This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Use in-line 'no overloading' for speed
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73
AD
29/* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
d4cce5f1 88#ifdef op
11343788 89#undef op
d4cce5f1 90#endif /* op */
11343788 91
fe14fcc3 92#ifdef MSDOS
7e4e8c89 93# if defined(BUGGY_MSC6)
fe14fcc3 94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 95# pragma optimize("a",off)
fe14fcc3 96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
97# pragma optimize("w",on )
98# endif /* BUGGY_MSC6 */
fe14fcc3
LW
99#endif /* MSDOS */
100
a687059c
LW
101#ifndef STATIC
102#define STATIC static
103#endif
104
830247a4 105typedef struct RExC_state_t {
e2509266 106 U32 flags; /* are we folding, multilining? */
830247a4 107 char *precomp; /* uncompiled string. */
288b8c02 108 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
109 regexp *rx; /* perl core regexp structure */
110 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 111 char *start; /* Start of input for compile */
830247a4
IZ
112 char *end; /* End of input for compile */
113 char *parse; /* Input-scan pointer. */
114 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 115 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 116 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 117 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
118 I32 naughty; /* How bad is this pattern? */
119 I32 sawback; /* Did we see \1, ...? */
120 U32 seen;
121 I32 size; /* Code size. */
c74340f9
YO
122 I32 npar; /* Capture buffer count, (OPEN). */
123 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 124 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
40d049e4
YO
128 regnode **open_parens; /* pointers to open parens */
129 regnode **close_parens; /* pointers to close parens */
130 regnode *opend; /* END node in program */
02daf0ab
YO
131 I32 utf8; /* whether the pattern is utf8 or not */
132 I32 orig_utf8; /* whether the pattern was originally in utf8 */
133 /* XXX use this for future optimisation of case
134 * where pattern must be upgraded to utf8. */
81714fb9 135 HV *paren_names; /* Paren names */
1f1031fe 136
40d049e4
YO
137 regnode **recurse; /* Recurse regops */
138 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
139#if ADD_TO_REGEXEC
140 char *starttry; /* -Dr: where regtry was called. */
141#define RExC_starttry (pRExC_state->starttry)
142#endif
3dab1dad 143#ifdef DEBUGGING
be8e71aa 144 const char *lastparse;
3dab1dad 145 I32 lastnum;
1f1031fe 146 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
147#define RExC_lastparse (pRExC_state->lastparse)
148#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 149#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 150#endif
830247a4
IZ
151} RExC_state_t;
152
e2509266 153#define RExC_flags (pRExC_state->flags)
830247a4 154#define RExC_precomp (pRExC_state->precomp)
288b8c02 155#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 156#define RExC_rx (pRExC_state->rx)
f8fc2ecf 157#define RExC_rxi (pRExC_state->rxi)
fac92740 158#define RExC_start (pRExC_state->start)
830247a4
IZ
159#define RExC_end (pRExC_state->end)
160#define RExC_parse (pRExC_state->parse)
161#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
162#ifdef RE_TRACK_PATTERN_OFFSETS
163#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
164#endif
830247a4 165#define RExC_emit (pRExC_state->emit)
fac92740 166#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 167#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
168#define RExC_naughty (pRExC_state->naughty)
169#define RExC_sawback (pRExC_state->sawback)
170#define RExC_seen (pRExC_state->seen)
171#define RExC_size (pRExC_state->size)
172#define RExC_npar (pRExC_state->npar)
e2e6a0f1 173#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
174#define RExC_extralen (pRExC_state->extralen)
175#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
176#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 177#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 178#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
179#define RExC_open_parens (pRExC_state->open_parens)
180#define RExC_close_parens (pRExC_state->close_parens)
181#define RExC_opend (pRExC_state->opend)
81714fb9 182#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
183#define RExC_recurse (pRExC_state->recurse)
184#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 185
cde0cee5 186
a687059c
LW
187#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
188#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
189 ((*s) == '{' && regcurly(s)))
a687059c 190
35c8bce7
LW
191#ifdef SPSTART
192#undef SPSTART /* dratted cpp namespace... */
193#endif
a687059c
LW
194/*
195 * Flags to be passed up and down.
196 */
a687059c 197#define WORST 0 /* Worst case. */
a3b492c3
YO
198#define HASWIDTH 0x01 /* Known to match non-null strings. */
199#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
200#define SPSTART 0x04 /* Starts with * or +. */
201#define TRYAGAIN 0x08 /* Weeded out a declaration. */
202#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 203
3dab1dad
YO
204#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
205
07be1b83
YO
206/* whether trie related optimizations are enabled */
207#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
208#define TRIE_STUDY_OPT
786e8c11 209#define FULL_TRIE_STUDY
07be1b83
YO
210#define TRIE_STCLASS
211#endif
1de06328
YO
212
213
40d049e4
YO
214
215#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
216#define PBITVAL(paren) (1 << ((paren) & 7))
217#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
218#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
219#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
220
221
1de06328
YO
222/* About scan_data_t.
223
224 During optimisation we recurse through the regexp program performing
225 various inplace (keyhole style) optimisations. In addition study_chunk
226 and scan_commit populate this data structure with information about
227 what strings MUST appear in the pattern. We look for the longest
228 string that must appear for at a fixed location, and we look for the
229 longest string that may appear at a floating location. So for instance
230 in the pattern:
231
232 /FOO[xX]A.*B[xX]BAR/
233
234 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
235 strings (because they follow a .* construct). study_chunk will identify
236 both FOO and BAR as being the longest fixed and floating strings respectively.
237
238 The strings can be composites, for instance
239
240 /(f)(o)(o)/
241
242 will result in a composite fixed substring 'foo'.
243
244 For each string some basic information is maintained:
245
246 - offset or min_offset
247 This is the position the string must appear at, or not before.
248 It also implicitly (when combined with minlenp) tells us how many
249 character must match before the string we are searching.
250 Likewise when combined with minlenp and the length of the string
251 tells us how many characters must appear after the string we have
252 found.
253
254 - max_offset
255 Only used for floating strings. This is the rightmost point that
256 the string can appear at. Ifset to I32 max it indicates that the
257 string can occur infinitely far to the right.
258
259 - minlenp
260 A pointer to the minimum length of the pattern that the string
261 was found inside. This is important as in the case of positive
262 lookahead or positive lookbehind we can have multiple patterns
263 involved. Consider
264
265 /(?=FOO).*F/
266
267 The minimum length of the pattern overall is 3, the minimum length
268 of the lookahead part is 3, but the minimum length of the part that
269 will actually match is 1. So 'FOO's minimum length is 3, but the
270 minimum length for the F is 1. This is important as the minimum length
271 is used to determine offsets in front of and behind the string being
272 looked for. Since strings can be composites this is the length of the
273 pattern at the time it was commited with a scan_commit. Note that
274 the length is calculated by study_chunk, so that the minimum lengths
275 are not known until the full pattern has been compiled, thus the
276 pointer to the value.
277
278 - lookbehind
279
280 In the case of lookbehind the string being searched for can be
281 offset past the start point of the final matching string.
282 If this value was just blithely removed from the min_offset it would
283 invalidate some of the calculations for how many chars must match
284 before or after (as they are derived from min_offset and minlen and
285 the length of the string being searched for).
286 When the final pattern is compiled and the data is moved from the
287 scan_data_t structure into the regexp structure the information
288 about lookbehind is factored in, with the information that would
289 have been lost precalculated in the end_shift field for the
290 associated string.
291
292 The fields pos_min and pos_delta are used to store the minimum offset
293 and the delta to the maximum offset at the current point in the pattern.
294
295*/
2c2d71f5
JH
296
297typedef struct scan_data_t {
1de06328
YO
298 /*I32 len_min; unused */
299 /*I32 len_delta; unused */
2c2d71f5
JH
300 I32 pos_min;
301 I32 pos_delta;
302 SV *last_found;
1de06328 303 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
304 I32 last_start_min;
305 I32 last_start_max;
1de06328
YO
306 SV **longest; /* Either &l_fixed, or &l_float. */
307 SV *longest_fixed; /* longest fixed string found in pattern */
308 I32 offset_fixed; /* offset where it starts */
309 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
310 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
311 SV *longest_float; /* longest floating string found in pattern */
312 I32 offset_float_min; /* earliest point in string it can appear */
313 I32 offset_float_max; /* latest point in string it can appear */
314 I32 *minlen_float; /* pointer to the minlen relevent to the string */
315 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
316 I32 flags;
317 I32 whilem_c;
cb434fcc 318 I32 *last_closep;
653099ff 319 struct regnode_charclass_class *start_class;
2c2d71f5
JH
320} scan_data_t;
321
a687059c 322/*
e50aee73 323 * Forward declarations for pregcomp()'s friends.
a687059c 324 */
a0d0e21e 325
27da23d5 326static const scan_data_t zero_scan_data =
1de06328 327 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
328
329#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
330#define SF_BEFORE_SEOL 0x0001
331#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
332#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
333#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
334
09b7f37c
CB
335#ifdef NO_UNARY_PLUS
336# define SF_FIX_SHIFT_EOL (0+2)
337# define SF_FL_SHIFT_EOL (0+4)
338#else
339# define SF_FIX_SHIFT_EOL (+2)
340# define SF_FL_SHIFT_EOL (+4)
341#endif
c277df42
IZ
342
343#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
344#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
345
346#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
347#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
348#define SF_IS_INF 0x0040
349#define SF_HAS_PAR 0x0080
350#define SF_IN_PAR 0x0100
351#define SF_HAS_EVAL 0x0200
352#define SCF_DO_SUBSTR 0x0400
653099ff
GS
353#define SCF_DO_STCLASS_AND 0x0800
354#define SCF_DO_STCLASS_OR 0x1000
355#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 356#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 357
786e8c11 358#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 359#define SCF_SEEN_ACCEPT 0x8000
07be1b83 360
eb160463 361#define UTF (RExC_utf8 != 0)
bbe252da
YO
362#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
363#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 364
ffc61ed2 365#define OOB_UNICODE 12345678
93733859 366#define OOB_NAMEDCLASS -1
b8c5462f 367
a0ed51b3
LW
368#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
369#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
370
8615cb43 371
b45f050a
JF
372/* length of regex to show in messages that don't mark a position within */
373#define RegexLengthToShowInErrorMessages 127
374
375/*
376 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
377 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
378 * op/pragma/warn/regcomp.
379 */
7253e4e3
RK
380#define MARKER1 "<-- HERE" /* marker as it appears in the description */
381#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 382
7253e4e3 383#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
384
385/*
386 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
387 * arg. Show regex, up to a maximum length. If it's too long, chop and add
388 * "...".
389 */
58e23c8d 390#define _FAIL(code) STMT_START { \
bfed75c6 391 const char *ellipses = ""; \
ccb2c380
MP
392 IV len = RExC_end - RExC_precomp; \
393 \
394 if (!SIZE_ONLY) \
288b8c02 395 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
396 if (len > RegexLengthToShowInErrorMessages) { \
397 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
398 len = RegexLengthToShowInErrorMessages - 10; \
399 ellipses = "..."; \
400 } \
58e23c8d 401 code; \
ccb2c380 402} STMT_END
8615cb43 403
58e23c8d
YO
404#define FAIL(msg) _FAIL( \
405 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
406 msg, (int)len, RExC_precomp, ellipses))
407
408#define FAIL2(msg,arg) _FAIL( \
409 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
410 arg, (int)len, RExC_precomp, ellipses))
411
b45f050a 412/*
b45f050a
JF
413 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
414 */
ccb2c380 415#define Simple_vFAIL(m) STMT_START { \
a28509cc 416 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
417 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
418 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
419} STMT_END
b45f050a
JF
420
421/*
422 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
423 */
ccb2c380
MP
424#define vFAIL(m) STMT_START { \
425 if (!SIZE_ONLY) \
288b8c02 426 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
427 Simple_vFAIL(m); \
428} STMT_END
b45f050a
JF
429
430/*
431 * Like Simple_vFAIL(), but accepts two arguments.
432 */
ccb2c380 433#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 434 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
435 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
436 (int)offset, RExC_precomp, RExC_precomp + offset); \
437} STMT_END
b45f050a
JF
438
439/*
440 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
441 */
ccb2c380
MP
442#define vFAIL2(m,a1) STMT_START { \
443 if (!SIZE_ONLY) \
288b8c02 444 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
445 Simple_vFAIL2(m, a1); \
446} STMT_END
b45f050a
JF
447
448
449/*
450 * Like Simple_vFAIL(), but accepts three arguments.
451 */
ccb2c380 452#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 453 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
454 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
455 (int)offset, RExC_precomp, RExC_precomp + offset); \
456} STMT_END
b45f050a
JF
457
458/*
459 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
460 */
ccb2c380
MP
461#define vFAIL3(m,a1,a2) STMT_START { \
462 if (!SIZE_ONLY) \
288b8c02 463 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
464 Simple_vFAIL3(m, a1, a2); \
465} STMT_END
b45f050a
JF
466
467/*
468 * Like Simple_vFAIL(), but accepts four arguments.
469 */
ccb2c380 470#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 471 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
472 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
473 (int)offset, RExC_precomp, RExC_precomp + offset); \
474} STMT_END
b45f050a 475
668c081a 476#define ckWARNreg(loc,m) STMT_START { \
a28509cc 477 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
478 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
479 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
480} STMT_END
481
668c081a 482#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 483 const IV offset = loc - RExC_precomp; \
d1d15184 484 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
485 m REPORT_LOCATION, \
486 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
487} STMT_END
488
668c081a 489#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 490 const IV offset = loc - RExC_precomp; \
668c081a 491 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
492 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
493} STMT_END
494
495#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 496 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
497 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
498 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
499} STMT_END
500
668c081a
NC
501#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
502 const IV offset = loc - RExC_precomp; \
503 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
504 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
505} STMT_END
506
ccb2c380 507#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 508 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
509 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
510 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
511} STMT_END
512
668c081a
NC
513#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
514 const IV offset = loc - RExC_precomp; \
515 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
516 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
517} STMT_END
518
ccb2c380 519#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 520 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
521 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
522 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
523} STMT_END
9d1d55b5 524
8615cb43 525
cd439c50 526/* Allow for side effects in s */
ccb2c380
MP
527#define REGC(c,s) STMT_START { \
528 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
529} STMT_END
cd439c50 530
fac92740
MJD
531/* Macros for recording node offsets. 20001227 mjd@plover.com
532 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
533 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
534 * Element 0 holds the number n.
07be1b83 535 * Position is 1 indexed.
fac92740 536 */
7122b237
YO
537#ifndef RE_TRACK_PATTERN_OFFSETS
538#define Set_Node_Offset_To_R(node,byte)
539#define Set_Node_Offset(node,byte)
540#define Set_Cur_Node_Offset
541#define Set_Node_Length_To_R(node,len)
542#define Set_Node_Length(node,len)
543#define Set_Node_Cur_Length(node)
544#define Node_Offset(n)
545#define Node_Length(n)
546#define Set_Node_Offset_Length(node,offset,len)
547#define ProgLen(ri) ri->u.proglen
548#define SetProgLen(ri,x) ri->u.proglen = x
549#else
550#define ProgLen(ri) ri->u.offsets[0]
551#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
552#define Set_Node_Offset_To_R(node,byte) STMT_START { \
553 if (! SIZE_ONLY) { \
554 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 555 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 556 if((node) < 0) { \
551405c4 557 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
558 } else { \
559 RExC_offsets[2*(node)-1] = (byte); \
560 } \
561 } \
562} STMT_END
563
564#define Set_Node_Offset(node,byte) \
565 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
566#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
567
568#define Set_Node_Length_To_R(node,len) STMT_START { \
569 if (! SIZE_ONLY) { \
570 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 571 __LINE__, (int)(node), (int)(len))); \
ccb2c380 572 if((node) < 0) { \
551405c4 573 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
574 } else { \
575 RExC_offsets[2*(node)] = (len); \
576 } \
577 } \
578} STMT_END
579
580#define Set_Node_Length(node,len) \
581 Set_Node_Length_To_R((node)-RExC_emit_start, len)
582#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
583#define Set_Node_Cur_Length(node) \
584 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
585
586/* Get offsets and lengths */
587#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
588#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
589
07be1b83
YO
590#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
591 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
592 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
593} STMT_END
7122b237 594#endif
07be1b83
YO
595
596#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
597#define EXPERIMENTAL_INPLACESCAN
7122b237 598#endif /*RE_TRACK_PATTERN_OFFSETS*/
07be1b83 599
304ee84b
YO
600#define DEBUG_STUDYDATA(str,data,depth) \
601DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 602 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
603 "%*s" str "Pos:%"IVdf"/%"IVdf \
604 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
605 (int)(depth)*2, "", \
606 (IV)((data)->pos_min), \
607 (IV)((data)->pos_delta), \
304ee84b 608 (UV)((data)->flags), \
1de06328 609 (IV)((data)->whilem_c), \
304ee84b
YO
610 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
611 is_inf ? "INF " : "" \
1de06328
YO
612 ); \
613 if ((data)->last_found) \
614 PerlIO_printf(Perl_debug_log, \
615 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
616 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
617 SvPVX_const((data)->last_found), \
618 (IV)((data)->last_end), \
619 (IV)((data)->last_start_min), \
620 (IV)((data)->last_start_max), \
621 ((data)->longest && \
622 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
623 SvPVX_const((data)->longest_fixed), \
624 (IV)((data)->offset_fixed), \
625 ((data)->longest && \
626 (data)->longest==&((data)->longest_float)) ? "*" : "", \
627 SvPVX_const((data)->longest_float), \
628 (IV)((data)->offset_float_min), \
629 (IV)((data)->offset_float_max) \
630 ); \
631 PerlIO_printf(Perl_debug_log,"\n"); \
632});
633
acfe0abc 634static void clear_re(pTHX_ void *r);
4327152a 635
653099ff 636/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 637 Update the longest found anchored substring and the longest found
653099ff
GS
638 floating substrings if needed. */
639
4327152a 640STATIC void
304ee84b 641S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 642{
e1ec3a88
AL
643 const STRLEN l = CHR_SVLEN(data->last_found);
644 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 645 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 646
7918f24d
NC
647 PERL_ARGS_ASSERT_SCAN_COMMIT;
648
c277df42 649 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 650 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
651 if (*data->longest == data->longest_fixed) {
652 data->offset_fixed = l ? data->last_start_min : data->pos_min;
653 if (data->flags & SF_BEFORE_EOL)
b81d288d 654 data->flags
c277df42
IZ
655 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
656 else
657 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
658 data->minlen_fixed=minlenp;
659 data->lookbehind_fixed=0;
a0ed51b3 660 }
304ee84b 661 else { /* *data->longest == data->longest_float */
c277df42 662 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
663 data->offset_float_max = (l
664 ? data->last_start_max
c277df42 665 : data->pos_min + data->pos_delta);
304ee84b 666 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 667 data->offset_float_max = I32_MAX;
c277df42 668 if (data->flags & SF_BEFORE_EOL)
b81d288d 669 data->flags
c277df42
IZ
670 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
671 else
672 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
673 data->minlen_float=minlenp;
674 data->lookbehind_float=0;
c277df42
IZ
675 }
676 }
677 SvCUR_set(data->last_found, 0);
0eda9292 678 {
a28509cc 679 SV * const sv = data->last_found;
097eb12c
AL
680 if (SvUTF8(sv) && SvMAGICAL(sv)) {
681 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
682 if (mg)
683 mg->mg_len = 0;
684 }
0eda9292 685 }
c277df42
IZ
686 data->last_end = -1;
687 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 688 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
689}
690
653099ff
GS
691/* Can match anything (initialization) */
692STATIC void
097eb12c 693S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 694{
7918f24d
NC
695 PERL_ARGS_ASSERT_CL_ANYTHING;
696
653099ff 697 ANYOF_CLASS_ZERO(cl);
f8bef550 698 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 699 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
700 if (LOC)
701 cl->flags |= ANYOF_LOCALE;
702}
703
704/* Can match anything (initialization) */
705STATIC int
5f66b61c 706S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
707{
708 int value;
709
7918f24d
NC
710 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
711
aaa51d5e 712 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
713 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
714 return 1;
1aa99e6b
IH
715 if (!(cl->flags & ANYOF_UNICODE_ALL))
716 return 0;
10edeb5d 717 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 718 return 0;
653099ff
GS
719 return 1;
720}
721
722/* Can match anything (initialization) */
723STATIC void
097eb12c 724S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 725{
7918f24d
NC
726 PERL_ARGS_ASSERT_CL_INIT;
727
8ecf7187 728 Zero(cl, 1, struct regnode_charclass_class);
653099ff 729 cl->type = ANYOF;
830247a4 730 cl_anything(pRExC_state, cl);
653099ff
GS
731}
732
733STATIC void
097eb12c 734S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 735{
7918f24d
NC
736 PERL_ARGS_ASSERT_CL_INIT_ZERO;
737
8ecf7187 738 Zero(cl, 1, struct regnode_charclass_class);
653099ff 739 cl->type = ANYOF;
830247a4 740 cl_anything(pRExC_state, cl);
653099ff
GS
741 if (LOC)
742 cl->flags |= ANYOF_LOCALE;
743}
744
745/* 'And' a given class with another one. Can create false positives */
746/* We assume that cl is not inverted */
747STATIC void
5f66b61c 748S_cl_and(struct regnode_charclass_class *cl,
a28509cc 749 const struct regnode_charclass_class *and_with)
653099ff 750{
7918f24d 751 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
752
753 assert(and_with->type == ANYOF);
653099ff
GS
754 if (!(and_with->flags & ANYOF_CLASS)
755 && !(cl->flags & ANYOF_CLASS)
756 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
757 && !(and_with->flags & ANYOF_FOLD)
758 && !(cl->flags & ANYOF_FOLD)) {
759 int i;
760
761 if (and_with->flags & ANYOF_INVERT)
762 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
763 cl->bitmap[i] &= ~and_with->bitmap[i];
764 else
765 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
766 cl->bitmap[i] &= and_with->bitmap[i];
767 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
768 if (!(and_with->flags & ANYOF_EOS))
769 cl->flags &= ~ANYOF_EOS;
1aa99e6b 770
14ebb1a2
JH
771 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
772 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
773 cl->flags &= ~ANYOF_UNICODE_ALL;
774 cl->flags |= ANYOF_UNICODE;
775 ARG_SET(cl, ARG(and_with));
776 }
14ebb1a2
JH
777 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
778 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 779 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
780 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
781 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 782 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
783}
784
785/* 'OR' a given class with another one. Can create false positives */
786/* We assume that cl is not inverted */
787STATIC void
097eb12c 788S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 789{
7918f24d
NC
790 PERL_ARGS_ASSERT_CL_OR;
791
653099ff
GS
792 if (or_with->flags & ANYOF_INVERT) {
793 /* We do not use
794 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
795 * <= (B1 | !B2) | (CL1 | !CL2)
796 * which is wasteful if CL2 is small, but we ignore CL2:
797 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
798 * XXXX Can we handle case-fold? Unclear:
799 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
800 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
801 */
802 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
803 && !(or_with->flags & ANYOF_FOLD)
804 && !(cl->flags & ANYOF_FOLD) ) {
805 int i;
806
807 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
808 cl->bitmap[i] |= ~or_with->bitmap[i];
809 } /* XXXX: logic is complicated otherwise */
810 else {
830247a4 811 cl_anything(pRExC_state, cl);
653099ff
GS
812 }
813 } else {
814 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
815 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 816 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
817 || (cl->flags & ANYOF_FOLD)) ) {
818 int i;
819
820 /* OR char bitmap and class bitmap separately */
821 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
822 cl->bitmap[i] |= or_with->bitmap[i];
823 if (or_with->flags & ANYOF_CLASS) {
824 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
825 cl->classflags[i] |= or_with->classflags[i];
826 cl->flags |= ANYOF_CLASS;
827 }
828 }
829 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 830 cl_anything(pRExC_state, cl);
653099ff
GS
831 }
832 }
833 if (or_with->flags & ANYOF_EOS)
834 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
835
836 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
837 ARG(cl) != ARG(or_with)) {
838 cl->flags |= ANYOF_UNICODE_ALL;
839 cl->flags &= ~ANYOF_UNICODE;
840 }
841 if (or_with->flags & ANYOF_UNICODE_ALL) {
842 cl->flags |= ANYOF_UNICODE_ALL;
843 cl->flags &= ~ANYOF_UNICODE;
844 }
653099ff
GS
845}
846
a3621e74
YO
847#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
848#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
849#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
850#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
851
3dab1dad
YO
852
853#ifdef DEBUGGING
07be1b83 854/*
2b8b4781
NC
855 dump_trie(trie,widecharmap,revcharmap)
856 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
857 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
858
859 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
860 The _interim_ variants are used for debugging the interim
861 tables that are used to generate the final compressed
862 representation which is what dump_trie expects.
863
3dab1dad
YO
864 Part of the reason for their existance is to provide a form
865 of documentation as to how the different representations function.
07be1b83
YO
866
867*/
3dab1dad
YO
868
869/*
3dab1dad
YO
870 Dumps the final compressed table form of the trie to Perl_debug_log.
871 Used for debugging make_trie().
872*/
873
874STATIC void
2b8b4781
NC
875S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
876 AV *revcharmap, U32 depth)
3dab1dad
YO
877{
878 U32 state;
ab3bbdeb 879 SV *sv=sv_newmortal();
55eed653 880 int colwidth= widecharmap ? 6 : 4;
2e64971a 881 U16 word;
3dab1dad
YO
882 GET_RE_DEBUG_FLAGS_DECL;
883
7918f24d 884 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 885
3dab1dad
YO
886 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
887 (int)depth * 2 + 2,"",
888 "Match","Base","Ofs" );
889
890 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 891 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 892 if ( tmp ) {
ab3bbdeb
YO
893 PerlIO_printf( Perl_debug_log, "%*s",
894 colwidth,
ddc5bc0f 895 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
896 PL_colors[0], PL_colors[1],
897 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
898 PERL_PV_ESCAPE_FIRSTCHAR
899 )
900 );
3dab1dad
YO
901 }
902 }
903 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
904 (int)depth * 2 + 2,"");
905
906 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 907 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
908 PerlIO_printf( Perl_debug_log, "\n");
909
1e2e3d02 910 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 911 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
912
913 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
914
915 if ( trie->states[ state ].wordnum ) {
916 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
917 } else {
918 PerlIO_printf( Perl_debug_log, "%6s", "" );
919 }
920
921 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
922
923 if ( base ) {
924 U32 ofs = 0;
925
926 while( ( base + ofs < trie->uniquecharcount ) ||
927 ( base + ofs - trie->uniquecharcount < trie->lasttrans
928 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
929 ofs++;
930
931 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
932
933 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
934 if ( ( base + ofs >= trie->uniquecharcount ) &&
935 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
936 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
937 {
ab3bbdeb
YO
938 PerlIO_printf( Perl_debug_log, "%*"UVXf,
939 colwidth,
3dab1dad
YO
940 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
941 } else {
ab3bbdeb 942 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
943 }
944 }
945
946 PerlIO_printf( Perl_debug_log, "]");
947
948 }
949 PerlIO_printf( Perl_debug_log, "\n" );
950 }
2e64971a
DM
951 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
952 for (word=1; word <= trie->wordcount; word++) {
953 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
954 (int)word, (int)(trie->wordinfo[word].prev),
955 (int)(trie->wordinfo[word].len));
956 }
957 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
958}
959/*
3dab1dad
YO
960 Dumps a fully constructed but uncompressed trie in list form.
961 List tries normally only are used for construction when the number of
962 possible chars (trie->uniquecharcount) is very high.
963 Used for debugging make_trie().
964*/
965STATIC void
55eed653 966S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
967 HV *widecharmap, AV *revcharmap, U32 next_alloc,
968 U32 depth)
3dab1dad
YO
969{
970 U32 state;
ab3bbdeb 971 SV *sv=sv_newmortal();
55eed653 972 int colwidth= widecharmap ? 6 : 4;
3dab1dad 973 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
974
975 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
976
3dab1dad 977 /* print out the table precompression. */
ab3bbdeb
YO
978 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
979 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
980 "------:-----+-----------------\n" );
3dab1dad
YO
981
982 for( state=1 ; state < next_alloc ; state ++ ) {
983 U16 charid;
984
ab3bbdeb 985 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
986 (int)depth * 2 + 2,"", (UV)state );
987 if ( ! trie->states[ state ].wordnum ) {
988 PerlIO_printf( Perl_debug_log, "%5s| ","");
989 } else {
990 PerlIO_printf( Perl_debug_log, "W%4x| ",
991 trie->states[ state ].wordnum
992 );
993 }
994 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 995 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
996 if ( tmp ) {
997 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
998 colwidth,
ddc5bc0f 999 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1000 PL_colors[0], PL_colors[1],
1001 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1002 PERL_PV_ESCAPE_FIRSTCHAR
1003 ) ,
1e2e3d02
YO
1004 TRIE_LIST_ITEM(state,charid).forid,
1005 (UV)TRIE_LIST_ITEM(state,charid).newstate
1006 );
1007 if (!(charid % 10))
664e119d
RGS
1008 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1009 (int)((depth * 2) + 14), "");
1e2e3d02 1010 }
ab3bbdeb
YO
1011 }
1012 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1013 }
1014}
1015
1016/*
3dab1dad
YO
1017 Dumps a fully constructed but uncompressed trie in table form.
1018 This is the normal DFA style state transition table, with a few
1019 twists to facilitate compression later.
1020 Used for debugging make_trie().
1021*/
1022STATIC void
55eed653 1023S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1024 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1025 U32 depth)
3dab1dad
YO
1026{
1027 U32 state;
1028 U16 charid;
ab3bbdeb 1029 SV *sv=sv_newmortal();
55eed653 1030 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1031 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1032
1033 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1034
1035 /*
1036 print out the table precompression so that we can do a visual check
1037 that they are identical.
1038 */
1039
1040 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1041
1042 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1043 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1044 if ( tmp ) {
ab3bbdeb
YO
1045 PerlIO_printf( Perl_debug_log, "%*s",
1046 colwidth,
ddc5bc0f 1047 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1048 PL_colors[0], PL_colors[1],
1049 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1050 PERL_PV_ESCAPE_FIRSTCHAR
1051 )
1052 );
3dab1dad
YO
1053 }
1054 }
1055
1056 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1057
1058 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1059 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1060 }
1061
1062 PerlIO_printf( Perl_debug_log, "\n" );
1063
1064 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1065
1066 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1067 (int)depth * 2 + 2,"",
1068 (UV)TRIE_NODENUM( state ) );
1069
1070 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1071 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1072 if (v)
1073 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1074 else
1075 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1076 }
1077 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1078 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1079 } else {
1080 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1081 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1082 }
1083 }
07be1b83 1084}
3dab1dad
YO
1085
1086#endif
1087
2e64971a 1088
786e8c11
YO
1089/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1090 startbranch: the first branch in the whole branch sequence
1091 first : start branch of sequence of branch-exact nodes.
1092 May be the same as startbranch
1093 last : Thing following the last branch.
1094 May be the same as tail.
1095 tail : item following the branch sequence
1096 count : words in the sequence
1097 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1098 depth : indent depth
3dab1dad 1099
786e8c11 1100Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1101
786e8c11
YO
1102A trie is an N'ary tree where the branches are determined by digital
1103decomposition of the key. IE, at the root node you look up the 1st character and
1104follow that branch repeat until you find the end of the branches. Nodes can be
1105marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1106
786e8c11 1107 /he|she|his|hers/
72f13be8 1108
786e8c11
YO
1109would convert into the following structure. Numbers represent states, letters
1110following numbers represent valid transitions on the letter from that state, if
1111the number is in square brackets it represents an accepting state, otherwise it
1112will be in parenthesis.
07be1b83 1113
786e8c11
YO
1114 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1115 | |
1116 | (2)
1117 | |
1118 (1) +-i->(6)-+-s->[7]
1119 |
1120 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1121
786e8c11
YO
1122 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1123
1124This shows that when matching against the string 'hers' we will begin at state 1
1125read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1126then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1127is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1128single traverse. We store a mapping from accepting to state to which word was
1129matched, and then when we have multiple possibilities we try to complete the
1130rest of the regex in the order in which they occured in the alternation.
1131
1132The only prior NFA like behaviour that would be changed by the TRIE support is
1133the silent ignoring of duplicate alternations which are of the form:
1134
1135 / (DUPE|DUPE) X? (?{ ... }) Y /x
1136
1137Thus EVAL blocks follwing a trie may be called a different number of times with
1138and without the optimisation. With the optimisations dupes will be silently
1139ignored. This inconsistant behaviour of EVAL type nodes is well established as
1140the following demonstrates:
1141
1142 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1143
1144which prints out 'word' three times, but
1145
1146 'words'=~/(word|word|word)(?{ print $1 })S/
1147
1148which doesnt print it out at all. This is due to other optimisations kicking in.
1149
1150Example of what happens on a structural level:
1151
1152The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1153
1154 1: CURLYM[1] {1,32767}(18)
1155 5: BRANCH(8)
1156 6: EXACT <ac>(16)
1157 8: BRANCH(11)
1158 9: EXACT <ad>(16)
1159 11: BRANCH(14)
1160 12: EXACT <ab>(16)
1161 16: SUCCEED(0)
1162 17: NOTHING(18)
1163 18: END(0)
1164
1165This would be optimizable with startbranch=5, first=5, last=16, tail=16
1166and should turn into:
1167
1168 1: CURLYM[1] {1,32767}(18)
1169 5: TRIE(16)
1170 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1171 <ac>
1172 <ad>
1173 <ab>
1174 16: SUCCEED(0)
1175 17: NOTHING(18)
1176 18: END(0)
1177
1178Cases where tail != last would be like /(?foo|bar)baz/:
1179
1180 1: BRANCH(4)
1181 2: EXACT <foo>(8)
1182 4: BRANCH(7)
1183 5: EXACT <bar>(8)
1184 7: TAIL(8)
1185 8: EXACT <baz>(10)
1186 10: END(0)
1187
1188which would be optimizable with startbranch=1, first=1, last=7, tail=8
1189and would end up looking like:
1190
1191 1: TRIE(8)
1192 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1193 <foo>
1194 <bar>
1195 7: TAIL(8)
1196 8: EXACT <baz>(10)
1197 10: END(0)
1198
1199 d = uvuni_to_utf8_flags(d, uv, 0);
1200
1201is the recommended Unicode-aware way of saying
1202
1203 *(d++) = uv;
1204*/
1205
1e2e3d02 1206#define TRIE_STORE_REVCHAR \
786e8c11 1207 STMT_START { \
73031816
NC
1208 if (UTF) { \
1209 SV *zlopp = newSV(2); \
88c9ea1e
CB
1210 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1211 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1212 SvCUR_set(zlopp, kapow - flrbbbbb); \
1213 SvPOK_on(zlopp); \
1214 SvUTF8_on(zlopp); \
1215 av_push(revcharmap, zlopp); \
1216 } else { \
6bdeddd2 1217 char ooooff = (char)uvc; \
73031816
NC
1218 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1219 } \
1220 } STMT_END
786e8c11
YO
1221
1222#define TRIE_READ_CHAR STMT_START { \
1223 wordlen++; \
1224 if ( UTF ) { \
1225 if ( folder ) { \
1226 if ( foldlen > 0 ) { \
1227 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1228 foldlen -= len; \
1229 scan += len; \
1230 len = 0; \
1231 } else { \
1232 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1233 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1234 foldlen -= UNISKIP( uvc ); \
1235 scan = foldbuf + UNISKIP( uvc ); \
1236 } \
1237 } else { \
1238 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1239 } \
1240 } else { \
1241 uvc = (U32)*uc; \
1242 len = 1; \
1243 } \
1244} STMT_END
1245
1246
1247
1248#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1249 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1250 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1251 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1252 } \
1253 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1254 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1255 TRIE_LIST_CUR( state )++; \
1256} STMT_END
07be1b83 1257
786e8c11
YO
1258#define TRIE_LIST_NEW(state) STMT_START { \
1259 Newxz( trie->states[ state ].trans.list, \
1260 4, reg_trie_trans_le ); \
1261 TRIE_LIST_CUR( state ) = 1; \
1262 TRIE_LIST_LEN( state ) = 4; \
1263} STMT_END
07be1b83 1264
786e8c11
YO
1265#define TRIE_HANDLE_WORD(state) STMT_START { \
1266 U16 dupe= trie->states[ state ].wordnum; \
1267 regnode * const noper_next = regnext( noper ); \
1268 \
786e8c11
YO
1269 DEBUG_r({ \
1270 /* store the word for dumping */ \
1271 SV* tmp; \
1272 if (OP(noper) != NOTHING) \
740cce10 1273 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1274 else \
740cce10 1275 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1276 av_push( trie_words, tmp ); \
786e8c11
YO
1277 }); \
1278 \
1279 curword++; \
2e64971a
DM
1280 trie->wordinfo[curword].prev = 0; \
1281 trie->wordinfo[curword].len = wordlen; \
1282 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1283 \
1284 if ( noper_next < tail ) { \
1285 if (!trie->jump) \
c944940b 1286 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1287 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1288 if (!jumper) \
1289 jumper = noper_next; \
1290 if (!nextbranch) \
1291 nextbranch= regnext(cur); \
1292 } \
1293 \
1294 if ( dupe ) { \
2e64971a
DM
1295 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1296 /* chain, so that when the bits of chain are later */\
1297 /* linked together, the dups appear in the chain */\
1298 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1299 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1300 } else { \
1301 /* we haven't inserted this word yet. */ \
1302 trie->states[ state ].wordnum = curword; \
1303 } \
1304} STMT_END
07be1b83 1305
3dab1dad 1306
786e8c11
YO
1307#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1308 ( ( base + charid >= ucharcount \
1309 && base + charid < ubound \
1310 && state == trie->trans[ base - ucharcount + charid ].check \
1311 && trie->trans[ base - ucharcount + charid ].next ) \
1312 ? trie->trans[ base - ucharcount + charid ].next \
1313 : ( state==1 ? special : 0 ) \
1314 )
3dab1dad 1315
786e8c11
YO
1316#define MADE_TRIE 1
1317#define MADE_JUMP_TRIE 2
1318#define MADE_EXACT_TRIE 4
3dab1dad 1319
a3621e74 1320STATIC I32
786e8c11 1321S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1322{
27da23d5 1323 dVAR;
a3621e74
YO
1324 /* first pass, loop through and scan words */
1325 reg_trie_data *trie;
55eed653 1326 HV *widecharmap = NULL;
2b8b4781 1327 AV *revcharmap = newAV();
a3621e74 1328 regnode *cur;
9f7f3913 1329 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1330 STRLEN len = 0;
1331 UV uvc = 0;
1332 U16 curword = 0;
1333 U32 next_alloc = 0;
786e8c11
YO
1334 regnode *jumper = NULL;
1335 regnode *nextbranch = NULL;
7f69552c 1336 regnode *convert = NULL;
2e64971a 1337 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1338 /* we just use folder as a flag in utf8 */
e1ec3a88 1339 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1340 ? PL_fold
1341 : ( flags == EXACTFL
1342 ? PL_fold_locale
1343 : NULL
1344 )
1345 );
1346
2b8b4781
NC
1347#ifdef DEBUGGING
1348 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1349 AV *trie_words = NULL;
1350 /* along with revcharmap, this only used during construction but both are
1351 * useful during debugging so we store them in the struct when debugging.
8e11feef 1352 */
2b8b4781
NC
1353#else
1354 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1355 STRLEN trie_charcount=0;
3dab1dad 1356#endif
2b8b4781 1357 SV *re_trie_maxbuff;
a3621e74 1358 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1359
1360 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1361#ifndef DEBUGGING
1362 PERL_UNUSED_ARG(depth);
1363#endif
a3621e74 1364
c944940b 1365 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1366 trie->refcount = 1;
3dab1dad 1367 trie->startstate = 1;
786e8c11 1368 trie->wordcount = word_count;
f8fc2ecf 1369 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1370 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1371 if (!(UTF && folder))
c944940b 1372 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1373 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1374 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1375
a3621e74 1376 DEBUG_r({
2b8b4781 1377 trie_words = newAV();
a3621e74 1378 });
a3621e74 1379
0111c4fd 1380 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1381 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1382 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1383 }
3dab1dad
YO
1384 DEBUG_OPTIMISE_r({
1385 PerlIO_printf( Perl_debug_log,
786e8c11 1386 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1387 (int)depth * 2 + 2, "",
1388 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1389 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1390 (int)depth);
3dab1dad 1391 });
7f69552c
YO
1392
1393 /* Find the node we are going to overwrite */
1394 if ( first == startbranch && OP( last ) != BRANCH ) {
1395 /* whole branch chain */
1396 convert = first;
1397 } else {
1398 /* branch sub-chain */
1399 convert = NEXTOPER( first );
1400 }
1401
a3621e74
YO
1402 /* -- First loop and Setup --
1403
1404 We first traverse the branches and scan each word to determine if it
1405 contains widechars, and how many unique chars there are, this is
1406 important as we have to build a table with at least as many columns as we
1407 have unique chars.
1408
1409 We use an array of integers to represent the character codes 0..255
38a44b82 1410 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1411 native representation of the character value as the key and IV's for the
1412 coded index.
1413
1414 *TODO* If we keep track of how many times each character is used we can
1415 remap the columns so that the table compression later on is more
1416 efficient in terms of memory by ensuring most common value is in the
1417 middle and the least common are on the outside. IMO this would be better
1418 than a most to least common mapping as theres a decent chance the most
1419 common letter will share a node with the least common, meaning the node
1420 will not be compressable. With a middle is most common approach the worst
1421 case is when we have the least common nodes twice.
1422
1423 */
1424
a3621e74 1425 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1426 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1427 const U8 *uc = (U8*)STRING( noper );
a28509cc 1428 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1429 STRLEN foldlen = 0;
1430 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1431 const U8 *scan = (U8*)NULL;
07be1b83 1432 U32 wordlen = 0; /* required init */
02daf0ab
YO
1433 STRLEN chars = 0;
1434 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1435
3dab1dad
YO
1436 if (OP(noper) == NOTHING) {
1437 trie->minlen= 0;
1438 continue;
1439 }
02daf0ab
YO
1440 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1441 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1442 regardless of encoding */
1443
a3621e74 1444 for ( ; uc < e ; uc += len ) {
3dab1dad 1445 TRIE_CHARCOUNT(trie)++;
a3621e74 1446 TRIE_READ_CHAR;
3dab1dad 1447 chars++;
a3621e74
YO
1448 if ( uvc < 256 ) {
1449 if ( !trie->charmap[ uvc ] ) {
1450 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1451 if ( folder )
1452 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1453 TRIE_STORE_REVCHAR;
a3621e74 1454 }
02daf0ab
YO
1455 if ( set_bit ) {
1456 /* store the codepoint in the bitmap, and if its ascii
1457 also store its folded equivelent. */
1458 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1459
1460 /* store the folded codepoint */
1461 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1462
1463 if ( !UTF ) {
1464 /* store first byte of utf8 representation of
1465 codepoints in the 127 < uvc < 256 range */
1466 if (127 < uvc && uvc < 192) {
1467 TRIE_BITMAP_SET(trie,194);
1468 } else if (191 < uvc ) {
1469 TRIE_BITMAP_SET(trie,195);
1470 /* && uvc < 256 -- we know uvc is < 256 already */
1471 }
1472 }
02daf0ab
YO
1473 set_bit = 0; /* We've done our bit :-) */
1474 }
a3621e74
YO
1475 } else {
1476 SV** svpp;
55eed653
NC
1477 if ( !widecharmap )
1478 widecharmap = newHV();
a3621e74 1479
55eed653 1480 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1481
1482 if ( !svpp )
e4584336 1483 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1484
1485 if ( !SvTRUE( *svpp ) ) {
1486 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1487 TRIE_STORE_REVCHAR;
a3621e74
YO
1488 }
1489 }
1490 }
3dab1dad
YO
1491 if( cur == first ) {
1492 trie->minlen=chars;
1493 trie->maxlen=chars;
1494 } else if (chars < trie->minlen) {
1495 trie->minlen=chars;
1496 } else if (chars > trie->maxlen) {
1497 trie->maxlen=chars;
1498 }
1499
a3621e74
YO
1500 } /* end first pass */
1501 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1502 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1503 (int)depth * 2 + 2,"",
55eed653 1504 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1505 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1506 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1507 );
a3621e74
YO
1508
1509 /*
1510 We now know what we are dealing with in terms of unique chars and
1511 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1512 representation using a flat table will take. If it's over a reasonable
1513 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1514 conservative but potentially much slower representation using an array
1515 of lists.
1516
1517 At the end we convert both representations into the same compressed
1518 form that will be used in regexec.c for matching with. The latter
1519 is a form that cannot be used to construct with but has memory
1520 properties similar to the list form and access properties similar
1521 to the table form making it both suitable for fast searches and
1522 small enough that its feasable to store for the duration of a program.
1523
1524 See the comment in the code where the compressed table is produced
1525 inplace from the flat tabe representation for an explanation of how
1526 the compression works.
1527
1528 */
1529
1530
2e64971a
DM
1531 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1532 prev_states[1] = 0;
1533
3dab1dad 1534 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1535 /*
1536 Second Pass -- Array Of Lists Representation
1537
1538 Each state will be represented by a list of charid:state records
1539 (reg_trie_trans_le) the first such element holds the CUR and LEN
1540 points of the allocated array. (See defines above).
1541
1542 We build the initial structure using the lists, and then convert
1543 it into the compressed table form which allows faster lookups
1544 (but cant be modified once converted).
a3621e74
YO
1545 */
1546
a3621e74
YO
1547 STRLEN transcount = 1;
1548
1e2e3d02
YO
1549 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1550 "%*sCompiling trie using list compiler\n",
1551 (int)depth * 2 + 2, ""));
446bd890 1552
c944940b
JH
1553 trie->states = (reg_trie_state *)
1554 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1555 sizeof(reg_trie_state) );
a3621e74
YO
1556 TRIE_LIST_NEW(1);
1557 next_alloc = 2;
1558
1559 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1560
c445ea15
AL
1561 regnode * const noper = NEXTOPER( cur );
1562 U8 *uc = (U8*)STRING( noper );
1563 const U8 * const e = uc + STR_LEN( noper );
1564 U32 state = 1; /* required init */
1565 U16 charid = 0; /* sanity init */
1566 U8 *scan = (U8*)NULL; /* sanity init */
1567 STRLEN foldlen = 0; /* required init */
07be1b83 1568 U32 wordlen = 0; /* required init */
c445ea15
AL
1569 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1570
3dab1dad 1571 if (OP(noper) != NOTHING) {
786e8c11 1572 for ( ; uc < e ; uc += len ) {
c445ea15 1573
786e8c11 1574 TRIE_READ_CHAR;
c445ea15 1575
786e8c11
YO
1576 if ( uvc < 256 ) {
1577 charid = trie->charmap[ uvc ];
c445ea15 1578 } else {
55eed653 1579 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1580 if ( !svpp ) {
1581 charid = 0;
1582 } else {
1583 charid=(U16)SvIV( *svpp );
1584 }
c445ea15 1585 }
786e8c11
YO
1586 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1587 if ( charid ) {
a3621e74 1588
786e8c11
YO
1589 U16 check;
1590 U32 newstate = 0;
a3621e74 1591
786e8c11
YO
1592 charid--;
1593 if ( !trie->states[ state ].trans.list ) {
1594 TRIE_LIST_NEW( state );
c445ea15 1595 }
786e8c11
YO
1596 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1597 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1598 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1599 break;
1600 }
1601 }
1602 if ( ! newstate ) {
1603 newstate = next_alloc++;
2e64971a 1604 prev_states[newstate] = state;
786e8c11
YO
1605 TRIE_LIST_PUSH( state, charid, newstate );
1606 transcount++;
1607 }
1608 state = newstate;
1609 } else {
1610 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1611 }
a28509cc 1612 }
c445ea15 1613 }
3dab1dad 1614 TRIE_HANDLE_WORD(state);
a3621e74
YO
1615
1616 } /* end second pass */
1617
1e2e3d02
YO
1618 /* next alloc is the NEXT state to be allocated */
1619 trie->statecount = next_alloc;
c944940b
JH
1620 trie->states = (reg_trie_state *)
1621 PerlMemShared_realloc( trie->states,
1622 next_alloc
1623 * sizeof(reg_trie_state) );
a3621e74 1624
3dab1dad 1625 /* and now dump it out before we compress it */
2b8b4781
NC
1626 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1627 revcharmap, next_alloc,
1628 depth+1)
1e2e3d02 1629 );
a3621e74 1630
c944940b
JH
1631 trie->trans = (reg_trie_trans *)
1632 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1633 {
1634 U32 state;
a3621e74
YO
1635 U32 tp = 0;
1636 U32 zp = 0;
1637
1638
1639 for( state=1 ; state < next_alloc ; state ++ ) {
1640 U32 base=0;
1641
1642 /*
1643 DEBUG_TRIE_COMPILE_MORE_r(
1644 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1645 );
1646 */
1647
1648 if (trie->states[state].trans.list) {
1649 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1650 U16 maxid=minid;
a28509cc 1651 U16 idx;
a3621e74
YO
1652
1653 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1654 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1655 if ( forid < minid ) {
1656 minid=forid;
1657 } else if ( forid > maxid ) {
1658 maxid=forid;
1659 }
a3621e74
YO
1660 }
1661 if ( transcount < tp + maxid - minid + 1) {
1662 transcount *= 2;
c944940b
JH
1663 trie->trans = (reg_trie_trans *)
1664 PerlMemShared_realloc( trie->trans,
446bd890
NC
1665 transcount
1666 * sizeof(reg_trie_trans) );
a3621e74
YO
1667 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1668 }
1669 base = trie->uniquecharcount + tp - minid;
1670 if ( maxid == minid ) {
1671 U32 set = 0;
1672 for ( ; zp < tp ; zp++ ) {
1673 if ( ! trie->trans[ zp ].next ) {
1674 base = trie->uniquecharcount + zp - minid;
1675 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1676 trie->trans[ zp ].check = state;
1677 set = 1;
1678 break;
1679 }
1680 }
1681 if ( !set ) {
1682 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1683 trie->trans[ tp ].check = state;
1684 tp++;
1685 zp = tp;
1686 }
1687 } else {
1688 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1689 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1690 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1691 trie->trans[ tid ].check = state;
1692 }
1693 tp += ( maxid - minid + 1 );
1694 }
1695 Safefree(trie->states[ state ].trans.list);
1696 }
1697 /*
1698 DEBUG_TRIE_COMPILE_MORE_r(
1699 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1700 );
1701 */
1702 trie->states[ state ].trans.base=base;
1703 }
cc601c31 1704 trie->lasttrans = tp + 1;
a3621e74
YO
1705 }
1706 } else {
1707 /*
1708 Second Pass -- Flat Table Representation.
1709
1710 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1711 We know that we will need Charcount+1 trans at most to store the data
1712 (one row per char at worst case) So we preallocate both structures
1713 assuming worst case.
1714
1715 We then construct the trie using only the .next slots of the entry
1716 structs.
1717
1718 We use the .check field of the first entry of the node temporarily to
1719 make compression both faster and easier by keeping track of how many non
1720 zero fields are in the node.
1721
1722 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1723 transition.
1724
1725 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1726 number representing the first entry of the node, and state as a
1727 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1728 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1729 are 2 entrys per node. eg:
1730
1731 A B A B
1732 1. 2 4 1. 3 7
1733 2. 0 3 3. 0 5
1734 3. 0 0 5. 0 0
1735 4. 0 0 7. 0 0
1736
1737 The table is internally in the right hand, idx form. However as we also
1738 have to deal with the states array which is indexed by nodenum we have to
1739 use TRIE_NODENUM() to convert.
1740
1741 */
1e2e3d02
YO
1742 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1743 "%*sCompiling trie using table compiler\n",
1744 (int)depth * 2 + 2, ""));
3dab1dad 1745
c944940b
JH
1746 trie->trans = (reg_trie_trans *)
1747 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1748 * trie->uniquecharcount + 1,
1749 sizeof(reg_trie_trans) );
1750 trie->states = (reg_trie_state *)
1751 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1752 sizeof(reg_trie_state) );
a3621e74
YO
1753 next_alloc = trie->uniquecharcount + 1;
1754
3dab1dad 1755
a3621e74
YO
1756 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1757
c445ea15 1758 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1759 const U8 *uc = (U8*)STRING( noper );
1760 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1761
1762 U32 state = 1; /* required init */
1763
1764 U16 charid = 0; /* sanity init */
1765 U32 accept_state = 0; /* sanity init */
1766 U8 *scan = (U8*)NULL; /* sanity init */
1767
1768 STRLEN foldlen = 0; /* required init */
07be1b83 1769 U32 wordlen = 0; /* required init */
a3621e74
YO
1770 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1771
3dab1dad 1772 if ( OP(noper) != NOTHING ) {
786e8c11 1773 for ( ; uc < e ; uc += len ) {
a3621e74 1774
786e8c11 1775 TRIE_READ_CHAR;
a3621e74 1776
786e8c11
YO
1777 if ( uvc < 256 ) {
1778 charid = trie->charmap[ uvc ];
1779 } else {
55eed653 1780 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1781 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1782 }
786e8c11
YO
1783 if ( charid ) {
1784 charid--;
1785 if ( !trie->trans[ state + charid ].next ) {
1786 trie->trans[ state + charid ].next = next_alloc;
1787 trie->trans[ state ].check++;
2e64971a
DM
1788 prev_states[TRIE_NODENUM(next_alloc)]
1789 = TRIE_NODENUM(state);
786e8c11
YO
1790 next_alloc += trie->uniquecharcount;
1791 }
1792 state = trie->trans[ state + charid ].next;
1793 } else {
1794 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1795 }
1796 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1797 }
a3621e74 1798 }
3dab1dad
YO
1799 accept_state = TRIE_NODENUM( state );
1800 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1801
1802 } /* end second pass */
1803
3dab1dad 1804 /* and now dump it out before we compress it */
2b8b4781
NC
1805 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1806 revcharmap,
1807 next_alloc, depth+1));
a3621e74 1808
a3621e74
YO
1809 {
1810 /*
1811 * Inplace compress the table.*
1812
1813 For sparse data sets the table constructed by the trie algorithm will
1814 be mostly 0/FAIL transitions or to put it another way mostly empty.
1815 (Note that leaf nodes will not contain any transitions.)
1816
1817 This algorithm compresses the tables by eliminating most such
1818 transitions, at the cost of a modest bit of extra work during lookup:
1819
1820 - Each states[] entry contains a .base field which indicates the
1821 index in the state[] array wheres its transition data is stored.
1822
1823 - If .base is 0 there are no valid transitions from that node.
1824
1825 - If .base is nonzero then charid is added to it to find an entry in
1826 the trans array.
1827
1828 -If trans[states[state].base+charid].check!=state then the
1829 transition is taken to be a 0/Fail transition. Thus if there are fail
1830 transitions at the front of the node then the .base offset will point
1831 somewhere inside the previous nodes data (or maybe even into a node
1832 even earlier), but the .check field determines if the transition is
1833 valid.
1834
786e8c11 1835 XXX - wrong maybe?
a3621e74
YO
1836 The following process inplace converts the table to the compressed
1837 table: We first do not compress the root node 1,and mark its all its
1838 .check pointers as 1 and set its .base pointer as 1 as well. This
1839 allows to do a DFA construction from the compressed table later, and
1840 ensures that any .base pointers we calculate later are greater than
1841 0.
1842
1843 - We set 'pos' to indicate the first entry of the second node.
1844
1845 - We then iterate over the columns of the node, finding the first and
1846 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1847 and set the .check pointers accordingly, and advance pos
1848 appropriately and repreat for the next node. Note that when we copy
1849 the next pointers we have to convert them from the original
1850 NODEIDX form to NODENUM form as the former is not valid post
1851 compression.
1852
1853 - If a node has no transitions used we mark its base as 0 and do not
1854 advance the pos pointer.
1855
1856 - If a node only has one transition we use a second pointer into the
1857 structure to fill in allocated fail transitions from other states.
1858 This pointer is independent of the main pointer and scans forward
1859 looking for null transitions that are allocated to a state. When it
1860 finds one it writes the single transition into the "hole". If the
786e8c11 1861 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1862
1863 - Once compressed we can Renew/realloc the structures to release the
1864 excess space.
1865
1866 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1867 specifically Fig 3.47 and the associated pseudocode.
1868
1869 demq
1870 */
a3b680e6 1871 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1872 U32 state, charid;
a3621e74 1873 U32 pos = 0, zp=0;
1e2e3d02 1874 trie->statecount = laststate;
a3621e74
YO
1875
1876 for ( state = 1 ; state < laststate ; state++ ) {
1877 U8 flag = 0;
a28509cc
AL
1878 const U32 stateidx = TRIE_NODEIDX( state );
1879 const U32 o_used = trie->trans[ stateidx ].check;
1880 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1881 trie->trans[ stateidx ].check = 0;
1882
1883 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1884 if ( flag || trie->trans[ stateidx + charid ].next ) {
1885 if ( trie->trans[ stateidx + charid ].next ) {
1886 if (o_used == 1) {
1887 for ( ; zp < pos ; zp++ ) {
1888 if ( ! trie->trans[ zp ].next ) {
1889 break;
1890 }
1891 }
1892 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1893 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1894 trie->trans[ zp ].check = state;
1895 if ( ++zp > pos ) pos = zp;
1896 break;
1897 }
1898 used--;
1899 }
1900 if ( !flag ) {
1901 flag = 1;
1902 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1903 }
1904 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1905 trie->trans[ pos ].check = state;
1906 pos++;
1907 }
1908 }
1909 }
cc601c31 1910 trie->lasttrans = pos + 1;
c944940b
JH
1911 trie->states = (reg_trie_state *)
1912 PerlMemShared_realloc( trie->states, laststate
1913 * sizeof(reg_trie_state) );
a3621e74 1914 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1915 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1916 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1917 (int)depth * 2 + 2,"",
1918 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1919 (IV)next_alloc,
1920 (IV)pos,
a3621e74
YO
1921 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1922 );
1923
1924 } /* end table compress */
1925 }
1e2e3d02
YO
1926 DEBUG_TRIE_COMPILE_MORE_r(
1927 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1928 (int)depth * 2 + 2, "",
1929 (UV)trie->statecount,
1930 (UV)trie->lasttrans)
1931 );
cc601c31 1932 /* resize the trans array to remove unused space */
c944940b
JH
1933 trie->trans = (reg_trie_trans *)
1934 PerlMemShared_realloc( trie->trans, trie->lasttrans
1935 * sizeof(reg_trie_trans) );
a3621e74 1936
3dab1dad 1937 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1938 U8 nodetype =(U8)(flags & 0xFF);
1939 char *str=NULL;
786e8c11 1940
07be1b83 1941#ifdef DEBUGGING
e62cc96a 1942 regnode *optimize = NULL;
7122b237
YO
1943#ifdef RE_TRACK_PATTERN_OFFSETS
1944
b57a0404
JH
1945 U32 mjd_offset = 0;
1946 U32 mjd_nodelen = 0;
7122b237
YO
1947#endif /* RE_TRACK_PATTERN_OFFSETS */
1948#endif /* DEBUGGING */
a3621e74 1949 /*
3dab1dad
YO
1950 This means we convert either the first branch or the first Exact,
1951 depending on whether the thing following (in 'last') is a branch
1952 or not and whther first is the startbranch (ie is it a sub part of
1953 the alternation or is it the whole thing.)
1954 Assuming its a sub part we conver the EXACT otherwise we convert
1955 the whole branch sequence, including the first.
a3621e74 1956 */
3dab1dad 1957 /* Find the node we are going to overwrite */
7f69552c 1958 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1959 /* branch sub-chain */
3dab1dad 1960 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1961#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1962 DEBUG_r({
1963 mjd_offset= Node_Offset((convert));
1964 mjd_nodelen= Node_Length((convert));
1965 });
7122b237 1966#endif
7f69552c 1967 /* whole branch chain */
7122b237
YO
1968 }
1969#ifdef RE_TRACK_PATTERN_OFFSETS
1970 else {
7f69552c
YO
1971 DEBUG_r({
1972 const regnode *nop = NEXTOPER( convert );
1973 mjd_offset= Node_Offset((nop));
1974 mjd_nodelen= Node_Length((nop));
1975 });
07be1b83
YO
1976 }
1977 DEBUG_OPTIMISE_r(
1978 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1979 (int)depth * 2 + 2, "",
786e8c11 1980 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1981 );
7122b237 1982#endif
3dab1dad
YO
1983 /* But first we check to see if there is a common prefix we can
1984 split out as an EXACT and put in front of the TRIE node. */
1985 trie->startstate= 1;
55eed653 1986 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1987 U32 state;
1e2e3d02 1988 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1989 U32 ofs = 0;
8e11feef
RGS
1990 I32 idx = -1;
1991 U32 count = 0;
1992 const U32 base = trie->states[ state ].trans.base;
a3621e74 1993
3dab1dad 1994 if ( trie->states[state].wordnum )
8e11feef 1995 count = 1;
a3621e74 1996
8e11feef 1997 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1998 if ( ( base + ofs >= trie->uniquecharcount ) &&
1999 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2000 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2001 {
3dab1dad 2002 if ( ++count > 1 ) {
2b8b4781 2003 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2004 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2005 if ( state == 1 ) break;
3dab1dad
YO
2006 if ( count == 2 ) {
2007 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2008 DEBUG_OPTIMISE_r(
8e11feef
RGS
2009 PerlIO_printf(Perl_debug_log,
2010 "%*sNew Start State=%"UVuf" Class: [",
2011 (int)depth * 2 + 2, "",
786e8c11 2012 (UV)state));
be8e71aa 2013 if (idx >= 0) {
2b8b4781 2014 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2015 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2016
3dab1dad 2017 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2018 if ( folder )
2019 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2020 DEBUG_OPTIMISE_r(
f1f66076 2021 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2022 );
8e11feef
RGS
2023 }
2024 }
2025 TRIE_BITMAP_SET(trie,*ch);
2026 if ( folder )
2027 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2028 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2029 }
2030 idx = ofs;
2031 }
3dab1dad
YO
2032 }
2033 if ( count == 1 ) {
2b8b4781 2034 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2035 STRLEN len;
2036 char *ch = SvPV( *tmp, len );
de734bd5
A
2037 DEBUG_OPTIMISE_r({
2038 SV *sv=sv_newmortal();
8e11feef
RGS
2039 PerlIO_printf( Perl_debug_log,
2040 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2041 (int)depth * 2 + 2, "",
de734bd5
A
2042 (UV)state, (UV)idx,
2043 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2044 PL_colors[0], PL_colors[1],
2045 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2046 PERL_PV_ESCAPE_FIRSTCHAR
2047 )
2048 );
2049 });
3dab1dad
YO
2050 if ( state==1 ) {
2051 OP( convert ) = nodetype;
2052 str=STRING(convert);
2053 STR_LEN(convert)=0;
2054 }
c490c714
YO
2055 STR_LEN(convert) += len;
2056 while (len--)
de734bd5 2057 *str++ = *ch++;
8e11feef 2058 } else {
f9049ba1 2059#ifdef DEBUGGING
8e11feef
RGS
2060 if (state>1)
2061 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2062#endif
8e11feef
RGS
2063 break;
2064 }
2065 }
2e64971a 2066 trie->prefixlen = (state-1);
3dab1dad 2067 if (str) {
8e11feef 2068 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2069 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2070 trie->startstate = state;
07be1b83
YO
2071 trie->minlen -= (state - 1);
2072 trie->maxlen -= (state - 1);
33809eae
JH
2073#ifdef DEBUGGING
2074 /* At least the UNICOS C compiler choked on this
2075 * being argument to DEBUG_r(), so let's just have
2076 * it right here. */
2077 if (
2078#ifdef PERL_EXT_RE_BUILD
2079 1
2080#else
2081 DEBUG_r_TEST
2082#endif
2083 ) {
2084 regnode *fix = convert;
2085 U32 word = trie->wordcount;
2086 mjd_nodelen++;
2087 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2088 while( ++fix < n ) {
2089 Set_Node_Offset_Length(fix, 0, 0);
2090 }
2091 while (word--) {
2092 SV ** const tmp = av_fetch( trie_words, word, 0 );
2093 if (tmp) {
2094 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2095 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2096 else
2097 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2098 }
2099 }
2100 }
2101#endif
8e11feef
RGS
2102 if (trie->maxlen) {
2103 convert = n;
2104 } else {
3dab1dad 2105 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2106 DEBUG_r(optimize= n);
3dab1dad
YO
2107 }
2108 }
2109 }
a5ca303d
YO
2110 if (!jumper)
2111 jumper = last;
3dab1dad 2112 if ( trie->maxlen ) {
8e11feef
RGS
2113 NEXT_OFF( convert ) = (U16)(tail - convert);
2114 ARG_SET( convert, data_slot );
786e8c11
YO
2115 /* Store the offset to the first unabsorbed branch in
2116 jump[0], which is otherwise unused by the jump logic.
2117 We use this when dumping a trie and during optimisation. */
2118 if (trie->jump)
7f69552c 2119 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2120
786e8c11
YO
2121 /* XXXX */
2122 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2123 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2124 {
2125 OP( convert ) = TRIEC;
2126 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2127 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2128 trie->bitmap= NULL;
2129 } else
2130 OP( convert ) = TRIE;
a3621e74 2131
3dab1dad
YO
2132 /* store the type in the flags */
2133 convert->flags = nodetype;
a5ca303d
YO
2134 DEBUG_r({
2135 optimize = convert
2136 + NODE_STEP_REGNODE
2137 + regarglen[ OP( convert ) ];
2138 });
2139 /* XXX We really should free up the resource in trie now,
2140 as we won't use them - (which resources?) dmq */
3dab1dad 2141 }
a3621e74 2142 /* needed for dumping*/
e62cc96a 2143 DEBUG_r(if (optimize) {
07be1b83 2144 regnode *opt = convert;
bcdf7404 2145
e62cc96a 2146 while ( ++opt < optimize) {
07be1b83
YO
2147 Set_Node_Offset_Length(opt,0,0);
2148 }
786e8c11
YO
2149 /*
2150 Try to clean up some of the debris left after the
2151 optimisation.
a3621e74 2152 */
786e8c11 2153 while( optimize < jumper ) {
07be1b83 2154 mjd_nodelen += Node_Length((optimize));
a3621e74 2155 OP( optimize ) = OPTIMIZED;
07be1b83 2156 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2157 optimize++;
2158 }
07be1b83 2159 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2160 });
2161 } /* end node insert */
2e64971a
DM
2162
2163 /* Finish populating the prev field of the wordinfo array. Walk back
2164 * from each accept state until we find another accept state, and if
2165 * so, point the first word's .prev field at the second word. If the
2166 * second already has a .prev field set, stop now. This will be the
2167 * case either if we've already processed that word's accept state,
2168 * or that that state had multiple words, and the overspill words
2169 * were already linked up earlier.
2170 */
2171 {
2172 U16 word;
2173 U32 state;
2174 U16 prev;
2175
2176 for (word=1; word <= trie->wordcount; word++) {
2177 prev = 0;
2178 if (trie->wordinfo[word].prev)
2179 continue;
2180 state = trie->wordinfo[word].accept;
2181 while (state) {
2182 state = prev_states[state];
2183 if (!state)
2184 break;
2185 prev = trie->states[state].wordnum;
2186 if (prev)
2187 break;
2188 }
2189 trie->wordinfo[word].prev = prev;
2190 }
2191 Safefree(prev_states);
2192 }
2193
2194
2195 /* and now dump out the compressed format */
2196 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2197
55eed653 2198 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2199#ifdef DEBUGGING
2200 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2201 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2202#else
2203 SvREFCNT_dec(revcharmap);
07be1b83 2204#endif
786e8c11
YO
2205 return trie->jump
2206 ? MADE_JUMP_TRIE
2207 : trie->startstate>1
2208 ? MADE_EXACT_TRIE
2209 : MADE_TRIE;
2210}
2211
2212STATIC void
2213S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2214{
2215/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2216
2217 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2218 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2219 ISBN 0-201-10088-6
2220
2221 We find the fail state for each state in the trie, this state is the longest proper
2222 suffix of the current states 'word' that is also a proper prefix of another word in our
2223 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2224 the DFA not to have to restart after its tried and failed a word at a given point, it
2225 simply continues as though it had been matching the other word in the first place.
2226 Consider
2227 'abcdgu'=~/abcdefg|cdgu/
2228 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2229 fail, which would bring use to the state representing 'd' in the second word where we would
2230 try 'g' and succeed, prodceding to match 'cdgu'.
2231 */
2232 /* add a fail transition */
3251b653
NC
2233 const U32 trie_offset = ARG(source);
2234 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2235 U32 *q;
2236 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2237 const U32 numstates = trie->statecount;
786e8c11
YO
2238 const U32 ubound = trie->lasttrans + ucharcount;
2239 U32 q_read = 0;
2240 U32 q_write = 0;
2241 U32 charid;
2242 U32 base = trie->states[ 1 ].trans.base;
2243 U32 *fail;
2244 reg_ac_data *aho;
2245 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2246 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2247
2248 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2249#ifndef DEBUGGING
2250 PERL_UNUSED_ARG(depth);
2251#endif
2252
2253
2254 ARG_SET( stclass, data_slot );
c944940b 2255 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2256 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2257 aho->trie=trie_offset;
446bd890
NC
2258 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2259 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2260 Newxz( q, numstates, U32);
c944940b 2261 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2262 aho->refcount = 1;
2263 fail = aho->fail;
2264 /* initialize fail[0..1] to be 1 so that we always have
2265 a valid final fail state */
2266 fail[ 0 ] = fail[ 1 ] = 1;
2267
2268 for ( charid = 0; charid < ucharcount ; charid++ ) {
2269 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2270 if ( newstate ) {
2271 q[ q_write ] = newstate;
2272 /* set to point at the root */
2273 fail[ q[ q_write++ ] ]=1;
2274 }
2275 }
2276 while ( q_read < q_write) {
2277 const U32 cur = q[ q_read++ % numstates ];
2278 base = trie->states[ cur ].trans.base;
2279
2280 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2281 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2282 if (ch_state) {
2283 U32 fail_state = cur;
2284 U32 fail_base;
2285 do {
2286 fail_state = fail[ fail_state ];
2287 fail_base = aho->states[ fail_state ].trans.base;
2288 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2289
2290 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2291 fail[ ch_state ] = fail_state;
2292 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2293 {
2294 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2295 }
2296 q[ q_write++ % numstates] = ch_state;
2297 }
2298 }
2299 }
2300 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2301 when we fail in state 1, this allows us to use the
2302 charclass scan to find a valid start char. This is based on the principle
2303 that theres a good chance the string being searched contains lots of stuff
2304 that cant be a start char.
2305 */
2306 fail[ 0 ] = fail[ 1 ] = 0;
2307 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2308 PerlIO_printf(Perl_debug_log,
2309 "%*sStclass Failtable (%"UVuf" states): 0",
2310 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2311 );
786e8c11
YO
2312 for( q_read=1; q_read<numstates; q_read++ ) {
2313 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2314 }
2315 PerlIO_printf(Perl_debug_log, "\n");
2316 });
2317 Safefree(q);
2318 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2319}
2320
786e8c11 2321
a3621e74 2322/*
5d1c421c
JH
2323 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2324 * These need to be revisited when a newer toolchain becomes available.
2325 */
2326#if defined(__sparc64__) && defined(__GNUC__)
2327# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2328# undef SPARC64_GCC_WORKAROUND
2329# define SPARC64_GCC_WORKAROUND 1
2330# endif
2331#endif
2332
07be1b83 2333#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2334 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2335 SV * const mysv=sv_newmortal(); \
2336 regnode *Next = regnext(scan); \
2337 regprop(RExC_rx, mysv, scan); \
7f69552c 2338 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2339 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2340 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2341 }});
07be1b83 2342
1de06328
YO
2343
2344
2345
2346
07be1b83
YO
2347#define JOIN_EXACT(scan,min,flags) \
2348 if (PL_regkind[OP(scan)] == EXACT) \
2349 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2350
be8e71aa 2351STATIC U32
07be1b83
YO
2352S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2353 /* Merge several consecutive EXACTish nodes into one. */
2354 regnode *n = regnext(scan);
2355 U32 stringok = 1;
2356 regnode *next = scan + NODE_SZ_STR(scan);
2357 U32 merged = 0;
2358 U32 stopnow = 0;
2359#ifdef DEBUGGING
2360 regnode *stop = scan;
72f13be8 2361 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2362#else
d47053eb
RGS
2363 PERL_UNUSED_ARG(depth);
2364#endif
7918f24d
NC
2365
2366 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2367#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2368 PERL_UNUSED_ARG(flags);
2369 PERL_UNUSED_ARG(val);
07be1b83 2370#endif
07be1b83
YO
2371 DEBUG_PEEP("join",scan,depth);
2372
2373 /* Skip NOTHING, merge EXACT*. */
2374 while (n &&
2375 ( PL_regkind[OP(n)] == NOTHING ||
2376 (stringok && (OP(n) == OP(scan))))
2377 && NEXT_OFF(n)
2378 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2379
2380 if (OP(n) == TAIL || n > next)
2381 stringok = 0;
2382 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2383 DEBUG_PEEP("skip:",n,depth);
2384 NEXT_OFF(scan) += NEXT_OFF(n);
2385 next = n + NODE_STEP_REGNODE;
2386#ifdef DEBUGGING
2387 if (stringok)
2388 stop = n;
2389#endif
2390 n = regnext(n);
2391 }
2392 else if (stringok) {
786e8c11 2393 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2394 regnode * const nnext = regnext(n);
2395
2396 DEBUG_PEEP("merg",n,depth);
2397
2398 merged++;
2399 if (oldl + STR_LEN(n) > U8_MAX)
2400 break;
2401 NEXT_OFF(scan) += NEXT_OFF(n);
2402 STR_LEN(scan) += STR_LEN(n);
2403 next = n + NODE_SZ_STR(n);
2404 /* Now we can overwrite *n : */
2405 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2406#ifdef DEBUGGING
2407 stop = next - 1;
2408#endif
2409 n = nnext;
2410 if (stopnow) break;
2411 }
2412
d47053eb
RGS
2413#ifdef EXPERIMENTAL_INPLACESCAN
2414 if (flags && !NEXT_OFF(n)) {
2415 DEBUG_PEEP("atch", val, depth);
2416 if (reg_off_by_arg[OP(n)]) {
2417 ARG_SET(n, val - n);
2418 }
2419 else {
2420 NEXT_OFF(n) = val - n;
2421 }
2422 stopnow = 1;
2423 }
07be1b83
YO
2424#endif
2425 }
2426
2427 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2428 /*
2429 Two problematic code points in Unicode casefolding of EXACT nodes:
2430
2431 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2432 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2433
2434 which casefold to
2435
2436 Unicode UTF-8
2437
2438 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2439 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2440
2441 This means that in case-insensitive matching (or "loose matching",
2442 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2443 length of the above casefolded versions) can match a target string
2444 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2445 This would rather mess up the minimum length computation.
2446
2447 What we'll do is to look for the tail four bytes, and then peek
2448 at the preceding two bytes to see whether we need to decrease
2449 the minimum length by four (six minus two).
2450
2451 Thanks to the design of UTF-8, there cannot be false matches:
2452 A sequence of valid UTF-8 bytes cannot be a subsequence of
2453 another valid sequence of UTF-8 bytes.
2454
2455 */
2456 char * const s0 = STRING(scan), *s, *t;
2457 char * const s1 = s0 + STR_LEN(scan) - 1;
2458 char * const s2 = s1 - 4;
e294cc5d
JH
2459#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2460 const char t0[] = "\xaf\x49\xaf\x42";
2461#else
07be1b83 2462 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2463#endif
07be1b83
YO
2464 const char * const t1 = t0 + 3;
2465
2466 for (s = s0 + 2;
2467 s < s2 && (t = ninstr(s, s1, t0, t1));
2468 s = t + 4) {
e294cc5d
JH
2469#ifdef EBCDIC
2470 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2471 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2472#else
07be1b83
YO
2473 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2474 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2475#endif
07be1b83
YO
2476 *min -= 4;
2477 }
2478 }
2479
2480#ifdef DEBUGGING
2481 /* Allow dumping */
2482 n = scan + NODE_SZ_STR(scan);
2483 while (n <= stop) {
2484 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2485 OP(n) = OPTIMIZED;
2486 NEXT_OFF(n) = 0;
2487 }
2488 n++;
2489 }
2490#endif
2491 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2492 return stopnow;
2493}
2494
653099ff
GS
2495/* REx optimizer. Converts nodes into quickier variants "in place".
2496 Finds fixed substrings. */
2497
a0288114 2498/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2499 to the position after last scanned or to NULL. */
2500
40d049e4
YO
2501#define INIT_AND_WITHP \
2502 assert(!and_withp); \
2503 Newx(and_withp,1,struct regnode_charclass_class); \
2504 SAVEFREEPV(and_withp)
07be1b83 2505
b515a41d
YO
2506/* this is a chain of data about sub patterns we are processing that
2507 need to be handled seperately/specially in study_chunk. Its so
2508 we can simulate recursion without losing state. */
2509struct scan_frame;
2510typedef struct scan_frame {
2511 regnode *last; /* last node to process in this frame */
2512 regnode *next; /* next node to process when last is reached */
2513 struct scan_frame *prev; /*previous frame*/
2514 I32 stop; /* what stopparen do we use */
2515} scan_frame;
2516
304ee84b
YO
2517
2518#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2519
e1d1eefb
YO
2520#define CASE_SYNST_FNC(nAmE) \
2521case nAmE: \
2522 if (flags & SCF_DO_STCLASS_AND) { \
2523 for (value = 0; value < 256; value++) \
2524 if (!is_ ## nAmE ## _cp(value)) \
2525 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2526 } \
2527 else { \
2528 for (value = 0; value < 256; value++) \
2529 if (is_ ## nAmE ## _cp(value)) \
2530 ANYOF_BITMAP_SET(data->start_class, value); \
2531 } \
2532 break; \
2533case N ## nAmE: \
2534 if (flags & SCF_DO_STCLASS_AND) { \
2535 for (value = 0; value < 256; value++) \
2536 if (is_ ## nAmE ## _cp(value)) \
2537 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2538 } \
2539 else { \
2540 for (value = 0; value < 256; value++) \
2541 if (!is_ ## nAmE ## _cp(value)) \
2542 ANYOF_BITMAP_SET(data->start_class, value); \
2543 } \
2544 break
2545
2546
2547
76e3520e 2548STATIC I32
40d049e4 2549S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2550 I32 *minlenp, I32 *deltap,
40d049e4
YO
2551 regnode *last,
2552 scan_data_t *data,
2553 I32 stopparen,
2554 U8* recursed,
2555 struct regnode_charclass_class *and_withp,
2556 U32 flags, U32 depth)
c277df42
IZ
2557 /* scanp: Start here (read-write). */
2558 /* deltap: Write maxlen-minlen here. */
2559 /* last: Stop before this one. */
40d049e4
YO
2560 /* data: string data about the pattern */
2561 /* stopparen: treat close N as END */
2562 /* recursed: which subroutines have we recursed into */
2563 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2564{
97aff369 2565 dVAR;
c277df42
IZ
2566 I32 min = 0, pars = 0, code;
2567 regnode *scan = *scanp, *next;
2568 I32 delta = 0;
2569 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2570 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2571 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2572 scan_data_t data_fake;
a3621e74 2573 SV *re_trie_maxbuff = NULL;
786e8c11 2574 regnode *first_non_open = scan;
e2e6a0f1 2575 I32 stopmin = I32_MAX;
8aa23a47 2576 scan_frame *frame = NULL;
a3621e74 2577 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2578
7918f24d
NC
2579 PERL_ARGS_ASSERT_STUDY_CHUNK;
2580
13a24bad 2581#ifdef DEBUGGING
40d049e4 2582 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2583#endif
40d049e4 2584
786e8c11 2585 if ( depth == 0 ) {
40d049e4 2586 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2587 first_non_open=regnext(first_non_open);
2588 }
2589
b81d288d 2590
8aa23a47
YO
2591 fake_study_recurse:
2592 while ( scan && OP(scan) != END && scan < last ){
2593 /* Peephole optimizer: */
304ee84b 2594 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2595 DEBUG_PEEP("Peep",scan,depth);
2596 JOIN_EXACT(scan,&min,0);
2597
2598 /* Follow the next-chain of the current node and optimize
2599 away all the NOTHINGs from it. */
2600 if (OP(scan) != CURLYX) {
2601 const int max = (reg_off_by_arg[OP(scan)]
2602 ? I32_MAX
2603 /* I32 may be smaller than U16 on CRAYs! */
2604 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2605 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2606 int noff;
2607 regnode *n = scan;
2608
2609 /* Skip NOTHING and LONGJMP. */
2610 while ((n = regnext(n))
2611 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2612 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2613 && off + noff < max)
2614 off += noff;
2615 if (reg_off_by_arg[OP(scan)])
2616 ARG(scan) = off;
2617 else
2618 NEXT_OFF(scan) = off;
2619 }
a3621e74 2620
c277df42 2621
8aa23a47
YO
2622
2623 /* The principal pseudo-switch. Cannot be a switch, since we
2624 look into several different things. */
2625 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2626 || OP(scan) == IFTHEN) {
2627 next = regnext(scan);
2628 code = OP(scan);
2629 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2630
2631 if (OP(next) == code || code == IFTHEN) {
2632 /* NOTE - There is similar code to this block below for handling
2633 TRIE nodes on a re-study. If you change stuff here check there
2634 too. */
2635 I32 max1 = 0, min1 = I32_MAX, num = 0;
2636 struct regnode_charclass_class accum;
2637 regnode * const startbranch=scan;
2638
2639 if (flags & SCF_DO_SUBSTR)
304ee84b 2640 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2641 if (flags & SCF_DO_STCLASS)
2642 cl_init_zero(pRExC_state, &accum);
2643
2644 while (OP(scan) == code) {
2645 I32 deltanext, minnext, f = 0, fake;
2646 struct regnode_charclass_class this_class;
2647
2648 num++;
2649 data_fake.flags = 0;
2650 if (data) {
2651 data_fake.whilem_c = data->whilem_c;
2652 data_fake.last_closep = data->last_closep;
2653 }
2654 else
2655 data_fake.last_closep = &fake;
58e23c8d
YO
2656
2657 data_fake.pos_delta = delta;
8aa23a47
YO
2658 next = regnext(scan);
2659 scan = NEXTOPER(scan);
2660 if (code != BRANCH)
c277df42 2661 scan = NEXTOPER(scan);
8aa23a47
YO
2662 if (flags & SCF_DO_STCLASS) {
2663 cl_init(pRExC_state, &this_class);
2664 data_fake.start_class = &this_class;
2665 f = SCF_DO_STCLASS_AND;
58e23c8d 2666 }
8aa23a47
YO
2667 if (flags & SCF_WHILEM_VISITED_POS)
2668 f |= SCF_WHILEM_VISITED_POS;
2669
2670 /* we suppose the run is continuous, last=next...*/
2671 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2672 next, &data_fake,
2673 stopparen, recursed, NULL, f,depth+1);
2674 if (min1 > minnext)
2675 min1 = minnext;
2676 if (max1 < minnext + deltanext)
2677 max1 = minnext + deltanext;
2678 if (deltanext == I32_MAX)
2679 is_inf = is_inf_internal = 1;
2680 scan = next;
2681 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2682 pars++;
2683 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2684 if ( stopmin > minnext)
2685 stopmin = min + min1;
2686 flags &= ~SCF_DO_SUBSTR;
2687 if (data)
2688 data->flags |= SCF_SEEN_ACCEPT;
2689 }
2690 if (data) {
2691 if (data_fake.flags & SF_HAS_EVAL)
2692 data->flags |= SF_HAS_EVAL;
2693 data->whilem_c = data_fake.whilem_c;
3dab1dad 2694 }
8aa23a47
YO
2695 if (flags & SCF_DO_STCLASS)
2696 cl_or(pRExC_state, &accum, &this_class);
2697 }
2698 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2699 min1 = 0;
2700 if (flags & SCF_DO_SUBSTR) {
2701 data->pos_min += min1;
2702 data->pos_delta += max1 - min1;
2703 if (max1 != min1 || is_inf)
2704 data->longest = &(data->longest_float);
2705 }
2706 min += min1;
2707 delta += max1 - min1;
2708 if (flags & SCF_DO_STCLASS_OR) {
2709 cl_or(pRExC_state, data->start_class, &accum);
2710 if (min1) {
2711 cl_and(data->start_class, and_withp);
2712 flags &= ~SCF_DO_STCLASS;
653099ff 2713 }
8aa23a47
YO
2714 }
2715 else if (flags & SCF_DO_STCLASS_AND) {
2716 if (min1) {
2717 cl_and(data->start_class, &accum);
2718 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2719 }
8aa23a47
YO
2720 else {
2721 /* Switch to OR mode: cache the old value of
2722 * data->start_class */
2723 INIT_AND_WITHP;
2724 StructCopy(data->start_class, and_withp,
2725 struct regnode_charclass_class);
2726 flags &= ~SCF_DO_STCLASS_AND;
2727 StructCopy(&accum, data->start_class,
2728 struct regnode_charclass_class);
2729 flags |= SCF_DO_STCLASS_OR;
2730 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2731 }
8aa23a47 2732 }
a3621e74 2733
8aa23a47
YO
2734 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2735 /* demq.
a3621e74 2736
8aa23a47
YO
2737 Assuming this was/is a branch we are dealing with: 'scan' now
2738 points at the item that follows the branch sequence, whatever
2739 it is. We now start at the beginning of the sequence and look
2740 for subsequences of
a3621e74 2741
8aa23a47
YO
2742 BRANCH->EXACT=>x1
2743 BRANCH->EXACT=>x2
2744 tail
a3621e74 2745
8aa23a47 2746 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2747
8aa23a47
YO
2748 If we can find such a subseqence we need to turn the first
2749 element into a trie and then add the subsequent branch exact
2750 strings to the trie.
a3621e74 2751
8aa23a47 2752 We have two cases
a3621e74 2753
8aa23a47 2754 1. patterns where the whole set of branch can be converted.
a3621e74 2755
8aa23a47 2756 2. patterns where only a subset can be converted.
a3621e74 2757
8aa23a47
YO
2758 In case 1 we can replace the whole set with a single regop
2759 for the trie. In case 2 we need to keep the start and end
2760 branchs so
a3621e74 2761
8aa23a47
YO
2762 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2763 becomes BRANCH TRIE; BRANCH X;
786e8c11 2764
8aa23a47
YO
2765 There is an additional case, that being where there is a
2766 common prefix, which gets split out into an EXACT like node
2767 preceding the TRIE node.
a3621e74 2768
8aa23a47
YO
2769 If x(1..n)==tail then we can do a simple trie, if not we make
2770 a "jump" trie, such that when we match the appropriate word
2771 we "jump" to the appopriate tail node. Essentailly we turn
2772 a nested if into a case structure of sorts.
b515a41d 2773
8aa23a47
YO
2774 */
2775
2776 int made=0;
2777 if (!re_trie_maxbuff) {
2778 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2779 if (!SvIOK(re_trie_maxbuff))
2780 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2781 }
2782 if ( SvIV(re_trie_maxbuff)>=0 ) {
2783 regnode *cur;
2784 regnode *first = (regnode *)NULL;
2785 regnode *last = (regnode *)NULL;
2786 regnode *tail = scan;
2787 U8 optype = 0;
2788 U32 count=0;
a3621e74
YO
2789
2790#ifdef DEBUGGING
8aa23a47 2791 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2792#endif
8aa23a47
YO
2793 /* var tail is used because there may be a TAIL
2794 regop in the way. Ie, the exacts will point to the
2795 thing following the TAIL, but the last branch will
2796 point at the TAIL. So we advance tail. If we
2797 have nested (?:) we may have to move through several
2798 tails.
2799 */
2800
2801 while ( OP( tail ) == TAIL ) {
2802 /* this is the TAIL generated by (?:) */
2803 tail = regnext( tail );
2804 }
a3621e74 2805
8aa23a47
YO
2806
2807 DEBUG_OPTIMISE_r({
2808 regprop(RExC_rx, mysv, tail );
2809 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2810 (int)depth * 2 + 2, "",
2811 "Looking for TRIE'able sequences. Tail node is: ",
2812 SvPV_nolen_const( mysv )
2813 );
2814 });
2815
2816 /*
2817
2818 step through the branches, cur represents each
2819 branch, noper is the first thing to be matched
2820 as part of that branch and noper_next is the
2821 regnext() of that node. if noper is an EXACT
2822 and noper_next is the same as scan (our current
2823 position in the regex) then the EXACT branch is
2824 a possible optimization target. Once we have
2825 two or more consequetive such branches we can
2826 create a trie of the EXACT's contents and stich
2827 it in place. If the sequence represents all of
2828 the branches we eliminate the whole thing and
2829 replace it with a single TRIE. If it is a
2830 subsequence then we need to stitch it in. This
2831 means the first branch has to remain, and needs
2832 to be repointed at the item on the branch chain
2833 following the last branch optimized. This could
2834 be either a BRANCH, in which case the
2835 subsequence is internal, or it could be the
2836 item following the branch sequence in which
2837 case the subsequence is at the end.
2838
2839 */
2840
2841 /* dont use tail as the end marker for this traverse */
2842 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2843 regnode * const noper = NEXTOPER( cur );
b515a41d 2844#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2845 regnode * const noper_next = regnext( noper );
b515a41d
YO
2846#endif
2847
8aa23a47
YO
2848 DEBUG_OPTIMISE_r({
2849 regprop(RExC_rx, mysv, cur);
2850 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2851 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2852
2853 regprop(RExC_rx, mysv, noper);
2854 PerlIO_printf( Perl_debug_log, " -> %s",
2855 SvPV_nolen_const(mysv));
2856
2857 if ( noper_next ) {
2858 regprop(RExC_rx, mysv, noper_next );
2859 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2860 SvPV_nolen_const(mysv));
2861 }
2862 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2863 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2864 });
2865 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2866 : PL_regkind[ OP( noper ) ] == EXACT )
2867 || OP(noper) == NOTHING )
786e8c11 2868#ifdef NOJUMPTRIE
8aa23a47 2869 && noper_next == tail
786e8c11 2870#endif
8aa23a47
YO
2871 && count < U16_MAX)
2872 {
2873 count++;
2874 if ( !first || optype == NOTHING ) {
2875 if (!first) first = cur;
2876 optype = OP( noper );
2877 } else {
2878 last = cur;
2879 }
2880 } else {
a0a388a1 2881/*
0abd0d78
YO
2882 Currently we do not believe that the trie logic can
2883 handle case insensitive matching properly when the
2884 pattern is not unicode (thus forcing unicode semantics).
2885
2886 If/when this is fixed the following define can be swapped
2887 in below to fully enable trie logic.
2888
a0a388a1 2889#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2890
2891*/
2892#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2893
a0a388a1 2894 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2895 make_trie( pRExC_state,
2896 startbranch, first, cur, tail, count,
2897 optype, depth+1 );
2898 }
2899 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2900#ifdef NOJUMPTRIE
8aa23a47 2901 && noper_next == tail
786e8c11 2902#endif
8aa23a47
YO
2903 ){
2904 count = 1;
2905 first = cur;
2906 optype = OP( noper );
2907 } else {
2908 count = 0;
2909 first = NULL;
2910 optype = 0;
2911 }
2912 last = NULL;
2913 }
2914 }
2915 DEBUG_OPTIMISE_r({
2916 regprop(RExC_rx, mysv, cur);
2917 PerlIO_printf( Perl_debug_log,
2918 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2919 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2920
2921 });
a0a388a1
YO
2922
2923 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2924 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2925#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2926 if ( ((made == MADE_EXACT_TRIE &&
2927 startbranch == first)
2928 || ( first_non_open == first )) &&
2929 depth==0 ) {
2930 flags |= SCF_TRIE_RESTUDY;
2931 if ( startbranch == first
2932 && scan == tail )
2933 {
2934 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2935 }
2936 }
3dab1dad 2937#endif
8aa23a47
YO
2938 }
2939 }
2940
2941 } /* do trie */
2942
653099ff 2943 }
8aa23a47
YO
2944 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2945 scan = NEXTOPER(NEXTOPER(scan));
2946 } else /* single branch is optimized. */
2947 scan = NEXTOPER(scan);
2948 continue;
2949 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2950 scan_frame *newframe = NULL;
2951 I32 paren;
2952 regnode *start;
2953 regnode *end;
2954
2955 if (OP(scan) != SUSPEND) {
2956 /* set the pointer */
2957 if (OP(scan) == GOSUB) {
2958 paren = ARG(scan);
2959 RExC_recurse[ARG2L(scan)] = scan;
2960 start = RExC_open_parens[paren-1];
2961 end = RExC_close_parens[paren-1];
2962 } else {
2963 paren = 0;
f8fc2ecf 2964 start = RExC_rxi->program + 1;
8aa23a47
YO
2965 end = RExC_opend;
2966 }
2967 if (!recursed) {
2968 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2969 SAVEFREEPV(recursed);
2970 }
2971 if (!PAREN_TEST(recursed,paren+1)) {
2972 PAREN_SET(recursed,paren+1);
2973 Newx(newframe,1,scan_frame);
2974 } else {
2975 if (flags & SCF_DO_SUBSTR) {
304ee84b 2976 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2977 data->longest = &(data->longest_float);
2978 }
2979 is_inf = is_inf_internal = 1;
2980 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2981 cl_anything(pRExC_state, data->start_class);
2982 flags &= ~SCF_DO_STCLASS;
2983 }
2984 } else {
2985 Newx(newframe,1,scan_frame);
2986 paren = stopparen;
2987 start = scan+2;
2988 end = regnext(scan);
2989 }
2990 if (newframe) {
2991 assert(start);
2992 assert(end);
2993 SAVEFREEPV(newframe);
2994 newframe->next = regnext(scan);
2995 newframe->last = last;
2996 newframe->stop = stopparen;
2997 newframe->prev = frame;
2998
2999 frame = newframe;
3000 scan = start;
3001 stopparen = paren;
3002 last = end;
3003
3004 continue;
3005 }
3006 }
3007 else if (OP(scan) == EXACT) {
3008 I32 l = STR_LEN(scan);
3009 UV uc;
3010 if (UTF) {
3011 const U8 * const s = (U8*)STRING(scan);
3012 l = utf8_length(s, s + l);
3013 uc = utf8_to_uvchr(s, NULL);
3014 } else {
3015 uc = *((U8*)STRING(scan));
3016 }
3017 min += l;
3018 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3019 /* The code below prefers earlier match for fixed
3020 offset, later match for variable offset. */
3021 if (data->last_end == -1) { /* Update the start info. */
3022 data->last_start_min = data->pos_min;
3023 data->last_start_max = is_inf
3024 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3025 }
8aa23a47
YO
3026 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3027 if (UTF)
3028 SvUTF8_on(data->last_found);
3029 {
3030 SV * const sv = data->last_found;
3031 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3032 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3033 if (mg && mg->mg_len >= 0)
3034 mg->mg_len += utf8_length((U8*)STRING(scan),
3035 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3036 }
8aa23a47
YO
3037 data->last_end = data->pos_min + l;
3038 data->pos_min += l; /* As in the first entry. */
3039 data->flags &= ~SF_BEFORE_EOL;
3040 }
3041 if (flags & SCF_DO_STCLASS_AND) {
3042 /* Check whether it is compatible with what we know already! */
3043 int compat = 1;
3044
3045 if (uc >= 0x100 ||
3046 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3047 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3048 && (!(data->start_class->flags & ANYOF_FOLD)
3049 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3050 )
3051 compat = 0;
3052 ANYOF_CLASS_ZERO(data->start_class);
3053 ANYOF_BITMAP_ZERO(data->start_class);
3054 if (compat)
3055 ANYOF_BITMAP_SET(data->start_class, uc);
3056 data->start_class->flags &= ~ANYOF_EOS;
3057 if (uc < 0x100)
3058 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3059 }
3060 else if (flags & SCF_DO_STCLASS_OR) {
3061 /* false positive possible if the class is case-folded */
3062 if (uc < 0x100)
3063 ANYOF_BITMAP_SET(data->start_class, uc);
3064 else
3065 data->start_class->flags |= ANYOF_UNICODE_ALL;
3066 data->start_class->flags &= ~ANYOF_EOS;
3067 cl_and(data->start_class, and_withp);
3068 }
3069 flags &= ~SCF_DO_STCLASS;
3070 }
3071 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3072 I32 l = STR_LEN(scan);
3073 UV uc = *((U8*)STRING(scan));
3074
3075 /* Search for fixed substrings supports EXACT only. */
3076 if (flags & SCF_DO_SUBSTR) {
3077 assert(data);
304ee84b 3078 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3079 }
3080 if (UTF) {
3081 const U8 * const s = (U8 *)STRING(scan);
3082 l = utf8_length(s, s + l);
3083 uc = utf8_to_uvchr(s, NULL);
3084 }
3085 min += l;
3086 if (flags & SCF_DO_SUBSTR)
3087 data->pos_min += l;
3088 if (flags & SCF_DO_STCLASS_AND) {
3089 /* Check whether it is compatible with what we know already! */
3090 int compat = 1;
3091
3092 if (uc >= 0x100 ||
3093 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3094 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3095 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3096 compat = 0;
3097 ANYOF_CLASS_ZERO(data->start_class);
3098 ANYOF_BITMAP_ZERO(data->start_class);
3099 if (compat) {
3100 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3101 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
3102 data->start_class->flags |= ANYOF_FOLD;
3103 if (OP(scan) == EXACTFL)
3104 data->start_class->flags |= ANYOF_LOCALE;
653099ff 3105 }
8aa23a47
YO
3106 }
3107 else if (flags & SCF_DO_STCLASS_OR) {
3108 if (data->start_class->flags & ANYOF_FOLD) {
3109 /* false positive possible if the class is case-folded.
3110 Assume that the locale settings are the same... */
1aa99e6b
IH
3111 if (uc < 0x100)
3112 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
3113 data->start_class->flags &= ~ANYOF_EOS;
3114 }
8aa23a47 3115 cl_and(data->start_class, and_withp);
653099ff 3116 }
8aa23a47
YO
3117 flags &= ~SCF_DO_STCLASS;
3118 }
e52fc539 3119 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3120 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3121 I32 f = flags, pos_before = 0;
3122 regnode * const oscan = scan;
3123 struct regnode_charclass_class this_class;
3124 struct regnode_charclass_class *oclass = NULL;
3125 I32 next_is_eval = 0;
3126
3127 switch (PL_regkind[OP(scan)]) {
3128 case WHILEM: /* End of (?:...)* . */
3129 scan = NEXTOPER(scan);
3130 goto finish;
3131 case PLUS:
3132 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3133 next = NEXTOPER(scan);
3134 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3135 mincount = 1;
3136 maxcount = REG_INFTY;
3137 next = regnext(scan);
3138 scan = NEXTOPER(scan);
3139 goto do_curly;
3140 }
3141 }
3142 if (flags & SCF_DO_SUBSTR)
3143 data->pos_min++;
3144 min++;
3145 /* Fall through. */
3146 case STAR:
3147 if (flags & SCF_DO_STCLASS) {
3148 mincount = 0;
3149 maxcount = REG_INFTY;
3150 next = regnext(scan);
3151 scan = NEXTOPER(scan);
3152 goto do_curly;
3153 }
3154 is_inf = is_inf_internal = 1;
3155 scan = regnext(scan);
c277df42 3156 if (flags & SCF_DO_SUBSTR) {
304ee84b 3157 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3158 data->longest = &(data->longest_float);
c277df42 3159 }
8aa23a47
YO
3160 goto optimize_curly_tail;
3161 case CURLY:
3162 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3163 && (scan->flags == stopparen))
3164 {
3165 mincount = 1;
3166 maxcount = 1;
3167 } else {
3168 mincount = ARG1(scan);
3169 maxcount = ARG2(scan);
653099ff 3170 }
8aa23a47
YO
3171 next = regnext(scan);
3172 if (OP(scan) == CURLYX) {
3173 I32 lp = (data ? *(data->last_closep) : 0);
3174 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3175 }
8aa23a47
YO
3176 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3177 next_is_eval = (OP(scan) == EVAL);
3178 do_curly:
3179 if (flags & SCF_DO_SUBSTR) {
304ee84b 3180 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3181 pos_before = data->pos_min;
b45f050a 3182 }
8aa23a47
YO
3183 if (data) {
3184 fl = data->flags;
3185 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3186 if (is_inf)
3187 data->flags |= SF_IS_INF;
3188 }
3189 if (flags & SCF_DO_STCLASS) {
3190 cl_init(pRExC_state, &this_class);
3191 oclass = data->start_class;
3192 data->start_class = &this_class;
3193 f |= SCF_DO_STCLASS_AND;
3194 f &= ~SCF_DO_STCLASS_OR;
3195 }
3196 /* These are the cases when once a subexpression
3197 fails at a particular position, it cannot succeed
3198 even after backtracking at the enclosing scope.
3199
3200 XXXX what if minimal match and we are at the
3201 initial run of {n,m}? */
3202 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3203 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3204
8aa23a47
YO
3205 /* This will finish on WHILEM, setting scan, or on NULL: */
3206 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3207 last, data, stopparen, recursed, NULL,
3208 (mincount == 0
3209 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3210
8aa23a47
YO
3211 if (flags & SCF_DO_STCLASS)
3212 data->start_class = oclass;
3213 if (mincount == 0 || minnext == 0) {
3214 if (flags & SCF_DO_STCLASS_OR) {
3215 cl_or(pRExC_state, data->start_class, &this_class);
3216 }
3217 else if (flags & SCF_DO_STCLASS_AND) {
3218 /* Switch to OR mode: cache the old value of
3219 * data->start_class */
3220 INIT_AND_WITHP;
3221 StructCopy(data->start_class, and_withp,
3222 struct regnode_charclass_class);
3223 flags &= ~SCF_DO_STCLASS_AND;
3224 StructCopy(&this_class, data->start_class,
3225 struct regnode_charclass_class);
3226 flags |= SCF_DO_STCLASS_OR;
3227 data->start_class->flags |= ANYOF_EOS;
3228 }
3229 } else { /* Non-zero len */
3230 if (flags & SCF_DO_STCLASS_OR) {
3231 cl_or(pRExC_state, data->start_class, &this_class);
3232 cl_and(data->start_class, and_withp);
3233 }
3234 else if (flags & SCF_DO_STCLASS_AND)
3235 cl_and(data->start_class, &this_class);
3236 flags &= ~SCF_DO_STCLASS;
3237 }
3238 if (!scan) /* It was not CURLYX, but CURLY. */
3239 scan = next;
3240 if ( /* ? quantifier ok, except for (?{ ... }) */
3241 (next_is_eval || !(mincount == 0 && maxcount == 1))
3242 && (minnext == 0) && (deltanext == 0)
3243 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3244 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3245 {
668c081a
NC
3246 ckWARNreg(RExC_parse,
3247 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3248 }
3249
3250 min += minnext * mincount;
3251 is_inf_internal |= ((maxcount == REG_INFTY
3252 && (minnext + deltanext) > 0)
3253 || deltanext == I32_MAX);
3254 is_inf |= is_inf_internal;
3255 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3256
3257 /* Try powerful optimization CURLYX => CURLYN. */
3258 if ( OP(oscan) == CURLYX && data
3259 && data->flags & SF_IN_PAR
3260 && !(data->flags & SF_HAS_EVAL)
3261 && !deltanext && minnext == 1 ) {
3262 /* Try to optimize to CURLYN. */
3263 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3264 regnode * const nxt1 = nxt;
497b47a8 3265#ifdef DEBUGGING
8aa23a47 3266 regnode *nxt2;
497b47a8 3267#endif
c277df42 3268
8aa23a47
YO
3269 /* Skip open. */
3270 nxt = regnext(nxt);
e52fc539 3271 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3272 && !(PL_regkind[OP(nxt)] == EXACT
3273 && STR_LEN(nxt) == 1))
3274 goto nogo;
497b47a8 3275#ifdef DEBUGGING
8aa23a47 3276 nxt2 = nxt;
497b47a8 3277#endif
8aa23a47
YO
3278 nxt = regnext(nxt);
3279 if (OP(nxt) != CLOSE)
3280 goto nogo;
3281 if (RExC_open_parens) {
3282 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3283 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3284 }
3285 /* Now we know that nxt2 is the only contents: */
3286 oscan->flags = (U8)ARG(nxt);
3287 OP(oscan) = CURLYN;
3288 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3289
c277df42 3290#ifdef DEBUGGING
8aa23a47
YO
3291 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3292 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3293 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3294 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3295 OP(nxt + 1) = OPTIMIZED; /* was count. */
3296 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3297#endif
8aa23a47
YO
3298 }
3299 nogo:
3300
3301 /* Try optimization CURLYX => CURLYM. */
3302 if ( OP(oscan) == CURLYX && data
3303 && !(data->flags & SF_HAS_PAR)
3304 && !(data->flags & SF_HAS_EVAL)
3305 && !deltanext /* atom is fixed width */
3306 && minnext != 0 /* CURLYM can't handle zero width */
3307 ) {
3308 /* XXXX How to optimize if data == 0? */
3309 /* Optimize to a simpler form. */
3310 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3311 regnode *nxt2;
3312
3313 OP(oscan) = CURLYM;
3314 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3315 && (OP(nxt2) != WHILEM))
3316 nxt = nxt2;
3317 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3318 /* Need to optimize away parenths. */
3319 if (data->flags & SF_IN_PAR) {
3320 /* Set the parenth number. */
3321 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3322
3323 if (OP(nxt) != CLOSE)
3324 FAIL("Panic opt close");
3325 oscan->flags = (U8)ARG(nxt);
3326 if (RExC_open_parens) {
3327 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3328 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3329 }
8aa23a47
YO
3330 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3331 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3332
c277df42 3333#ifdef DEBUGGING
8aa23a47
YO
3334 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3335 OP(nxt + 1) = OPTIMIZED; /* was count. */
3336 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3337 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3338#endif
c277df42 3339#if 0
8aa23a47
YO
3340 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3341 regnode *nnxt = regnext(nxt1);
3342
3343 if (nnxt == nxt) {
3344 if (reg_off_by_arg[OP(nxt1)])
3345 ARG_SET(nxt1, nxt2 - nxt1);
3346 else if (nxt2 - nxt1 < U16_MAX)
3347 NEXT_OFF(nxt1) = nxt2 - nxt1;
3348 else
3349 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3350 }
8aa23a47 3351 nxt1 = nnxt;
c277df42 3352 }
5d1c421c 3353#endif
8aa23a47
YO
3354 /* Optimize again: */
3355 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3356 NULL, stopparen, recursed, NULL, 0,depth+1);
3357 }
3358 else
3359 oscan->flags = 0;
3360 }
3361 else if ((OP(oscan) == CURLYX)
3362 && (flags & SCF_WHILEM_VISITED_POS)
3363 /* See the comment on a similar expression above.
3364 However, this time it not a subexpression
3365 we care about, but the expression itself. */
3366 && (maxcount == REG_INFTY)
3367 && data && ++data->whilem_c < 16) {
3368 /* This stays as CURLYX, we can put the count/of pair. */
3369 /* Find WHILEM (as in regexec.c) */
3370 regnode *nxt = oscan + NEXT_OFF(oscan);
3371
3372 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3373 nxt += ARG(nxt);
3374 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3375 | (RExC_whilem_seen << 4)); /* On WHILEM */
3376 }
3377 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3378 pars++;
3379 if (flags & SCF_DO_SUBSTR) {
3380 SV *last_str = NULL;
3381 int counted = mincount != 0;
a0ed51b3 3382
8aa23a47
YO
3383 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3384#if defined(SPARC64_GCC_WORKAROUND)
3385 I32 b = 0;
3386 STRLEN l = 0;
3387 const char *s = NULL;
3388 I32 old = 0;
b515a41d 3389
8aa23a47
YO
3390 if (pos_before >= data->last_start_min)
3391 b = pos_before;
3392 else
3393 b = data->last_start_min;
b515a41d 3394
8aa23a47
YO
3395 l = 0;
3396 s = SvPV_const(data->last_found, l);
3397 old = b - data->last_start_min;
3398
3399#else
3400 I32 b = pos_before >= data->last_start_min
3401 ? pos_before : data->last_start_min;
3402 STRLEN l;
3403 const char * const s = SvPV_const(data->last_found, l);
3404 I32 old = b - data->last_start_min;
3405#endif
3406
3407 if (UTF)
3408 old = utf8_hop((U8*)s, old) - (U8*)s;
3409
3410 l -= old;
3411 /* Get the added string: */
740cce10 3412 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3413 if (deltanext == 0 && pos_before == b) {
3414 /* What was added is a constant string */
3415 if (mincount > 1) {
3416 SvGROW(last_str, (mincount * l) + 1);
3417 repeatcpy(SvPVX(last_str) + l,
3418 SvPVX_const(last_str), l, mincount - 1);
3419 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3420 /* Add additional parts. */
3421 SvCUR_set(data->last_found,
3422 SvCUR(data->last_found) - l);
3423 sv_catsv(data->last_found, last_str);
3424 {
3425 SV * sv = data->last_found;
3426 MAGIC *mg =
3427 SvUTF8(sv) && SvMAGICAL(sv) ?
3428 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3429 if (mg && mg->mg_len >= 0)
bd94e887 3430 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3431 }
8aa23a47 3432 data->last_end += l * (mincount - 1);
b515a41d 3433 }
8aa23a47
YO
3434 } else {
3435 /* start offset must point into the last copy */
3436 data->last_start_min += minnext * (mincount - 1);
3437 data->last_start_max += is_inf ? I32_MAX
3438 : (maxcount - 1) * (minnext + data->pos_delta);
3439 }
c277df42 3440 }
8aa23a47
YO
3441 /* It is counted once already... */
3442 data->pos_min += minnext * (mincount - counted);
3443 data->pos_delta += - counted * deltanext +
3444 (minnext + deltanext) * maxcount - minnext * mincount;
3445 if (mincount != maxcount) {
3446 /* Cannot extend fixed substrings found inside
3447 the group. */
304ee84b 3448 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3449 if (mincount && last_str) {
3450 SV * const sv = data->last_found;
3451 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3452 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3453
3454 if (mg)
3455 mg->mg_len = -1;
3456 sv_setsv(sv, last_str);
3457 data->last_end = data->pos_min;
3458 data->last_start_min =
3459 data->pos_min - CHR_SVLEN(last_str);
3460 data->last_start_max = is_inf
3461 ? I32_MAX
3462 : data->pos_min + data->pos_delta
3463 - CHR_SVLEN(last_str);
3464 }
3465 data->longest = &(data->longest_float);
3466 }
3467 SvREFCNT_dec(last_str);
c277df42 3468 }
8aa23a47
YO
3469 if (data && (fl & SF_HAS_EVAL))
3470 data->flags |= SF_HAS_EVAL;
3471 optimize_curly_tail:
3472 if (OP(oscan) != CURLYX) {
3473 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3474 && NEXT_OFF(next))
3475 NEXT_OFF(oscan) += NEXT_OFF(next);
3476 }
3477 continue;
3478 default: /* REF and CLUMP only? */
3479 if (flags & SCF_DO_SUBSTR) {
304ee84b 3480 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3481 data->longest = &(data->longest_float);
3482 }
3483 is_inf = is_inf_internal = 1;
3484 if (flags & SCF_DO_STCLASS_OR)
3485 cl_anything(pRExC_state, data->start_class);
3486 flags &= ~SCF_DO_STCLASS;
3487 break;
c277df42 3488 }
8aa23a47 3489 }
e1d1eefb
YO
3490 else if (OP(scan) == LNBREAK) {
3491 if (flags & SCF_DO_STCLASS) {
3492 int value = 0;
3493 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3494 if (flags & SCF_DO_STCLASS_AND) {
3495 for (value = 0; value < 256; value++)
e64b1bd1 3496 if (!is_VERTWS_cp(value))
e1d1eefb
YO
3497 ANYOF_BITMAP_CLEAR(data->start_class, value);
3498 }
3499 else {
3500 for (value = 0; value < 256; value++)
e64b1bd1 3501 if (is_VERTWS_cp(value))
e1d1eefb
YO
3502 ANYOF_BITMAP_SET(data->start_class, value);
3503 }
3504 if (flags & SCF_DO_STCLASS_OR)
3505 cl_and(data->start_class, and_withp);
3506 flags &= ~SCF_DO_STCLASS;
3507 }
3508 min += 1;
f9a79580 3509 delta += 1;
e1d1eefb
YO
3510 if (flags & SCF_DO_SUBSTR) {
3511 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3512 data->pos_min += 1;
f9a79580 3513 data->pos_delta += 1;
e1d1eefb
YO
3514 data->longest = &(data->longest_float);
3515 }
3516
3517 }
f9a79580
RGS
3518 else if (OP(scan) == FOLDCHAR) {
3519 int d = ARG(scan)==0xDF ? 1 : 2;
3520 flags &= ~SCF_DO_STCLASS;
3521 min += 1;
3522 delta += d;
3523 if (flags & SCF_DO_SUBSTR) {
3524 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3525 data->pos_min += 1;
3526 data->pos_delta += d;
3527 data->longest = &(data->longest_float);
3528 }
3529 }
e52fc539 3530 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 3531 int value = 0;
653099ff 3532
8aa23a47 3533 if (flags & SCF_DO_SUBSTR) {
304ee84b 3534 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3535 data->pos_min++;
3536 }
3537 min++;
3538 if (flags & SCF_DO_STCLASS) {
3539 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3540
8aa23a47
YO
3541 /* Some of the logic below assumes that switching
3542 locale on will only add false positives. */
3543 switch (PL_regkind[OP(scan)]) {
3544 case SANY:
3545 default:
3546 do_default:
3547 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3548 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3549 cl_anything(pRExC_state, data->start_class);
3550 break;
3551 case REG_ANY:
3552 if (OP(scan) == SANY)
3553 goto do_default;
3554 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3555 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3556 || (data->start_class->flags & ANYOF_CLASS));
3557 cl_anything(pRExC_state, data->start_class);
653099ff 3558 }
8aa23a47
YO
3559 if (flags & SCF_DO_STCLASS_AND || !value)
3560 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3561 break;
3562 case ANYOF:
3563 if (flags & SCF_DO_STCLASS_AND)
3564 cl_and(data->start_class,
3565 (struct regnode_charclass_class*)scan);
653099ff 3566 else
8aa23a47
YO
3567 cl_or(pRExC_state, data->start_class,
3568 (struct regnode_charclass_class*)scan);
3569 break;
3570 case ALNUM:
3571 if (flags & SCF_DO_STCLASS_AND) {
3572 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3573 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3574 for (value = 0; value < 256; value++)
3575 if (!isALNUM(value))
3576 ANYOF_BITMAP_CLEAR(data->start_class, value);
3577 }
653099ff 3578 }
8aa23a47
YO
3579 else {
3580 if (data->start_class->flags & ANYOF_LOCALE)
3581 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3582 else {
3583 for (value = 0; value < 256; value++)
3584 if (isALNUM(value))
3585 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3586 }
8aa23a47
YO
3587 }
3588 break;
3589 case ALNUML:
3590 if (flags & SCF_DO_STCLASS_AND) {
3591 if (data->start_class->flags & ANYOF_LOCALE)
3592 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3593 }
3594 else {
3595 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3596 data->start_class->flags |= ANYOF_LOCALE;
3597 }
3598 break;
3599 case NALNUM:
3600 if (flags & SCF_DO_STCLASS_AND) {
3601 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3602 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3603 for (value = 0; value < 256; value++)
3604 if (isALNUM(value))
3605 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3606 }
3607 }
8aa23a47
YO
3608 else {
3609 if (data->start_class->flags & ANYOF_LOCALE)
3610 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3611 else {
3612 for (value = 0; value < 256; value++)
3613 if (!isALNUM(value))
3614 ANYOF_BITMAP_SET(data->start_class, value);
3615 }
653099ff 3616 }
8aa23a47
YO
3617 break;
3618 case NALNUML:
3619 if (flags & SCF_DO_STCLASS_AND) {
3620 if (data->start_class->flags & ANYOF_LOCALE)
3621 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3622 }
8aa23a47
YO
3623 else {
3624 data->start_class->flags |= ANYOF_LOCALE;
3625 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3626 }
3627 break;
3628 case SPACE:
3629 if (flags & SCF_DO_STCLASS_AND) {
3630 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3631 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3632 for (value = 0; value < 256; value++)
3633 if (!isSPACE(value))
3634 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3635 }
3636 }
8aa23a47
YO
3637 else {
3638 if (data->start_class->flags & ANYOF_LOCALE)
3639 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3640 else {
3641 for (value = 0; value < 256; value++)
3642 if (isSPACE(value))
3643 ANYOF_BITMAP_SET(data->start_class, value);
3644 }
653099ff 3645 }
8aa23a47
YO
3646 break;
3647 case SPACEL:
3648 if (flags & SCF_DO_STCLASS_AND) {
3649 if (data->start_class->flags & ANYOF_LOCALE)
3650 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3651 }
3652 else {
3653 data->start_class->flags |= ANYOF_LOCALE;
3654 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3655 }
3656 break;
3657 case NSPACE:
3658 if (flags & SCF_DO_STCLASS_AND) {
3659 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3660 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3661 for (value = 0; value < 256; value++)
3662 if (isSPACE(value))
3663 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3664 }
8aa23a47
YO
3665 }
3666 else {
3667 if (data->start_class->flags & ANYOF_LOCALE)
3668 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3669 else {
3670 for (value = 0; value < 256; value++)
3671 if (!isSPACE(value))
3672 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3673 }
3674 }
8aa23a47
YO
3675 break;
3676 case NSPACEL:
3677 if (flags & SCF_DO_STCLASS_AND) {
3678 if (data->start_class->flags & ANYOF_LOCALE) {
3679 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3680 for (value = 0; value < 256; value++)
3681 if (!isSPACE(value))
3682 ANYOF_BITMAP_CLEAR(data->start_class, value);
3683 }
653099ff 3684 }
8aa23a47
YO
3685 else {
3686 data->start_class->flags |= ANYOF_LOCALE;
3687 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3688 }
3689 break;
3690 case DIGIT:
3691 if (flags & SCF_DO_STCLASS_AND) {
3692 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3693 for (value = 0; value < 256; value++)
3694 if (!isDIGIT(value))
3695 ANYOF_BITMAP_CLEAR(data->start_class, value);
3696 }
3697 else {
3698 if (data->start_class->flags & ANYOF_LOCALE)
3699 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3700 else {
3701 for (value = 0; value < 256; value++)
3702 if (isDIGIT(value))
3703 ANYOF_BITMAP_SET(data->start_class, value);
3704 }
3705 }
3706 break;
3707 case NDIGIT:
3708 if (flags & SCF_DO_STCLASS_AND) {
3709 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3710 for (value = 0; value < 256; value++)
3711 if (isDIGIT(value))
3712 ANYOF_BITMAP_CLEAR(data->start_class, value);
3713 }
3714 else {
3715 if (data->start_class->flags & ANYOF_LOCALE)
3716 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3717 else {
3718 for (value = 0; value < 256; value++)
3719 if (!isDIGIT(value))
3720 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3721 }
3722 }
8aa23a47 3723 break;
e1d1eefb
YO
3724 CASE_SYNST_FNC(VERTWS);
3725 CASE_SYNST_FNC(HORIZWS);
3726
8aa23a47
YO
3727 }
3728 if (flags & SCF_DO_STCLASS_OR)
3729 cl_and(data->start_class, and_withp);
3730 flags &= ~SCF_DO_STCLASS;
3731 }
3732 }
3733 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3734 data->flags |= (OP(scan) == MEOL
3735 ? SF_BEFORE_MEOL
3736 : SF_BEFORE_SEOL);
3737 }
3738 else if ( PL_regkind[OP(scan)] == BRANCHJ
3739 /* Lookbehind, or need to calculate parens/evals/stclass: */
3740 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3741 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3742 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3743 || OP(scan) == UNLESSM )
3744 {
3745 /* Negative Lookahead/lookbehind
3746 In this case we can't do fixed string optimisation.
3747 */
1de06328 3748
8aa23a47
YO
3749 I32 deltanext, minnext, fake = 0;
3750 regnode *nscan;
3751 struct regnode_charclass_class intrnl;
3752 int f = 0;
1de06328 3753
8aa23a47
YO
3754 data_fake.flags = 0;
3755 if (data) {
3756 data_fake.whilem_c = data->whilem_c;
3757 data_fake.last_closep = data->last_closep;
c277df42 3758 }
8aa23a47
YO
3759 else
3760 data_fake.last_closep = &fake;
58e23c8d 3761 data_fake.pos_delta = delta;
8aa23a47
YO
3762 if ( flags & SCF_DO_STCLASS && !scan->flags
3763 && OP(scan) == IFMATCH ) { /* Lookahead */
3764 cl_init(pRExC_state, &intrnl);
3765 data_fake.start_class = &intrnl;
3766 f |= SCF_DO_STCLASS_AND;
3767 }
3768 if (flags & SCF_WHILEM_VISITED_POS)
3769 f |= SCF_WHILEM_VISITED_POS;
3770 next = regnext(scan);
3771 nscan = NEXTOPER(NEXTOPER(scan));
3772 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3773 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3774 if (scan->flags) {
3775 if (deltanext) {
58e23c8d 3776 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3777 }
3778 else if (minnext > (I32)U8_MAX) {
58e23c8d 3779 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3780 }
3781 scan->flags = (U8)minnext;
3782 }
3783 if (data) {
3784 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3785 pars++;
3786 if (data_fake.flags & SF_HAS_EVAL)
3787 data->flags |= SF_HAS_EVAL;
3788 data->whilem_c = data_fake.whilem_c;
3789 }
3790 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
3791 if (flags & SCF_DO_STCLASS_OR) {
3792 /* OR before, AND after: ideally we would recurse with
3793 * data_fake to get the AND applied by study of the
3794 * remainder of the pattern, and then derecurse;
3795 * *** HACK *** for now just treat as "no information".
3796 * See [perl #56690].
3797 */
3798 cl_init(pRExC_state, data->start_class);
3799 } else {
3800 /* AND before and after: combine and continue */
3801 const int was = (data->start_class->flags & ANYOF_EOS);
3802
3803 cl_and(data->start_class, &intrnl);
3804 if (was)
3805 data->start_class->flags |= ANYOF_EOS;
3806 }
8aa23a47 3807 }
cb434fcc 3808 }
8aa23a47
YO
3809#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3810 else {
3811 /* Positive Lookahead/lookbehind
3812 In this case we can do fixed string optimisation,
3813 but we must be careful about it. Note in the case of
3814 lookbehind the positions will be offset by the minimum
3815 length of the pattern, something we won't know about
3816 until after the recurse.
3817 */
3818 I32 deltanext, fake = 0;
3819 regnode *nscan;
3820 struct regnode_charclass_class intrnl;
3821 int f = 0;
3822 /* We use SAVEFREEPV so that when the full compile
3823 is finished perl will clean up the allocated
3824 minlens when its all done. This was we don't
3825 have to worry about freeing them when we know
3826 they wont be used, which would be a pain.
3827 */
3828 I32 *minnextp;
3829 Newx( minnextp, 1, I32 );
3830 SAVEFREEPV(minnextp);
3831
3832 if (data) {
3833 StructCopy(data, &data_fake, scan_data_t);
3834 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3835 f |= SCF_DO_SUBSTR;
3836 if (scan->flags)
304ee84b 3837 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3838 data_fake.last_found=newSVsv(data->last_found);
3839 }
3840 }
3841 else
3842 data_fake.last_closep = &fake;
3843 data_fake.flags = 0;
58e23c8d 3844 data_fake.pos_delta = delta;
8aa23a47
YO
3845 if (is_inf)
3846 data_fake.flags |= SF_IS_INF;
3847 if ( flags & SCF_DO_STCLASS && !scan->flags
3848 && OP(scan) == IFMATCH ) { /* Lookahead */
3849 cl_init(pRExC_state, &intrnl);
3850 data_fake.start_class = &intrnl;
3851 f |= SCF_DO_STCLASS_AND;
3852 }
3853 if (flags & SCF_WHILEM_VISITED_POS)
3854 f |= SCF_WHILEM_VISITED_POS;
3855 next = regnext(scan);
3856 nscan = NEXTOPER(NEXTOPER(scan));
3857
3858 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3859 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3860 if (scan->flags) {
3861 if (deltanext) {
58e23c8d 3862 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3863 }
3864 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3865 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3866 }
3867 scan->flags = (U8)*minnextp;
3868 }
3869
3870 *minnextp += min;
3871
3872 if (f & SCF_DO_STCLASS_AND) {
3873 const int was = (data->start_class->flags & ANYOF_EOS);
3874
3875 cl_and(data->start_class, &intrnl);
3876 if (was)
3877 data->start_class->flags |= ANYOF_EOS;
3878 }
3879 if (data) {
3880 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3881 pars++;
3882 if (data_fake.flags & SF_HAS_EVAL)
3883 data->flags |= SF_HAS_EVAL;
3884 data->whilem_c = data_fake.whilem_c;
3885 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3886 if (RExC_rx->minlen<*minnextp)
3887 RExC_rx->minlen=*minnextp;
304ee84b 3888 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3889 SvREFCNT_dec(data_fake.last_found);
3890
3891 if ( data_fake.minlen_fixed != minlenp )
3892 {
3893 data->offset_fixed= data_fake.offset_fixed;
3894 data->minlen_fixed= data_fake.minlen_fixed;
3895 data->lookbehind_fixed+= scan->flags;
3896 }
3897 if ( data_fake.minlen_float != minlenp )
3898 {
3899 data->minlen_float= data_fake.minlen_float;
3900 data->offset_float_min=data_fake.offset_float_min;
3901 data->offset_float_max=data_fake.offset_float_max;
3902 data->lookbehind_float+= scan->flags;
3903 }
3904 }
3905 }
3906
3907
40d049e4 3908 }
8aa23a47
YO
3909#endif
3910 }
3911 else if (OP(scan) == OPEN) {
3912 if (stopparen != (I32)ARG(scan))
3913 pars++;
3914 }
3915 else if (OP(scan) == CLOSE) {
3916 if (stopparen == (I32)ARG(scan)) {
3917 break;
3918 }
3919 if ((I32)ARG(scan) == is_par) {
3920 next = regnext(scan);
b515a41d 3921
8aa23a47
YO
3922 if ( next && (OP(next) != WHILEM) && next < last)
3923 is_par = 0; /* Disable optimization */
40d049e4 3924 }
8aa23a47
YO
3925 if (data)
3926 *(data->last_closep) = ARG(scan);
3927 }
3928 else if (OP(scan) == EVAL) {
c277df42
IZ
3929 if (data)
3930 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3931 }
3932 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3933 if (flags & SCF_DO_SUBSTR) {
304ee84b 3934 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3935 flags &= ~SCF_DO_SUBSTR;
40d049e4 3936 }
8aa23a47
YO
3937 if (data && OP(scan)==ACCEPT) {
3938 data->flags |= SCF_SEEN_ACCEPT;
3939 if (stopmin > min)
3940 stopmin = min;
e2e6a0f1 3941 }
8aa23a47
YO
3942 }
3943 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3944 {
0f5d15d6 3945 if (flags & SCF_DO_SUBSTR) {
304ee84b 3946 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3947 data->longest = &(data->longest_float);
3948 }
3949 is_inf = is_inf_internal = 1;
653099ff 3950 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3951 cl_anything(pRExC_state, data->start_class);
96776eda 3952 flags &= ~SCF_DO_STCLASS;
8aa23a47 3953 }
58e23c8d 3954 else if (OP(scan) == GPOS) {
bbe252da 3955 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3956 !(delta || is_inf || (data && data->pos_delta)))
3957 {
bbe252da
YO
3958 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3959 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3960 if (RExC_rx->gofs < (U32)min)
3961 RExC_rx->gofs = min;
3962 } else {
bbe252da 3963 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3964 RExC_rx->gofs = 0;
3965 }
3966 }
786e8c11 3967#ifdef TRIE_STUDY_OPT
40d049e4 3968#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3969 else if (PL_regkind[OP(scan)] == TRIE) {
3970 /* NOTE - There is similar code to this block above for handling
3971 BRANCH nodes on the initial study. If you change stuff here
3972 check there too. */
3973 regnode *trie_node= scan;
3974 regnode *tail= regnext(scan);
f8fc2ecf 3975 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3976 I32 max1 = 0, min1 = I32_MAX;
3977 struct regnode_charclass_class accum;
3978
3979 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3980 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3981 if (flags & SCF_DO_STCLASS)
3982 cl_init_zero(pRExC_state, &accum);
3983
3984 if (!trie->jump) {
3985 min1= trie->minlen;
3986 max1= trie->maxlen;
3987 } else {
3988 const regnode *nextbranch= NULL;
3989 U32 word;
3990
3991 for ( word=1 ; word <= trie->wordcount ; word++)
3992 {
3993 I32 deltanext=0, minnext=0, f = 0, fake;
3994 struct regnode_charclass_class this_class;
3995
3996 data_fake.flags = 0;
3997 if (data) {
3998 data_fake.whilem_c = data->whilem_c;
3999 data_fake.last_closep = data->last_closep;
4000 }
4001 else
4002 data_fake.last_closep = &fake;
58e23c8d 4003 data_fake.pos_delta = delta;
8aa23a47
YO
4004 if (flags & SCF_DO_STCLASS) {
4005 cl_init(pRExC_state, &this_class);
4006 data_fake.start_class = &this_class;
4007 f = SCF_DO_STCLASS_AND;
4008 }
4009 if (flags & SCF_WHILEM_VISITED_POS)
4010 f |= SCF_WHILEM_VISITED_POS;
4011
4012 if (trie->jump[word]) {
4013 if (!nextbranch)
4014 nextbranch = trie_node + trie->jump[0];
4015 scan= trie_node + trie->jump[word];
4016 /* We go from the jump point to the branch that follows
4017 it. Note this means we need the vestigal unused branches
4018 even though they arent otherwise used.
4019 */
4020 minnext = study_chunk(pRExC_state, &scan, minlenp,
4021 &deltanext, (regnode *)nextbranch, &data_fake,
4022 stopparen, recursed, NULL, f,depth+1);
4023 }
4024 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4025 nextbranch= regnext((regnode*)nextbranch);
4026
4027 if (min1 > (I32)(minnext + trie->minlen))
4028 min1 = minnext + trie->minlen;
4029 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4030 max1 = minnext + deltanext + trie->maxlen;
4031 if (deltanext == I32_MAX)
4032 is_inf = is_inf_internal = 1;
4033
4034 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4035 pars++;
4036 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4037 if ( stopmin > min + min1)
4038 stopmin = min + min1;
4039 flags &= ~SCF_DO_SUBSTR;
4040 if (data)
4041 data->flags |= SCF_SEEN_ACCEPT;
4042 }
4043 if (data) {
4044 if (data_fake.flags & SF_HAS_EVAL)
4045 data->flags |= SF_HAS_EVAL;
4046 data->whilem_c = data_fake.whilem_c;
4047 }
4048 if (flags & SCF_DO_STCLASS)
4049 cl_or(pRExC_state, &accum, &this_class);
4050 }
4051 }
4052 if (flags & SCF_DO_SUBSTR) {
4053 data->pos_min += min1;
4054 data->pos_delta += max1 - min1;
4055 if (max1 != min1 || is_inf)
4056 data->longest = &(data->longest_float);
4057 }
4058 min += min1;
4059 delta += max1 - min1;
4060 if (flags & SCF_DO_STCLASS_OR) {
4061 cl_or(pRExC_state, data->start_class, &accum);
4062 if (min1) {
4063 cl_and(data->start_class, and_withp);
4064 flags &= ~SCF_DO_STCLASS;
4065 }
4066 }
4067 else if (flags & SCF_DO_STCLASS_AND) {
4068 if (min1) {
4069 cl_and(data->start_class, &accum);
4070 flags &= ~SCF_DO_STCLASS;
4071 }
4072 else {
4073 /* Switch to OR mode: cache the old value of
4074 * data->start_class */
4075 INIT_AND_WITHP;
4076 StructCopy(data->start_class, and_withp,
4077 struct regnode_charclass_class);
4078 flags &= ~SCF_DO_STCLASS_AND;
4079 StructCopy(&accum, data->start_class,
4080 struct regnode_charclass_class);
4081 flags |= SCF_DO_STCLASS_OR;
4082 data->start_class->flags |= ANYOF_EOS;
4083 }
4084 }
4085 scan= tail;
4086 continue;
4087 }
786e8c11 4088#else
8aa23a47 4089 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4090 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4091 U8*bang=NULL;
4092
4093 min += trie->minlen;
4094 delta += (trie->maxlen - trie->minlen);
4095 flags &= ~SCF_DO_STCLASS; /* xxx */
4096 if (flags & SCF_DO_SUBSTR) {
304ee84b 4097 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4098 data->pos_min += trie->minlen;
4099 data->pos_delta += (trie->maxlen - trie->minlen);
4100 if (trie->maxlen != trie->minlen)
4101 data->longest = &(data->longest_float);
4102 }
4103 if (trie->jump) /* no more substrings -- for now /grr*/
4104 flags &= ~SCF_DO_SUBSTR;
b515a41d 4105 }
8aa23a47
YO
4106#endif /* old or new */
4107#endif /* TRIE_STUDY_OPT */
e1d1eefb 4108
8aa23a47
YO
4109 /* Else: zero-length, ignore. */
4110 scan = regnext(scan);
4111 }
4112 if (frame) {
4113 last = frame->last;
4114 scan = frame->next;
4115 stopparen = frame->stop;
4116 frame = frame->prev;
4117 goto fake_study_recurse;
c277df42
IZ
4118 }
4119
4120 finish:
8aa23a47 4121 assert(!frame);
304ee84b 4122 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4123
c277df42 4124 *scanp = scan;
aca2d497 4125 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4126 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4127 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4128 if (is_par > (I32)U8_MAX)
c277df42
IZ
4129 is_par = 0;
4130 if (is_par && pars==1 && data) {
4131 data->flags |= SF_IN_PAR;
4132 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4133 }
4134 else if (pars && data) {
c277df42
IZ
4135 data->flags |= SF_HAS_PAR;
4136 data->flags &= ~SF_IN_PAR;
4137 }
653099ff 4138 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4139 cl_and(data->start_class, and_withp);
786e8c11
YO
4140 if (flags & SCF_TRIE_RESTUDY)
4141 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4142
304ee84b 4143 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4144
e2e6a0f1 4145 return min < stopmin ? min : stopmin;
c277df42
IZ
4146}
4147
2eccd3b2
NC
4148STATIC U32
4149S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4150{
4a4e7719
NC
4151 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4152
7918f24d
NC
4153 PERL_ARGS_ASSERT_ADD_DATA;
4154
4a4e7719
NC
4155 Renewc(RExC_rxi->data,
4156 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4157 char, struct reg_data);
4158 if(count)
f8fc2ecf 4159 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4160 else
f8fc2ecf 4161 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4162 RExC_rxi->data->count = count + n;
4163 Copy(s, RExC_rxi->data->what + count, n, U8);
4164 return count;
c277df42
IZ
4165}
4166
f8149455 4167/*XXX: todo make this not included in a non debugging perl */
76234dfb 4168#ifndef PERL_IN_XSUB_RE
d88dccdf 4169void
864dbfa3 4170Perl_reginitcolors(pTHX)
d88dccdf 4171{
97aff369 4172 dVAR;
1df70142 4173 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4174 if (s) {
1df70142
AL
4175 char *t = savepv(s);
4176 int i = 0;
4177 PL_colors[0] = t;
d88dccdf 4178 while (++i < 6) {
1df70142
AL
4179 t = strchr(t, '\t');
4180 if (t) {
4181 *t = '\0';
4182 PL_colors[i] = ++t;
d88dccdf
IZ
4183 }
4184 else
1df70142 4185 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4186 }
4187 } else {
1df70142 4188 int i = 0;
b81d288d 4189 while (i < 6)
06b5626a 4190 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4191 }
4192 PL_colorset = 1;
4193}
76234dfb 4194#endif
8615cb43 4195
07be1b83 4196
786e8c11
YO
4197#ifdef TRIE_STUDY_OPT
4198#define CHECK_RESTUDY_GOTO \
4199 if ( \
4200 (data.flags & SCF_TRIE_RESTUDY) \
4201 && ! restudied++ \
4202 ) goto reStudy
4203#else
4204#define CHECK_RESTUDY_GOTO
4205#endif
f9f4320a 4206
a687059c 4207/*
e50aee73 4208 - pregcomp - compile a regular expression into internal code
a687059c
LW
4209 *
4210 * We can't allocate space until we know how big the compiled form will be,
4211 * but we can't compile it (and thus know how big it is) until we've got a
4212 * place to put the code. So we cheat: we compile it twice, once with code
4213 * generation turned off and size counting turned on, and once "for real".
4214 * This also means that we don't allocate space until we are sure that the
4215 * thing really will compile successfully, and we never have to move the
4216 * code and thus invalidate pointers into it. (Note that it has to be in
4217 * one piece because free() must be able to free it all.) [NB: not true in perl]
4218 *
4219 * Beware that the optimization-preparation code in here knows about some
4220 * of the structure of the compiled regexp. [I'll say.]
4221 */
b9b4dddf
YO
4222
4223
4224
f9f4320a 4225#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4226#define RE_ENGINE_PTR &PL_core_reg_engine
4227#else
f9f4320a
YO
4228extern const struct regexp_engine my_reg_engine;
4229#define RE_ENGINE_PTR &my_reg_engine
4230#endif
6d5c990f
RGS
4231
4232#ifndef PERL_IN_XSUB_RE
3ab4a224 4233REGEXP *
1593ad57 4234Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4235{
97aff369 4236 dVAR;
6d5c990f 4237 HV * const table = GvHV(PL_hintgv);
7918f24d
NC
4238
4239 PERL_ARGS_ASSERT_PREGCOMP;
4240
f9f4320a
YO
4241 /* Dispatch a request to compile a regexp to correct
4242 regexp engine. */
f9f4320a
YO
4243 if (table) {
4244 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4245 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4246 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4247 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4248 DEBUG_COMPILE_r({
8d8756e7 4249 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4250 SvIV(*ptr));
4251 });
3ab4a224 4252 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4253 }
b9b4dddf 4254 }
3ab4a224 4255 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4256}
6d5c990f 4257#endif
2a5d9b1d 4258
3ab4a224 4259REGEXP *
1593ad57 4260Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
2a5d9b1d
RGS
4261{
4262 dVAR;
288b8c02
NC
4263 REGEXP *rx;
4264 struct regexp *r;
f8fc2ecf 4265 register regexp_internal *ri;
3ab4a224 4266 STRLEN plen;
1593ad57 4267 char *exp = SvPV(pattern, plen);
3ab4a224 4268 char* xend = exp + plen;
c277df42 4269 regnode *scan;
a0d0e21e 4270 I32 flags;
a0d0e21e
LW
4271 I32 minlen = 0;
4272 I32 sawplus = 0;
4273 I32 sawopen = 0;
2c2d71f5 4274 scan_data_t data;
830247a4 4275 RExC_state_t RExC_state;
be8e71aa 4276 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
4277#ifdef TRIE_STUDY_OPT
4278 int restudied= 0;
4279 RExC_state_t copyRExC_state;
4280#endif
2a5d9b1d 4281 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4282
4283 PERL_ARGS_ASSERT_RE_COMPILE;
4284
6d5c990f 4285 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4286
b9ad30b4 4287 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
a0ed51b3 4288
a3621e74 4289 DEBUG_COMPILE_r({
ab3bbdeb
YO
4290 SV *dsv= sv_newmortal();
4291 RE_PV_QUOTED_DECL(s, RExC_utf8,
3ab4a224 4292 dsv, exp, plen, 60);
ab3bbdeb
YO
4293 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4294 PL_colors[4],PL_colors[5],s);
a5961de5 4295 });
02daf0ab
YO
4296
4297redo_first_pass:
4298 RExC_precomp = exp;
c737faaf 4299 RExC_flags = pm_flags;
830247a4 4300 RExC_sawback = 0;
bbce6d69 4301
830247a4
IZ
4302 RExC_seen = 0;
4303 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4304 RExC_seen_evals = 0;
4305 RExC_extralen = 0;
c277df42 4306
bbce6d69 4307 /* First pass: determine size, legality. */
830247a4 4308 RExC_parse = exp;
fac92740 4309 RExC_start = exp;
830247a4
IZ
4310 RExC_end = xend;
4311 RExC_naughty = 0;
4312 RExC_npar = 1;
e2e6a0f1 4313 RExC_nestroot = 0;
830247a4
IZ
4314 RExC_size = 0L;
4315 RExC_emit = &PL_regdummy;
4316 RExC_whilem_seen = 0;
40d049e4
YO
4317 RExC_open_parens = NULL;
4318 RExC_close_parens = NULL;
4319 RExC_opend = NULL;
81714fb9 4320 RExC_paren_names = NULL;
1f1031fe
YO
4321#ifdef DEBUGGING
4322 RExC_paren_name_list = NULL;
4323#endif
40d049e4
YO
4324 RExC_recurse = NULL;
4325 RExC_recurse_count = 0;
81714fb9 4326
85ddcde9
JH
4327#if 0 /* REGC() is (currently) a NOP at the first pass.
4328 * Clever compilers notice this and complain. --jhi */
830247a4 4329 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4330#endif
3dab1dad
YO
4331 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4332 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4333 RExC_precomp = NULL;
a0d0e21e
LW
4334 return(NULL);
4335 }
02daf0ab 4336 if (RExC_utf8 && !RExC_orig_utf8) {
38a44b82 4337 /* It's possible to write a regexp in ascii that represents Unicode
02daf0ab
YO
4338 codepoints outside of the byte range, such as via \x{100}. If we
4339 detect such a sequence we have to convert the entire pattern to utf8
4340 and then recompile, as our sizing calculation will have been based
4341 on 1 byte == 1 character, but we will need to use utf8 to encode
4342 at least some part of the pattern, and therefore must convert the whole
4343 thing.
4344 XXX: somehow figure out how to make this less expensive...
4345 -- dmq */
3ab4a224 4346 STRLEN len = plen;
02daf0ab
YO
4347 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4348 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4349 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4350 xend = exp + len;
4351 RExC_orig_utf8 = RExC_utf8;
4352 SAVEFREEPV(exp);
4353 goto redo_first_pass;
4354 }
07be1b83 4355 DEBUG_PARSE_r({
81714fb9
YO
4356 PerlIO_printf(Perl_debug_log,
4357 "Required size %"IVdf" nodes\n"
4358 "Starting second pass (creation)\n",
4359 (IV)RExC_size);
07be1b83
YO
4360 RExC_lastnum=0;
4361 RExC_lastparse=NULL;
4362 });
c277df42
IZ
4363 /* Small enough for pointer-storage convention?
4364 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4365 if (RExC_size >= 0x10000L && RExC_extralen)
4366 RExC_size += RExC_extralen;
c277df42 4367 else
830247a4
IZ
4368 RExC_extralen = 0;
4369 if (RExC_whilem_seen > 15)
4370 RExC_whilem_seen = 15;
a0d0e21e 4371
f9f4320a
YO
4372 /* Allocate space and zero-initialize. Note, the two step process
4373 of zeroing when in debug mode, thus anything assigned has to
4374 happen after that */
d2f13c59 4375 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 4376 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
4377 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4378 char, regexp_internal);
4379 if ( r == NULL || ri == NULL )
b45f050a 4380 FAIL("Regexp out of space");
0f79a09d
GS
4381#ifdef DEBUGGING
4382 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4383 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4384#else
f8fc2ecf
YO
4385 /* bulk initialize base fields with 0. */
4386 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4387#endif
58e23c8d
YO
4388
4389 /* non-zero initialization begins here */
f8fc2ecf 4390 RXi_SET( r, ri );
f9f4320a 4391 r->engine= RE_ENGINE_PTR;
c737faaf 4392 r->extflags = pm_flags;
bcdf7404 4393 {
f7819f85 4394 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
bcdf7404
YO
4395 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4396 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
4397 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4398 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
4399 const char *fptr = STD_PAT_MODS; /*"msix"*/
4400 char *p;
9d17798d 4401 const STRLEN wraplen = plen + has_minus + has_p + has_runon
bcdf7404
YO
4402 + (sizeof(STD_PAT_MODS) - 1)
4403 + (sizeof("(?:)") - 1);
4404
ad64d0ec 4405 p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
9d17798d 4406 SvCUR_set(rx, wraplen);
f7c278bf 4407 SvPOK_on(rx);
8f6ae13c 4408 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 4409 *p++='('; *p++='?';
f7819f85
A
4410 if (has_p)
4411 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404
YO
4412 {
4413 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4414 char *colon = r + 1;
4415 char ch;
4416
4417 while((ch = *fptr++)) {
4418 if(reganch & 1)
4419 *p++ = ch;
4420 else
4421 *r-- = ch;
4422 reganch >>= 1;
4423 }
4424 if(has_minus) {
4425 *r = '-';
4426 p = colon;
4427 }
4428 }
4429
28d8d7f4 4430 *p++ = ':';
bb661a58 4431 Copy(RExC_precomp, p, plen, char);
efd26800
NC
4432 assert ((RX_WRAPPED(rx) - p) < 16);
4433 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 4434 p += plen;
bcdf7404 4435 if (has_runon)
28d8d7f4
YO
4436 *p++ = '\n';
4437 *p++ = ')';
4438 *p = 0;
bcdf7404
YO
4439 }
4440
bbe252da 4441 r->intflags = 0;
830247a4 4442 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4443
6bda09f9 4444 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4445 Newxz(RExC_open_parens, RExC_npar,regnode *);
4446 SAVEFREEPV(RExC_open_parens);
4447 Newxz(RExC_close_parens,RExC_npar,regnode *);
4448 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4449 }
4450
4451 /* Useful during FAIL. */
7122b237
YO
4452#ifdef RE_TRACK_PATTERN_OFFSETS
4453 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4454 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4455 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4456 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4457 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4458#endif
4459 SetProgLen(ri,RExC_size);
288b8c02 4460 RExC_rx_sv = rx;
830247a4 4461 RExC_rx = r;
f8fc2ecf 4462 RExC_rxi = ri;
bbce6d69 4463
4464 /* Second pass: emit code. */
c737faaf 4465 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
4466 RExC_parse = exp;
4467 RExC_end = xend;
4468 RExC_naughty = 0;
4469 RExC_npar = 1;
f8fc2ecf
YO
4470 RExC_emit_start = ri->program;
4471 RExC_emit = ri->program;
3b57cd43
YO
4472 RExC_emit_bound = ri->program + RExC_size + 1;
4473
2cd61cdb 4474 /* Store the count of eval-groups for security checks: */
f8149455 4475 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4476 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 4477 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 4478 ReREFCNT_dec(rx);
a0d0e21e 4479 return(NULL);
80757612 4480 }
07be1b83
YO
4481 /* XXXX To minimize changes to RE engine we always allocate
4482 3-units-long substrs field. */
4483 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4484 if (RExC_recurse_count) {
4485 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4486 SAVEFREEPV(RExC_recurse);
4487 }
a0d0e21e 4488
07be1b83 4489reStudy:
1de06328 4490 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4491 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4492
07be1b83 4493#ifdef TRIE_STUDY_OPT
0934c9d9
SH
4494 if (!restudied) {
4495 StructCopy(&zero_scan_data, &data, scan_data_t);
4496 copyRExC_state = RExC_state;
4497 } else {
5d458dd8 4498 U32 seen=RExC_seen;
07be1b83 4499 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4500
4501 RExC_state = copyRExC_state;
4502 if (seen & REG_TOP_LEVEL_BRANCHES)
4503 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4504 else
4505 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4506 if (data.last_found) {
07be1b83 4507 SvREFCNT_dec(data.longest_fixed);
07be1b83 4508 SvREFCNT_dec(data.longest_float);
07be1b83 4509 SvREFCNT_dec(data.last_found);
1de06328 4510 }
40d049e4 4511 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4512 }
40d049e4
YO
4513#else
4514 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4515#endif
fc8cd66c 4516
a0d0e21e 4517 /* Dig out information for optimizations. */
f7819f85 4518 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
4519 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4520
a0ed51b3 4521 if (UTF)
8f6ae13c 4522 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 4523 ri->regstclass = NULL;
830247a4 4524 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4525 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4526 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4527
1de06328
YO
4528 /* testing for BRANCH here tells us whether there is "must appear"
4529 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4530 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4531 I32 fake;
c5254dd6 4532 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4533 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4534 int stclass_flag;
07be1b83 4535 I32 last_close = 0; /* pointed to by data */
5339e136
YO
4536 regnode *first= scan;
4537 regnode *first_next= regnext(first);
4538
639081d6
YO
4539 /*
4540 * Skip introductions and multiplicators >= 1
4541 * so that we can extract the 'meat' of the pattern that must
4542 * match in the large if() sequence following.
4543 * NOTE that EXACT is NOT covered here, as it is normally
4544 * picked up by the optimiser separately.
4545 *
4546 * This is unfortunate as the optimiser isnt handling lookahead
4547 * properly currently.
4548 *
4549 */
a0d0e21e 4550 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4551 /* An OR of *one* alternative - should not happen now. */
5339e136 4552 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83
YO
4553 /* for now we can't handle lookbehind IFMATCH*/
4554 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4555 (OP(first) == PLUS) ||
4556 (OP(first) == MINMOD) ||
653099ff 4557 /* An {n,m} with n>0 */
5339e136
YO
4558 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4559 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 4560 {
639081d6
YO
4561 /*
4562 * the only op that could be a regnode is PLUS, all the rest
4563 * will be regnode_1 or regnode_2.
4564 *
4565 */
a0d0e21e
LW
4566 if (OP(first) == PLUS)
4567 sawplus = 1;
4568 else
3dab1dad 4569 first += regarglen[OP(first)];
639081d6
YO
4570
4571 first = NEXTOPER(first);
5339e136 4572 first_next= regnext(first);
a687059c
LW
4573 }
4574
a0d0e21e
LW
4575 /* Starting-point info. */
4576 again:
786e8c11 4577 DEBUG_PEEP("first:",first,0);
07be1b83 4578 /* Ignore EXACT as we deal with it later. */
3dab1dad 4579 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4580 if (OP(first) == EXACT)
6f207bd3 4581 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4582 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4583 ri->regstclass = first;
b3c9acc1 4584 }
07be1b83 4585#ifdef TRIE_STCLASS
786e8c11 4586 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4587 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4588 {
786e8c11 4589 regnode *trie_op;
07be1b83 4590 /* this can happen only on restudy */
786e8c11 4591 if ( OP(first) == TRIE ) {
c944940b 4592 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4593 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4594 StructCopy(first,trieop,struct regnode_1);
4595 trie_op=(regnode *)trieop;
4596 } else {
c944940b 4597 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4598 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4599 StructCopy(first,trieop,struct regnode_charclass);
4600 trie_op=(regnode *)trieop;
4601 }
1de06328 4602 OP(trie_op)+=2;
786e8c11 4603 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4604 ri->regstclass = trie_op;
07be1b83
YO
4605 }
4606#endif
e52fc539 4607 else if (REGNODE_SIMPLE(OP(first)))
f8fc2ecf 4608 ri->regstclass = first;
3dab1dad
YO
4609 else if (PL_regkind[OP(first)] == BOUND ||
4610 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4611 ri->regstclass = first;
3dab1dad 4612 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4613 r->extflags |= (OP(first) == MBOL
4614 ? RXf_ANCH_MBOL
cad2e5aa 4615 : (OP(first) == SBOL
bbe252da
YO
4616 ? RXf_ANCH_SBOL
4617 : RXf_ANCH_BOL));
a0d0e21e 4618 first = NEXTOPER(first);
774d564b 4619 goto again;
4620 }
4621 else if (OP(first) == GPOS) {
bbe252da 4622 r->extflags |= RXf_ANCH_GPOS;
774d564b 4623 first = NEXTOPER(first);
4624 goto again;
a0d0e21e 4625 }
cf2a2b69
YO
4626 else if ((!sawopen || !RExC_sawback) &&
4627 (OP(first) == STAR &&
3dab1dad 4628 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4629 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4630 {
4631 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4632 const int type =
4633 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4634 ? RXf_ANCH_MBOL
4635 : RXf_ANCH_SBOL;
4636 r->extflags |= type;
4637 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4638 first = NEXTOPER(first);
774d564b 4639 goto again;
a0d0e21e 4640 }
b81d288d 4641 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4642 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4643 /* x+ must match at the 1st pos of run of x's */
bbe252da 4644 r->intflags |= PREGf_SKIP;
a0d0e21e 4645
c277df42 4646 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4647#ifdef TRIE_STUDY_OPT
81714fb9 4648 DEBUG_PARSE_r(
be8e71aa
YO
4649 if (!restudied)
4650 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4651 (IV)(first - scan + 1))
4652 );
4653#else
81714fb9 4654 DEBUG_PARSE_r(
be8e71aa
YO
4655 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4656 (IV)(first - scan + 1))
4657 );
4658#endif
4659
4660
a0d0e21e
LW
4661 /*
4662 * If there's something expensive in the r.e., find the
4663 * longest literal string that must appear and make it the
4664 * regmust. Resolve ties in favor of later strings, since
4665 * the regstart check works with the beginning of the r.e.
4666 * and avoiding duplication strengthens checking. Not a
4667 * strong reason, but sufficient in the absence of others.
4668 * [Now we resolve ties in favor of the earlier string if
c277df42 4669 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4670 * earlier string may buy us something the later one won't.]
4671 */
de8c5301 4672
396482e1
GA
4673 data.longest_fixed = newSVpvs("");
4674 data.longest_float = newSVpvs("");
4675 data.last_found = newSVpvs("");
c277df42
IZ
4676 data.longest = &(data.longest_fixed);
4677 first = scan;
f8fc2ecf 4678 if (!ri->regstclass) {
830247a4 4679 cl_init(pRExC_state, &ch_class);
653099ff
GS
4680 data.start_class = &ch_class;
4681 stclass_flag = SCF_DO_STCLASS_AND;
4682 } else /* XXXX Check for BOUND? */
4683 stclass_flag = 0;
cb434fcc 4684 data.last_closep = &last_close;
de8c5301 4685
1de06328 4686 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4687 &data, -1, NULL, NULL,
4688 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4689
07be1b83 4690
786e8c11
YO
4691 CHECK_RESTUDY_GOTO;
4692
4693
830247a4 4694 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4695 && data.last_start_min == 0 && data.last_end > 0
830247a4 4696 && !RExC_seen_zerolen
2bf803e2 4697 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4698 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4699 r->extflags |= RXf_CHECK_ALL;
304ee84b 4700 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4701 SvREFCNT_dec(data.last_found);
4702
1de06328
YO
4703 /* Note that code very similar to this but for anchored string
4704 follows immediately below, changes may need to be made to both.
4705 Be careful.
4706 */
a0ed51b3 4707 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4708 if (longest_float_length
c277df42
IZ
4709 || (data.flags & SF_FL_BEFORE_EOL
4710 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4711 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4712 {
1182767e 4713 I32 t,ml;
cf93c79d 4714
1de06328 4715 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4716 && data.offset_fixed == data.offset_float_min
4717 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4718 goto remove_float; /* As in (a)+. */
4719
1de06328
YO
4720 /* copy the information about the longest float from the reg_scan_data
4721 over to the program. */
33b8afdf
JH
4722 if (SvUTF8(data.longest_float)) {
4723 r->float_utf8 = data.longest_float;
c445ea15 4724 r->float_substr = NULL;
33b8afdf
JH
4725 } else {
4726 r->float_substr = data.longest_float;
c445ea15 4727 r->float_utf8 = NULL;
33b8afdf 4728 }
1de06328
YO
4729 /* float_end_shift is how many chars that must be matched that
4730 follow this item. We calculate it ahead of time as once the
4731 lookbehind offset is added in we lose the ability to correctly
4732 calculate it.*/
4733 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4734 : (I32)longest_float_length;
1de06328
YO
4735 r->float_end_shift = ml - data.offset_float_min
4736 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4737 + data.lookbehind_float;
4738 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4739 r->float_max_offset = data.offset_float_max;
1182767e 4740 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4741 r->float_max_offset -= data.lookbehind_float;
4742
cf93c79d
IZ
4743 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4744 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4745 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4746 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4747 }
4748 else {
aca2d497 4749 remove_float:
c445ea15 4750 r->float_substr = r->float_utf8 = NULL;
c277df42 4751 SvREFCNT_dec(data.longest_float);
c5254dd6 4752 longest_float_length = 0;
a0d0e21e 4753 }
c277df42 4754
1de06328
YO
4755 /* Note that code very similar to this but for floating string
4756 is immediately above, changes may need to be made to both.
4757 Be careful.
4758 */
a0ed51b3 4759 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4760 if (longest_fixed_length
c277df42
IZ
4761 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4762 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4763 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4764 {
1182767e 4765 I32 t,ml;
cf93c79d 4766
1de06328
YO
4767 /* copy the information about the longest fixed
4768 from the reg_scan_data over to the program. */
33b8afdf
JH
4769 if (SvUTF8(data.longest_fixed)) {
4770 r->anchored_utf8 = data.longest_fixed;
c445ea15 4771 r->anchored_substr = NULL;
33b8afdf
JH
4772 } else {
4773 r->anchored_substr = data.longest_fixed;
c445ea15 4774 r->anchored_utf8 = NULL;
33b8afdf 4775 }
1de06328
YO
4776 /* fixed_end_shift is how many chars that must be matched that
4777 follow this item. We calculate it ahead of time as once the
4778 lookbehind offset is added in we lose the ability to correctly
4779 calculate it.*/
4780 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4781 : (I32)longest_fixed_length;
1de06328
YO
4782 r->anchored_end_shift = ml - data.offset_fixed
4783 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4784 + data.lookbehind_fixed;
4785 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4786
cf93c79d
IZ
4787 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4788 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4789 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4790 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4791 }
4792 else {
c445ea15 4793 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4794 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4795 longest_fixed_length = 0;
a0d0e21e 4796 }
f8fc2ecf
YO
4797 if (ri->regstclass
4798 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4799 ri->regstclass = NULL;
33b8afdf
JH
4800 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4801 && stclass_flag
653099ff 4802 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4803 && !cl_is_anything(data.start_class))
4804 {
2eccd3b2 4805 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4806
f8fc2ecf 4807 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4808 struct regnode_charclass_class);
4809 StructCopy(data.start_class,
f8fc2ecf 4810 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4811 struct regnode_charclass_class);
f8fc2ecf 4812 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4813 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4814 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4815 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4816 PerlIO_printf(Perl_debug_log,
a0288114 4817 "synthetic stclass \"%s\".\n",
3f7c398e 4818 SvPVX_const(sv));});
653099ff 4819 }
c277df42
IZ
4820
4821 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4822 if (longest_fixed_length > longest_float_length) {
1de06328 4823 r->check_end_shift = r->anchored_end_shift;
c277df42 4824 r->check_substr = r->anchored_substr;
33b8afdf 4825 r->check_utf8 = r->anchored_utf8;
c277df42 4826 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4827 if (r->extflags & RXf_ANCH_SINGLE)
4828 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4829 }
4830 else {
1de06328 4831 r->check_end_shift = r->float_end_shift;
c277df42 4832 r->check_substr = r->float_substr;
33b8afdf 4833 r->check_utf8 = r->float_utf8;
1de06328
YO
4834 r->check_offset_min = r->float_min_offset;
4835 r->check_offset_max = r->float_max_offset;
a0d0e21e 4836 }
30382c73
IZ
4837 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4838 This should be changed ASAP! */
bbe252da
YO
4839 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4840 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4841 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4842 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4843 }
1de06328
YO
4844 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4845 if ( (STRLEN)minlen < longest_float_length )
4846 minlen= longest_float_length;
4847 if ( (STRLEN)minlen < longest_fixed_length )
4848 minlen= longest_fixed_length;
4849 */
a0ed51b3
LW
4850 }
4851 else {
c277df42
IZ
4852 /* Several toplevels. Best we can is to set minlen. */
4853 I32 fake;
653099ff 4854 struct regnode_charclass_class ch_class;
cb434fcc 4855 I32 last_close = 0;
c277df42 4856
5d458dd8 4857 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4858
f8fc2ecf 4859 scan = ri->program + 1;
830247a4 4860 cl_init(pRExC_state, &ch_class);
653099ff 4861 data.start_class = &ch_class;
cb434fcc 4862 data.last_closep = &last_close;
07be1b83 4863
de8c5301 4864
1de06328 4865 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4866 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4867
786e8c11 4868 CHECK_RESTUDY_GOTO;
07be1b83 4869
33b8afdf 4870 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4871 = r->float_substr = r->float_utf8 = NULL;
653099ff 4872 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4873 && !cl_is_anything(data.start_class))
4874 {
2eccd3b2 4875 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4876
f8fc2ecf 4877 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4878 struct regnode_charclass_class);
4879 StructCopy(data.start_class,
f8fc2ecf 4880 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4881 struct regnode_charclass_class);
f8fc2ecf 4882 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4883 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4884 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4885 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4886 PerlIO_printf(Perl_debug_log,
a0288114 4887 "synthetic stclass \"%s\".\n",
3f7c398e 4888 SvPVX_const(sv));});
653099ff 4889 }
a0d0e21e
LW
4890 }
4891
1de06328
YO
4892 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4893 the "real" pattern. */
cf9788e3
RGS
4894 DEBUG_OPTIMISE_r({
4895 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4896 (IV)minlen, (IV)r->minlen);
cf9788e3 4897 });
de8c5301 4898 r->minlenret = minlen;
1de06328
YO
4899 if (r->minlen < minlen)
4900 r->minlen = minlen;
4901
b81d288d 4902 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4903 r->extflags |= RXf_GPOS_SEEN;
830247a4 4904 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4905 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4906 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4907 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4908 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4909 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4910 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4911 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4912 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4913 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 4914 if (RExC_paren_names)
85fbaab2 4915 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 4916 else
5daac39c 4917 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 4918
7bd1e614 4919#ifdef STUPID_PATTERN_CHECKS
5509d87a 4920 if (RX_PRELEN(rx) == 0)
640f820d 4921 r->extflags |= RXf_NULL;
5509d87a 4922 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
4923 /* XXX: this should happen BEFORE we compile */
4924 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 4925 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 4926 r->extflags |= RXf_WHITE;
5509d87a 4927 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 4928 r->extflags |= RXf_START_ONLY;
f1b875a0 4929#else
5509d87a 4930 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
4931 /* XXX: this should happen BEFORE we compile */
4932 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4933 else {
4934 regnode *first = ri->program + 1;
39aa8307
JH
4935 U8 fop = OP(first);
4936 U8 nop = OP(NEXTOPER(first));
7bd1e614 4937
640f820d
AB
4938 if (PL_regkind[fop] == NOTHING && nop == END)
4939 r->extflags |= RXf_NULL;
4940 else if (PL_regkind[fop] == BOL && nop == END)
7bd1e614
YO
4941 r->extflags |= RXf_START_ONLY;
4942 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4943 r->extflags |= RXf_WHITE;
4944 }
f1b875a0 4945#endif
1f1031fe
YO
4946#ifdef DEBUGGING
4947 if (RExC_paren_names) {
af534a04 4948 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
1f1031fe
YO
4949 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4950 } else
1f1031fe 4951#endif
cde0cee5 4952 ri->name_list_idx = 0;
1f1031fe 4953
40d049e4
YO
4954 if (RExC_recurse_count) {
4955 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4956 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4957 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4958 }
4959 }
f0ab9afb 4960 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
4961 /* assume we don't need to swap parens around before we match */
4962
be8e71aa
YO
4963 DEBUG_DUMP_r({
4964 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4965 regdump(r);
4966 });
7122b237
YO
4967#ifdef RE_TRACK_PATTERN_OFFSETS
4968 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4969 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
4970 U32 i;
4971 GET_RE_DEBUG_FLAGS_DECL;
7122b237 4972 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 4973 for (i = 1; i <= len; i++) {
7122b237 4974 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 4975 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 4976 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
4977 }
4978 PerlIO_printf(Perl_debug_log, "\n");
4979 });
7122b237 4980#endif
288b8c02 4981 return rx;
a687059c
LW
4982}
4983
f9f4320a 4984#undef RE_ENGINE_PTR
3dab1dad 4985
93b32b6d 4986
81714fb9 4987SV*
192b9cd1
AB
4988Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4989 const U32 flags)
4990{
7918f24d
NC
4991 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4992
192b9cd1
AB
4993 PERL_UNUSED_ARG(value);
4994
f1b875a0 4995 if (flags & RXapif_FETCH) {
192b9cd1 4996 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 4997 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
f1f66076 4998 Perl_croak(aTHX_ "%s", PL_no_modify);
192b9cd1 4999 return NULL;
f1b875a0 5000 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
5001 return reg_named_buff_exists(rx, key, flags)
5002 ? &PL_sv_yes
5003 : &PL_sv_no;
f1b875a0 5004 } else if (flags & RXapif_REGNAMES) {
192b9cd1 5005 return reg_named_buff_all(rx, flags);
f1b875a0 5006 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
5007 return reg_named_buff_scalar(rx, flags);
5008 } else {
5009 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5010 return NULL;
5011 }
5012}
5013
5014SV*
5015Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5016 const U32 flags)
5017{
7918f24d 5018 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
5019 PERL_UNUSED_ARG(lastkey);
5020
f1b875a0 5021 if (flags & RXapif_FIRSTKEY)
192b9cd1 5022 return reg_named_buff_firstkey(rx, flags);
f1b875a0 5023 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
5024 return reg_named_buff_nextkey(rx, flags);
5025 else {
5026 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5027 return NULL;
5028 }
5029}
5030
5031SV*
288b8c02
NC
5032Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5033 const U32 flags)
81714fb9 5034{
44a2ac75
YO
5035 AV *retarray = NULL;
5036 SV *ret;
288b8c02 5037 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5038
5039 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5040
f1b875a0 5041 if (flags & RXapif_ALL)
44a2ac75 5042 retarray=newAV();
93b32b6d 5043
5daac39c
NC
5044 if (rx && RXp_PAREN_NAMES(rx)) {
5045 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
5046 if (he_str) {
5047 IV i;
5048 SV* sv_dat=HeVAL(he_str);
5049 I32 *nums=(I32*)SvPVX(sv_dat);
5050 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
5051 if ((I32)(rx->nparens) >= nums[i]
5052 && rx->offs[nums[i]].start != -1
5053 && rx->offs[nums[i]].end != -1)
93b32b6d 5054 {
49d7dfbc 5055 ret = newSVpvs("");
288b8c02 5056 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
5057 if (!retarray)
5058 return ret;
5059 } else {
5060 ret = newSVsv(&PL_sv_undef);
5061 }
ec83ea38 5062 if (retarray)
93b32b6d 5063 av_push(retarray, ret);
81714fb9 5064 }
93b32b6d 5065 if (retarray)
ad64d0ec 5066 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
5067 }
5068 }
5069 return NULL;
5070}
5071
5072bool
288b8c02 5073Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
5074 const U32 flags)
5075{
288b8c02 5076 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5077
5078 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5079
5daac39c 5080 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5081 if (flags & RXapif_ALL) {
5daac39c 5082 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 5083 } else {
288b8c02 5084 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
5085 if (sv) {
5086 SvREFCNT_dec(sv);
192b9cd1
AB
5087 return TRUE;
5088 } else {
5089 return FALSE;
5090 }
5091 }
5092 } else {
5093 return FALSE;
5094 }
5095}
5096
5097SV*
288b8c02 5098Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5099{
288b8c02 5100 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5101
5102 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5103
5daac39c
NC
5104 if ( rx && RXp_PAREN_NAMES(rx) ) {
5105 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 5106
288b8c02 5107 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
5108 } else {
5109 return FALSE;
5110 }
192b9cd1
AB
5111}
5112
5113SV*
288b8c02 5114Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5115{
288b8c02 5116 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 5117 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5118
5119 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5120
5daac39c
NC
5121 if (rx && RXp_PAREN_NAMES(rx)) {
5122 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
5123 HE *temphe;
5124 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5125 IV i;
5126 IV parno = 0;
5127 SV* sv_dat = HeVAL(temphe);
5128 I32 *nums = (I32*)SvPVX(sv_dat);
5129 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5130 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5131 rx->offs[nums[i]].start != -1 &&
5132 rx->offs[nums[i]].end != -1)
5133 {
5134 parno = nums[i];
5135 break;
5136 }
5137 }
f1b875a0 5138 if (parno || flags & RXapif_ALL) {
a663657d 5139 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5140 }
81714fb9
YO
5141 }
5142 }
44a2ac75
YO
5143 return NULL;
5144}
5145
192b9cd1 5146SV*
288b8c02 5147Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5148{
5149 SV *ret;
5150 AV *av;
5151 I32 length;
288b8c02 5152 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5153
7918f24d
NC
5154 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5155
5daac39c 5156 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5157 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5158 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5159 } else if (flags & RXapif_ONE) {
288b8c02 5160 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 5161 av = MUTABLE_AV(SvRV(ret));
192b9cd1 5162 length = av_len(av);
ec83ea38 5163 SvREFCNT_dec(ret);
192b9cd1
AB
5164 return newSViv(length + 1);
5165 } else {
5166 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5167 return NULL;
5168 }
5169 }
5170 return &PL_sv_undef;
5171}
5172
5173SV*
288b8c02 5174Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5175{
288b8c02 5176 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5177 AV *av = newAV();
5178
7918f24d
NC
5179 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5180
5daac39c
NC
5181 if (rx && RXp_PAREN_NAMES(rx)) {
5182 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5183 HE *temphe;
5184 (void)hv_iterinit(hv);
5185 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5186 IV i;
5187 IV parno = 0;
5188 SV* sv_dat = HeVAL(temphe);
5189 I32 *nums = (I32*)SvPVX(sv_dat);
5190 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5191 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5192 rx->offs[nums[i]].start != -1 &&
5193 rx->offs[nums[i]].end != -1)
5194 {
5195 parno = nums[i];
5196 break;
5197 }
5198 }
f1b875a0 5199 if (parno || flags & RXapif_ALL) {
a663657d 5200 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
5201 }
5202 }
5203 }
5204
ad64d0ec 5205 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
5206}
5207
49d7dfbc 5208void
288b8c02
NC
5209Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5210 SV * const sv)
44a2ac75 5211{
288b8c02 5212 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 5213 char *s = NULL;
a9d504c3 5214 I32 i = 0;
44a2ac75 5215 I32 s1, t1;
7918f24d
NC
5216
5217 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 5218
cde0cee5
YO
5219 if (!rx->subbeg) {
5220 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5221 return;
cde0cee5
YO
5222 }
5223 else
f1b875a0 5224 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 5225 /* $` */
f0ab9afb 5226 i = rx->offs[0].start;
cde0cee5 5227 s = rx->subbeg;
44a2ac75
YO
5228 }
5229 else
f1b875a0 5230 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 5231 /* $' */
f0ab9afb
NC
5232 s = rx->subbeg + rx->offs[0].end;
5233 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
5234 }
5235 else
5236 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
5237 (s1 = rx->offs[paren].start) != -1 &&
5238 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
5239 {
5240 /* $& $1 ... */
5241 i = t1 - s1;
5242 s = rx->subbeg + s1;
cde0cee5
YO
5243 } else {
5244 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5245 return;
cde0cee5
YO
5246 }
5247 assert(rx->sublen >= (s - rx->subbeg) + i );
5248 if (i >= 0) {
5249 const int oldtainted = PL_tainted;
5250 TAINT_NOT;
5251 sv_setpvn(sv, s, i);
5252 PL_tainted = oldtainted;
5253 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 5254 ? (RXp_MATCH_UTF8(rx)
cde0cee5 5255 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 5256 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
5257 {
5258 SvUTF8_on(sv);
5259 }
5260 else
5261 SvUTF8_off(sv);
5262 if (PL_tainting) {
07bc277f 5263 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
5264 if (SvTYPE(sv) >= SVt_PVMG) {
5265 MAGIC* const mg = SvMAGIC(sv);
5266 MAGIC* mgt;
5267 PL_tainted = 1;
5268 SvMAGIC_set(sv, mg->mg_moremagic);
5269 SvTAINT(sv);
5270 if ((mgt = SvMAGIC(sv))) {
5271 mg->mg_moremagic = mgt;
5272 SvMAGIC_set(sv, mg);
44a2ac75 5273 }
cde0cee5
YO
5274 } else {
5275 PL_tainted = 1;
5276 SvTAINT(sv);
5277 }
5278 } else
5279 SvTAINTED_off(sv);
44a2ac75 5280 }
81714fb9 5281 } else {
44a2ac75 5282 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5283 return;
81714fb9
YO
5284 }
5285}
93b32b6d 5286
2fdbfb4d
AB
5287void
5288Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5289 SV const * const value)
5290{
7918f24d
NC
5291 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5292
2fdbfb4d
AB
5293 PERL_UNUSED_ARG(rx);
5294 PERL_UNUSED_ARG(paren);
5295 PERL_UNUSED_ARG(value);
5296
5297 if (!PL_localizing)
f1f66076 5298 Perl_croak(aTHX_ "%s", PL_no_modify);
2fdbfb4d
AB
5299}
5300
5301I32
288b8c02 5302Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
5303 const I32 paren)
5304{
288b8c02 5305 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
5306 I32 i;
5307 I32 s1, t1;
5308
7918f24d
NC
5309 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5310
2fdbfb4d
AB
5311 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5312 switch (paren) {
192b9cd1 5313 /* $` / ${^PREMATCH} */
f1b875a0 5314 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
5315 if (rx->offs[0].start != -1) {
5316 i = rx->offs[0].start;
5317 if (i > 0) {
5318 s1 = 0;
5319 t1 = i;
5320 goto getlen;
5321 }
5322 }
5323 return 0;
192b9cd1 5324 /* $' / ${^POSTMATCH} */
f1b875a0 5325 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
5326 if (rx->offs[0].end != -1) {
5327 i = rx->sublen - rx->offs[0].end;
5328 if (i > 0) {
5329 s1 = rx->offs[0].end;
5330 t1 = rx->sublen;
5331 goto getlen;
5332 }
5333 }
5334 return 0;
192b9cd1
AB
5335 /* $& / ${^MATCH}, $1, $2, ... */
5336 default:
2fdbfb4d
AB
5337 if (paren <= (I32)rx->nparens &&
5338 (s1 = rx->offs[paren].start) != -1 &&
5339 (t1 = rx->offs[paren].end) != -1)
5340 {
5341 i = t1 - s1;
5342 goto getlen;
5343 } else {
5344 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 5345 report_uninit((const SV *)sv);
2fdbfb4d
AB
5346 return 0;
5347 }
5348 }
5349 getlen:
07bc277f 5350 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
5351 const char * const s = rx->subbeg + s1;
5352 const U8 *ep;
5353 STRLEN el;
5354
5355 i = t1 - s1;
5356 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5357 i = el;
5358 }
5359 return i;
5360}
5361
fe578d7f 5362SV*
49d7dfbc 5363Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 5364{
7918f24d 5365 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 5366 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
5367 if (0)
5368 return NULL;
5369 else
5370 return newSVpvs("Regexp");
fe578d7f 5371}
0a4db386 5372
894be9b7 5373/* Scans the name of a named buffer from the pattern.
0a4db386
YO
5374 * If flags is REG_RSN_RETURN_NULL returns null.
5375 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5376 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5377 * to the parsed name as looked up in the RExC_paren_names hash.
5378 * If there is an error throws a vFAIL().. type exception.
894be9b7 5379 */
0a4db386
YO
5380
5381#define REG_RSN_RETURN_NULL 0
5382#define REG_RSN_RETURN_NAME 1
5383#define REG_RSN_RETURN_DATA 2
5384
894be9b7 5385STATIC SV*
7918f24d
NC
5386S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5387{
894be9b7 5388 char *name_start = RExC_parse;
1f1031fe 5389
7918f24d
NC
5390 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5391
1f1031fe
YO
5392 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5393 /* skip IDFIRST by using do...while */
5394 if (UTF)
5395 do {
5396 RExC_parse += UTF8SKIP(RExC_parse);
5397 } while (isALNUM_utf8((U8*)RExC_parse));
5398 else
5399 do {
5400 RExC_parse++;
5401 } while (isALNUM(*RExC_parse));
894be9b7 5402 }
1f1031fe 5403
0a4db386 5404 if ( flags ) {
59cd0e26
NC
5405 SV* sv_name
5406 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5407 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
5408 if ( flags == REG_RSN_RETURN_NAME)
5409 return sv_name;
5410 else if (flags==REG_RSN_RETURN_DATA) {
5411 HE *he_str = NULL;
5412 SV *sv_dat = NULL;
5413 if ( ! sv_name ) /* should not happen*/
5414 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5415 if (RExC_paren_names)
5416 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5417 if ( he_str )
5418 sv_dat = HeVAL(he_str);
5419 if ( ! sv_dat )
5420 vFAIL("Reference to nonexistent named group");
5421 return sv_dat;
5422 }
5423 else {
5424 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5425 }
5426 /* NOT REACHED */
894be9b7 5427 }
0a4db386 5428 return NULL;
894be9b7
YO
5429}
5430
3dab1dad
YO
5431#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5432 int rem=(int)(RExC_end - RExC_parse); \
5433 int cut; \
5434 int num; \
5435 int iscut=0; \
5436 if (rem>10) { \
5437 rem=10; \
5438 iscut=1; \
5439 } \
5440 cut=10-rem; \
5441 if (RExC_lastparse!=RExC_parse) \
5442 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5443 rem, RExC_parse, \
5444 cut + 4, \
5445 iscut ? "..." : "<" \
5446 ); \
5447 else \
5448 PerlIO_printf(Perl_debug_log,"%16s",""); \
5449 \
5450 if (SIZE_ONLY) \
3b57cd43 5451 num = RExC_size + 1; \
3dab1dad
YO
5452 else \
5453 num=REG_NODE_NUM(RExC_emit); \
5454 if (RExC_lastnum!=num) \
0a4db386 5455 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 5456 else \
0a4db386 5457 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
5458 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5459 (int)((depth*2)), "", \
3dab1dad
YO
5460 (funcname) \
5461 ); \
5462 RExC_lastnum=num; \
5463 RExC_lastparse=RExC_parse; \
5464})
5465
07be1b83
YO
5466
5467
3dab1dad
YO
5468#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5469 DEBUG_PARSE_MSG((funcname)); \
5470 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5471})
6bda09f9
YO
5472#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5473 DEBUG_PARSE_MSG((funcname)); \
5474 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5475})
a687059c
LW
5476/*
5477 - reg - regular expression, i.e. main body or parenthesized thing
5478 *
5479 * Caller must absorb opening parenthesis.
5480 *
5481 * Combining parenthesis handling with the base level of regular expression
5482 * is a trifle forced, but the need to tie the tails of the branches to what
5483 * follows makes it hard to avoid.
5484 */
07be1b83
YO
5485#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5486#ifdef DEBUGGING
5487#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5488#else
5489#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5490#endif
3dab1dad 5491
76e3520e 5492STATIC regnode *
3dab1dad 5493S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 5494 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 5495{
27da23d5 5496 dVAR;
c277df42
IZ
5497 register regnode *ret; /* Will be the head of the group. */
5498 register regnode *br;
5499 register regnode *lastbr;
cbbf8932 5500 register regnode *ender = NULL;
a0d0e21e 5501 register I32 parno = 0;
cbbf8932 5502 I32 flags;
f7819f85 5503 U32 oregflags = RExC_flags;
6136c704
AL
5504 bool have_branch = 0;
5505 bool is_open = 0;
594d7033
YO
5506 I32 freeze_paren = 0;
5507 I32 after_freeze = 0;
9d1d55b5
JP
5508
5509 /* for (?g), (?gc), and (?o) warnings; warning
5510 about (?c) will warn about (?g) -- japhy */
5511
6136c704
AL
5512#define WASTED_O 0x01
5513#define WASTED_G 0x02
5514#define WASTED_C 0x04
5515#define WASTED_GC (0x02|0x04)
cbbf8932 5516 I32 wastedflags = 0x00;
9d1d55b5 5517
fac92740 5518 char * parse_start = RExC_parse; /* MJD */
a28509cc 5519 char * const oregcomp_parse = RExC_parse;
a0d0e21e 5520
3dab1dad 5521 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5522
5523 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
5524 DEBUG_PARSE("reg ");
5525
821b33a5 5526 *flagp = 0; /* Tentatively. */
a0d0e21e 5527
9d1d55b5 5528
a0d0e21e
LW
5529 /* Make an OPEN node, if parenthesized. */
5530 if (paren) {
e2e6a0f1
YO
5531 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5532 char *start_verb = RExC_parse;
5533 STRLEN verb_len = 0;
5534 char *start_arg = NULL;
5535 unsigned char op = 0;
5536 int argok = 1;
5537 int internal_argval = 0; /* internal_argval is only useful if !argok */
5538 while ( *RExC_parse && *RExC_parse != ')' ) {
5539 if ( *RExC_parse == ':' ) {
5540 start_arg = RExC_parse + 1;
5541 break;
5542 }
5543 RExC_parse++;
5544 }
5545 ++start_verb;
5546 verb_len = RExC_parse - start_verb;
5547 if ( start_arg ) {
5548 RExC_parse++;
5549 while ( *RExC_parse && *RExC_parse != ')' )
5550 RExC_parse++;
5551 if ( *RExC_parse != ')' )
5552 vFAIL("Unterminated verb pattern argument");
5553 if ( RExC_parse == start_arg )
5554 start_arg = NULL;
5555 } else {
5556 if ( *RExC_parse != ')' )
5557 vFAIL("Unterminated verb pattern");
5558 }
5d458dd8 5559
e2e6a0f1
YO
5560 switch ( *start_verb ) {
5561 case 'A': /* (*ACCEPT) */
568a785a 5562 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
5563 op = ACCEPT;
5564 internal_argval = RExC_nestroot;
5565 }
5566 break;
5567 case 'C': /* (*COMMIT) */
568a785a 5568 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 5569 op = COMMIT;
e2e6a0f1
YO
5570 break;
5571 case 'F': /* (*FAIL) */
568a785a 5572 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
5573 op = OPFAIL;
5574 argok = 0;
5575 }
5576 break;
5d458dd8
YO
5577 case ':': /* (*:NAME) */
5578 case 'M': /* (*MARK:NAME) */
568a785a 5579 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 5580 op = MARKPOINT;
5d458dd8
YO
5581 argok = -1;
5582 }
5583 break;
5584 case 'P': /* (*PRUNE) */
568a785a 5585 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 5586 op = PRUNE;
e2e6a0f1 5587 break;
5d458dd8 5588 case 'S': /* (*SKIP) */
568a785a 5589 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
5590 op = SKIP;
5591 break;
5592 case 'T': /* (*THEN) */
5593 /* [19:06] <TimToady> :: is then */
568a785a 5594 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
5595 op = CUTGROUP;
5596 RExC_seen |= REG_SEEN_CUTGROUP;
5597 }
e2e6a0f1
YO
5598 break;
5599 }
5600 if ( ! op ) {
5601 RExC_parse++;
5602 vFAIL3("Unknown verb pattern '%.*s'",
5603 verb_len, start_verb);
5604 }
5605 if ( argok ) {
5606 if ( start_arg && internal_argval ) {
5607 vFAIL3("Verb pattern '%.*s' may not have an argument",
5608 verb_len, start_verb);
5609 } else if ( argok < 0 && !start_arg ) {
5610 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5611 verb_len, start_verb);
5612 } else {
5613 ret = reganode(pRExC_state, op, internal_argval);
5614 if ( ! internal_argval && ! SIZE_ONLY ) {
5615 if (start_arg) {
5616 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5617 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5618 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
5619 ret->flags = 0;
5620 } else {
5621 ret->flags = 1;
5622 }
5623 }
5624 }
5625 if (!internal_argval)
5626 RExC_seen |= REG_SEEN_VERBARG;
5627 } else if ( start_arg ) {
5628 vFAIL3("Verb pattern '%.*s' may not have an argument",
5629 verb_len, start_verb);
5630 } else {
5631 ret = reg_node(pRExC_state, op);
5632 }
5633 nextchar(pRExC_state);
5634 return ret;
5635 } else
fac92740 5636 if (*RExC_parse == '?') { /* (?...) */
6136c704 5637 bool is_logical = 0;
a28509cc 5638 const char * const seqstart = RExC_parse;
ca9dfc88 5639
830247a4
IZ
5640 RExC_parse++;
5641 paren = *RExC_parse++;
c277df42 5642 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5643 switch (paren) {
894be9b7 5644
1f1031fe
YO
5645 case 'P': /* (?P...) variants for those used to PCRE/Python */
5646 paren = *RExC_parse++;
5647 if ( paren == '<') /* (?P<...>) named capture */
5648 goto named_capture;
5649 else if (paren == '>') { /* (?P>name) named recursion */
5650 goto named_recursion;
5651 }
5652 else if (paren == '=') { /* (?P=...) named backref */
5653 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5654 you change this make sure you change that */
5655 char* name_start = RExC_parse;
5656 U32 num = 0;
5657 SV *sv_dat = reg_scan_name(pRExC_state,
5658 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5659 if (RExC_parse == name_start || *RExC_parse != ')')
5660 vFAIL2("Sequence %.3s... not terminated",parse_start);
5661
5662 if (!SIZE_ONLY) {
5663 num = add_data( pRExC_state, 1, "S" );
5664 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5665 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
5666 }
5667 RExC_sawback = 1;
5668 ret = reganode(pRExC_state,
5669 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5670 num);
5671 *flagp |= HASWIDTH;
5672
5673 Set_Node_Offset(ret, parse_start+1);
5674 Set_Node_Cur_Length(ret); /* MJD */
5675
5676 nextchar(pRExC_state);
5677 return ret;
5678 }
57b84237
YO
5679 RExC_parse++;
5680 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5681 /*NOTREACHED*/
5682 case '<': /* (?<...) */
b81d288d 5683 if (*RExC_parse == '!')
c277df42 5684 paren = ',';
0a4db386 5685 else if (*RExC_parse != '=')
1f1031fe 5686 named_capture:
0a4db386 5687 { /* (?<...>) */
81714fb9 5688 char *name_start;
894be9b7 5689 SV *svname;
81714fb9
YO
5690 paren= '>';
5691 case '\'': /* (?'...') */
5692 name_start= RExC_parse;
0a4db386
YO
5693 svname = reg_scan_name(pRExC_state,
5694 SIZE_ONLY ? /* reverse test from the others */
5695 REG_RSN_RETURN_NAME :
5696 REG_RSN_RETURN_NULL);
57b84237
YO
5697 if (RExC_parse == name_start) {
5698 RExC_parse++;
5699 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5700 /*NOTREACHED*/
5701 }
81714fb9
YO
5702 if (*RExC_parse != paren)
5703 vFAIL2("Sequence (?%c... not terminated",
5704 paren=='>' ? '<' : paren);
5705 if (SIZE_ONLY) {
e62cc96a
YO
5706 HE *he_str;
5707 SV *sv_dat = NULL;
894be9b7
YO
5708 if (!svname) /* shouldnt happen */
5709 Perl_croak(aTHX_
5710 "panic: reg_scan_name returned NULL");
81714fb9
YO
5711 if (!RExC_paren_names) {
5712 RExC_paren_names= newHV();
ad64d0ec 5713 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
5714#ifdef DEBUGGING
5715 RExC_paren_name_list= newAV();
ad64d0ec 5716 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 5717#endif
81714fb9
YO
5718 }
5719 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5720 if ( he_str )
81714fb9 5721 sv_dat = HeVAL(he_str);
e62cc96a 5722 if ( ! sv_dat ) {
81714fb9 5723 /* croak baby croak */
e62cc96a
YO
5724 Perl_croak(aTHX_
5725 "panic: paren_name hash element allocation failed");
5726 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
5727 /* (?|...) can mean we have dupes so scan to check
5728 its already been stored. Maybe a flag indicating
5729 we are inside such a construct would be useful,
5730 but the arrays are likely to be quite small, so
5731 for now we punt -- dmq */
5732 IV count = SvIV(sv_dat);
5733 I32 *pv = (I32*)SvPVX(sv_dat);
5734 IV i;
5735 for ( i = 0 ; i < count ; i++ ) {
5736 if ( pv[i] == RExC_npar ) {
5737 count = 0;
5738 break;
5739 }
5740 }
5741 if ( count ) {
5742 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5743 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5744 pv[count] = RExC_npar;
3a92e6ae 5745 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 5746 }
81714fb9
YO
5747 } else {
5748 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5749 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5750 SvIOK_on(sv_dat);
3ec35e0f 5751 SvIV_set(sv_dat, 1);
e62cc96a 5752 }
1f1031fe
YO
5753#ifdef DEBUGGING
5754 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5755 SvREFCNT_dec(svname);
5756#endif
e62cc96a 5757
81714fb9
YO
5758 /*sv_dump(sv_dat);*/
5759 }
5760 nextchar(pRExC_state);
5761 paren = 1;
5762 goto capturing_parens;
5763 }
5764 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5765 RExC_parse++;
fac92740 5766 case '=': /* (?=...) */
89c6a13e
AB
5767 RExC_seen_zerolen++;
5768 break;
fac92740 5769 case '!': /* (?!...) */
830247a4 5770 RExC_seen_zerolen++;
e2e6a0f1
YO
5771 if (*RExC_parse == ')') {
5772 ret=reg_node(pRExC_state, OPFAIL);
5773 nextchar(pRExC_state);
5774 return ret;
5775 }
594d7033
YO
5776 break;
5777 case '|': /* (?|...) */
5778 /* branch reset, behave like a (?:...) except that
5779 buffers in alternations share the same numbers */
5780 paren = ':';
5781 after_freeze = freeze_paren = RExC_npar;
5782 break;
fac92740
MJD
5783 case ':': /* (?:...) */
5784 case '>': /* (?>...) */
a0d0e21e 5785 break;
fac92740
MJD
5786 case '$': /* (?$...) */
5787 case '@': /* (?@...) */
8615cb43 5788 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5789 break;
fac92740 5790 case '#': /* (?#...) */
830247a4
IZ
5791 while (*RExC_parse && *RExC_parse != ')')
5792 RExC_parse++;
5793 if (*RExC_parse != ')')
c277df42 5794 FAIL("Sequence (?#... not terminated");
830247a4 5795 nextchar(pRExC_state);
a0d0e21e
LW
5796 *flagp = TRYAGAIN;
5797 return NULL;
894be9b7
YO
5798 case '0' : /* (?0) */
5799 case 'R' : /* (?R) */
5800 if (*RExC_parse != ')')
6bda09f9 5801 FAIL("Sequence (?R) not terminated");
1a147d38 5802 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 5803 *flagp |= POSTPONED;
7f69552c
YO
5804 nextchar(pRExC_state);
5805 return ret;
5806 /*notreached*/
894be9b7
YO
5807 { /* named and numeric backreferences */
5808 I32 num;
894be9b7
YO
5809 case '&': /* (?&NAME) */
5810 parse_start = RExC_parse - 1;
1f1031fe 5811 named_recursion:
894be9b7 5812 {
0a4db386
YO
5813 SV *sv_dat = reg_scan_name(pRExC_state,
5814 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5815 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5816 }
5817 goto gen_recurse_regop;
5818 /* NOT REACHED */
542fa716
YO
5819 case '+':
5820 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5821 RExC_parse++;
5822 vFAIL("Illegal pattern");
5823 }
5824 goto parse_recursion;
5825 /* NOT REACHED*/
5826 case '-': /* (?-1) */
5827 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5828 RExC_parse--; /* rewind to let it be handled later */
5829 goto parse_flags;
5830 }
5831 /*FALLTHROUGH */
6bda09f9
YO
5832 case '1': case '2': case '3': case '4': /* (?1) */
5833 case '5': case '6': case '7': case '8': case '9':
5834 RExC_parse--;
542fa716 5835 parse_recursion:
894be9b7
YO
5836 num = atoi(RExC_parse);
5837 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5838 if (*RExC_parse == '-')
5839 RExC_parse++;
6bda09f9
YO
5840 while (isDIGIT(*RExC_parse))
5841 RExC_parse++;
5842 if (*RExC_parse!=')')
5843 vFAIL("Expecting close bracket");
894be9b7
YO
5844
5845 gen_recurse_regop:
542fa716
YO
5846 if ( paren == '-' ) {
5847 /*
5848 Diagram of capture buffer numbering.
5849 Top line is the normal capture buffer numbers
5850 Botton line is the negative indexing as from
5851 the X (the (?-2))
5852
5853 + 1 2 3 4 5 X 6 7
5854 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5855 - 5 4 3 2 1 X x x
5856
5857 */
5858 num = RExC_npar + num;
5859 if (num < 1) {
5860 RExC_parse++;
5861 vFAIL("Reference to nonexistent group");
5862 }
5863 } else if ( paren == '+' ) {
5864 num = RExC_npar + num - 1;
5865 }
5866
1a147d38 5867 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5868 if (!SIZE_ONLY) {
5869 if (num > (I32)RExC_rx->nparens) {
5870 RExC_parse++;
5871 vFAIL("Reference to nonexistent group");
5872 }
40d049e4 5873 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5874 RExC_emit++;
226de585 5875 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5876 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5877 } else {
6bda09f9 5878 RExC_size++;
6bda09f9 5879 }
0a4db386 5880 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5881 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5882 Set_Node_Offset(ret, parse_start); /* MJD */
5883
a3b492c3 5884 *flagp |= POSTPONED;
6bda09f9
YO
5885 nextchar(pRExC_state);
5886 return ret;
894be9b7
YO
5887 } /* named and numeric backreferences */
5888 /* NOT REACHED */
5889
fac92740 5890 case '?': /* (??...) */
6136c704 5891 is_logical = 1;
57b84237
YO
5892 if (*RExC_parse != '{') {
5893 RExC_parse++;
5894 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5895 /*NOTREACHED*/
5896 }
a3b492c3 5897 *flagp |= POSTPONED;
830247a4 5898 paren = *RExC_parse++;
0f5d15d6 5899 /* FALL THROUGH */
fac92740 5900 case '{': /* (?{...}) */
c277df42 5901 {
2eccd3b2
NC
5902 I32 count = 1;
5903 U32 n = 0;
c277df42 5904 char c;
830247a4 5905 char *s = RExC_parse;
c277df42 5906
830247a4
IZ
5907 RExC_seen_zerolen++;
5908 RExC_seen |= REG_SEEN_EVAL;
5909 while (count && (c = *RExC_parse)) {
6136c704
AL
5910 if (c == '\\') {
5911 if (RExC_parse[1])
5912 RExC_parse++;
5913 }
b81d288d 5914 else if (c == '{')
c277df42 5915 count++;
b81d288d 5916 else if (c == '}')
c277df42 5917 count--;
830247a4 5918 RExC_parse++;
c277df42 5919 }
6136c704 5920 if (*RExC_parse != ')') {
b81d288d 5921 RExC_parse = s;
b45f050a
JF
5922 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5923 }
c277df42 5924 if (!SIZE_ONLY) {
f3548bdc 5925 PAD *pad;
6136c704
AL
5926 OP_4tree *sop, *rop;
5927 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5928
569233ed
SB
5929 ENTER;
5930 Perl_save_re_context(aTHX);
f3548bdc 5931 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5932 sop->op_private |= OPpREFCOUNTED;
5933 /* re_dup will OpREFCNT_inc */
5934 OpREFCNT_set(sop, 1);
569233ed 5935 LEAVE;
c277df42 5936
830247a4 5937 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5938 RExC_rxi->data->data[n] = (void*)rop;
5939 RExC_rxi->data->data[n+1] = (void*)sop;
5940 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 5941 SvREFCNT_dec(sv);
a0ed51b3 5942 }
e24b16f9 5943 else { /* First pass */
830247a4 5944 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5945 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5946 /* No compiled RE interpolated, has runtime
5947 components ===> unsafe. */
5948 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5949 if (PL_tainting && PL_tainted)
cc6b7395 5950 FAIL("Eval-group in insecure regular expression");
54df2634 5951#if PERL_VERSION > 8
923e4eb5 5952 if (IN_PERL_COMPILETIME)
b5c19bd7 5953 PL_cv_has_eval = 1;
54df2634 5954#endif
c277df42 5955 }
b5c19bd7 5956
830247a4 5957 nextchar(pRExC_state);
6136c704 5958 if (is_logical) {
830247a4 5959 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5960 if (!SIZE_ONLY)
5961 ret->flags = 2;
3dab1dad 5962 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5963 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5964 return ret;
5965 }
ccb2c380
MP
5966 ret = reganode(pRExC_state, EVAL, n);
5967 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5968 Set_Node_Offset(ret, parse_start);
5969 return ret;
c277df42 5970 }
fac92740 5971 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5972 {
0a4db386 5973 int is_define= 0;
fac92740 5974 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5975 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5976 || RExC_parse[1] == '<'
830247a4 5977 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5978 I32 flag;
5979
830247a4 5980 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5981 if (!SIZE_ONLY)
5982 ret->flags = 1;
3dab1dad 5983 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5984 goto insert_if;
b81d288d 5985 }
a0ed51b3 5986 }
0a4db386
YO
5987 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5988 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5989 {
5990 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5991 char *name_start= RExC_parse++;
2eccd3b2 5992 U32 num = 0;
0a4db386
YO
5993 SV *sv_dat=reg_scan_name(pRExC_state,
5994 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5995 if (RExC_parse == name_start || *RExC_parse != ch)
5996 vFAIL2("Sequence (?(%c... not terminated",
5997 (ch == '>' ? '<' : ch));
5998 RExC_parse++;
5999 if (!SIZE_ONLY) {
6000 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 6001 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 6002 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
6003 }
6004 ret = reganode(pRExC_state,NGROUPP,num);
6005 goto insert_if_check_paren;
6006 }
6007 else if (RExC_parse[0] == 'D' &&
6008 RExC_parse[1] == 'E' &&
6009 RExC_parse[2] == 'F' &&
6010 RExC_parse[3] == 'I' &&
6011 RExC_parse[4] == 'N' &&
6012 RExC_parse[5] == 'E')
6013 {
6014 ret = reganode(pRExC_state,DEFINEP,0);
6015 RExC_parse +=6 ;
6016 is_define = 1;
6017 goto insert_if_check_paren;
6018 }
6019 else if (RExC_parse[0] == 'R') {
6020 RExC_parse++;
6021 parno = 0;
6022 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6023 parno = atoi(RExC_parse++);
6024 while (isDIGIT(*RExC_parse))
6025 RExC_parse++;
6026 } else if (RExC_parse[0] == '&') {
6027 SV *sv_dat;
6028 RExC_parse++;
6029 sv_dat = reg_scan_name(pRExC_state,
6030 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6031 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6032 }
1a147d38 6033 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
6034 goto insert_if_check_paren;
6035 }
830247a4 6036 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 6037 /* (?(1)...) */
6136c704 6038 char c;
830247a4 6039 parno = atoi(RExC_parse++);
c277df42 6040
830247a4
IZ
6041 while (isDIGIT(*RExC_parse))
6042 RExC_parse++;
fac92740 6043 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 6044
0a4db386 6045 insert_if_check_paren:
830247a4 6046 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 6047 vFAIL("Switch condition not recognized");
c277df42 6048 insert_if:
3dab1dad
YO
6049 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6050 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 6051 if (br == NULL)
830247a4 6052 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 6053 else
3dab1dad 6054 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 6055 c = *nextchar(pRExC_state);
d1b80229
IZ
6056 if (flags&HASWIDTH)
6057 *flagp |= HASWIDTH;
c277df42 6058 if (c == '|') {
0a4db386
YO
6059 if (is_define)
6060 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 6061 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
6062 regbranch(pRExC_state, &flags, 1,depth+1);
6063 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
6064 if (flags&HASWIDTH)
6065 *flagp |= HASWIDTH;
830247a4 6066 c = *nextchar(pRExC_state);
a0ed51b3
LW
6067 }
6068 else
c277df42
IZ
6069 lastbr = NULL;
6070 if (c != ')')
8615cb43 6071 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 6072 ender = reg_node(pRExC_state, TAIL);
3dab1dad 6073 REGTAIL(pRExC_state, br, ender);
c277df42 6074 if (lastbr) {
3dab1dad
YO
6075 REGTAIL(pRExC_state, lastbr, ender);
6076 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
6077 }
6078 else
3dab1dad 6079 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
6080 RExC_size++; /* XXX WHY do we need this?!!
6081 For large programs it seems to be required
6082 but I can't figure out why. -- dmq*/
c277df42 6083 return ret;
a0ed51b3
LW
6084 }
6085 else {
830247a4 6086 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
6087 }
6088 }
1b1626e4 6089 case 0:
830247a4 6090 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 6091 vFAIL("Sequence (? incomplete");
1b1626e4 6092 break;
a0d0e21e 6093 default:
cde0cee5
YO
6094 --RExC_parse;
6095 parse_flags: /* (?i) */
6096 {
6097 U32 posflags = 0, negflags = 0;
6098 U32 *flagsp = &posflags;
6099
6100 while (*RExC_parse) {
6101 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
6102 /* (?g), (?gc) and (?o) are useless here
6103 and must be globally applied -- japhy */
cde0cee5
YO
6104 switch (*RExC_parse) {
6105 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
f7819f85
A
6106 case ONCE_PAT_MOD: /* 'o' */
6107 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 6108 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 6109 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
6110 if (! (wastedflags & wflagbit) ) {
6111 wastedflags |= wflagbit;
6112 vWARN5(
6113 RExC_parse + 1,
6114 "Useless (%s%c) - %suse /%c modifier",
6115 flagsp == &negflags ? "?-" : "?",
6116 *RExC_parse,
6117 flagsp == &negflags ? "don't " : "",
6118 *RExC_parse
6119 );
6120 }
6121 }
cde0cee5
YO
6122 break;
6123
f7819f85 6124 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 6125 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
6126 if (! (wastedflags & WASTED_C) ) {
6127 wastedflags |= WASTED_GC;
9d1d55b5
JP
6128 vWARN3(
6129 RExC_parse + 1,
6130 "Useless (%sc) - %suse /gc modifier",
6131 flagsp == &negflags ? "?-" : "?",
6132 flagsp == &negflags ? "don't " : ""
6133 );
6134 }
6135 }
cde0cee5 6136 break;
f7819f85 6137 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5 6138 if (flagsp == &negflags) {
668c081a
NC
6139 if (SIZE_ONLY)
6140 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
6141 } else {
6142 *flagsp |= RXf_PMf_KEEPCOPY;
6143 }
6144 break;
6145 case '-':
57b84237
YO
6146 if (flagsp == &negflags) {
6147 RExC_parse++;
6148 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6149 /*NOTREACHED*/
6150 }
cde0cee5
YO
6151 flagsp = &negflags;
6152 wastedflags = 0; /* reset so (?g-c) warns twice */
6153 break;
6154 case ':':
6155 paren = ':';
6156 /*FALLTHROUGH*/
6157 case ')':
6158 RExC_flags |= posflags;
6159 RExC_flags &= ~negflags;
f7819f85
A
6160 if (paren != ':') {
6161 oregflags |= posflags;
6162 oregflags &= ~negflags;
6163 }
cde0cee5
YO
6164 nextchar(pRExC_state);
6165 if (paren != ':') {
6166 *flagp = TRYAGAIN;
6167 return NULL;
6168 } else {
6169 ret = NULL;
6170 goto parse_rest;
6171 }
6172 /*NOTREACHED*/
6173 default:
cde0cee5
YO
6174 RExC_parse++;
6175 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6176 /*NOTREACHED*/
6177 }
830247a4 6178 ++RExC_parse;
48c036b1 6179 }
cde0cee5 6180 }} /* one for the default block, one for the switch */
a0d0e21e 6181 }
fac92740 6182 else { /* (...) */
81714fb9 6183 capturing_parens:
830247a4
IZ
6184 parno = RExC_npar;
6185 RExC_npar++;
e2e6a0f1 6186
830247a4 6187 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
6188 if (!SIZE_ONLY ){
6189 if (!RExC_nestroot)
6190 RExC_nestroot = parno;
c009da3d
YO
6191 if (RExC_seen & REG_SEEN_RECURSE
6192 && !RExC_open_parens[parno-1])
6193 {
e2e6a0f1 6194 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
6195 "Setting open paren #%"IVdf" to %d\n",
6196 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
6197 RExC_open_parens[parno-1]= ret;
6198 }
6bda09f9 6199 }
fac92740
MJD
6200 Set_Node_Length(ret, 1); /* MJD */
6201 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 6202 is_open = 1;
a0d0e21e 6203 }
a0ed51b3 6204 }
fac92740 6205 else /* ! paren */
a0d0e21e 6206 ret = NULL;
cde0cee5
YO
6207
6208 parse_rest:
a0d0e21e 6209 /* Pick up the branches, linking them together. */
fac92740 6210 parse_start = RExC_parse; /* MJD */
3dab1dad 6211 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e
VP
6212
6213 if (freeze_paren) {
6214 if (RExC_npar > after_freeze)
6215 after_freeze = RExC_npar;
6216 RExC_npar = freeze_paren;
6217 }
6218
fac92740 6219 /* branch_len = (paren != 0); */
2af232bd 6220
a0d0e21e
LW
6221 if (br == NULL)
6222 return(NULL);
830247a4
IZ
6223 if (*RExC_parse == '|') {
6224 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 6225 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 6226 }
fac92740 6227 else { /* MJD */
6bda09f9 6228 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
6229 Set_Node_Length(br, paren != 0);
6230 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6231 }
c277df42
IZ
6232 have_branch = 1;
6233 if (SIZE_ONLY)
830247a4 6234 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
6235 }
6236 else if (paren == ':') {
c277df42
IZ
6237 *flagp |= flags&SIMPLE;
6238 }
6136c704 6239 if (is_open) { /* Starts with OPEN. */
3dab1dad 6240 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
6241 }
6242 else if (paren != '?') /* Not Conditional */
a0d0e21e 6243 ret = br;
8ae10a67 6244 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 6245 lastbr = br;
830247a4
IZ
6246 while (*RExC_parse == '|') {
6247 if (!SIZE_ONLY && RExC_extralen) {
6248 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 6249 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
6250 }
6251 if (SIZE_ONLY)
830247a4
IZ
6252 RExC_extralen += 2; /* Account for LONGJMP. */
6253 nextchar(pRExC_state);
594d7033
YO
6254 if (freeze_paren) {
6255 if (RExC_npar > after_freeze)
6256 after_freeze = RExC_npar;
6257 RExC_npar = freeze_paren;
6258 }
3dab1dad 6259 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 6260
a687059c 6261 if (br == NULL)
a0d0e21e 6262 return(NULL);
3dab1dad 6263 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 6264 lastbr = br;
8ae10a67 6265 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
6266 }
6267
c277df42
IZ
6268 if (have_branch || paren != ':') {
6269 /* Make a closing node, and hook it on the end. */
6270 switch (paren) {
6271 case ':':
830247a4 6272 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
6273 break;
6274 case 1:
830247a4 6275 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
6276 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6277 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6278 "Setting close paren #%"IVdf" to %d\n",
6279 (IV)parno, REG_NODE_NUM(ender)));
6280 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
6281 if (RExC_nestroot == parno)
6282 RExC_nestroot = 0;
40d049e4 6283 }
fac92740
MJD
6284 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6285 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
6286 break;
6287 case '<':
c277df42
IZ
6288 case ',':
6289 case '=':
6290 case '!':
c277df42 6291 *flagp &= ~HASWIDTH;
821b33a5
IZ
6292 /* FALL THROUGH */
6293 case '>':
830247a4 6294 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
6295 break;
6296 case 0:
830247a4 6297 ender = reg_node(pRExC_state, END);
40d049e4
YO
6298 if (!SIZE_ONLY) {
6299 assert(!RExC_opend); /* there can only be one! */
6300 RExC_opend = ender;
6301 }
c277df42
IZ
6302 break;
6303 }
eaf3ca90 6304 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 6305
9674d46a 6306 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
6307 if (depth==1)
6308 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6309
c277df42 6310 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
6311 for (br = ret; br; br = regnext(br)) {
6312 const U8 op = PL_regkind[OP(br)];
6313 if (op == BRANCH) {
07be1b83 6314 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
6315 }
6316 else if (op == BRANCHJ) {
07be1b83 6317 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 6318 }
c277df42
IZ
6319 }
6320 }
a0d0e21e 6321 }
c277df42
IZ
6322
6323 {
e1ec3a88
AL
6324 const char *p;
6325 static const char parens[] = "=!<,>";
c277df42
IZ
6326
6327 if (paren && (p = strchr(parens, paren))) {
eb160463 6328 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
6329 int flag = (p - parens) > 1;
6330
6331 if (paren == '>')
6332 node = SUSPEND, flag = 0;
6bda09f9 6333 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
6334 Set_Node_Cur_Length(ret);
6335 Set_Node_Offset(ret, parse_start + 1);
c277df42 6336 ret->flags = flag;
07be1b83 6337 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 6338 }
a0d0e21e
LW
6339 }
6340
6341 /* Check for proper termination. */
ce3e6498 6342 if (paren) {
e2509266 6343 RExC_flags = oregflags;
830247a4
IZ
6344 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6345 RExC_parse = oregcomp_parse;
380a0633 6346 vFAIL("Unmatched (");
ce3e6498 6347 }
a0ed51b3 6348 }
830247a4
IZ
6349 else if (!paren && RExC_parse < RExC_end) {
6350 if (*RExC_parse == ')') {
6351 RExC_parse++;
380a0633 6352 vFAIL("Unmatched )");
a0ed51b3
LW
6353 }
6354 else
b45f050a 6355 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
6356 /* NOTREACHED */
6357 }
594d7033
YO
6358 if (after_freeze)
6359 RExC_npar = after_freeze;
a0d0e21e 6360 return(ret);
a687059c
LW
6361}
6362
6363/*
6364 - regbranch - one alternative of an | operator
6365 *
6366 * Implements the concatenation operator.
6367 */
76e3520e 6368STATIC regnode *
3dab1dad 6369S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 6370{
97aff369 6371 dVAR;
c277df42
IZ
6372 register regnode *ret;
6373 register regnode *chain = NULL;
6374 register regnode *latest;
6375 I32 flags = 0, c = 0;
3dab1dad 6376 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6377
6378 PERL_ARGS_ASSERT_REGBRANCH;
6379
3dab1dad 6380 DEBUG_PARSE("brnc");
02daf0ab 6381
b81d288d 6382 if (first)
c277df42
IZ
6383 ret = NULL;
6384 else {
b81d288d 6385 if (!SIZE_ONLY && RExC_extralen)
830247a4 6386 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 6387 else {
830247a4 6388 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
6389 Set_Node_Length(ret, 1);
6390 }
c277df42
IZ
6391 }
6392
b81d288d 6393 if (!first && SIZE_ONLY)
830247a4 6394 RExC_extralen += 1; /* BRANCHJ */
b81d288d 6395
c277df42 6396 *flagp = WORST; /* Tentatively. */
a0d0e21e 6397
830247a4
IZ
6398 RExC_parse--;
6399 nextchar(pRExC_state);
6400 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 6401 flags &= ~TRYAGAIN;
3dab1dad 6402 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6403 if (latest == NULL) {
6404 if (flags & TRYAGAIN)
6405 continue;
6406 return(NULL);
a0ed51b3
LW
6407 }
6408 else if (ret == NULL)
c277df42 6409 ret = latest;
8ae10a67 6410 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 6411 if (chain == NULL) /* First piece. */
a0d0e21e
LW
6412 *flagp |= flags&SPSTART;
6413 else {
830247a4 6414 RExC_naughty++;
3dab1dad 6415 REGTAIL(pRExC_state, chain, latest);
a687059c 6416 }
a0d0e21e 6417 chain = latest;
c277df42
IZ
6418 c++;
6419 }
6420 if (chain == NULL) { /* Loop ran zero times. */
830247a4 6421 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
6422 if (ret == NULL)
6423 ret = chain;
6424 }
6425 if (c == 1) {
6426 *flagp |= flags&SIMPLE;
a0d0e21e 6427 }
a687059c 6428
d4c19fe8 6429 return ret;
a687059c
LW
6430}
6431
6432/*
6433 - regpiece - something followed by possible [*+?]
6434 *
6435 * Note that the branching code sequences used for ? and the general cases
6436 * of * and + are somewhat optimized: they use the same NOTHING node as
6437 * both the endmarker for their branch list and the body of the last branch.
6438 * It might seem that this node could be dispensed with entirely, but the
6439 * endmarker role is not redundant.
6440 */
76e3520e 6441STATIC regnode *
3dab1dad 6442S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6443{
97aff369 6444 dVAR;
c277df42 6445 register regnode *ret;
a0d0e21e
LW
6446 register char op;
6447 register char *next;
6448 I32 flags;
1df70142 6449 const char * const origparse = RExC_parse;
a0d0e21e 6450 I32 min;
c277df42 6451 I32 max = REG_INFTY;
fac92740 6452 char *parse_start;
10edeb5d 6453 const char *maxpos = NULL;
3dab1dad 6454 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6455
6456 PERL_ARGS_ASSERT_REGPIECE;
6457
3dab1dad 6458 DEBUG_PARSE("piec");
a0d0e21e 6459
3dab1dad 6460 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6461 if (ret == NULL) {
6462 if (flags & TRYAGAIN)
6463 *flagp |= TRYAGAIN;
6464 return(NULL);
6465 }
6466
830247a4 6467 op = *RExC_parse;
a0d0e21e 6468
830247a4 6469 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 6470 maxpos = NULL;
fac92740 6471 parse_start = RExC_parse; /* MJD */
830247a4 6472 next = RExC_parse + 1;
a0d0e21e
LW
6473 while (isDIGIT(*next) || *next == ',') {
6474 if (*next == ',') {
6475 if (maxpos)
6476 break;
6477 else
6478 maxpos = next;
a687059c 6479 }
a0d0e21e
LW
6480 next++;
6481 }
6482 if (*next == '}') { /* got one */
6483 if (!maxpos)
6484 maxpos = next;
830247a4
IZ
6485 RExC_parse++;
6486 min = atoi(RExC_parse);
a0d0e21e
LW
6487 if (*maxpos == ',')
6488 maxpos++;
6489 else
830247a4 6490 maxpos = RExC_parse;
a0d0e21e
LW
6491 max = atoi(maxpos);
6492 if (!max && *maxpos != '0')
c277df42
IZ
6493 max = REG_INFTY; /* meaning "infinity" */
6494 else if (max >= REG_INFTY)
8615cb43 6495 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
6496 RExC_parse = next;
6497 nextchar(pRExC_state);
a0d0e21e
LW
6498
6499 do_curly:
6500 if ((flags&SIMPLE)) {
830247a4 6501 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 6502 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
6503 Set_Node_Offset(ret, parse_start+1); /* MJD */
6504 Set_Node_Cur_Length(ret);
a0d0e21e
LW
6505 }
6506 else {
3dab1dad 6507 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
6508
6509 w->flags = 0;
3dab1dad 6510 REGTAIL(pRExC_state, ret, w);
830247a4 6511 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
6512 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6513 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
6514 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6515 }
6bda09f9 6516 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
6517 /* MJD hk */
6518 Set_Node_Offset(ret, parse_start+1);
2af232bd 6519 Set_Node_Length(ret,
fac92740 6520 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 6521
830247a4 6522 if (!SIZE_ONLY && RExC_extralen)
c277df42 6523 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 6524 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 6525 if (SIZE_ONLY)
830247a4
IZ
6526 RExC_whilem_seen++, RExC_extralen += 3;
6527 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 6528 }
c277df42 6529 ret->flags = 0;
a0d0e21e
LW
6530
6531 if (min > 0)
821b33a5
IZ
6532 *flagp = WORST;
6533 if (max > 0)
6534 *flagp |= HASWIDTH;
8fa23287 6535 if (max < min)
8615cb43 6536 vFAIL("Can't do {n,m} with n > m");
c277df42 6537 if (!SIZE_ONLY) {
eb160463
GS
6538 ARG1_SET(ret, (U16)min);
6539 ARG2_SET(ret, (U16)max);
a687059c 6540 }
a687059c 6541
a0d0e21e 6542 goto nest_check;
a687059c 6543 }
a0d0e21e 6544 }
a687059c 6545
a0d0e21e
LW
6546 if (!ISMULT1(op)) {
6547 *flagp = flags;
a687059c 6548 return(ret);
a0d0e21e 6549 }
bb20fd44 6550
c277df42 6551#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
6552
6553 /* if this is reinstated, don't forget to put this back into perldiag:
6554
6555 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6556
6557 (F) The part of the regexp subject to either the * or + quantifier
6558 could match an empty string. The {#} shows in the regular
6559 expression about where the problem was discovered.
6560
6561 */
6562
bb20fd44 6563 if (!(flags&HASWIDTH) && op != '?')
b45f050a 6564 vFAIL("Regexp *+ operand could be empty");
b81d288d 6565#endif
bb20fd44 6566
fac92740 6567 parse_start = RExC_parse;
830247a4 6568 nextchar(pRExC_state);
a0d0e21e 6569
821b33a5 6570 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
6571
6572 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 6573 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 6574 ret->flags = 0;
830247a4 6575 RExC_naughty += 4;
a0d0e21e
LW
6576 }
6577 else if (op == '*') {
6578 min = 0;
6579 goto do_curly;
a0ed51b3
LW
6580 }
6581 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 6582 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 6583 ret->flags = 0;
830247a4 6584 RExC_naughty += 3;
a0d0e21e
LW
6585 }
6586 else if (op == '+') {
6587 min = 1;
6588 goto do_curly;
a0ed51b3
LW
6589 }
6590 else if (op == '?') {
a0d0e21e
LW
6591 min = 0; max = 1;
6592 goto do_curly;
6593 }
6594 nest_check:
668c081a
NC
6595 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6596 ckWARN3reg(RExC_parse,
6597 "%.*s matches null string many times",
6598 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6599 origparse);
a0d0e21e
LW
6600 }
6601
b9b4dddf 6602 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 6603 nextchar(pRExC_state);
6bda09f9 6604 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 6605 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 6606 }
b9b4dddf
YO
6607#ifndef REG_ALLOW_MINMOD_SUSPEND
6608 else
6609#endif
6610 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6611 regnode *ender;
6612 nextchar(pRExC_state);
6613 ender = reg_node(pRExC_state, SUCCEED);
6614 REGTAIL(pRExC_state, ret, ender);
6615 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6616 ret->flags = 0;
6617 ender = reg_node(pRExC_state, TAIL);
6618 REGTAIL(pRExC_state, ret, ender);
6619 /*ret= ender;*/
6620 }
6621
6622 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 6623 RExC_parse++;
b45f050a
JF
6624 vFAIL("Nested quantifiers");
6625 }
a0d0e21e
LW
6626
6627 return(ret);
a687059c
LW
6628}
6629
fc8cd66c
YO
6630
6631/* reg_namedseq(pRExC_state,UVp)
6632
6633 This is expected to be called by a parser routine that has
afefe6bf 6634 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
6635 expected to point at the first char following the N at the time
6636 of the call.
ff3f963a
KW
6637
6638 The \N may be inside (indicated by valuep not being NULL) or outside a
6639 character class.
6640
6641 \N may begin either a named sequence, or if outside a character class, mean
6642 to match a non-newline. For non single-quoted regexes, the tokenizer has
6643 attempted to decide which, and in the case of a named sequence converted it
6644 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6645 where c1... are the characters in the sequence. For single-quoted regexes,
6646 the tokenizer passes the \N sequence through unchanged; this code will not
6647 attempt to determine this nor expand those. The net effect is that if the
6648 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6649 signals that this \N occurrence means to match a non-newline.
6650
6651 Only the \N{U+...} form should occur in a character class, for the same
6652 reason that '.' inside a character class means to just match a period: it
6653 just doesn't make sense.
fc8cd66c
YO
6654
6655 If valuep is non-null then it is assumed that we are parsing inside
6656 of a charclass definition and the first codepoint in the resolved
6657 string is returned via *valuep and the routine will return NULL.
6658 In this mode if a multichar string is returned from the charnames
ff3f963a 6659 handler, a warning will be issued, and only the first char in the
fc8cd66c
YO
6660 sequence will be examined. If the string returned is zero length
6661 then the value of *valuep is undefined and NON-NULL will
6662 be returned to indicate failure. (This will NOT be a valid pointer
6663 to a regnode.)
6664
ff3f963a
KW
6665 If valuep is null then it is assumed that we are parsing normal text and a
6666 new EXACT node is inserted into the program containing the resolved string,
6667 and a pointer to the new node is returned. But if the string is zero length
6668 a NOTHING node is emitted instead.
afefe6bf 6669
fc8cd66c 6670 On success RExC_parse is set to the char following the endbrace.
ff3f963a 6671 Parsing failures will generate a fatal error via vFAIL(...)
fc8cd66c
YO
6672 */
6673STATIC regnode *
afefe6bf 6674S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
fc8cd66c 6675{
c3c41406 6676 char * endbrace; /* '}' following the name */
fc8cd66c 6677 regnode *ret = NULL;
ff3f963a
KW
6678#ifdef DEBUGGING
6679 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6680#endif
c3c41406 6681 char* p;
ff3f963a
KW
6682
6683 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6684
6685 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
ff3f963a
KW
6686
6687 GET_RE_DEBUG_FLAGS;
c3c41406
KW
6688
6689 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6690 * modifier. The other meaning does not */
6691 p = (RExC_flags & RXf_PMf_EXTENDED)
6692 ? regwhite( pRExC_state, RExC_parse )
6693 : RExC_parse;
7918f24d 6694
ff3f963a 6695 /* Disambiguate between \N meaning a named character versus \N meaning
c3c41406
KW
6696 * [^\n]. The former is assumed when it can't be the latter. */
6697 if (*p != '{' || regcurly(p)) {
6698 RExC_parse = p;
ff3f963a 6699 if (valuep) {
afefe6bf 6700 /* no bare \N in a charclass */
ff3f963a
KW
6701 vFAIL("\\N in a character class must be a named character: \\N{...}");
6702 }
afefe6bf
RGS
6703 nextchar(pRExC_state);
6704 ret = reg_node(pRExC_state, REG_ANY);
6705 *flagp |= HASWIDTH|SIMPLE;
6706 RExC_naughty++;
6707 RExC_parse--;
6708 Set_Node_Length(ret, 1); /* MJD */
6709 return ret;
fc8cd66c 6710 }
a4893424 6711
c3c41406
KW
6712 /* Here, we have decided it should be a named sequence */
6713
6714 /* The test above made sure that the next real character is a '{', but
6715 * under the /x modifier, it could be separated by space (or a comment and
6716 * \n) and this is not allowed (for consistency with \x{...} and the
6717 * tokenizer handling of \N{NAME}). */
6718 if (*RExC_parse != '{') {
6719 vFAIL("Missing braces on \\N{}");
6720 }
6721
ff3f963a 6722 RExC_parse++; /* Skip past the '{' */
c3c41406
KW
6723
6724 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6725 || ! (endbrace == RExC_parse /* nothing between the {} */
6726 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6727 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6728 {
6729 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6730 vFAIL("\\N{NAME} must be resolved by the lexer");
6731 }
6732
ff3f963a
KW
6733 if (endbrace == RExC_parse) { /* empty: \N{} */
6734 if (! valuep) {
6735 RExC_parse = endbrace + 1;
6736 return reg_node(pRExC_state,NOTHING);
a4893424 6737 }
fc8cd66c 6738
ff3f963a
KW
6739 if (SIZE_ONLY) {
6740 ckWARNreg(RExC_parse,
6741 "Ignoring zero length \\N{} in character class"
6742 );
6743 RExC_parse = endbrace + 1;
6744 }
6745 *valuep = 0;
6746 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 6747 }
ff3f963a
KW
6748
6749 RExC_utf8 = 1; /* named sequences imply Unicode semantics */
6750 RExC_parse += 2; /* Skip past the 'U+' */
6751
6752 if (valuep) { /* In a bracketed char class */
6753 /* We only pay attention to the first char of
6754 multichar strings being returned. I kinda wonder
6755 if this makes sense as it does change the behaviour
6756 from earlier versions, OTOH that behaviour was broken
6757 as well. XXX Solution is to recharacterize as
6758 [rest-of-class]|multi1|multi2... */
6759
6760 STRLEN length_of_hex;
6761 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6762 | PERL_SCAN_DISALLOW_PREFIX
6763 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6764
37820adc
KW
6765 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6766 if (endchar < endbrace) {
ff3f963a
KW
6767 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6768 }
ff3f963a
KW
6769
6770 length_of_hex = (STRLEN)(endchar - RExC_parse);
6771 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6772
6773 /* The tokenizer should have guaranteed validity, but it's possible to
6774 * bypass it by using single quoting, so check */
c3c41406
KW
6775 if (length_of_hex == 0
6776 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6777 {
6778 RExC_parse += length_of_hex; /* Includes all the valid */
6779 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6780 ? UTF8SKIP(RExC_parse)
6781 : 1;
6782 /* Guard against malformed utf8 */
6783 if (RExC_parse >= endchar) RExC_parse = endchar;
6784 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
6785 }
6786
6787 RExC_parse = endbrace + 1;
6788 if (endchar == endbrace) return NULL;
6789
6790 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 6791 }
ff3f963a
KW
6792 else { /* Not a char class */
6793 char *s; /* String to put in generated EXACT node */
6794 STRLEN len = 0; /* Its current length */
6795 char *endchar; /* Points to '.' or '}' ending cur char in the input
6796 stream */
6797
6798 ret = reg_node(pRExC_state,
6799 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6800 s= STRING(ret);
6801
6802 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
6803 * the input which is of the form now 'c1.c2.c3...}' until find the
6804 * ending brace or exeed length 255. The characters that exceed this
6805 * limit are dropped. The limit could be relaxed should it become
6806 * desirable by reparsing this as (?:\N{NAME}), so could generate
6807 * multiple EXACT nodes, as is done for just regular input. But this
6808 * is primarily a named character, and not intended to be a huge long
6809 * string, so 255 bytes should be good enough */
6810 while (1) {
c3c41406 6811 STRLEN length_of_hex;
ff3f963a
KW
6812 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6813 | PERL_SCAN_DISALLOW_PREFIX
6814 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6815 UV cp; /* Ord of current character */
6816
6817 /* Code points are separated by dots. If none, there is only one
6818 * code point, and is terminated by the brace */
37820adc 6819 endchar = RExC_parse + strcspn(RExC_parse, ".}");
ff3f963a
KW
6820
6821 /* The values are Unicode even on EBCDIC machines */
c3c41406
KW
6822 length_of_hex = (STRLEN)(endchar - RExC_parse);
6823 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
6824 if ( length_of_hex == 0
6825 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
ff3f963a 6826 {
c3c41406
KW
6827 RExC_parse += length_of_hex; /* Includes all the valid */
6828 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6829 ? UTF8SKIP(RExC_parse)
6830 : 1;
6831 /* Guard against malformed utf8 */
6832 if (RExC_parse >= endchar) RExC_parse = endchar;
6833 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
6834 }
6835
6836 if (! FOLD) { /* Not folding, just append to the string */
6837 STRLEN unilen;
6838
6839 /* Quit before adding this character if would exceed limit */
6840 if (len + UNISKIP(cp) > U8_MAX) break;
fc8cd66c 6841
ff3f963a
KW
6842 unilen = reguni(pRExC_state, cp, s);
6843 if (unilen > 0) {
6844 s += unilen;
6845 len += unilen;
6846 }
6847 } else { /* Folding, output the folded equivalent */
6848 STRLEN foldlen,numlen;
6849 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6850 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
6851
6852 /* Quit before exceeding size limit */
6853 if (len + foldlen > U8_MAX) break;
6854
6855 for (foldbuf = tmpbuf;
6856 foldlen;
6857 foldlen -= numlen)
6858 {
6859 cp = utf8_to_uvchr(foldbuf, &numlen);
6860 if (numlen > 0) {
6861 const STRLEN unilen = reguni(pRExC_state, cp, s);
6862 s += unilen;
6863 len += unilen;
6864 /* In EBCDIC the numlen and unilen can differ. */
6865 foldbuf += numlen;
6866 if (numlen >= foldlen)
6867 break;
6868 }
6869 else
6870 break; /* "Can't happen." */
6871 }
6872 }
6873
6874 /* Point to the beginning of the next character in the sequence. */
6875 RExC_parse = endchar + 1;
6876
6877 /* Quit if no more characters */
6878 if (RExC_parse >= endbrace) break;
6879 }
6880
6881
6882 if (SIZE_ONLY) {
6883 if (RExC_parse < endbrace) {
6884 ckWARNreg(RExC_parse - 1,
6885 "Using just the first characters returned by \\N{}");
6886 }
6887
6888 RExC_size += STR_SZ(len);
6889 } else {
6890 STR_LEN(ret) = len;
6891 RExC_emit += STR_SZ(len);
6892 }
6893
6894 RExC_parse = endbrace + 1;
6895
6896 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
6897 with malformed in t/re/pat_advanced.t */
6898 RExC_parse --;
6899 Set_Node_Cur_Length(ret); /* MJD */
6900 nextchar(pRExC_state);
6901 }
6902
6903 return ret;
fc8cd66c
YO
6904}
6905
6906
9e08bc66
TS
6907/*
6908 * reg_recode
6909 *
6910 * It returns the code point in utf8 for the value in *encp.
6911 * value: a code value in the source encoding
6912 * encp: a pointer to an Encode object
6913 *
6914 * If the result from Encode is not a single character,
6915 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6916 */
6917STATIC UV
6918S_reg_recode(pTHX_ const char value, SV **encp)
6919{
6920 STRLEN numlen = 1;
59cd0e26 6921 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 6922 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
6923 const STRLEN newlen = SvCUR(sv);
6924 UV uv = UNICODE_REPLACEMENT;
6925
7918f24d
NC
6926 PERL_ARGS_ASSERT_REG_RECODE;
6927
9e08bc66
TS
6928 if (newlen)
6929 uv = SvUTF8(sv)
6930 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6931 : *(U8*)s;
6932
6933 if (!newlen || numlen != newlen) {
6934 uv = UNICODE_REPLACEMENT;
c86f7df5 6935 *encp = NULL;
9e08bc66
TS
6936 }
6937 return uv;
6938}
6939
fc8cd66c 6940
a687059c
LW
6941/*
6942 - regatom - the lowest level
ee9b8eae
YO
6943
6944 Try to identify anything special at the start of the pattern. If there
6945 is, then handle it as required. This may involve generating a single regop,
6946 such as for an assertion; or it may involve recursing, such as to
6947 handle a () structure.
6948
6949 If the string doesn't start with something special then we gobble up
6950 as much literal text as we can.
6951
6952 Once we have been able to handle whatever type of thing started the
6953 sequence, we return.
6954
6955 Note: we have to be careful with escapes, as they can be both literal
6956 and special, and in the case of \10 and friends can either, depending
6957 on context. Specifically there are two seperate switches for handling
6958 escape sequences, with the one for handling literal escapes requiring
6959 a dummy entry for all of the special escapes that are actually handled
6960 by the other.
6961*/
6962
76e3520e 6963STATIC regnode *
3dab1dad 6964S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6965{
97aff369 6966 dVAR;
cbbf8932 6967 register regnode *ret = NULL;
a0d0e21e 6968 I32 flags;
45948336 6969 char *parse_start = RExC_parse;
3dab1dad
YO
6970 GET_RE_DEBUG_FLAGS_DECL;
6971 DEBUG_PARSE("atom");
a0d0e21e
LW
6972 *flagp = WORST; /* Tentatively. */
6973
7918f24d 6974 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 6975
a0d0e21e 6976tryagain:
f9a79580 6977 switch ((U8)*RExC_parse) {
a0d0e21e 6978 case '^':
830247a4
IZ
6979 RExC_seen_zerolen++;
6980 nextchar(pRExC_state);
bbe252da 6981 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6982 ret = reg_node(pRExC_state, MBOL);
bbe252da 6983 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6984 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6985 else
830247a4 6986 ret = reg_node(pRExC_state, BOL);
fac92740 6987 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6988 break;
6989 case '$':
830247a4 6990 nextchar(pRExC_state);
b81d288d 6991 if (*RExC_parse)
830247a4 6992 RExC_seen_zerolen++;
bbe252da 6993 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6994 ret = reg_node(pRExC_state, MEOL);
bbe252da 6995 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6996 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6997 else
830247a4 6998 ret = reg_node(pRExC_state, EOL);
fac92740 6999 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7000 break;
7001 case '.':
830247a4 7002 nextchar(pRExC_state);
bbe252da 7003 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
7004 ret = reg_node(pRExC_state, SANY);
7005 else
7006 ret = reg_node(pRExC_state, REG_ANY);
7007 *flagp |= HASWIDTH|SIMPLE;
830247a4 7008 RExC_naughty++;
fac92740 7009 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7010 break;
7011 case '[':
b45f050a 7012 {
3dab1dad
YO
7013 char * const oregcomp_parse = ++RExC_parse;
7014 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
7015 if (*RExC_parse != ']') {
7016 RExC_parse = oregcomp_parse;
b45f050a
JF
7017 vFAIL("Unmatched [");
7018 }
830247a4 7019 nextchar(pRExC_state);
a0d0e21e 7020 *flagp |= HASWIDTH|SIMPLE;
fac92740 7021 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 7022 break;
b45f050a 7023 }
a0d0e21e 7024 case '(':
830247a4 7025 nextchar(pRExC_state);
3dab1dad 7026 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 7027 if (ret == NULL) {
bf93d4cc 7028 if (flags & TRYAGAIN) {
830247a4 7029 if (RExC_parse == RExC_end) {
bf93d4cc
GS
7030 /* Make parent create an empty node if needed. */
7031 *flagp |= TRYAGAIN;
7032 return(NULL);
7033 }
a0d0e21e 7034 goto tryagain;
bf93d4cc 7035 }
a0d0e21e
LW
7036 return(NULL);
7037 }
a3b492c3 7038 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
7039 break;
7040 case '|':
7041 case ')':
7042 if (flags & TRYAGAIN) {
7043 *flagp |= TRYAGAIN;
7044 return NULL;
7045 }
b45f050a 7046 vFAIL("Internal urp");
a0d0e21e
LW
7047 /* Supposed to be caught earlier. */
7048 break;
85afd4ae 7049 case '{':
830247a4
IZ
7050 if (!regcurly(RExC_parse)) {
7051 RExC_parse++;
85afd4ae
CS
7052 goto defchar;
7053 }
7054 /* FALL THROUGH */
a0d0e21e
LW
7055 case '?':
7056 case '+':
7057 case '*':
830247a4 7058 RExC_parse++;
b45f050a 7059 vFAIL("Quantifier follows nothing");
a0d0e21e 7060 break;
f9a79580
RGS
7061 case 0xDF:
7062 case 0xC3:
7063 case 0xCE:
a0a388a1 7064 do_foldchar:
56d400ed 7065 if (!LOC && FOLD) {
e64b1bd1 7066 U32 len,cp;
7cf3a6a3 7067 len=0; /* silence a spurious compiler warning */
56d400ed 7068 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
e64b1bd1
YO
7069 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7070 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7071 ret = reganode(pRExC_state, FOLDCHAR, cp);
7072 Set_Node_Length(ret, 1); /* MJD */
7073 nextchar(pRExC_state); /* kill whitespace under /x */
7074 return ret;
7075 }
7076 }
7077 goto outer_default;
a0d0e21e 7078 case '\\':
ee9b8eae
YO
7079 /* Special Escapes
7080
7081 This switch handles escape sequences that resolve to some kind
7082 of special regop and not to literal text. Escape sequnces that
7083 resolve to literal text are handled below in the switch marked
7084 "Literal Escapes".
7085
7086 Every entry in this switch *must* have a corresponding entry
7087 in the literal escape switch. However, the opposite is not
7088 required, as the default for this switch is to jump to the
7089 literal text handling code.
7090 */
a0a388a1
YO
7091 switch ((U8)*++RExC_parse) {
7092 case 0xDF:
7093 case 0xC3:
7094 case 0xCE:
7095 goto do_foldchar;
ee9b8eae 7096 /* Special Escapes */
a0d0e21e 7097 case 'A':
830247a4
IZ
7098 RExC_seen_zerolen++;
7099 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7100 *flagp |= SIMPLE;
ee9b8eae 7101 goto finish_meta_pat;
a0d0e21e 7102 case 'G':
830247a4
IZ
7103 ret = reg_node(pRExC_state, GPOS);
7104 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 7105 *flagp |= SIMPLE;
ee9b8eae
YO
7106 goto finish_meta_pat;
7107 case 'K':
7108 RExC_seen_zerolen++;
7109 ret = reg_node(pRExC_state, KEEPS);
7110 *flagp |= SIMPLE;
37923168
RGS
7111 /* XXX:dmq : disabling in-place substitution seems to
7112 * be necessary here to avoid cases of memory corruption, as
7113 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7114 */
7115 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 7116 goto finish_meta_pat;
a0d0e21e 7117 case 'Z':
830247a4 7118 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7119 *flagp |= SIMPLE;
a1917ab9 7120 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7121 goto finish_meta_pat;
b85d18e9 7122 case 'z':
830247a4 7123 ret = reg_node(pRExC_state, EOS);
b85d18e9 7124 *flagp |= SIMPLE;
830247a4 7125 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7126 goto finish_meta_pat;
4a2d328f 7127 case 'C':
f33976b4
DB
7128 ret = reg_node(pRExC_state, CANY);
7129 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 7130 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7131 goto finish_meta_pat;
a0ed51b3 7132 case 'X':
830247a4 7133 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 7134 *flagp |= HASWIDTH;
ee9b8eae 7135 goto finish_meta_pat;
a0d0e21e 7136 case 'w':
eb160463 7137 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 7138 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7139 goto finish_meta_pat;
a0d0e21e 7140 case 'W':
eb160463 7141 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 7142 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7143 goto finish_meta_pat;
a0d0e21e 7144 case 'b':
830247a4
IZ
7145 RExC_seen_zerolen++;
7146 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 7147 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 7148 *flagp |= SIMPLE;
ee9b8eae 7149 goto finish_meta_pat;
a0d0e21e 7150 case 'B':
830247a4
IZ
7151 RExC_seen_zerolen++;
7152 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 7153 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 7154 *flagp |= SIMPLE;
ee9b8eae 7155 goto finish_meta_pat;
a0d0e21e 7156 case 's':
eb160463 7157 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 7158 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7159 goto finish_meta_pat;
a0d0e21e 7160 case 'S':
eb160463 7161 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 7162 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7163 goto finish_meta_pat;
a0d0e21e 7164 case 'd':
ffc61ed2 7165 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 7166 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7167 goto finish_meta_pat;
a0d0e21e 7168 case 'D':
ffc61ed2 7169 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 7170 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7171 goto finish_meta_pat;
e1d1eefb
YO
7172 case 'R':
7173 ret = reg_node(pRExC_state, LNBREAK);
7174 *flagp |= HASWIDTH|SIMPLE;
7175 goto finish_meta_pat;
7176 case 'h':
7177 ret = reg_node(pRExC_state, HORIZWS);
7178 *flagp |= HASWIDTH|SIMPLE;
7179 goto finish_meta_pat;
7180 case 'H':
7181 ret = reg_node(pRExC_state, NHORIZWS);
7182 *flagp |= HASWIDTH|SIMPLE;
7183 goto finish_meta_pat;
ee9b8eae 7184 case 'v':
e1d1eefb
YO
7185 ret = reg_node(pRExC_state, VERTWS);
7186 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
7187 goto finish_meta_pat;
7188 case 'V':
e1d1eefb
YO
7189 ret = reg_node(pRExC_state, NVERTWS);
7190 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7191 finish_meta_pat:
830247a4 7192 nextchar(pRExC_state);
fac92740 7193 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 7194 break;
a14b48bc
LW
7195 case 'p':
7196 case 'P':
3568d838 7197 {
3dab1dad 7198 char* const oldregxend = RExC_end;
d008bc60 7199#ifdef DEBUGGING
ccb2c380 7200 char* parse_start = RExC_parse - 2;
d008bc60 7201#endif
a14b48bc 7202
830247a4 7203 if (RExC_parse[1] == '{') {
3568d838 7204 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
7205 RExC_end = strchr(RExC_parse, '}');
7206 if (!RExC_end) {
3dab1dad 7207 const U8 c = (U8)*RExC_parse;
830247a4
IZ
7208 RExC_parse += 2;
7209 RExC_end = oldregxend;
0da60cf5 7210 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 7211 }
830247a4 7212 RExC_end++;
a14b48bc 7213 }
af6f566e 7214 else {
830247a4 7215 RExC_end = RExC_parse + 2;
af6f566e
HS
7216 if (RExC_end > oldregxend)
7217 RExC_end = oldregxend;
7218 }
830247a4 7219 RExC_parse--;
a14b48bc 7220
3dab1dad 7221 ret = regclass(pRExC_state,depth+1);
a14b48bc 7222
830247a4
IZ
7223 RExC_end = oldregxend;
7224 RExC_parse--;
ccb2c380
MP
7225
7226 Set_Node_Offset(ret, parse_start + 2);
7227 Set_Node_Cur_Length(ret);
830247a4 7228 nextchar(pRExC_state);
a14b48bc
LW
7229 *flagp |= HASWIDTH|SIMPLE;
7230 }
7231 break;
fc8cd66c 7232 case 'N':
afefe6bf 7233 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
7234 multicharacter. join_exact() will join them up later on.
7235 Also this makes sure that things like /\N{BLAH}+/ and
7236 \N{BLAH} being multi char Just Happen. dmq*/
7237 ++RExC_parse;
afefe6bf 7238 ret= reg_namedseq(pRExC_state, NULL, flagp);
fc8cd66c 7239 break;
0a4db386 7240 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 7241 parse_named_seq:
81714fb9
YO
7242 {
7243 char ch= RExC_parse[1];
1f1031fe
YO
7244 if (ch != '<' && ch != '\'' && ch != '{') {
7245 RExC_parse++;
7246 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 7247 } else {
1f1031fe
YO
7248 /* this pretty much dupes the code for (?P=...) in reg(), if
7249 you change this make sure you change that */
81714fb9 7250 char* name_start = (RExC_parse += 2);
2eccd3b2 7251 U32 num = 0;
0a4db386
YO
7252 SV *sv_dat = reg_scan_name(pRExC_state,
7253 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 7254 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 7255 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
7256 vFAIL2("Sequence %.3s... not terminated",parse_start);
7257
7258 if (!SIZE_ONLY) {
7259 num = add_data( pRExC_state, 1, "S" );
7260 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 7261 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
7262 }
7263
81714fb9
YO
7264 RExC_sawback = 1;
7265 ret = reganode(pRExC_state,
7266 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7267 num);
7268 *flagp |= HASWIDTH;
1f1031fe 7269
81714fb9
YO
7270 /* override incorrect value set in reganode MJD */
7271 Set_Node_Offset(ret, parse_start+1);
7272 Set_Node_Cur_Length(ret); /* MJD */
7273 nextchar(pRExC_state);
1f1031fe 7274
81714fb9
YO
7275 }
7276 break;
1f1031fe 7277 }
2bf803e2 7278 case 'g':
a0d0e21e
LW
7279 case '1': case '2': case '3': case '4':
7280 case '5': case '6': case '7': case '8': case '9':
7281 {
c74340f9 7282 I32 num;
2bf803e2
YO
7283 bool isg = *RExC_parse == 'g';
7284 bool isrel = 0;
7285 bool hasbrace = 0;
7286 if (isg) {
c74340f9 7287 RExC_parse++;
2bf803e2
YO
7288 if (*RExC_parse == '{') {
7289 RExC_parse++;
7290 hasbrace = 1;
7291 }
7292 if (*RExC_parse == '-') {
7293 RExC_parse++;
7294 isrel = 1;
7295 }
1f1031fe
YO
7296 if (hasbrace && !isDIGIT(*RExC_parse)) {
7297 if (isrel) RExC_parse--;
7298 RExC_parse -= 2;
7299 goto parse_named_seq;
7300 } }
c74340f9 7301 num = atoi(RExC_parse);
b72d83b2
RGS
7302 if (isg && num == 0)
7303 vFAIL("Reference to invalid group 0");
c74340f9 7304 if (isrel) {
5624f11d 7305 num = RExC_npar - num;
c74340f9
YO
7306 if (num < 1)
7307 vFAIL("Reference to nonexistent or unclosed group");
7308 }
2bf803e2 7309 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
7310 goto defchar;
7311 else {
3dab1dad 7312 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
7313 while (isDIGIT(*RExC_parse))
7314 RExC_parse++;
1f1031fe
YO
7315 if (parse_start == RExC_parse - 1)
7316 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
7317 if (hasbrace) {
7318 if (*RExC_parse != '}')
7319 vFAIL("Unterminated \\g{...} pattern");
7320 RExC_parse++;
7321 }
c74340f9
YO
7322 if (!SIZE_ONLY) {
7323 if (num > (I32)RExC_rx->nparens)
7324 vFAIL("Reference to nonexistent group");
c74340f9 7325 }
830247a4 7326 RExC_sawback = 1;
eb160463
GS
7327 ret = reganode(pRExC_state,
7328 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7329 num);
a0d0e21e 7330 *flagp |= HASWIDTH;
2af232bd 7331
fac92740 7332 /* override incorrect value set in reganode MJD */
2af232bd 7333 Set_Node_Offset(ret, parse_start+1);
fac92740 7334 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
7335 RExC_parse--;
7336 nextchar(pRExC_state);
a0d0e21e
LW
7337 }
7338 }
7339 break;
7340 case '\0':
830247a4 7341 if (RExC_parse >= RExC_end)
b45f050a 7342 FAIL("Trailing \\");
a0d0e21e
LW
7343 /* FALL THROUGH */
7344 default:
a0288114 7345 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 7346 back into the quick-grab loop below */
45948336 7347 parse_start--;
a0d0e21e
LW
7348 goto defchar;
7349 }
7350 break;
4633a7c4
LW
7351
7352 case '#':
bbe252da 7353 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 7354 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
7355 goto tryagain;
7356 }
7357 /* FALL THROUGH */
7358
f9a79580
RGS
7359 default:
7360 outer_default:{
ba210ebe 7361 register STRLEN len;
58ae7d3f 7362 register UV ender;
a0d0e21e 7363 register char *p;
3dab1dad 7364 char *s;
80aecb99 7365 STRLEN foldlen;
89ebb4a3 7366 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
7367
7368 parse_start = RExC_parse - 1;
a0d0e21e 7369
830247a4 7370 RExC_parse++;
a0d0e21e
LW
7371
7372 defchar:
58ae7d3f 7373 ender = 0;
eb160463
GS
7374 ret = reg_node(pRExC_state,
7375 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 7376 s = STRING(ret);
830247a4
IZ
7377 for (len = 0, p = RExC_parse - 1;
7378 len < 127 && p < RExC_end;
a0d0e21e
LW
7379 len++)
7380 {
3dab1dad 7381 char * const oldp = p;
5b5a24f7 7382
bbe252da 7383 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 7384 p = regwhite( pRExC_state, p );
f9a79580
RGS
7385 switch ((U8)*p) {
7386 case 0xDF:
7387 case 0xC3:
7388 case 0xCE:
56d400ed 7389 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
f9a79580 7390 goto normal_default;
a0d0e21e
LW
7391 case '^':
7392 case '$':
7393 case '.':
7394 case '[':
7395 case '(':
7396 case ')':
7397 case '|':
7398 goto loopdone;
7399 case '\\':
ee9b8eae
YO
7400 /* Literal Escapes Switch
7401
7402 This switch is meant to handle escape sequences that
7403 resolve to a literal character.
7404
7405 Every escape sequence that represents something
7406 else, like an assertion or a char class, is handled
7407 in the switch marked 'Special Escapes' above in this
7408 routine, but also has an entry here as anything that
7409 isn't explicitly mentioned here will be treated as
7410 an unescaped equivalent literal.
7411 */
7412
a0a388a1 7413 switch ((U8)*++p) {
ee9b8eae 7414 /* These are all the special escapes. */
a0a388a1
YO
7415 case 0xDF:
7416 case 0xC3:
7417 case 0xCE:
7418 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7419 goto normal_default;
ee9b8eae
YO
7420 case 'A': /* Start assertion */
7421 case 'b': case 'B': /* Word-boundary assertion*/
7422 case 'C': /* Single char !DANGEROUS! */
7423 case 'd': case 'D': /* digit class */
7424 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 7425 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
7426 case 'k': case 'K': /* named backref, keep marker */
7427 case 'N': /* named char sequence */
38a44b82 7428 case 'p': case 'P': /* Unicode property */
e1d1eefb 7429 case 'R': /* LNBREAK */
ee9b8eae 7430 case 's': case 'S': /* space class */
e1d1eefb 7431 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
7432 case 'w': case 'W': /* word class */
7433 case 'X': /* eXtended Unicode "combining character sequence" */
7434 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
7435 --p;
7436 goto loopdone;
ee9b8eae
YO
7437
7438 /* Anything after here is an escape that resolves to a
7439 literal. (Except digits, which may or may not)
7440 */
a0d0e21e
LW
7441 case 'n':
7442 ender = '\n';
7443 p++;
a687059c 7444 break;
a0d0e21e
LW
7445 case 'r':
7446 ender = '\r';
7447 p++;
a687059c 7448 break;
a0d0e21e
LW
7449 case 't':
7450 ender = '\t';
7451 p++;
a687059c 7452 break;
a0d0e21e
LW
7453 case 'f':
7454 ender = '\f';
7455 p++;
a687059c 7456 break;
a0d0e21e 7457 case 'e':
c7f1f016 7458 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 7459 p++;
a687059c 7460 break;
a0d0e21e 7461 case 'a':
c7f1f016 7462 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 7463 p++;
a687059c 7464 break;
a0d0e21e 7465 case 'x':
a0ed51b3 7466 if (*++p == '{') {
1df70142 7467 char* const e = strchr(p, '}');
b81d288d 7468
b45f050a 7469 if (!e) {
830247a4 7470 RExC_parse = p + 1;
b45f050a
JF
7471 vFAIL("Missing right brace on \\x{}");
7472 }
de5f0749 7473 else {
a4c04bdc
NC
7474 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7475 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 7476 STRLEN numlen = e - p - 1;
53305cf1 7477 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
7478 if (ender > 0xff)
7479 RExC_utf8 = 1;
a0ed51b3
LW
7480 p = e + 1;
7481 }
a0ed51b3
LW
7482 }
7483 else {
a4c04bdc 7484 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 7485 STRLEN numlen = 2;
53305cf1 7486 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
7487 p += numlen;
7488 }
9e08bc66
TS
7489 if (PL_encoding && ender < 0x100)
7490 goto recode_encoding;
a687059c 7491 break;
a0d0e21e
LW
7492 case 'c':
7493 p++;
f9d13529 7494 ender = grok_bslash_c(*p++, SIZE_ONLY);
a687059c 7495 break;
a0d0e21e
LW
7496 case '0': case '1': case '2': case '3':case '4':
7497 case '5': case '6': case '7': case '8':case '9':
7498 if (*p == '0' ||
830247a4 7499 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 7500 I32 flags = 0;
1df70142 7501 STRLEN numlen = 3;
53305cf1 7502 ender = grok_oct(p, &numlen, &flags, NULL);
609122bd
KW
7503
7504 /* An octal above 0xff is interpreted differently
7505 * depending on if the re is in utf8 or not. If it
7506 * is in utf8, the value will be itself, otherwise
7507 * it is interpreted as modulo 0x100. It has been
7508 * decided to discourage the use of octal above the
7509 * single-byte range. For now, warn only when
7510 * it ends up modulo */
7511 if (SIZE_ONLY && ender >= 0x100
668c081a
NC
7512 && ! UTF && ! PL_encoding) {
7513 ckWARNregdep(p, "Use of octal value above 377 is deprecated");
609122bd 7514 }
a0d0e21e
LW
7515 p += numlen;
7516 }
7517 else {
7518 --p;
7519 goto loopdone;
a687059c 7520 }
9e08bc66
TS
7521 if (PL_encoding && ender < 0x100)
7522 goto recode_encoding;
7523 break;
7524 recode_encoding:
7525 {
7526 SV* enc = PL_encoding;
7527 ender = reg_recode((const char)(U8)ender, &enc);
668c081a
NC
7528 if (!enc && SIZE_ONLY)
7529 ckWARNreg(p, "Invalid escape in the specified encoding");
9e08bc66
TS
7530 RExC_utf8 = 1;
7531 }
a687059c 7532 break;
a0d0e21e 7533 case '\0':
830247a4 7534 if (p >= RExC_end)
b45f050a 7535 FAIL("Trailing \\");
a687059c 7536 /* FALL THROUGH */
a0d0e21e 7537 default:
668c081a
NC
7538 if (!SIZE_ONLY&& isALPHA(*p))
7539 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 7540 goto normal_default;
a0d0e21e
LW
7541 }
7542 break;
a687059c 7543 default:
a0ed51b3 7544 normal_default:
fd400ab9 7545 if (UTF8_IS_START(*p) && UTF) {
1df70142 7546 STRLEN numlen;
5e12f4fb 7547 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 7548 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
7549 p += numlen;
7550 }
7551 else
7552 ender = *p++;
a0d0e21e 7553 break;
a687059c 7554 }
bcdf7404
YO
7555 if ( RExC_flags & RXf_PMf_EXTENDED)
7556 p = regwhite( pRExC_state, p );
60a8b682
JH
7557 if (UTF && FOLD) {
7558 /* Prime the casefolded buffer. */
ac7e0132 7559 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 7560 }
bcdf7404 7561 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
7562 if (len)
7563 p = oldp;
16ea2a2e 7564 else if (UTF) {
80aecb99 7565 if (FOLD) {
60a8b682 7566 /* Emit all the Unicode characters. */
1df70142 7567 STRLEN numlen;
80aecb99
JH
7568 for (foldbuf = tmpbuf;
7569 foldlen;
7570 foldlen -= numlen) {
7571 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7572 if (numlen > 0) {
71207a34 7573 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7574 s += unilen;
7575 len += unilen;
7576 /* In EBCDIC the numlen
7577 * and unilen can differ. */
9dc45d57 7578 foldbuf += numlen;
47654450
JH
7579 if (numlen >= foldlen)
7580 break;
9dc45d57
JH
7581 }
7582 else
7583 break; /* "Can't happen." */
80aecb99
JH
7584 }
7585 }
7586 else {
71207a34 7587 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7588 if (unilen > 0) {
0ebc6274
JH
7589 s += unilen;
7590 len += unilen;
9dc45d57 7591 }
80aecb99 7592 }
a0ed51b3 7593 }
a0d0e21e
LW
7594 else {
7595 len++;
eb160463 7596 REGC((char)ender, s++);
a0d0e21e
LW
7597 }
7598 break;
a687059c 7599 }
16ea2a2e 7600 if (UTF) {
80aecb99 7601 if (FOLD) {
60a8b682 7602 /* Emit all the Unicode characters. */
1df70142 7603 STRLEN numlen;
80aecb99
JH
7604 for (foldbuf = tmpbuf;
7605 foldlen;
7606 foldlen -= numlen) {
7607 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7608 if (numlen > 0) {
71207a34 7609 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7610 len += unilen;
7611 s += unilen;
7612 /* In EBCDIC the numlen
7613 * and unilen can differ. */
9dc45d57 7614 foldbuf += numlen;
47654450
JH
7615 if (numlen >= foldlen)
7616 break;
9dc45d57
JH
7617 }
7618 else
7619 break;
80aecb99
JH
7620 }
7621 }
7622 else {
71207a34 7623 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7624 if (unilen > 0) {
0ebc6274
JH
7625 s += unilen;
7626 len += unilen;
9dc45d57 7627 }
80aecb99
JH
7628 }
7629 len--;
a0ed51b3
LW
7630 }
7631 else
eb160463 7632 REGC((char)ender, s++);
a0d0e21e
LW
7633 }
7634 loopdone:
830247a4 7635 RExC_parse = p - 1;
fac92740 7636 Set_Node_Cur_Length(ret); /* MJD */
830247a4 7637 nextchar(pRExC_state);
793db0cb
JH
7638 {
7639 /* len is STRLEN which is unsigned, need to copy to signed */
7640 IV iv = len;
7641 if (iv < 0)
7642 vFAIL("Internal disaster");
7643 }
a0d0e21e
LW
7644 if (len > 0)
7645 *flagp |= HASWIDTH;
090f7165 7646 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 7647 *flagp |= SIMPLE;
3dab1dad 7648
cd439c50 7649 if (SIZE_ONLY)
830247a4 7650 RExC_size += STR_SZ(len);
3dab1dad
YO
7651 else {
7652 STR_LEN(ret) = len;
830247a4 7653 RExC_emit += STR_SZ(len);
07be1b83 7654 }
3dab1dad 7655 }
a0d0e21e
LW
7656 break;
7657 }
a687059c 7658
a0d0e21e 7659 return(ret);
a687059c
LW
7660}
7661
873ef191 7662STATIC char *
bcdf7404 7663S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 7664{
bcdf7404 7665 const char *e = RExC_end;
7918f24d
NC
7666
7667 PERL_ARGS_ASSERT_REGWHITE;
7668
5b5a24f7
CS
7669 while (p < e) {
7670 if (isSPACE(*p))
7671 ++p;
7672 else if (*p == '#') {
bcdf7404 7673 bool ended = 0;
5b5a24f7 7674 do {
bcdf7404
YO
7675 if (*p++ == '\n') {
7676 ended = 1;
7677 break;
7678 }
7679 } while (p < e);
7680 if (!ended)
7681 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
7682 }
7683 else
7684 break;
7685 }
7686 return p;
7687}
7688
b8c5462f
JH
7689/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7690 Character classes ([:foo:]) can also be negated ([:^foo:]).
7691 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7692 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 7693 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
7694
7695#define POSIXCC_DONE(c) ((c) == ':')
7696#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7697#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7698
b8c5462f 7699STATIC I32
830247a4 7700S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 7701{
97aff369 7702 dVAR;
936ed897 7703 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 7704
7918f24d
NC
7705 PERL_ARGS_ASSERT_REGPPOSIXCC;
7706
830247a4 7707 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 7708 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 7709 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 7710 const char c = UCHARAT(RExC_parse);
097eb12c 7711 char* const s = RExC_parse++;
b81d288d 7712
9a86a77b 7713 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
7714 RExC_parse++;
7715 if (RExC_parse == RExC_end)
620e46c5 7716 /* Grandfather lone [:, [=, [. */
830247a4 7717 RExC_parse = s;
620e46c5 7718 else {
3dab1dad 7719 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
7720 assert(*t == c);
7721
9a86a77b 7722 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 7723 const char *posixcc = s + 1;
830247a4 7724 RExC_parse++; /* skip over the ending ] */
3dab1dad 7725
b8c5462f 7726 if (*s == ':') {
1df70142
AL
7727 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7728 const I32 skip = t - posixcc;
80916619
NC
7729
7730 /* Initially switch on the length of the name. */
7731 switch (skip) {
7732 case 4:
3dab1dad
YO
7733 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7734 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 7735 break;
80916619
NC
7736 case 5:
7737 /* Names all of length 5. */
7738 /* alnum alpha ascii blank cntrl digit graph lower
7739 print punct space upper */
7740 /* Offset 4 gives the best switch position. */
7741 switch (posixcc[4]) {
7742 case 'a':
3dab1dad
YO
7743 if (memEQ(posixcc, "alph", 4)) /* alpha */
7744 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
7745 break;
7746 case 'e':
3dab1dad
YO
7747 if (memEQ(posixcc, "spac", 4)) /* space */
7748 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
7749 break;
7750 case 'h':
3dab1dad
YO
7751 if (memEQ(posixcc, "grap", 4)) /* graph */
7752 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
7753 break;
7754 case 'i':
3dab1dad
YO
7755 if (memEQ(posixcc, "asci", 4)) /* ascii */
7756 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
7757 break;
7758 case 'k':
3dab1dad
YO
7759 if (memEQ(posixcc, "blan", 4)) /* blank */
7760 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
7761 break;
7762 case 'l':
3dab1dad
YO
7763 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7764 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
7765 break;
7766 case 'm':
3dab1dad
YO
7767 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7768 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
7769 break;
7770 case 'r':
3dab1dad
YO
7771 if (memEQ(posixcc, "lowe", 4)) /* lower */
7772 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7773 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7774 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
7775 break;
7776 case 't':
3dab1dad
YO
7777 if (memEQ(posixcc, "digi", 4)) /* digit */
7778 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7779 else if (memEQ(posixcc, "prin", 4)) /* print */
7780 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7781 else if (memEQ(posixcc, "punc", 4)) /* punct */
7782 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 7783 break;
b8c5462f
JH
7784 }
7785 break;
80916619 7786 case 6:
3dab1dad
YO
7787 if (memEQ(posixcc, "xdigit", 6))
7788 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
7789 break;
7790 }
80916619
NC
7791
7792 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
7793 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7794 t - s - 1, s + 1);
80916619
NC
7795 assert (posixcc[skip] == ':');
7796 assert (posixcc[skip+1] == ']');
b45f050a 7797 } else if (!SIZE_ONLY) {
b8c5462f 7798 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 7799
830247a4 7800 /* adjust RExC_parse so the warning shows after
b45f050a 7801 the class closes */
9a86a77b 7802 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 7803 RExC_parse++;
b45f050a
JF
7804 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7805 }
b8c5462f
JH
7806 } else {
7807 /* Maternal grandfather:
7808 * "[:" ending in ":" but not in ":]" */
830247a4 7809 RExC_parse = s;
767d463e 7810 }
620e46c5
JH
7811 }
7812 }
7813
b8c5462f
JH
7814 return namedclass;
7815}
7816
7817STATIC void
830247a4 7818S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 7819{
97aff369 7820 dVAR;
7918f24d
NC
7821
7822 PERL_ARGS_ASSERT_CHECKPOSIXCC;
7823
3dab1dad 7824 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
7825 const char *s = RExC_parse;
7826 const char c = *s++;
b8c5462f 7827
3dab1dad 7828 while (isALNUM(*s))
b8c5462f
JH
7829 s++;
7830 if (*s && c == *s && s[1] == ']') {
668c081a
NC
7831 ckWARN3reg(s+2,
7832 "POSIX syntax [%c %c] belongs inside character classes",
7833 c, c);
b45f050a
JF
7834
7835 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 7836 if (POSIXCC_NOTYET(c)) {
830247a4 7837 /* adjust RExC_parse so the error shows after
b45f050a 7838 the class closes */
9a86a77b 7839 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 7840 NOOP;
b45f050a
JF
7841 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7842 }
b8c5462f
JH
7843 }
7844 }
620e46c5
JH
7845}
7846
7f6f358c 7847
89836f1f
YO
7848#define _C_C_T_(NAME,TEST,WORD) \
7849ANYOF_##NAME: \
7850 if (LOC) \
7851 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7852 else { \
7853 for (value = 0; value < 256; value++) \
7854 if (TEST) \
7855 ANYOF_BITMAP_SET(ret, value); \
7856 } \
7857 yesno = '+'; \
7858 what = WORD; \
7859 break; \
7860case ANYOF_N##NAME: \
7861 if (LOC) \
7862 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7863 else { \
7864 for (value = 0; value < 256; value++) \
7865 if (!TEST) \
7866 ANYOF_BITMAP_SET(ret, value); \
7867 } \
7868 yesno = '!'; \
7869 what = WORD; \
7870 break
7871
e1d1eefb
YO
7872#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7873ANYOF_##NAME: \
7874 for (value = 0; value < 256; value++) \
7875 if (TEST) \
7876 ANYOF_BITMAP_SET(ret, value); \
7877 yesno = '+'; \
7878 what = WORD; \
7879 break; \
7880case ANYOF_N##NAME: \
7881 for (value = 0; value < 256; value++) \
7882 if (!TEST) \
7883 ANYOF_BITMAP_SET(ret, value); \
7884 yesno = '!'; \
7885 what = WORD; \
7886 break
89836f1f 7887
da7fcca4
YO
7888/*
7889 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7890 so that it is possible to override the option here without having to
7891 rebuild the entire core. as we are required to do if we change regcomp.h
7892 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7893*/
7894#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7895#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7896#endif
7897
7898#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7899#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7900#else
7901#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7902#endif
7903
7f6f358c
YO
7904/*
7905 parse a class specification and produce either an ANYOF node that
89836f1f
YO
7906 matches the pattern or if the pattern matches a single char only and
7907 that char is < 256 and we are case insensitive then we produce an
7908 EXACT node instead.
7f6f358c 7909*/
89836f1f 7910
76e3520e 7911STATIC regnode *
3dab1dad 7912S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 7913{
97aff369 7914 dVAR;
9a86a77b 7915 register UV nextvalue;
3568d838 7916 register IV prevvalue = OOB_UNICODE;
ffc61ed2 7917 register IV range = 0;
e1d1eefb 7918 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 7919 register regnode *ret;
ba210ebe 7920 STRLEN numlen;
ffc61ed2 7921 IV namedclass;
cbbf8932 7922 char *rangebegin = NULL;
936ed897 7923 bool need_class = 0;
c445ea15 7924 SV *listsv = NULL;
ffc61ed2 7925 UV n;
9e55ce06 7926 bool optimize_invert = TRUE;
cbbf8932 7927 AV* unicode_alternate = NULL;
1b2d223b
JH
7928#ifdef EBCDIC
7929 UV literal_endpoint = 0;
7930#endif
7f6f358c 7931 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 7932
3dab1dad 7933 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 7934 case we need to change the emitted regop to an EXACT. */
07be1b83 7935 const char * orig_parse = RExC_parse;
72f13be8 7936 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
7937
7938 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
7939#ifndef DEBUGGING
7940 PERL_UNUSED_ARG(depth);
7941#endif
72f13be8 7942
3dab1dad 7943 DEBUG_PARSE("clas");
7f6f358c
YO
7944
7945 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
7946 ret = reganode(pRExC_state, ANYOF, 0);
7947
7948 if (!SIZE_ONLY)
7949 ANYOF_FLAGS(ret) = 0;
7950
9a86a77b 7951 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
7952 RExC_naughty++;
7953 RExC_parse++;
7954 if (!SIZE_ONLY)
7955 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7956 }
a0d0e21e 7957
73060fc4 7958 if (SIZE_ONLY) {
830247a4 7959 RExC_size += ANYOF_SKIP;
73060fc4
JH
7960 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7961 }
936ed897 7962 else {
830247a4 7963 RExC_emit += ANYOF_SKIP;
936ed897
IZ
7964 if (FOLD)
7965 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7966 if (LOC)
7967 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 7968 ANYOF_BITMAP_ZERO(ret);
396482e1 7969 listsv = newSVpvs("# comment\n");
a0d0e21e 7970 }
b8c5462f 7971
9a86a77b
JH
7972 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7973
b938889d 7974 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 7975 checkposixcc(pRExC_state);
b8c5462f 7976
f064b6ad
HS
7977 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7978 if (UCHARAT(RExC_parse) == ']')
7979 goto charclassloop;
ffc61ed2 7980
fc8cd66c 7981parseit:
9a86a77b 7982 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
7983
7984 charclassloop:
7985
7986 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7987
73b437c8 7988 if (!range)
830247a4 7989 rangebegin = RExC_parse;
ffc61ed2 7990 if (UTF) {
5e12f4fb 7991 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 7992 RExC_end - RExC_parse,
9f7f3913 7993 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7994 RExC_parse += numlen;
7995 }
7996 else
7997 value = UCHARAT(RExC_parse++);
7f6f358c 7998
9a86a77b
JH
7999 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8000 if (value == '[' && POSIXCC(nextvalue))
830247a4 8001 namedclass = regpposixcc(pRExC_state, value);
620e46c5 8002 else if (value == '\\') {
ffc61ed2 8003 if (UTF) {
5e12f4fb 8004 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 8005 RExC_end - RExC_parse,
9f7f3913 8006 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
8007 RExC_parse += numlen;
8008 }
8009 else
8010 value = UCHARAT(RExC_parse++);
470c3474 8011 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 8012 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
8013 * be a problem later if we want switch on Unicode.
8014 * A similar issue a little bit later when switching on
8015 * namedclass. --jhi */
ffc61ed2 8016 switch ((I32)value) {
b8c5462f
JH
8017 case 'w': namedclass = ANYOF_ALNUM; break;
8018 case 'W': namedclass = ANYOF_NALNUM; break;
8019 case 's': namedclass = ANYOF_SPACE; break;
8020 case 'S': namedclass = ANYOF_NSPACE; break;
8021 case 'd': namedclass = ANYOF_DIGIT; break;
8022 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
8023 case 'v': namedclass = ANYOF_VERTWS; break;
8024 case 'V': namedclass = ANYOF_NVERTWS; break;
8025 case 'h': namedclass = ANYOF_HORIZWS; break;
8026 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
8027 case 'N': /* Handle \N{NAME} in class */
8028 {
8029 /* We only pay attention to the first char of
8030 multichar strings being returned. I kinda wonder
8031 if this makes sense as it does change the behaviour
8032 from earlier versions, OTOH that behaviour was broken
8033 as well. */
8034 UV v; /* value is register so we cant & it /grrr */
afefe6bf 8035 if (reg_namedseq(pRExC_state, &v, NULL)) {
fc8cd66c
YO
8036 goto parseit;
8037 }
8038 value= v;
8039 }
8040 break;
ffc61ed2
JH
8041 case 'p':
8042 case 'P':
3dab1dad
YO
8043 {
8044 char *e;
af6f566e 8045 if (RExC_parse >= RExC_end)
2a4859cd 8046 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 8047 if (*RExC_parse == '{') {
1df70142 8048 const U8 c = (U8)value;
ffc61ed2
JH
8049 e = strchr(RExC_parse++, '}');
8050 if (!e)
0da60cf5 8051 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
8052 while (isSPACE(UCHARAT(RExC_parse)))
8053 RExC_parse++;
8054 if (e == RExC_parse)
0da60cf5 8055 vFAIL2("Empty \\%c{}", c);
ffc61ed2 8056 n = e - RExC_parse;
ab13f0c7
JH
8057 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8058 n--;
ffc61ed2
JH
8059 }
8060 else {
8061 e = RExC_parse;
8062 n = 1;
8063 }
8064 if (!SIZE_ONLY) {
ab13f0c7
JH
8065 if (UCHARAT(RExC_parse) == '^') {
8066 RExC_parse++;
8067 n--;
8068 value = value == 'p' ? 'P' : 'p'; /* toggle */
8069 while (isSPACE(UCHARAT(RExC_parse))) {
8070 RExC_parse++;
8071 n--;
8072 }
8073 }
097eb12c
AL
8074 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8075 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
8076 }
8077 RExC_parse = e + 1;
8078 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 8079 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 8080 }
f81125e2 8081 break;
b8c5462f
JH
8082 case 'n': value = '\n'; break;
8083 case 'r': value = '\r'; break;
8084 case 't': value = '\t'; break;
8085 case 'f': value = '\f'; break;
8086 case 'b': value = '\b'; break;
c7f1f016
NIS
8087 case 'e': value = ASCII_TO_NATIVE('\033');break;
8088 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 8089 case 'x':
ffc61ed2 8090 if (*RExC_parse == '{') {
a4c04bdc
NC
8091 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8092 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 8093 char * const e = strchr(RExC_parse++, '}');
b81d288d 8094 if (!e)
ffc61ed2 8095 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
8096
8097 numlen = e - RExC_parse;
8098 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8099 RExC_parse = e + 1;
8100 }
8101 else {
a4c04bdc 8102 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
8103 numlen = 2;
8104 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8105 RExC_parse += numlen;
8106 }
9e08bc66
TS
8107 if (PL_encoding && value < 0x100)
8108 goto recode_encoding;
b8c5462f
JH
8109 break;
8110 case 'c':
f9d13529 8111 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
b8c5462f
JH
8112 break;
8113 case '0': case '1': case '2': case '3': case '4':
8114 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
8115 {
8116 I32 flags = 0;
8117 numlen = 3;
8118 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8119 RExC_parse += numlen;
8120 if (PL_encoding && value < 0x100)
8121 goto recode_encoding;
8122 break;
8123 }
8124 recode_encoding:
8125 {
8126 SV* enc = PL_encoding;
8127 value = reg_recode((const char)(U8)value, &enc);
668c081a
NC
8128 if (!enc && SIZE_ONLY)
8129 ckWARNreg(RExC_parse,
8130 "Invalid escape in the specified encoding");
9e08bc66
TS
8131 break;
8132 }
1028017a 8133 default:
668c081a
NC
8134 if (!SIZE_ONLY && isALPHA(value))
8135 ckWARN2reg(RExC_parse,
8136 "Unrecognized escape \\%c in character class passed through",
8137 (int)value);
1028017a 8138 break;
b8c5462f 8139 }
ffc61ed2 8140 } /* end of \blah */
1b2d223b
JH
8141#ifdef EBCDIC
8142 else
8143 literal_endpoint++;
8144#endif
ffc61ed2
JH
8145
8146 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8147
8148 if (!SIZE_ONLY && !need_class)
936ed897 8149 ANYOF_CLASS_ZERO(ret);
ffc61ed2 8150
936ed897 8151 need_class = 1;
ffc61ed2
JH
8152
8153 /* a bad range like a-\d, a-[:digit:] ? */
8154 if (range) {
73b437c8 8155 if (!SIZE_ONLY) {
668c081a
NC
8156 const int w =
8157 RExC_parse >= rangebegin ?
8158 RExC_parse - rangebegin : 0;
8159 ckWARN4reg(RExC_parse,
b45f050a 8160 "False [] range \"%*.*s\"",
097eb12c 8161 w, w, rangebegin);
668c081a 8162
3568d838
JH
8163 if (prevvalue < 256) {
8164 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
8165 ANYOF_BITMAP_SET(ret, '-');
8166 }
8167 else {
8168 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8169 Perl_sv_catpvf(aTHX_ listsv,
3568d838 8170 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 8171 }
b8c5462f 8172 }
ffc61ed2
JH
8173
8174 range = 0; /* this was not a true range */
73b437c8 8175 }
ffc61ed2 8176
89836f1f
YO
8177
8178
73b437c8 8179 if (!SIZE_ONLY) {
c49a72a9
NC
8180 const char *what = NULL;
8181 char yesno = 0;
8182
3568d838
JH
8183 if (namedclass > OOB_NAMEDCLASS)
8184 optimize_invert = FALSE;
e2962f66
JH
8185 /* Possible truncation here but in some 64-bit environments
8186 * the compiler gets heartburn about switch on 64-bit values.
8187 * A similar issue a little earlier when switching on value.
98f323fa 8188 * --jhi */
e2962f66 8189 switch ((I32)namedclass) {
da7fcca4
YO
8190
8191 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8192 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8193 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8194 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8195 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8196 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8197 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8198 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8199 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8200 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8201#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
89836f1f 8202 case _C_C_T_(ALNUM, isALNUM(value), "Word");
89836f1f 8203 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
da7fcca4
YO
8204#else
8205 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8206 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8207#endif
89836f1f 8208 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
e1d1eefb
YO
8209 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8210 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
73b437c8
JH
8211 case ANYOF_ASCII:
8212 if (LOC)
936ed897 8213 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 8214 else {
c7f1f016 8215#ifndef EBCDIC
1ba5c669
JH
8216 for (value = 0; value < 128; value++)
8217 ANYOF_BITMAP_SET(ret, value);
8218#else /* EBCDIC */
ffbc6a93 8219 for (value = 0; value < 256; value++) {
3a3c4447
JH
8220 if (isASCII(value))
8221 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 8222 }
1ba5c669 8223#endif /* EBCDIC */
73b437c8 8224 }
c49a72a9
NC
8225 yesno = '+';
8226 what = "ASCII";
73b437c8
JH
8227 break;
8228 case ANYOF_NASCII:
8229 if (LOC)
936ed897 8230 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 8231 else {
c7f1f016 8232#ifndef EBCDIC
1ba5c669
JH
8233 for (value = 128; value < 256; value++)
8234 ANYOF_BITMAP_SET(ret, value);
8235#else /* EBCDIC */
ffbc6a93 8236 for (value = 0; value < 256; value++) {
3a3c4447
JH
8237 if (!isASCII(value))
8238 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 8239 }
1ba5c669 8240#endif /* EBCDIC */
73b437c8 8241 }
c49a72a9
NC
8242 yesno = '!';
8243 what = "ASCII";
89836f1f 8244 break;
ffc61ed2
JH
8245 case ANYOF_DIGIT:
8246 if (LOC)
8247 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8248 else {
8249 /* consecutive digits assumed */
8250 for (value = '0'; value <= '9'; value++)
8251 ANYOF_BITMAP_SET(ret, value);
8252 }
c49a72a9 8253 yesno = '+';
da7fcca4 8254 what = POSIX_CC_UNI_NAME("Digit");
ffc61ed2
JH
8255 break;
8256 case ANYOF_NDIGIT:
8257 if (LOC)
8258 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8259 else {
8260 /* consecutive digits assumed */
8261 for (value = 0; value < '0'; value++)
8262 ANYOF_BITMAP_SET(ret, value);
8263 for (value = '9' + 1; value < 256; value++)
8264 ANYOF_BITMAP_SET(ret, value);
8265 }
c49a72a9 8266 yesno = '!';
da7fcca4 8267 what = POSIX_CC_UNI_NAME("Digit");
89836f1f 8268 break;
f81125e2
JP
8269 case ANYOF_MAX:
8270 /* this is to handle \p and \P */
8271 break;
73b437c8 8272 default:
b45f050a 8273 vFAIL("Invalid [::] class");
73b437c8 8274 break;
b8c5462f 8275 }
c49a72a9
NC
8276 if (what) {
8277 /* Strings such as "+utf8::isWord\n" */
8278 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8279 }
b8c5462f 8280 if (LOC)
936ed897 8281 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 8282 continue;
a0d0e21e 8283 }
ffc61ed2
JH
8284 } /* end of namedclass \blah */
8285
a0d0e21e 8286 if (range) {
eb160463 8287 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
8288 const int w = RExC_parse - rangebegin;
8289 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 8290 range = 0; /* not a valid range */
73b437c8 8291 }
a0d0e21e
LW
8292 }
8293 else {
3568d838 8294 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
8295 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8296 RExC_parse[1] != ']') {
8297 RExC_parse++;
ffc61ed2
JH
8298
8299 /* a bad range like \w-, [:word:]- ? */
8300 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 8301 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 8302 const int w =
afd78fd5
JH
8303 RExC_parse >= rangebegin ?
8304 RExC_parse - rangebegin : 0;
830247a4 8305 vWARN4(RExC_parse,
b45f050a 8306 "False [] range \"%*.*s\"",
097eb12c 8307 w, w, rangebegin);
afd78fd5 8308 }
73b437c8 8309 if (!SIZE_ONLY)
936ed897 8310 ANYOF_BITMAP_SET(ret, '-');
73b437c8 8311 } else
ffc61ed2
JH
8312 range = 1; /* yeah, it's a range! */
8313 continue; /* but do it the next time */
a0d0e21e 8314 }
a687059c 8315 }
ffc61ed2 8316
93733859 8317 /* now is the next time */
07be1b83 8318 /*stored += (value - prevvalue + 1);*/
ae5c130c 8319 if (!SIZE_ONLY) {
3568d838 8320 if (prevvalue < 256) {
1df70142 8321 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 8322 IV i;
3568d838 8323#ifdef EBCDIC
1b2d223b
JH
8324 /* In EBCDIC [\x89-\x91] should include
8325 * the \x8e but [i-j] should not. */
8326 if (literal_endpoint == 2 &&
8327 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8328 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 8329 {
3568d838
JH
8330 if (isLOWER(prevvalue)) {
8331 for (i = prevvalue; i <= ceilvalue; i++)
2670d666
BC
8332 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8333 stored++;
ffc61ed2 8334 ANYOF_BITMAP_SET(ret, i);
2670d666 8335 }
ffc61ed2 8336 } else {
3568d838 8337 for (i = prevvalue; i <= ceilvalue; i++)
2670d666
BC
8338 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8339 stored++;
ffc61ed2 8340 ANYOF_BITMAP_SET(ret, i);
2670d666 8341 }
ffc61ed2 8342 }
8ada0baa 8343 }
ffc61ed2 8344 else
8ada0baa 8345#endif
07be1b83
YO
8346 for (i = prevvalue; i <= ceilvalue; i++) {
8347 if (!ANYOF_BITMAP_TEST(ret,i)) {
8348 stored++;
8349 ANYOF_BITMAP_SET(ret, i);
8350 }
8351 }
3568d838 8352 }
a5961de5 8353 if (value > 255 || UTF) {
1df70142
AL
8354 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8355 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 8356 stored+=2; /* can't optimize this class */
ffc61ed2 8357 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 8358 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 8359 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
8360 prevnatvalue, natvalue);
8361 }
8362 else if (prevnatvalue == natvalue) {
8363 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 8364 if (FOLD) {
89ebb4a3 8365 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 8366 STRLEN foldlen;
1df70142 8367 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 8368
e294cc5d
JH
8369#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8370 if (RExC_precomp[0] == ':' &&
8371 RExC_precomp[1] == '[' &&
8372 (f == 0xDF || f == 0x92)) {
8373 f = NATIVE_TO_UNI(f);
8374 }
8375#endif
c840d2a2
JH
8376 /* If folding and foldable and a single
8377 * character, insert also the folded version
8378 * to the charclass. */
9e55ce06 8379 if (f != value) {
e294cc5d
JH
8380#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8381 if ((RExC_precomp[0] == ':' &&
8382 RExC_precomp[1] == '[' &&
8383 (f == 0xA2 &&
8384 (value == 0xFB05 || value == 0xFB06))) ?
8385 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8386 foldlen == (STRLEN)UNISKIP(f) )
8387#else
eb160463 8388 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 8389#endif
9e55ce06
JH
8390 Perl_sv_catpvf(aTHX_ listsv,
8391 "%04"UVxf"\n", f);
8392 else {
8393 /* Any multicharacter foldings
8394 * require the following transform:
8395 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8396 * where E folds into "pq" and F folds
8397 * into "rst", all other characters
8398 * fold to single characters. We save
8399 * away these multicharacter foldings,
8400 * to be later saved as part of the
8401 * additional "s" data. */
8402 SV *sv;
8403
8404 if (!unicode_alternate)
8405 unicode_alternate = newAV();
740cce10
NC
8406 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8407 TRUE);
9e55ce06
JH
8408 av_push(unicode_alternate, sv);
8409 }
8410 }
254ba52a 8411
60a8b682
JH
8412 /* If folding and the value is one of the Greek
8413 * sigmas insert a few more sigmas to make the
8414 * folding rules of the sigmas to work right.
8415 * Note that not all the possible combinations
8416 * are handled here: some of them are handled
9e55ce06
JH
8417 * by the standard folding rules, and some of
8418 * them (literal or EXACTF cases) are handled
8419 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
8420 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8421 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8422 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 8423 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8424 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8425 }
8426 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8427 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8428 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8429 }
8430 }
ffc61ed2 8431 }
1b2d223b
JH
8432#ifdef EBCDIC
8433 literal_endpoint = 0;
8434#endif
8ada0baa 8435 }
ffc61ed2
JH
8436
8437 range = 0; /* this range (if it was one) is done now */
a0d0e21e 8438 }
ffc61ed2 8439
936ed897 8440 if (need_class) {
4f66b38d 8441 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 8442 if (SIZE_ONLY)
830247a4 8443 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 8444 else
830247a4 8445 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 8446 }
ffc61ed2 8447
7f6f358c
YO
8448
8449 if (SIZE_ONLY)
8450 return ret;
8451 /****** !SIZE_ONLY AFTER HERE *********/
8452
3b57cd43 8453 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
7f6f358c
YO
8454 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8455 ) {
8456 /* optimize single char class to an EXACT node
8457 but *only* when its not a UTF/high char */
07be1b83
YO
8458 const char * cur_parse= RExC_parse;
8459 RExC_emit = (regnode *)orig_emit;
8460 RExC_parse = (char *)orig_parse;
7f6f358c
YO
8461 ret = reg_node(pRExC_state,
8462 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 8463 RExC_parse = (char *)cur_parse;
7f6f358c
YO
8464 *STRING(ret)= (char)value;
8465 STR_LEN(ret)= 1;
8466 RExC_emit += STR_SZ(1);
ef8d46e8 8467 SvREFCNT_dec(listsv);
7f6f358c
YO
8468 return ret;
8469 }
ae5c130c 8470 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 8471 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
8472 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8473 ) {
a0ed51b3 8474 for (value = 0; value < 256; ++value) {
936ed897 8475 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 8476 UV fold = PL_fold[value];
ffc61ed2
JH
8477
8478 if (fold != value)
8479 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
8480 }
8481 }
936ed897 8482 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 8483 }
ffc61ed2 8484
ae5c130c 8485 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 8486 if (optimize_invert &&
ffc61ed2
JH
8487 /* If the only flag is inversion. */
8488 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 8489 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 8490 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 8491 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 8492 }
7f6f358c 8493 {
097eb12c 8494 AV * const av = newAV();
ffc61ed2 8495 SV *rv;
9e55ce06 8496 /* The 0th element stores the character class description
6a0407ee 8497 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
8498 * to initialize the appropriate swash (which gets stored in
8499 * the 1st element), and also useful for dumping the regnode.
8500 * The 2nd element stores the multicharacter foldings,
6a0407ee 8501 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
8502 av_store(av, 0, listsv);
8503 av_store(av, 1, NULL);
ad64d0ec
NC
8504 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8505 rv = newRV_noinc(MUTABLE_SV(av));
19860706 8506 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 8507 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 8508 ARG_SET(ret, n);
a0ed51b3 8509 }
a0ed51b3
LW
8510 return ret;
8511}
89836f1f
YO
8512#undef _C_C_T_
8513
a0ed51b3 8514
bcdf7404
YO
8515/* reg_skipcomment()
8516
8517 Absorbs an /x style # comments from the input stream.
8518 Returns true if there is more text remaining in the stream.
8519 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8520 terminates the pattern without including a newline.
8521
8522 Note its the callers responsibility to ensure that we are
8523 actually in /x mode
8524
8525*/
8526
8527STATIC bool
8528S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8529{
8530 bool ended = 0;
7918f24d
NC
8531
8532 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8533
bcdf7404
YO
8534 while (RExC_parse < RExC_end)
8535 if (*RExC_parse++ == '\n') {
8536 ended = 1;
8537 break;
8538 }
8539 if (!ended) {
8540 /* we ran off the end of the pattern without ending
8541 the comment, so we have to add an \n when wrapping */
8542 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8543 return 0;
8544 } else
8545 return 1;
8546}
8547
8548/* nextchar()
8549
8550 Advance that parse position, and optionall absorbs
8551 "whitespace" from the inputstream.
8552
8553 Without /x "whitespace" means (?#...) style comments only,
8554 with /x this means (?#...) and # comments and whitespace proper.
8555
8556 Returns the RExC_parse point from BEFORE the scan occurs.
8557
8558 This is the /x friendly way of saying RExC_parse++.
8559*/
8560
76e3520e 8561STATIC char*
830247a4 8562S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 8563{
097eb12c 8564 char* const retval = RExC_parse++;
a0d0e21e 8565
7918f24d
NC
8566 PERL_ARGS_ASSERT_NEXTCHAR;
8567
4633a7c4 8568 for (;;) {
830247a4
IZ
8569 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8570 RExC_parse[2] == '#') {
e994fd66
AE
8571 while (*RExC_parse != ')') {
8572 if (RExC_parse == RExC_end)
8573 FAIL("Sequence (?#... not terminated");
830247a4 8574 RExC_parse++;
e994fd66 8575 }
830247a4 8576 RExC_parse++;
4633a7c4
LW
8577 continue;
8578 }
bbe252da 8579 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
8580 if (isSPACE(*RExC_parse)) {
8581 RExC_parse++;
748a9306
LW
8582 continue;
8583 }
830247a4 8584 else if (*RExC_parse == '#') {
bcdf7404
YO
8585 if ( reg_skipcomment( pRExC_state ) )
8586 continue;
748a9306 8587 }
748a9306 8588 }
4633a7c4 8589 return retval;
a0d0e21e 8590 }
a687059c
LW
8591}
8592
8593/*
c277df42 8594- reg_node - emit a node
a0d0e21e 8595*/
76e3520e 8596STATIC regnode * /* Location. */
830247a4 8597S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 8598{
97aff369 8599 dVAR;
c277df42 8600 register regnode *ptr;
504618e9 8601 regnode * const ret = RExC_emit;
07be1b83 8602 GET_RE_DEBUG_FLAGS_DECL;
a687059c 8603
7918f24d
NC
8604 PERL_ARGS_ASSERT_REG_NODE;
8605
c277df42 8606 if (SIZE_ONLY) {
830247a4
IZ
8607 SIZE_ALIGN(RExC_size);
8608 RExC_size += 1;
a0d0e21e
LW
8609 return(ret);
8610 }
3b57cd43
YO
8611 if (RExC_emit >= RExC_emit_bound)
8612 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8613
c277df42 8614 NODE_ALIGN_FILL(ret);
a0d0e21e 8615 ptr = ret;
c277df42 8616 FILL_ADVANCE_NODE(ptr, op);
7122b237 8617#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8618 if (RExC_offsets) { /* MJD */
07be1b83 8619 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 8620 "reg_node", __LINE__,
13d6edb4 8621 PL_reg_name[op],
07be1b83
YO
8622 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8623 ? "Overwriting end of array!\n" : "OK",
8624 (UV)(RExC_emit - RExC_emit_start),
8625 (UV)(RExC_parse - RExC_start),
8626 (UV)RExC_offsets[0]));
ccb2c380 8627 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 8628 }
7122b237 8629#endif
830247a4 8630 RExC_emit = ptr;
a0d0e21e 8631 return(ret);
a687059c
LW
8632}
8633
8634/*
a0d0e21e
LW
8635- reganode - emit a node with an argument
8636*/
76e3520e 8637STATIC regnode * /* Location. */
830247a4 8638S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 8639{
97aff369 8640 dVAR;
c277df42 8641 register regnode *ptr;
504618e9 8642 regnode * const ret = RExC_emit;
07be1b83 8643 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 8644
7918f24d
NC
8645 PERL_ARGS_ASSERT_REGANODE;
8646
c277df42 8647 if (SIZE_ONLY) {
830247a4
IZ
8648 SIZE_ALIGN(RExC_size);
8649 RExC_size += 2;
6bda09f9
YO
8650 /*
8651 We can't do this:
8652
8653 assert(2==regarglen[op]+1);
8654
8655 Anything larger than this has to allocate the extra amount.
8656 If we changed this to be:
8657
8658 RExC_size += (1 + regarglen[op]);
8659
8660 then it wouldn't matter. Its not clear what side effect
8661 might come from that so its not done so far.
8662 -- dmq
8663 */
a0d0e21e
LW
8664 return(ret);
8665 }
3b57cd43
YO
8666 if (RExC_emit >= RExC_emit_bound)
8667 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8668
c277df42 8669 NODE_ALIGN_FILL(ret);
a0d0e21e 8670 ptr = ret;
c277df42 8671 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 8672#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8673 if (RExC_offsets) { /* MJD */
07be1b83 8674 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 8675 "reganode",
ccb2c380 8676 __LINE__,
13d6edb4 8677 PL_reg_name[op],
07be1b83 8678 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 8679 "Overwriting end of array!\n" : "OK",
07be1b83
YO
8680 (UV)(RExC_emit - RExC_emit_start),
8681 (UV)(RExC_parse - RExC_start),
8682 (UV)RExC_offsets[0]));
ccb2c380 8683 Set_Cur_Node_Offset;
fac92740 8684 }
7122b237 8685#endif
830247a4 8686 RExC_emit = ptr;
a0d0e21e 8687 return(ret);
fe14fcc3
LW
8688}
8689
8690/*
cd439c50 8691- reguni - emit (if appropriate) a Unicode character
a0ed51b3 8692*/
71207a34
AL
8693STATIC STRLEN
8694S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 8695{
97aff369 8696 dVAR;
7918f24d
NC
8697
8698 PERL_ARGS_ASSERT_REGUNI;
8699
71207a34 8700 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
8701}
8702
8703/*
a0d0e21e
LW
8704- reginsert - insert an operator in front of already-emitted operand
8705*
8706* Means relocating the operand.
8707*/
76e3520e 8708STATIC void
6bda09f9 8709S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 8710{
97aff369 8711 dVAR;
c277df42
IZ
8712 register regnode *src;
8713 register regnode *dst;
8714 register regnode *place;
504618e9 8715 const int offset = regarglen[(U8)op];
6bda09f9 8716 const int size = NODE_STEP_REGNODE + offset;
07be1b83 8717 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8718
8719 PERL_ARGS_ASSERT_REGINSERT;
def51078 8720 PERL_UNUSED_ARG(depth);
22c35a8c 8721/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 8722 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 8723 if (SIZE_ONLY) {
6bda09f9 8724 RExC_size += size;
a0d0e21e
LW
8725 return;
8726 }
a687059c 8727
830247a4 8728 src = RExC_emit;
6bda09f9 8729 RExC_emit += size;
830247a4 8730 dst = RExC_emit;
40d049e4 8731 if (RExC_open_parens) {
6bda09f9 8732 int paren;
3b57cd43 8733 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 8734 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 8735 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 8736 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
8737 RExC_open_parens[paren] += size;
8738 } else {
3b57cd43 8739 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
8740 }
8741 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 8742 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
8743 RExC_close_parens[paren] += size;
8744 } else {
3b57cd43 8745 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
8746 }
8747 }
6bda09f9 8748 }
40d049e4 8749
fac92740 8750 while (src > opnd) {
c277df42 8751 StructCopy(--src, --dst, regnode);
7122b237 8752#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8753 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 8754 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 8755 "reg_insert",
ccb2c380 8756 __LINE__,
13d6edb4 8757 PL_reg_name[op],
07be1b83
YO
8758 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8759 ? "Overwriting end of array!\n" : "OK",
8760 (UV)(src - RExC_emit_start),
8761 (UV)(dst - RExC_emit_start),
8762 (UV)RExC_offsets[0]));
ccb2c380
MP
8763 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8764 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 8765 }
7122b237 8766#endif
fac92740
MJD
8767 }
8768
a0d0e21e
LW
8769
8770 place = opnd; /* Op node, where operand used to be. */
7122b237 8771#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8772 if (RExC_offsets) { /* MJD */
07be1b83 8773 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 8774 "reginsert",
ccb2c380 8775 __LINE__,
13d6edb4 8776 PL_reg_name[op],
07be1b83 8777 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 8778 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
8779 (UV)(place - RExC_emit_start),
8780 (UV)(RExC_parse - RExC_start),
786e8c11 8781 (UV)RExC_offsets[0]));
ccb2c380 8782 Set_Node_Offset(place, RExC_parse);
45948336 8783 Set_Node_Length(place, 1);
fac92740 8784 }
7122b237 8785#endif
c277df42
IZ
8786 src = NEXTOPER(place);
8787 FILL_ADVANCE_NODE(place, op);
8788 Zero(src, offset, regnode);
a687059c
LW
8789}
8790
8791/*
c277df42 8792- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 8793- SEE ALSO: regtail_study
a0d0e21e 8794*/
097eb12c 8795/* TODO: All three parms should be const */
76e3520e 8796STATIC void
3dab1dad 8797S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 8798{
97aff369 8799 dVAR;
c277df42 8800 register regnode *scan;
72f13be8 8801 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8802
8803 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
8804#ifndef DEBUGGING
8805 PERL_UNUSED_ARG(depth);
8806#endif
a0d0e21e 8807
c277df42 8808 if (SIZE_ONLY)
a0d0e21e
LW
8809 return;
8810
8811 /* Find last node. */
8812 scan = p;
8813 for (;;) {
504618e9 8814 regnode * const temp = regnext(scan);
3dab1dad
YO
8815 DEBUG_PARSE_r({
8816 SV * const mysv=sv_newmortal();
8817 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8818 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
8819 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8820 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8821 (temp == NULL ? "->" : ""),
13d6edb4 8822 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 8823 );
3dab1dad
YO
8824 });
8825 if (temp == NULL)
8826 break;
8827 scan = temp;
8828 }
8829
8830 if (reg_off_by_arg[OP(scan)]) {
8831 ARG_SET(scan, val - scan);
8832 }
8833 else {
8834 NEXT_OFF(scan) = val - scan;
8835 }
8836}
8837
07be1b83 8838#ifdef DEBUGGING
3dab1dad
YO
8839/*
8840- regtail_study - set the next-pointer at the end of a node chain of p to val.
8841- Look for optimizable sequences at the same time.
8842- currently only looks for EXACT chains.
07be1b83
YO
8843
8844This is expermental code. The idea is to use this routine to perform
8845in place optimizations on branches and groups as they are constructed,
8846with the long term intention of removing optimization from study_chunk so
8847that it is purely analytical.
8848
8849Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8850to control which is which.
8851
3dab1dad
YO
8852*/
8853/* TODO: All four parms should be const */
07be1b83 8854
3dab1dad
YO
8855STATIC U8
8856S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8857{
8858 dVAR;
8859 register regnode *scan;
07be1b83
YO
8860 U8 exact = PSEUDO;
8861#ifdef EXPERIMENTAL_INPLACESCAN
8862 I32 min = 0;
8863#endif
3dab1dad
YO
8864 GET_RE_DEBUG_FLAGS_DECL;
8865
7918f24d
NC
8866 PERL_ARGS_ASSERT_REGTAIL_STUDY;
8867
07be1b83 8868
3dab1dad
YO
8869 if (SIZE_ONLY)
8870 return exact;
8871
8872 /* Find last node. */
8873
8874 scan = p;
8875 for (;;) {
8876 regnode * const temp = regnext(scan);
07be1b83
YO
8877#ifdef EXPERIMENTAL_INPLACESCAN
8878 if (PL_regkind[OP(scan)] == EXACT)
8879 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8880 return EXACT;
8881#endif
3dab1dad
YO
8882 if ( exact ) {
8883 switch (OP(scan)) {
8884 case EXACT:
8885 case EXACTF:
8886 case EXACTFL:
8887 if( exact == PSEUDO )
8888 exact= OP(scan);
07be1b83
YO
8889 else if ( exact != OP(scan) )
8890 exact= 0;
3dab1dad
YO
8891 case NOTHING:
8892 break;
8893 default:
8894 exact= 0;
8895 }
8896 }
8897 DEBUG_PARSE_r({
8898 SV * const mysv=sv_newmortal();
8899 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8900 regprop(RExC_rx, mysv, scan);
eaf3ca90 8901 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8902 SvPV_nolen_const(mysv),
eaf3ca90 8903 REG_NODE_NUM(scan),
13d6edb4 8904 PL_reg_name[exact]);
3dab1dad 8905 });
a0d0e21e
LW
8906 if (temp == NULL)
8907 break;
8908 scan = temp;
8909 }
07be1b83
YO
8910 DEBUG_PARSE_r({
8911 SV * const mysv_val=sv_newmortal();
8912 DEBUG_PARSE_MSG("");
8913 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
8914 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8915 SvPV_nolen_const(mysv_val),
8916 (IV)REG_NODE_NUM(val),
8917 (IV)(val - scan)
07be1b83
YO
8918 );
8919 });
c277df42
IZ
8920 if (reg_off_by_arg[OP(scan)]) {
8921 ARG_SET(scan, val - scan);
a0ed51b3
LW
8922 }
8923 else {
c277df42
IZ
8924 NEXT_OFF(scan) = val - scan;
8925 }
3dab1dad
YO
8926
8927 return exact;
a687059c 8928}
07be1b83 8929#endif
a687059c
LW
8930
8931/*
a687059c
LW
8932 - regcurly - a little FSA that accepts {\d+,?\d*}
8933 */
d3b0eb15 8934#ifndef PERL_IN_XSUB_RE
ff3f963a
KW
8935I32
8936Perl_regcurly(register const char *s)
a687059c 8937{
7918f24d
NC
8938 PERL_ARGS_ASSERT_REGCURLY;
8939
a687059c
LW
8940 if (*s++ != '{')
8941 return FALSE;
f0fcb552 8942 if (!isDIGIT(*s))
a687059c 8943 return FALSE;
f0fcb552 8944 while (isDIGIT(*s))
a687059c
LW
8945 s++;
8946 if (*s == ',')
8947 s++;
f0fcb552 8948 while (isDIGIT(*s))
a687059c
LW
8949 s++;
8950 if (*s != '}')
8951 return FALSE;
8952 return TRUE;
8953}
d3b0eb15 8954#endif
a687059c
LW
8955
8956/*
fd181c75 8957 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 8958 */
f7819f85 8959#ifdef DEBUGGING
c33269f7 8960static void
7918f24d
NC
8961S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8962{
f7819f85
A
8963 int bit;
8964 int set=0;
7918f24d 8965
f7819f85
A
8966 for (bit=0; bit<32; bit++) {
8967 if (flags & (1<<bit)) {
8968 if (!set++ && lead)
8969 PerlIO_printf(Perl_debug_log, "%s",lead);
8970 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8971 }
8972 }
8973 if (lead) {
8974 if (set)
8975 PerlIO_printf(Perl_debug_log, "\n");
8976 else
8977 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8978 }
8979}
8980#endif
8981
a687059c 8982void
097eb12c 8983Perl_regdump(pTHX_ const regexp *r)
a687059c 8984{
35ff7856 8985#ifdef DEBUGGING
97aff369 8986 dVAR;
c445ea15 8987 SV * const sv = sv_newmortal();
ab3bbdeb 8988 SV *dsv= sv_newmortal();
f8fc2ecf 8989 RXi_GET_DECL(r,ri);
f7819f85 8990 GET_RE_DEBUG_FLAGS_DECL;
a687059c 8991
7918f24d
NC
8992 PERL_ARGS_ASSERT_REGDUMP;
8993
f8fc2ecf 8994 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8995
8996 /* Header fields of interest. */
ab3bbdeb
YO
8997 if (r->anchored_substr) {
8998 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8999 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 9000 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9001 "anchored %s%s at %"IVdf" ",
9002 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 9003 (IV)r->anchored_offset);
ab3bbdeb
YO
9004 } else if (r->anchored_utf8) {
9005 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9006 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 9007 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9008 "anchored utf8 %s%s at %"IVdf" ",
9009 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 9010 (IV)r->anchored_offset);
ab3bbdeb
YO
9011 }
9012 if (r->float_substr) {
9013 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9014 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 9015 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9016 "floating %s%s at %"IVdf"..%"UVuf" ",
9017 s, RE_SV_TAIL(r->float_substr),
7b0972df 9018 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
9019 } else if (r->float_utf8) {
9020 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9021 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 9022 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9023 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9024 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 9025 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 9026 }
33b8afdf 9027 if (r->check_substr || r->check_utf8)
b81d288d 9028 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
9029 (const char *)
9030 (r->check_substr == r->float_substr
9031 && r->check_utf8 == r->float_utf8
9032 ? "(checking floating" : "(checking anchored"));
bbe252da 9033 if (r->extflags & RXf_NOSCAN)
c277df42 9034 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 9035 if (r->extflags & RXf_CHECK_ALL)
c277df42 9036 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 9037 if (r->check_substr || r->check_utf8)
c277df42
IZ
9038 PerlIO_printf(Perl_debug_log, ") ");
9039
f8fc2ecf
YO
9040 if (ri->regstclass) {
9041 regprop(r, sv, ri->regstclass);
1de06328 9042 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 9043 }
bbe252da 9044 if (r->extflags & RXf_ANCH) {
774d564b 9045 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 9046 if (r->extflags & RXf_ANCH_BOL)
774d564b 9047 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 9048 if (r->extflags & RXf_ANCH_MBOL)
c277df42 9049 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 9050 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 9051 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 9052 if (r->extflags & RXf_ANCH_GPOS)
774d564b 9053 PerlIO_printf(Perl_debug_log, "(GPOS)");
9054 PerlIO_putc(Perl_debug_log, ' ');
9055 }
bbe252da 9056 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 9057 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 9058 if (r->intflags & PREGf_SKIP)
760ac839 9059 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 9060 if (r->intflags & PREGf_IMPLICIT)
760ac839 9061 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 9062 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 9063 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 9064 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 9065 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 9066 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 9067#else
7918f24d 9068 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 9069 PERL_UNUSED_CONTEXT;
65e66c80 9070 PERL_UNUSED_ARG(r);
17c3b450 9071#endif /* DEBUGGING */
a687059c
LW
9072}
9073
9074/*
a0d0e21e
LW
9075- regprop - printable representation of opcode
9076*/
3339dfd8
YO
9077#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9078STMT_START { \
9079 if (do_sep) { \
9080 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9081 if (flags & ANYOF_INVERT) \
9082 /*make sure the invert info is in each */ \
9083 sv_catpvs(sv, "^"); \
9084 do_sep = 0; \
9085 } \
9086} STMT_END
9087
46fc3d4c 9088void
32fc9b6a 9089Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 9090{
35ff7856 9091#ifdef DEBUGGING
97aff369 9092 dVAR;
9b155405 9093 register int k;
f8fc2ecf 9094 RXi_GET_DECL(prog,progi);
1de06328 9095 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 9096
7918f24d 9097 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 9098
76f68e9b 9099 sv_setpvs(sv, "");
8aa23a47 9100
03363afd 9101 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
9102 /* It would be nice to FAIL() here, but this may be called from
9103 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 9104 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 9105 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 9106
3dab1dad 9107 k = PL_regkind[OP(o)];
9b155405 9108
2a782b5b 9109 if (k == EXACT) {
f92a2122 9110 sv_catpvs(sv, " ");
ab3bbdeb
YO
9111 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9112 * is a crude hack but it may be the best for now since
9113 * we have no flag "this EXACTish node was UTF-8"
9114 * --jhi */
f92a2122
NC
9115 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9116 PERL_PV_ESCAPE_UNI_DETECT |
9117 PERL_PV_PRETTY_ELLIPSES |
9118 PERL_PV_PRETTY_LTGT |
9119 PERL_PV_PRETTY_NOCLEAR
9120 );
bb263b4e 9121 } else if (k == TRIE) {
3dab1dad 9122 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 9123 * progi->data isn't available here */
1de06328 9124 const char op = OP(o);
647f639f 9125 const U32 n = ARG(o);
1de06328 9126 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 9127 (reg_ac_data *)progi->data->data[n] :
1de06328 9128 NULL;
3251b653
NC
9129 const reg_trie_data * const trie
9130 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 9131
13d6edb4 9132 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
9133 DEBUG_TRIE_COMPILE_r(
9134 Perl_sv_catpvf(aTHX_ sv,
9135 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9136 (UV)trie->startstate,
1e2e3d02 9137 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
9138 (UV)trie->wordcount,
9139 (UV)trie->minlen,
9140 (UV)trie->maxlen,
9141 (UV)TRIE_CHARCOUNT(trie),
9142 (UV)trie->uniquecharcount
9143 )
9144 );
9145 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9146 int i;
9147 int rangestart = -1;
f46cb337 9148 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 9149 sv_catpvs(sv, "[");
1de06328
YO
9150 for (i = 0; i <= 256; i++) {
9151 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9152 if (rangestart == -1)
9153 rangestart = i;
9154 } else if (rangestart != -1) {
9155 if (i <= rangestart + 3)
9156 for (; rangestart < i; rangestart++)
9157 put_byte(sv, rangestart);
9158 else {
9159 put_byte(sv, rangestart);
9160 sv_catpvs(sv, "-");
9161 put_byte(sv, i - 1);
9162 }
9163 rangestart = -1;
9164 }
9165 }
f3a2811a 9166 sv_catpvs(sv, "]");
1de06328
YO
9167 }
9168
a3621e74 9169 } else if (k == CURLY) {
cb434fcc 9170 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
9171 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9172 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 9173 }
2c2d71f5
JH
9174 else if (k == WHILEM && o->flags) /* Ordinal/of */
9175 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 9176 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 9177 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 9178 if ( RXp_PAREN_NAMES(prog) ) {
ee9b8eae 9179 if ( k != REF || OP(o) < NREF) {
502c6561 9180 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
9181 SV **name= av_fetch(list, ARG(o), 0 );
9182 if (name)
9183 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9184 }
9185 else {
502c6561 9186 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 9187 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
9188 I32 *nums=(I32*)SvPVX(sv_dat);
9189 SV **name= av_fetch(list, nums[0], 0 );
9190 I32 n;
9191 if (name) {
9192 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9193 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9194 (n ? "," : ""), (IV)nums[n]);
9195 }
9196 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 9197 }
1f1031fe 9198 }
ee9b8eae 9199 }
1f1031fe 9200 } else if (k == GOSUB)
6bda09f9 9201 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
9202 else if (k == VERB) {
9203 if (!o->flags)
9204 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 9205 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 9206 } else if (k == LOGICAL)
04ebc1ab 9207 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
f9a79580 9208 else if (k == FOLDCHAR)
df44d732 9209 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
653099ff
GS
9210 else if (k == ANYOF) {
9211 int i, rangestart = -1;
2d03de9c 9212 const U8 flags = ANYOF_FLAGS(o);
24d786f4 9213 int do_sep = 0;
0bd48802
AL
9214
9215 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9216 static const char * const anyofs[] = {
653099ff
GS
9217 "\\w",
9218 "\\W",
9219 "\\s",
9220 "\\S",
9221 "\\d",
9222 "\\D",
9223 "[:alnum:]",
9224 "[:^alnum:]",
9225 "[:alpha:]",
9226 "[:^alpha:]",
9227 "[:ascii:]",
9228 "[:^ascii:]",
24d786f4
YO
9229 "[:cntrl:]",
9230 "[:^cntrl:]",
653099ff
GS
9231 "[:graph:]",
9232 "[:^graph:]",
9233 "[:lower:]",
9234 "[:^lower:]",
9235 "[:print:]",
9236 "[:^print:]",
9237 "[:punct:]",
9238 "[:^punct:]",
9239 "[:upper:]",
aaa51d5e 9240 "[:^upper:]",
653099ff 9241 "[:xdigit:]",
aaa51d5e
JF
9242 "[:^xdigit:]",
9243 "[:space:]",
9244 "[:^space:]",
9245 "[:blank:]",
9246 "[:^blank:]"
653099ff
GS
9247 };
9248
19860706 9249 if (flags & ANYOF_LOCALE)
396482e1 9250 sv_catpvs(sv, "{loc}");
19860706 9251 if (flags & ANYOF_FOLD)
396482e1 9252 sv_catpvs(sv, "{i}");
653099ff 9253 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 9254 if (flags & ANYOF_INVERT)
396482e1 9255 sv_catpvs(sv, "^");
3339dfd8
YO
9256
9257 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
9258 for (i = 0; i <= 256; i++) {
9259 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9260 if (rangestart == -1)
9261 rangestart = i;
9262 } else if (rangestart != -1) {
9263 if (i <= rangestart + 3)
9264 for (; rangestart < i; rangestart++)
653099ff 9265 put_byte(sv, rangestart);
ffc61ed2
JH
9266 else {
9267 put_byte(sv, rangestart);
396482e1 9268 sv_catpvs(sv, "-");
ffc61ed2 9269 put_byte(sv, i - 1);
653099ff 9270 }
24d786f4 9271 do_sep = 1;
ffc61ed2 9272 rangestart = -1;
653099ff 9273 }
847a199f 9274 }
3339dfd8
YO
9275
9276 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9277 /* output any special charclass tests (used mostly under use locale) */
ffc61ed2 9278 if (o->flags & ANYOF_CLASS)
bb7a0f54 9279 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 9280 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 9281 sv_catpv(sv, anyofs[i]);
24d786f4
YO
9282 do_sep = 1;
9283 }
9284
3339dfd8
YO
9285 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9286
9287 /* output information about the unicode matching */
ffc61ed2 9288 if (flags & ANYOF_UNICODE)
396482e1 9289 sv_catpvs(sv, "{unicode}");
1aa99e6b 9290 else if (flags & ANYOF_UNICODE_ALL)
396482e1 9291 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
9292
9293 {
9294 SV *lv;
32fc9b6a 9295 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 9296
ffc61ed2
JH
9297 if (lv) {
9298 if (sw) {
89ebb4a3 9299 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 9300
ffc61ed2 9301 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 9302 uvchr_to_utf8(s, i);
ffc61ed2 9303
3568d838 9304 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
9305 if (rangestart == -1)
9306 rangestart = i;
9307 } else if (rangestart != -1) {
ffc61ed2
JH
9308 if (i <= rangestart + 3)
9309 for (; rangestart < i; rangestart++) {
2d03de9c
AL
9310 const U8 * const e = uvchr_to_utf8(s,rangestart);
9311 U8 *p;
9312 for(p = s; p < e; p++)
ffc61ed2
JH
9313 put_byte(sv, *p);
9314 }
9315 else {
2d03de9c
AL
9316 const U8 *e = uvchr_to_utf8(s,rangestart);
9317 U8 *p;
9318 for (p = s; p < e; p++)
ffc61ed2 9319 put_byte(sv, *p);
396482e1 9320 sv_catpvs(sv, "-");
2d03de9c
AL
9321 e = uvchr_to_utf8(s, i-1);
9322 for (p = s; p < e; p++)
1df70142 9323 put_byte(sv, *p);
ffc61ed2
JH
9324 }
9325 rangestart = -1;
9326 }
19860706 9327 }
ffc61ed2 9328
396482e1 9329 sv_catpvs(sv, "..."); /* et cetera */
19860706 9330 }
fde631ed 9331
ffc61ed2 9332 {
2e0de35c 9333 char *s = savesvpv(lv);
c445ea15 9334 char * const origs = s;
b81d288d 9335
3dab1dad
YO
9336 while (*s && *s != '\n')
9337 s++;
b81d288d 9338
ffc61ed2 9339 if (*s == '\n') {
2d03de9c 9340 const char * const t = ++s;
ffc61ed2
JH
9341
9342 while (*s) {
9343 if (*s == '\n')
9344 *s = ' ';
9345 s++;
9346 }
9347 if (s[-1] == ' ')
9348 s[-1] = 0;
9349
9350 sv_catpv(sv, t);
fde631ed 9351 }
b81d288d 9352
ffc61ed2 9353 Safefree(origs);
fde631ed
JH
9354 }
9355 }
653099ff 9356 }
ffc61ed2 9357
653099ff
GS
9358 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9359 }
9b155405 9360 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 9361 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 9362#else
96a5add6 9363 PERL_UNUSED_CONTEXT;
65e66c80
SP
9364 PERL_UNUSED_ARG(sv);
9365 PERL_UNUSED_ARG(o);
f9049ba1 9366 PERL_UNUSED_ARG(prog);
17c3b450 9367#endif /* DEBUGGING */
35ff7856 9368}
a687059c 9369
cad2e5aa 9370SV *
288b8c02 9371Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 9372{ /* Assume that RE_INTUIT is set */
97aff369 9373 dVAR;
288b8c02 9374 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 9375 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9376
9377 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
9378 PERL_UNUSED_CONTEXT;
9379
a3621e74 9380 DEBUG_COMPILE_r(
cfd0369c 9381 {
2d03de9c 9382 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 9383 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
9384
9385 if (!PL_colorset) reginitcolors();
9386 PerlIO_printf(Perl_debug_log,
a0288114 9387 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
9388 PL_colors[4],
9389 prog->check_substr ? "" : "utf8 ",
9390 PL_colors[5],PL_colors[0],
cad2e5aa
JH
9391 s,
9392 PL_colors[1],
9393 (strlen(s) > 60 ? "..." : ""));
9394 } );
9395
33b8afdf 9396 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
9397}
9398
84da74a7 9399/*
f8149455 9400 pregfree()
84da74a7 9401
f8149455
YO
9402 handles refcounting and freeing the perl core regexp structure. When
9403 it is necessary to actually free the structure the first thing it
9404 does is call the 'free' method of the regexp_engine associated to to
9405 the regexp, allowing the handling of the void *pprivate; member
9406 first. (This routine is not overridable by extensions, which is why
9407 the extensions free is called first.)
9408
9409 See regdupe and regdupe_internal if you change anything here.
84da74a7 9410*/
f8149455 9411#ifndef PERL_IN_XSUB_RE
2b69d0c2 9412void
84679df5 9413Perl_pregfree(pTHX_ REGEXP *r)
a687059c 9414{
288b8c02
NC
9415 SvREFCNT_dec(r);
9416}
9417
9418void
9419Perl_pregfree2(pTHX_ REGEXP *rx)
9420{
27da23d5 9421 dVAR;
288b8c02 9422 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 9423 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 9424
7918f24d
NC
9425 PERL_ARGS_ASSERT_PREGFREE2;
9426
28d8d7f4
YO
9427 if (r->mother_re) {
9428 ReREFCNT_dec(r->mother_re);
9429 } else {
288b8c02 9430 CALLREGFREE_PVT(rx); /* free the private data */
ef8d46e8 9431 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
9432 }
9433 if (r->substrs) {
ef8d46e8
VP
9434 SvREFCNT_dec(r->anchored_substr);
9435 SvREFCNT_dec(r->anchored_utf8);
9436 SvREFCNT_dec(r->float_substr);
9437 SvREFCNT_dec(r->float_utf8);
28d8d7f4
YO
9438 Safefree(r->substrs);
9439 }
288b8c02 9440 RX_MATCH_COPY_FREE(rx);
f8c7b90f 9441#ifdef PERL_OLD_COPY_ON_WRITE
ef8d46e8 9442 SvREFCNT_dec(r->saved_copy);
ed252734 9443#endif
f0ab9afb 9444 Safefree(r->offs);
f8149455 9445}
28d8d7f4
YO
9446
9447/* reg_temp_copy()
9448
9449 This is a hacky workaround to the structural issue of match results
9450 being stored in the regexp structure which is in turn stored in
9451 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9452 could be PL_curpm in multiple contexts, and could require multiple
9453 result sets being associated with the pattern simultaneously, such
9454 as when doing a recursive match with (??{$qr})
9455
9456 The solution is to make a lightweight copy of the regexp structure
9457 when a qr// is returned from the code executed by (??{$qr}) this
9458 lightweight copy doesnt actually own any of its data except for
9459 the starp/end and the actual regexp structure itself.
9460
9461*/
9462
9463
84679df5 9464REGEXP *
f0826785 9465Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
7918f24d 9466{
f0826785 9467 struct regexp *ret;
288b8c02 9468 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 9469 register const I32 npar = r->nparens+1;
7918f24d
NC
9470
9471 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9472
f0826785
BM
9473 if (!ret_x)
9474 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9475 ret = (struct regexp *)SvANY(ret_x);
9476
288b8c02 9477 (void)ReREFCNT_inc(rx);
f7c278bf
NC
9478 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9479 by pointing directly at the buffer, but flagging that the allocated
9480 space in the copy is zero. As we've just done a struct copy, it's now
9481 a case of zero-ing that, rather than copying the current length. */
9482 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 9483 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
b6f60916
NC
9484 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9485 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
f7c278bf 9486 SvLEN_set(ret_x, 0);
b9ad13ac 9487 SvSTASH_set(ret_x, NULL);
703c388d 9488 SvMAGIC_set(ret_x, NULL);
f0ab9afb
NC
9489 Newx(ret->offs, npar, regexp_paren_pair);
9490 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 9491 if (r->substrs) {
28d8d7f4 9492 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
9493 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9494
9495 SvREFCNT_inc_void(ret->anchored_substr);
9496 SvREFCNT_inc_void(ret->anchored_utf8);
9497 SvREFCNT_inc_void(ret->float_substr);
9498 SvREFCNT_inc_void(ret->float_utf8);
9499
9500 /* check_substr and check_utf8, if non-NULL, point to either their
9501 anchored or float namesakes, and don't hold a second reference. */
486913e4 9502 }
288b8c02 9503 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 9504#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 9505 ret->saved_copy = NULL;
28d8d7f4 9506#endif
288b8c02 9507 ret->mother_re = rx;
28d8d7f4 9508
288b8c02 9509 return ret_x;
28d8d7f4 9510}
f8149455
YO
9511#endif
9512
9513/* regfree_internal()
9514
9515 Free the private data in a regexp. This is overloadable by
9516 extensions. Perl takes care of the regexp structure in pregfree(),
9517 this covers the *pprivate pointer which technically perldoesnt
9518 know about, however of course we have to handle the
9519 regexp_internal structure when no extension is in use.
9520
9521 Note this is called before freeing anything in the regexp
9522 structure.
9523 */
9524
9525void
288b8c02 9526Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
9527{
9528 dVAR;
288b8c02 9529 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
9530 RXi_GET_DECL(r,ri);
9531 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9532
9533 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9534
f8149455
YO
9535 DEBUG_COMPILE_r({
9536 if (!PL_colorset)
9537 reginitcolors();
9538 {
9539 SV *dsv= sv_newmortal();
3c8556c3 9540 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 9541 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
9542 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9543 PL_colors[4],PL_colors[5],s);
9544 }
9545 });
7122b237
YO
9546#ifdef RE_TRACK_PATTERN_OFFSETS
9547 if (ri->u.offsets)
9548 Safefree(ri->u.offsets); /* 20010421 MJD */
9549#endif
f8fc2ecf
YO
9550 if (ri->data) {
9551 int n = ri->data->count;
f3548bdc
DM
9552 PAD* new_comppad = NULL;
9553 PAD* old_comppad;
4026c95a 9554 PADOFFSET refcnt;
dfad63ad 9555
c277df42 9556 while (--n >= 0) {
261faec3 9557 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 9558 switch (ri->data->what[n]) {
af534a04 9559 case 'a':
c277df42 9560 case 's':
81714fb9 9561 case 'S':
55eed653 9562 case 'u':
ad64d0ec 9563 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 9564 break;
653099ff 9565 case 'f':
f8fc2ecf 9566 Safefree(ri->data->data[n]);
653099ff 9567 break;
dfad63ad 9568 case 'p':
502c6561 9569 new_comppad = MUTABLE_AV(ri->data->data[n]);
dfad63ad 9570 break;
c277df42 9571 case 'o':
dfad63ad 9572 if (new_comppad == NULL)
cea2e8a9 9573 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
9574 PAD_SAVE_LOCAL(old_comppad,
9575 /* Watch out for global destruction's random ordering. */
c445ea15 9576 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 9577 );
b34c0dd4 9578 OP_REFCNT_LOCK;
f8fc2ecf 9579 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
9580 OP_REFCNT_UNLOCK;
9581 if (!refcnt)
f8fc2ecf 9582 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 9583
f3548bdc 9584 PAD_RESTORE_LOCAL(old_comppad);
ad64d0ec 9585 SvREFCNT_dec(MUTABLE_SV(new_comppad));
dfad63ad 9586 new_comppad = NULL;
c277df42
IZ
9587 break;
9588 case 'n':
9e55ce06 9589 break;
07be1b83 9590 case 'T':
be8e71aa
YO
9591 { /* Aho Corasick add-on structure for a trie node.
9592 Used in stclass optimization only */
07be1b83 9593 U32 refcount;
f8fc2ecf 9594 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
9595 OP_REFCNT_LOCK;
9596 refcount = --aho->refcount;
9597 OP_REFCNT_UNLOCK;
9598 if ( !refcount ) {
446bd890
NC
9599 PerlMemShared_free(aho->states);
9600 PerlMemShared_free(aho->fail);
446bd890
NC
9601 /* do this last!!!! */
9602 PerlMemShared_free(ri->data->data[n]);
9603 PerlMemShared_free(ri->regstclass);
07be1b83
YO
9604 }
9605 }
9606 break;
a3621e74 9607 case 't':
07be1b83 9608 {
be8e71aa 9609 /* trie structure. */
07be1b83 9610 U32 refcount;
f8fc2ecf 9611 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
9612 OP_REFCNT_LOCK;
9613 refcount = --trie->refcount;
9614 OP_REFCNT_UNLOCK;
9615 if ( !refcount ) {
446bd890 9616 PerlMemShared_free(trie->charmap);
446bd890
NC
9617 PerlMemShared_free(trie->states);
9618 PerlMemShared_free(trie->trans);
07be1b83 9619 if (trie->bitmap)
446bd890 9620 PerlMemShared_free(trie->bitmap);
786e8c11 9621 if (trie->jump)
446bd890 9622 PerlMemShared_free(trie->jump);
2e64971a 9623 PerlMemShared_free(trie->wordinfo);
446bd890
NC
9624 /* do this last!!!! */
9625 PerlMemShared_free(ri->data->data[n]);
a3621e74 9626 }
07be1b83
YO
9627 }
9628 break;
c277df42 9629 default:
f8fc2ecf 9630 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
9631 }
9632 }
f8fc2ecf
YO
9633 Safefree(ri->data->what);
9634 Safefree(ri->data);
a0d0e21e 9635 }
28d8d7f4 9636
f8fc2ecf 9637 Safefree(ri);
a687059c 9638}
c277df42 9639
a09252eb
NC
9640#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
9641#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
84da74a7
YO
9642#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9643
9644/*
32cd70f6 9645 re_dup - duplicate a regexp.
84da74a7 9646
8233f606
DM
9647 This routine is expected to clone a given regexp structure. It is only
9648 compiled under USE_ITHREADS.
32cd70f6 9649
f8149455
YO
9650 After all of the core data stored in struct regexp is duplicated
9651 the regexp_engine.dupe method is used to copy any private data
9652 stored in the *pprivate pointer. This allows extensions to handle
9653 any duplication it needs to do.
9654
9655 See pregfree() and regfree_internal() if you change anything here.
84da74a7 9656*/
a3c0e9ca 9657#if defined(USE_ITHREADS)
f8149455 9658#ifndef PERL_IN_XSUB_RE
288b8c02
NC
9659void
9660Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 9661{
84da74a7 9662 dVAR;
a86a1ca7 9663 I32 npar;
288b8c02
NC
9664 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9665 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 9666
7918f24d
NC
9667 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9668
84da74a7 9669 npar = r->nparens+1;
f0ab9afb
NC
9670 Newx(ret->offs, npar, regexp_paren_pair);
9671 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 9672 if(ret->swap) {
28d8d7f4 9673 /* no need to copy these */
f0ab9afb 9674 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 9675 }
84da74a7 9676
6057429f 9677 if (ret->substrs) {
32cd70f6
NC
9678 /* Do it this way to avoid reading from *r after the StructCopy().
9679 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9680 cache, it doesn't matter. */
66b1de87
NC
9681 const bool anchored = r->check_substr
9682 ? r->check_substr == r->anchored_substr
9683 : r->check_utf8 == r->anchored_utf8;
785a26d5 9684 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
9685 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9686
32cd70f6
NC
9687 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9688 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9689 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9690 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 9691
32cd70f6
NC
9692 /* check_substr and check_utf8, if non-NULL, point to either their
9693 anchored or float namesakes, and don't hold a second reference. */
9694
9695 if (ret->check_substr) {
9696 if (anchored) {
9697 assert(r->check_utf8 == r->anchored_utf8);
9698 ret->check_substr = ret->anchored_substr;
9699 ret->check_utf8 = ret->anchored_utf8;
9700 } else {
9701 assert(r->check_substr == r->float_substr);
9702 assert(r->check_utf8 == r->float_utf8);
9703 ret->check_substr = ret->float_substr;
9704 ret->check_utf8 = ret->float_utf8;
9705 }
66b1de87
NC
9706 } else if (ret->check_utf8) {
9707 if (anchored) {
9708 ret->check_utf8 = ret->anchored_utf8;
9709 } else {
9710 ret->check_utf8 = ret->float_utf8;
9711 }
32cd70f6 9712 }
6057429f 9713 }
f8149455 9714
5daac39c 9715 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 9716
6057429f 9717 if (ret->pprivate)
288b8c02 9718 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 9719
288b8c02 9720 if (RX_MATCH_COPIED(dstr))
6057429f 9721 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
9722 else
9723 ret->subbeg = NULL;
9724#ifdef PERL_OLD_COPY_ON_WRITE
9725 ret->saved_copy = NULL;
9726#endif
6057429f 9727
c2123ae3
NC
9728 if (ret->mother_re) {
9729 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9730 /* Our storage points directly to our mother regexp, but that's
9731 1: a buffer in a different thread
9732 2: something we no longer hold a reference on
9733 so we need to copy it locally. */
9734 /* Note we need to sue SvCUR() on our mother_re, because it, in
9735 turn, may well be pointing to its own mother_re. */
9736 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9737 SvCUR(ret->mother_re)+1));
9738 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9739 }
9740 ret->mother_re = NULL;
9741 }
6057429f 9742 ret->gofs = 0;
f8149455
YO
9743}
9744#endif /* PERL_IN_XSUB_RE */
9745
9746/*
9747 regdupe_internal()
9748
9749 This is the internal complement to regdupe() which is used to copy
9750 the structure pointed to by the *pprivate pointer in the regexp.
9751 This is the core version of the extension overridable cloning hook.
9752 The regexp structure being duplicated will be copied by perl prior
9753 to this and will be provided as the regexp *r argument, however
9754 with the /old/ structures pprivate pointer value. Thus this routine
9755 may override any copying normally done by perl.
9756
9757 It returns a pointer to the new regexp_internal structure.
9758*/
9759
9760void *
288b8c02 9761Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
9762{
9763 dVAR;
288b8c02 9764 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
9765 regexp_internal *reti;
9766 int len, npar;
9767 RXi_GET_DECL(r,ri);
7918f24d
NC
9768
9769 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455
YO
9770
9771 npar = r->nparens+1;
7122b237 9772 len = ProgLen(ri);
f8149455 9773
45cf4570 9774 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455
YO
9775 Copy(ri->program, reti->program, len+1, regnode);
9776
f8149455 9777
f8fc2ecf 9778 reti->regstclass = NULL;
bcdf7404 9779
f8fc2ecf 9780 if (ri->data) {
84da74a7 9781 struct reg_data *d;
f8fc2ecf 9782 const int count = ri->data->count;
84da74a7
YO
9783 int i;
9784
9785 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9786 char, struct reg_data);
9787 Newx(d->what, count, U8);
9788
9789 d->count = count;
9790 for (i = 0; i < count; i++) {
f8fc2ecf 9791 d->what[i] = ri->data->what[i];
84da74a7 9792 switch (d->what[i]) {
af534a04 9793 /* legal options are one of: sSfpontTua
84da74a7 9794 see also regcomp.h and pregfree() */
af534a04 9795 case 'a': /* actually an AV, but the dup function is identical. */
84da74a7 9796 case 's':
81714fb9 9797 case 'S':
0536c0a7 9798 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 9799 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 9800 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 9801 break;
84da74a7
YO
9802 case 'f':
9803 /* This is cheating. */
9804 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 9805 StructCopy(ri->data->data[i], d->data[i],
84da74a7 9806 struct regnode_charclass_class);
f8fc2ecf 9807 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
9808 break;
9809 case 'o':
bbe252da
YO
9810 /* Compiled op trees are readonly and in shared memory,
9811 and can thus be shared without duplication. */
84da74a7 9812 OP_REFCNT_LOCK;
f8fc2ecf 9813 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
9814 OP_REFCNT_UNLOCK;
9815 break;
23eab42c
NC
9816 case 'T':
9817 /* Trie stclasses are readonly and can thus be shared
9818 * without duplication. We free the stclass in pregfree
9819 * when the corresponding reg_ac_data struct is freed.
9820 */
9821 reti->regstclass= ri->regstclass;
9822 /* Fall through */
84da74a7 9823 case 't':
84da74a7 9824 OP_REFCNT_LOCK;
0536c0a7 9825 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 9826 OP_REFCNT_UNLOCK;
0536c0a7
NC
9827 /* Fall through */
9828 case 'n':
9829 d->data[i] = ri->data->data[i];
84da74a7 9830 break;
84da74a7 9831 default:
f8fc2ecf 9832 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
9833 }
9834 }
9835
f8fc2ecf 9836 reti->data = d;
84da74a7
YO
9837 }
9838 else
f8fc2ecf 9839 reti->data = NULL;
84da74a7 9840
cde0cee5
YO
9841 reti->name_list_idx = ri->name_list_idx;
9842
7122b237
YO
9843#ifdef RE_TRACK_PATTERN_OFFSETS
9844 if (ri->u.offsets) {
9845 Newx(reti->u.offsets, 2*len+1, U32);
9846 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9847 }
9848#else
9849 SetProgLen(reti,len);
9850#endif
9851
f8149455 9852 return (void*)reti;
84da74a7 9853}
f8149455
YO
9854
9855#endif /* USE_ITHREADS */
84da74a7 9856
f8149455 9857#ifndef PERL_IN_XSUB_RE
bcdf7404 9858
c277df42
IZ
9859/*
9860 - regnext - dig the "next" pointer out of a node
c277df42
IZ
9861 */
9862regnode *
864dbfa3 9863Perl_regnext(pTHX_ register regnode *p)
c277df42 9864{
97aff369 9865 dVAR;
c277df42
IZ
9866 register I32 offset;
9867
f8fc2ecf 9868 if (!p)
c277df42
IZ
9869 return(NULL);
9870
35db910f
KW
9871 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
9872 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
9873 }
9874
c277df42
IZ
9875 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9876 if (offset == 0)
9877 return(NULL);
9878
c277df42 9879 return(p+offset);
c277df42 9880}
76234dfb 9881#endif
c277df42 9882
01f988be 9883STATIC void
cea2e8a9 9884S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
9885{
9886 va_list args;
9887 STRLEN l1 = strlen(pat1);
9888 STRLEN l2 = strlen(pat2);
9889 char buf[512];
06bf62c7 9890 SV *msv;
73d840c0 9891 const char *message;
c277df42 9892
7918f24d
NC
9893 PERL_ARGS_ASSERT_RE_CROAK2;
9894
c277df42
IZ
9895 if (l1 > 510)
9896 l1 = 510;
9897 if (l1 + l2 > 510)
9898 l2 = 510 - l1;
9899 Copy(pat1, buf, l1 , char);
9900 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
9901 buf[l1 + l2] = '\n';
9902 buf[l1 + l2 + 1] = '\0';
8736538c
AS
9903#ifdef I_STDARG
9904 /* ANSI variant takes additional second argument */
c277df42 9905 va_start(args, pat2);
8736538c
AS
9906#else
9907 va_start(args);
9908#endif
5a844595 9909 msv = vmess(buf, &args);
c277df42 9910 va_end(args);
cfd0369c 9911 message = SvPV_const(msv,l1);
c277df42
IZ
9912 if (l1 > 512)
9913 l1 = 512;
9914 Copy(message, buf, l1 , char);
197cf9b9 9915 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 9916 Perl_croak(aTHX_ "%s", buf);
c277df42 9917}
a0ed51b3
LW
9918
9919/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9920
76234dfb 9921#ifndef PERL_IN_XSUB_RE
a0ed51b3 9922void
864dbfa3 9923Perl_save_re_context(pTHX)
b81d288d 9924{
97aff369 9925 dVAR;
1ade1aa1
NC
9926
9927 struct re_save_state *state;
9928
9929 SAVEVPTR(PL_curcop);
9930 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9931
9932 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9933 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
c6bf6a65 9934 SSPUSHUV(SAVEt_RE_STATE);
1ade1aa1 9935
46ab3289 9936 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 9937
a0ed51b3 9938 PL_reg_start_tmp = 0;
a0ed51b3 9939 PL_reg_start_tmpl = 0;
c445ea15 9940 PL_reg_oldsaved = NULL;
a5db57d6 9941 PL_reg_oldsavedlen = 0;
a5db57d6 9942 PL_reg_maxiter = 0;
a5db57d6 9943 PL_reg_leftiter = 0;
c445ea15 9944 PL_reg_poscache = NULL;
a5db57d6 9945 PL_reg_poscache_size = 0;
1ade1aa1
NC
9946#ifdef PERL_OLD_COPY_ON_WRITE
9947 PL_nrs = NULL;
9948#endif
ada6e8a9 9949
c445ea15
AL
9950 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9951 if (PL_curpm) {
9952 const REGEXP * const rx = PM_GETRE(PL_curpm);
9953 if (rx) {
1df70142 9954 U32 i;
07bc277f 9955 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 9956 char digits[TYPE_CHARS(long)];
d9fad198 9957 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
9958 GV *const *const gvp
9959 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9960
b37c2d43
AL
9961 if (gvp) {
9962 GV * const gv = *gvp;
9963 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9964 save_scalar(gv);
49f27e4b 9965 }
ada6e8a9
AMS
9966 }
9967 }
9968 }
a0ed51b3 9969}
76234dfb 9970#endif
51371543 9971
51371543 9972static void
acfe0abc 9973clear_re(pTHX_ void *r)
51371543 9974{
97aff369 9975 dVAR;
84679df5 9976 ReREFCNT_dec((REGEXP *)r);
51371543 9977}
ffbc6a93 9978
a28509cc
AL
9979#ifdef DEBUGGING
9980
9981STATIC void
9982S_put_byte(pTHX_ SV *sv, int c)
9983{
7918f24d
NC
9984 PERL_ARGS_ASSERT_PUT_BYTE;
9985
7fddd944
NC
9986 /* Our definition of isPRINT() ignores locales, so only bytes that are
9987 not part of UTF-8 are considered printable. I assume that the same
9988 holds for UTF-EBCDIC.
9989 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9990 which Wikipedia says:
9991
9992 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9993 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9994 identical, to the ASCII delete (DEL) or rubout control character.
9995 ) So the old condition can be simplified to !isPRINT(c) */
9996 if (!isPRINT(c))
a28509cc 9997 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5e7aa789 9998 else {
88c9ea1e 9999 const char string = c;
5e7aa789
NC
10000 if (c == '-' || c == ']' || c == '\\' || c == '^')
10001 sv_catpvs(sv, "\\");
10002 sv_catpvn(sv, &string, 1);
10003 }
a28509cc
AL
10004}
10005
786e8c11 10006
3dab1dad
YO
10007#define CLEAR_OPTSTART \
10008 if (optstart) STMT_START { \
70685ca0 10009 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
10010 optstart=NULL; \
10011 } STMT_END
10012
786e8c11 10013#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 10014
b5a2f8d8
NC
10015STATIC const regnode *
10016S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
10017 const regnode *last, const regnode *plast,
10018 SV* sv, I32 indent, U32 depth)
a28509cc 10019{
97aff369 10020 dVAR;
786e8c11 10021 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 10022 register const regnode *next;
3dab1dad 10023 const regnode *optstart= NULL;
1f1031fe 10024
f8fc2ecf 10025 RXi_GET_DECL(r,ri);
3dab1dad 10026 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10027
10028 PERL_ARGS_ASSERT_DUMPUNTIL;
10029
786e8c11
YO
10030#ifdef DEBUG_DUMPUNTIL
10031 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10032 last ? last-start : 0,plast ? plast-start : 0);
10033#endif
10034
10035 if (plast && plast < last)
10036 last= plast;
10037
10038 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 10039 /* While that wasn't END last time... */
a28509cc
AL
10040 NODE_ALIGN(node);
10041 op = OP(node);
de734bd5 10042 if (op == CLOSE || op == WHILEM)
786e8c11 10043 indent--;
b5a2f8d8 10044 next = regnext((regnode *)node);
1f1031fe 10045
a28509cc 10046 /* Where, what. */
8e11feef 10047 if (OP(node) == OPTIMIZED) {
e68ec53f 10048 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 10049 optstart = node;
3dab1dad 10050 else
8e11feef 10051 goto after_print;
3dab1dad
YO
10052 } else
10053 CLEAR_OPTSTART;
1f1031fe 10054
32fc9b6a 10055 regprop(r, sv, node);
a28509cc 10056 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 10057 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
10058
10059 if (OP(node) != OPTIMIZED) {
10060 if (next == NULL) /* Next ptr. */
10061 PerlIO_printf(Perl_debug_log, " (0)");
10062 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10063 PerlIO_printf(Perl_debug_log, " (FAIL)");
10064 else
10065 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10066 (void)PerlIO_putc(Perl_debug_log, '\n');
10067 }
10068
a28509cc
AL
10069 after_print:
10070 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
10071 assert(next);
10072 {
10073 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
10074 ? regnext((regnode *)next)
10075 : next);
be8e71aa
YO
10076 if (last && nnode > last)
10077 nnode = last;
786e8c11 10078 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 10079 }
a28509cc
AL
10080 }
10081 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 10082 assert(next);
786e8c11 10083 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
10084 }
10085 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 10086 const regnode *this_trie = node;
1de06328 10087 const char op = OP(node);
647f639f 10088 const U32 n = ARG(node);
1de06328 10089 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 10090 (reg_ac_data *)ri->data->data[n] :
1de06328 10091 NULL;
3251b653
NC
10092 const reg_trie_data * const trie =
10093 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 10094#ifdef DEBUGGING
502c6561 10095 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 10096#endif
786e8c11 10097 const regnode *nextbranch= NULL;
a28509cc 10098 I32 word_idx;
76f68e9b 10099 sv_setpvs(sv, "");
786e8c11 10100 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 10101 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
10102
10103 PerlIO_printf(Perl_debug_log, "%*s%s ",
10104 (int)(2*(indent+3)), "",
10105 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
10106 PL_colors[0], PL_colors[1],
10107 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 10108 PERL_PV_PRETTY_ELLIPSES |
7f69552c 10109 PERL_PV_PRETTY_LTGT
786e8c11
YO
10110 )
10111 : "???"
10112 );
10113 if (trie->jump) {
40d049e4 10114 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
10115 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10116 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
10117 if (dist) {
10118 if (!nextbranch)
24b23f37 10119 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
10120 DUMPUNTIL(this_trie + dist, nextbranch);
10121 }
786e8c11
YO
10122 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10123 nextbranch= regnext((regnode *)nextbranch);
10124 } else {
10125 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 10126 }
786e8c11
YO
10127 }
10128 if (last && next > last)
10129 node= last;
10130 else
10131 node= next;
a28509cc 10132 }
786e8c11
YO
10133 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10134 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10135 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
10136 }
10137 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 10138 assert(next);
786e8c11 10139 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
10140 }
10141 else if ( op == PLUS || op == STAR) {
786e8c11 10142 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
10143 }
10144 else if (op == ANYOF) {
10145 /* arglen 1 + class block */
10146 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10147 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10148 node = NEXTOPER(node);
10149 }
10150 else if (PL_regkind[(U8)op] == EXACT) {
10151 /* Literal string, where present. */
10152 node += NODE_SZ_STR(node) - 1;
10153 node = NEXTOPER(node);
10154 }
10155 else {
10156 node = NEXTOPER(node);
10157 node += regarglen[(U8)op];
10158 }
10159 if (op == CURLYX || op == OPEN)
786e8c11 10160 indent++;
a28509cc 10161 }
3dab1dad 10162 CLEAR_OPTSTART;
786e8c11 10163#ifdef DEBUG_DUMPUNTIL
70685ca0 10164 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 10165#endif
1de06328 10166 return node;
a28509cc
AL
10167}
10168
10169#endif /* DEBUGGING */
10170
241d1a3b
NC
10171/*
10172 * Local variables:
10173 * c-indentation-style: bsd
10174 * c-basic-offset: 4
10175 * indent-tabs-mode: t
10176 * End:
10177 *
37442d52
RGS
10178 * ex: set ts=8 sts=4 sw=4 noet:
10179 */