This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
do subname() is deprecated, so this test from perl 3 needs updating.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73
AD
29/* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
d4cce5f1 88#ifdef op
11343788 89#undef op
d4cce5f1 90#endif /* op */
11343788 91
fe14fcc3 92#ifdef MSDOS
7e4e8c89 93# if defined(BUGGY_MSC6)
fe14fcc3 94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 95# pragma optimize("a",off)
fe14fcc3 96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
97# pragma optimize("w",on )
98# endif /* BUGGY_MSC6 */
fe14fcc3
LW
99#endif /* MSDOS */
100
a687059c
LW
101#ifndef STATIC
102#define STATIC static
103#endif
104
830247a4 105typedef struct RExC_state_t {
e2509266 106 U32 flags; /* are we folding, multilining? */
830247a4 107 char *precomp; /* uncompiled string. */
288b8c02 108 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
109 regexp *rx; /* perl core regexp structure */
110 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 111 char *start; /* Start of input for compile */
830247a4
IZ
112 char *end; /* End of input for compile */
113 char *parse; /* Input-scan pointer. */
114 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 115 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 116 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 117 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
118 I32 naughty; /* How bad is this pattern? */
119 I32 sawback; /* Did we see \1, ...? */
120 U32 seen;
121 I32 size; /* Code size. */
c74340f9
YO
122 I32 npar; /* Capture buffer count, (OPEN). */
123 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 124 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
40d049e4
YO
128 regnode **open_parens; /* pointers to open parens */
129 regnode **close_parens; /* pointers to close parens */
130 regnode *opend; /* END node in program */
02daf0ab
YO
131 I32 utf8; /* whether the pattern is utf8 or not */
132 I32 orig_utf8; /* whether the pattern was originally in utf8 */
133 /* XXX use this for future optimisation of case
134 * where pattern must be upgraded to utf8. */
6bda09f9 135 HV *charnames; /* cache of named sequences */
81714fb9 136 HV *paren_names; /* Paren names */
1f1031fe 137
40d049e4
YO
138 regnode **recurse; /* Recurse regops */
139 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
140#if ADD_TO_REGEXEC
141 char *starttry; /* -Dr: where regtry was called. */
142#define RExC_starttry (pRExC_state->starttry)
143#endif
3dab1dad 144#ifdef DEBUGGING
be8e71aa 145 const char *lastparse;
3dab1dad 146 I32 lastnum;
1f1031fe 147 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
148#define RExC_lastparse (pRExC_state->lastparse)
149#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 150#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 151#endif
830247a4
IZ
152} RExC_state_t;
153
e2509266 154#define RExC_flags (pRExC_state->flags)
830247a4 155#define RExC_precomp (pRExC_state->precomp)
288b8c02 156#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 157#define RExC_rx (pRExC_state->rx)
f8fc2ecf 158#define RExC_rxi (pRExC_state->rxi)
fac92740 159#define RExC_start (pRExC_state->start)
830247a4
IZ
160#define RExC_end (pRExC_state->end)
161#define RExC_parse (pRExC_state->parse)
162#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
163#ifdef RE_TRACK_PATTERN_OFFSETS
164#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
165#endif
830247a4 166#define RExC_emit (pRExC_state->emit)
fac92740 167#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 168#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
169#define RExC_naughty (pRExC_state->naughty)
170#define RExC_sawback (pRExC_state->sawback)
171#define RExC_seen (pRExC_state->seen)
172#define RExC_size (pRExC_state->size)
173#define RExC_npar (pRExC_state->npar)
e2e6a0f1 174#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
175#define RExC_extralen (pRExC_state->extralen)
176#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
177#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 178#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 179#define RExC_orig_utf8 (pRExC_state->orig_utf8)
fc8cd66c 180#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
181#define RExC_open_parens (pRExC_state->open_parens)
182#define RExC_close_parens (pRExC_state->close_parens)
183#define RExC_opend (pRExC_state->opend)
81714fb9 184#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
185#define RExC_recurse (pRExC_state->recurse)
186#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 187
cde0cee5 188
a687059c
LW
189#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
a687059c 192
35c8bce7
LW
193#ifdef SPSTART
194#undef SPSTART /* dratted cpp namespace... */
195#endif
a687059c
LW
196/*
197 * Flags to be passed up and down.
198 */
a687059c 199#define WORST 0 /* Worst case. */
a3b492c3
YO
200#define HASWIDTH 0x01 /* Known to match non-null strings. */
201#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
202#define SPSTART 0x04 /* Starts with * or +. */
203#define TRYAGAIN 0x08 /* Weeded out a declaration. */
204#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 205
3dab1dad
YO
206#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
207
07be1b83
YO
208/* whether trie related optimizations are enabled */
209#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
210#define TRIE_STUDY_OPT
786e8c11 211#define FULL_TRIE_STUDY
07be1b83
YO
212#define TRIE_STCLASS
213#endif
1de06328
YO
214
215
40d049e4
YO
216
217#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
218#define PBITVAL(paren) (1 << ((paren) & 7))
219#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
220#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
221#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
222
223
1de06328
YO
224/* About scan_data_t.
225
226 During optimisation we recurse through the regexp program performing
227 various inplace (keyhole style) optimisations. In addition study_chunk
228 and scan_commit populate this data structure with information about
229 what strings MUST appear in the pattern. We look for the longest
230 string that must appear for at a fixed location, and we look for the
231 longest string that may appear at a floating location. So for instance
232 in the pattern:
233
234 /FOO[xX]A.*B[xX]BAR/
235
236 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
237 strings (because they follow a .* construct). study_chunk will identify
238 both FOO and BAR as being the longest fixed and floating strings respectively.
239
240 The strings can be composites, for instance
241
242 /(f)(o)(o)/
243
244 will result in a composite fixed substring 'foo'.
245
246 For each string some basic information is maintained:
247
248 - offset or min_offset
249 This is the position the string must appear at, or not before.
250 It also implicitly (when combined with minlenp) tells us how many
251 character must match before the string we are searching.
252 Likewise when combined with minlenp and the length of the string
253 tells us how many characters must appear after the string we have
254 found.
255
256 - max_offset
257 Only used for floating strings. This is the rightmost point that
258 the string can appear at. Ifset to I32 max it indicates that the
259 string can occur infinitely far to the right.
260
261 - minlenp
262 A pointer to the minimum length of the pattern that the string
263 was found inside. This is important as in the case of positive
264 lookahead or positive lookbehind we can have multiple patterns
265 involved. Consider
266
267 /(?=FOO).*F/
268
269 The minimum length of the pattern overall is 3, the minimum length
270 of the lookahead part is 3, but the minimum length of the part that
271 will actually match is 1. So 'FOO's minimum length is 3, but the
272 minimum length for the F is 1. This is important as the minimum length
273 is used to determine offsets in front of and behind the string being
274 looked for. Since strings can be composites this is the length of the
275 pattern at the time it was commited with a scan_commit. Note that
276 the length is calculated by study_chunk, so that the minimum lengths
277 are not known until the full pattern has been compiled, thus the
278 pointer to the value.
279
280 - lookbehind
281
282 In the case of lookbehind the string being searched for can be
283 offset past the start point of the final matching string.
284 If this value was just blithely removed from the min_offset it would
285 invalidate some of the calculations for how many chars must match
286 before or after (as they are derived from min_offset and minlen and
287 the length of the string being searched for).
288 When the final pattern is compiled and the data is moved from the
289 scan_data_t structure into the regexp structure the information
290 about lookbehind is factored in, with the information that would
291 have been lost precalculated in the end_shift field for the
292 associated string.
293
294 The fields pos_min and pos_delta are used to store the minimum offset
295 and the delta to the maximum offset at the current point in the pattern.
296
297*/
2c2d71f5
JH
298
299typedef struct scan_data_t {
1de06328
YO
300 /*I32 len_min; unused */
301 /*I32 len_delta; unused */
2c2d71f5
JH
302 I32 pos_min;
303 I32 pos_delta;
304 SV *last_found;
1de06328 305 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
306 I32 last_start_min;
307 I32 last_start_max;
1de06328
YO
308 SV **longest; /* Either &l_fixed, or &l_float. */
309 SV *longest_fixed; /* longest fixed string found in pattern */
310 I32 offset_fixed; /* offset where it starts */
311 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
312 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
313 SV *longest_float; /* longest floating string found in pattern */
314 I32 offset_float_min; /* earliest point in string it can appear */
315 I32 offset_float_max; /* latest point in string it can appear */
316 I32 *minlen_float; /* pointer to the minlen relevent to the string */
317 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
318 I32 flags;
319 I32 whilem_c;
cb434fcc 320 I32 *last_closep;
653099ff 321 struct regnode_charclass_class *start_class;
2c2d71f5
JH
322} scan_data_t;
323
a687059c 324/*
e50aee73 325 * Forward declarations for pregcomp()'s friends.
a687059c 326 */
a0d0e21e 327
27da23d5 328static const scan_data_t zero_scan_data =
1de06328 329 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
330
331#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
332#define SF_BEFORE_SEOL 0x0001
333#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
334#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
335#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
336
09b7f37c
CB
337#ifdef NO_UNARY_PLUS
338# define SF_FIX_SHIFT_EOL (0+2)
339# define SF_FL_SHIFT_EOL (0+4)
340#else
341# define SF_FIX_SHIFT_EOL (+2)
342# define SF_FL_SHIFT_EOL (+4)
343#endif
c277df42
IZ
344
345#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
346#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
347
348#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
349#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
350#define SF_IS_INF 0x0040
351#define SF_HAS_PAR 0x0080
352#define SF_IN_PAR 0x0100
353#define SF_HAS_EVAL 0x0200
354#define SCF_DO_SUBSTR 0x0400
653099ff
GS
355#define SCF_DO_STCLASS_AND 0x0800
356#define SCF_DO_STCLASS_OR 0x1000
357#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 358#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 359
786e8c11 360#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 361#define SCF_SEEN_ACCEPT 0x8000
07be1b83 362
eb160463 363#define UTF (RExC_utf8 != 0)
bbe252da
YO
364#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
365#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 366
ffc61ed2 367#define OOB_UNICODE 12345678
93733859 368#define OOB_NAMEDCLASS -1
b8c5462f 369
a0ed51b3
LW
370#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
371#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
372
8615cb43 373
b45f050a
JF
374/* length of regex to show in messages that don't mark a position within */
375#define RegexLengthToShowInErrorMessages 127
376
377/*
378 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
379 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
380 * op/pragma/warn/regcomp.
381 */
7253e4e3
RK
382#define MARKER1 "<-- HERE" /* marker as it appears in the description */
383#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 384
7253e4e3 385#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
386
387/*
388 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
389 * arg. Show regex, up to a maximum length. If it's too long, chop and add
390 * "...".
391 */
58e23c8d 392#define _FAIL(code) STMT_START { \
bfed75c6 393 const char *ellipses = ""; \
ccb2c380
MP
394 IV len = RExC_end - RExC_precomp; \
395 \
396 if (!SIZE_ONLY) \
288b8c02 397 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
398 if (len > RegexLengthToShowInErrorMessages) { \
399 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
400 len = RegexLengthToShowInErrorMessages - 10; \
401 ellipses = "..."; \
402 } \
58e23c8d 403 code; \
ccb2c380 404} STMT_END
8615cb43 405
58e23c8d
YO
406#define FAIL(msg) _FAIL( \
407 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
408 msg, (int)len, RExC_precomp, ellipses))
409
410#define FAIL2(msg,arg) _FAIL( \
411 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
412 arg, (int)len, RExC_precomp, ellipses))
413
b45f050a 414/*
b45f050a
JF
415 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
416 */
ccb2c380 417#define Simple_vFAIL(m) STMT_START { \
a28509cc 418 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
419 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
420 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
421} STMT_END
b45f050a
JF
422
423/*
424 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
425 */
ccb2c380
MP
426#define vFAIL(m) STMT_START { \
427 if (!SIZE_ONLY) \
288b8c02 428 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
429 Simple_vFAIL(m); \
430} STMT_END
b45f050a
JF
431
432/*
433 * Like Simple_vFAIL(), but accepts two arguments.
434 */
ccb2c380 435#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 436 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
437 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
438 (int)offset, RExC_precomp, RExC_precomp + offset); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
443 */
ccb2c380
MP
444#define vFAIL2(m,a1) STMT_START { \
445 if (!SIZE_ONLY) \
288b8c02 446 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
447 Simple_vFAIL2(m, a1); \
448} STMT_END
b45f050a
JF
449
450
451/*
452 * Like Simple_vFAIL(), but accepts three arguments.
453 */
ccb2c380 454#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 455 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
456 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
457 (int)offset, RExC_precomp, RExC_precomp + offset); \
458} STMT_END
b45f050a
JF
459
460/*
461 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
462 */
ccb2c380
MP
463#define vFAIL3(m,a1,a2) STMT_START { \
464 if (!SIZE_ONLY) \
288b8c02 465 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
466 Simple_vFAIL3(m, a1, a2); \
467} STMT_END
b45f050a
JF
468
469/*
470 * Like Simple_vFAIL(), but accepts four arguments.
471 */
ccb2c380 472#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 473 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
474 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
475 (int)offset, RExC_precomp, RExC_precomp + offset); \
476} STMT_END
b45f050a 477
668c081a 478#define ckWARNreg(loc,m) STMT_START { \
a28509cc 479 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
480 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
482} STMT_END
483
668c081a 484#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 485 const IV offset = loc - RExC_precomp; \
668c081a 486 Perl_ck_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
487 m REPORT_LOCATION, \
488 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
489} STMT_END
490
668c081a 491#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 492 const IV offset = loc - RExC_precomp; \
668c081a 493 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
494 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
495} STMT_END
496
497#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 498 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
499 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
500 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
501} STMT_END
502
668c081a
NC
503#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
504 const IV offset = loc - RExC_precomp; \
505 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
506 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
507} STMT_END
508
ccb2c380 509#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 510 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
511 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
512 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
513} STMT_END
514
668c081a
NC
515#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
516 const IV offset = loc - RExC_precomp; \
517 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
518 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
519} STMT_END
520
ccb2c380 521#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 522 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
523 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
524 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
525} STMT_END
9d1d55b5 526
8615cb43 527
cd439c50 528/* Allow for side effects in s */
ccb2c380
MP
529#define REGC(c,s) STMT_START { \
530 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
531} STMT_END
cd439c50 532
fac92740
MJD
533/* Macros for recording node offsets. 20001227 mjd@plover.com
534 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
535 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
536 * Element 0 holds the number n.
07be1b83 537 * Position is 1 indexed.
fac92740 538 */
7122b237
YO
539#ifndef RE_TRACK_PATTERN_OFFSETS
540#define Set_Node_Offset_To_R(node,byte)
541#define Set_Node_Offset(node,byte)
542#define Set_Cur_Node_Offset
543#define Set_Node_Length_To_R(node,len)
544#define Set_Node_Length(node,len)
545#define Set_Node_Cur_Length(node)
546#define Node_Offset(n)
547#define Node_Length(n)
548#define Set_Node_Offset_Length(node,offset,len)
549#define ProgLen(ri) ri->u.proglen
550#define SetProgLen(ri,x) ri->u.proglen = x
551#else
552#define ProgLen(ri) ri->u.offsets[0]
553#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
554#define Set_Node_Offset_To_R(node,byte) STMT_START { \
555 if (! SIZE_ONLY) { \
556 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 557 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 558 if((node) < 0) { \
551405c4 559 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
560 } else { \
561 RExC_offsets[2*(node)-1] = (byte); \
562 } \
563 } \
564} STMT_END
565
566#define Set_Node_Offset(node,byte) \
567 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
568#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
569
570#define Set_Node_Length_To_R(node,len) STMT_START { \
571 if (! SIZE_ONLY) { \
572 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 573 __LINE__, (int)(node), (int)(len))); \
ccb2c380 574 if((node) < 0) { \
551405c4 575 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
576 } else { \
577 RExC_offsets[2*(node)] = (len); \
578 } \
579 } \
580} STMT_END
581
582#define Set_Node_Length(node,len) \
583 Set_Node_Length_To_R((node)-RExC_emit_start, len)
584#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
585#define Set_Node_Cur_Length(node) \
586 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
587
588/* Get offsets and lengths */
589#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
590#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
591
07be1b83
YO
592#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
593 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
594 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
595} STMT_END
7122b237 596#endif
07be1b83
YO
597
598#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
599#define EXPERIMENTAL_INPLACESCAN
7122b237 600#endif /*RE_TRACK_PATTERN_OFFSETS*/
07be1b83 601
304ee84b
YO
602#define DEBUG_STUDYDATA(str,data,depth) \
603DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 604 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
605 "%*s" str "Pos:%"IVdf"/%"IVdf \
606 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
607 (int)(depth)*2, "", \
608 (IV)((data)->pos_min), \
609 (IV)((data)->pos_delta), \
304ee84b 610 (UV)((data)->flags), \
1de06328 611 (IV)((data)->whilem_c), \
304ee84b
YO
612 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
613 is_inf ? "INF " : "" \
1de06328
YO
614 ); \
615 if ((data)->last_found) \
616 PerlIO_printf(Perl_debug_log, \
617 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
618 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
619 SvPVX_const((data)->last_found), \
620 (IV)((data)->last_end), \
621 (IV)((data)->last_start_min), \
622 (IV)((data)->last_start_max), \
623 ((data)->longest && \
624 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
625 SvPVX_const((data)->longest_fixed), \
626 (IV)((data)->offset_fixed), \
627 ((data)->longest && \
628 (data)->longest==&((data)->longest_float)) ? "*" : "", \
629 SvPVX_const((data)->longest_float), \
630 (IV)((data)->offset_float_min), \
631 (IV)((data)->offset_float_max) \
632 ); \
633 PerlIO_printf(Perl_debug_log,"\n"); \
634});
635
acfe0abc 636static void clear_re(pTHX_ void *r);
4327152a 637
653099ff 638/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 639 Update the longest found anchored substring and the longest found
653099ff
GS
640 floating substrings if needed. */
641
4327152a 642STATIC void
304ee84b 643S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 644{
e1ec3a88
AL
645 const STRLEN l = CHR_SVLEN(data->last_found);
646 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 647 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 648
7918f24d
NC
649 PERL_ARGS_ASSERT_SCAN_COMMIT;
650
c277df42 651 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 652 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
653 if (*data->longest == data->longest_fixed) {
654 data->offset_fixed = l ? data->last_start_min : data->pos_min;
655 if (data->flags & SF_BEFORE_EOL)
b81d288d 656 data->flags
c277df42
IZ
657 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
658 else
659 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
660 data->minlen_fixed=minlenp;
661 data->lookbehind_fixed=0;
a0ed51b3 662 }
304ee84b 663 else { /* *data->longest == data->longest_float */
c277df42 664 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
665 data->offset_float_max = (l
666 ? data->last_start_max
c277df42 667 : data->pos_min + data->pos_delta);
304ee84b 668 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 669 data->offset_float_max = I32_MAX;
c277df42 670 if (data->flags & SF_BEFORE_EOL)
b81d288d 671 data->flags
c277df42
IZ
672 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
673 else
674 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
675 data->minlen_float=minlenp;
676 data->lookbehind_float=0;
c277df42
IZ
677 }
678 }
679 SvCUR_set(data->last_found, 0);
0eda9292 680 {
a28509cc 681 SV * const sv = data->last_found;
097eb12c
AL
682 if (SvUTF8(sv) && SvMAGICAL(sv)) {
683 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
684 if (mg)
685 mg->mg_len = 0;
686 }
0eda9292 687 }
c277df42
IZ
688 data->last_end = -1;
689 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 690 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
691}
692
653099ff
GS
693/* Can match anything (initialization) */
694STATIC void
097eb12c 695S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 696{
7918f24d
NC
697 PERL_ARGS_ASSERT_CL_ANYTHING;
698
653099ff 699 ANYOF_CLASS_ZERO(cl);
f8bef550 700 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 701 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
702 if (LOC)
703 cl->flags |= ANYOF_LOCALE;
704}
705
706/* Can match anything (initialization) */
707STATIC int
5f66b61c 708S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
709{
710 int value;
711
7918f24d
NC
712 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
713
aaa51d5e 714 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
715 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
716 return 1;
1aa99e6b
IH
717 if (!(cl->flags & ANYOF_UNICODE_ALL))
718 return 0;
10edeb5d 719 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 720 return 0;
653099ff
GS
721 return 1;
722}
723
724/* Can match anything (initialization) */
725STATIC void
097eb12c 726S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 727{
7918f24d
NC
728 PERL_ARGS_ASSERT_CL_INIT;
729
8ecf7187 730 Zero(cl, 1, struct regnode_charclass_class);
653099ff 731 cl->type = ANYOF;
830247a4 732 cl_anything(pRExC_state, cl);
653099ff
GS
733}
734
735STATIC void
097eb12c 736S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 737{
7918f24d
NC
738 PERL_ARGS_ASSERT_CL_INIT_ZERO;
739
8ecf7187 740 Zero(cl, 1, struct regnode_charclass_class);
653099ff 741 cl->type = ANYOF;
830247a4 742 cl_anything(pRExC_state, cl);
653099ff
GS
743 if (LOC)
744 cl->flags |= ANYOF_LOCALE;
745}
746
747/* 'And' a given class with another one. Can create false positives */
748/* We assume that cl is not inverted */
749STATIC void
5f66b61c 750S_cl_and(struct regnode_charclass_class *cl,
a28509cc 751 const struct regnode_charclass_class *and_with)
653099ff 752{
7918f24d 753 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
754
755 assert(and_with->type == ANYOF);
653099ff
GS
756 if (!(and_with->flags & ANYOF_CLASS)
757 && !(cl->flags & ANYOF_CLASS)
758 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
759 && !(and_with->flags & ANYOF_FOLD)
760 && !(cl->flags & ANYOF_FOLD)) {
761 int i;
762
763 if (and_with->flags & ANYOF_INVERT)
764 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
765 cl->bitmap[i] &= ~and_with->bitmap[i];
766 else
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] &= and_with->bitmap[i];
769 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
770 if (!(and_with->flags & ANYOF_EOS))
771 cl->flags &= ~ANYOF_EOS;
1aa99e6b 772
14ebb1a2
JH
773 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
774 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
775 cl->flags &= ~ANYOF_UNICODE_ALL;
776 cl->flags |= ANYOF_UNICODE;
777 ARG_SET(cl, ARG(and_with));
778 }
14ebb1a2
JH
779 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
780 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 781 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
782 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
783 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 784 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
785}
786
787/* 'OR' a given class with another one. Can create false positives */
788/* We assume that cl is not inverted */
789STATIC void
097eb12c 790S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 791{
7918f24d
NC
792 PERL_ARGS_ASSERT_CL_OR;
793
653099ff
GS
794 if (or_with->flags & ANYOF_INVERT) {
795 /* We do not use
796 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
797 * <= (B1 | !B2) | (CL1 | !CL2)
798 * which is wasteful if CL2 is small, but we ignore CL2:
799 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
800 * XXXX Can we handle case-fold? Unclear:
801 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
802 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
803 */
804 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
805 && !(or_with->flags & ANYOF_FOLD)
806 && !(cl->flags & ANYOF_FOLD) ) {
807 int i;
808
809 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
810 cl->bitmap[i] |= ~or_with->bitmap[i];
811 } /* XXXX: logic is complicated otherwise */
812 else {
830247a4 813 cl_anything(pRExC_state, cl);
653099ff
GS
814 }
815 } else {
816 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
817 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 818 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
819 || (cl->flags & ANYOF_FOLD)) ) {
820 int i;
821
822 /* OR char bitmap and class bitmap separately */
823 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
824 cl->bitmap[i] |= or_with->bitmap[i];
825 if (or_with->flags & ANYOF_CLASS) {
826 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
827 cl->classflags[i] |= or_with->classflags[i];
828 cl->flags |= ANYOF_CLASS;
829 }
830 }
831 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 832 cl_anything(pRExC_state, cl);
653099ff
GS
833 }
834 }
835 if (or_with->flags & ANYOF_EOS)
836 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
837
838 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
839 ARG(cl) != ARG(or_with)) {
840 cl->flags |= ANYOF_UNICODE_ALL;
841 cl->flags &= ~ANYOF_UNICODE;
842 }
843 if (or_with->flags & ANYOF_UNICODE_ALL) {
844 cl->flags |= ANYOF_UNICODE_ALL;
845 cl->flags &= ~ANYOF_UNICODE;
846 }
653099ff
GS
847}
848
a3621e74
YO
849#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
850#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
851#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
852#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
853
3dab1dad
YO
854
855#ifdef DEBUGGING
07be1b83 856/*
2b8b4781
NC
857 dump_trie(trie,widecharmap,revcharmap)
858 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
859 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
860
861 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
862 The _interim_ variants are used for debugging the interim
863 tables that are used to generate the final compressed
864 representation which is what dump_trie expects.
865
3dab1dad
YO
866 Part of the reason for their existance is to provide a form
867 of documentation as to how the different representations function.
07be1b83
YO
868
869*/
3dab1dad
YO
870
871/*
3dab1dad
YO
872 Dumps the final compressed table form of the trie to Perl_debug_log.
873 Used for debugging make_trie().
874*/
875
876STATIC void
2b8b4781
NC
877S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
878 AV *revcharmap, U32 depth)
3dab1dad
YO
879{
880 U32 state;
ab3bbdeb 881 SV *sv=sv_newmortal();
55eed653 882 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
883 GET_RE_DEBUG_FLAGS_DECL;
884
7918f24d 885 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 886
3dab1dad
YO
887 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
888 (int)depth * 2 + 2,"",
889 "Match","Base","Ofs" );
890
891 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 892 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 893 if ( tmp ) {
ab3bbdeb
YO
894 PerlIO_printf( Perl_debug_log, "%*s",
895 colwidth,
ddc5bc0f 896 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
897 PL_colors[0], PL_colors[1],
898 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
899 PERL_PV_ESCAPE_FIRSTCHAR
900 )
901 );
3dab1dad
YO
902 }
903 }
904 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
905 (int)depth * 2 + 2,"");
906
907 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 908 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
909 PerlIO_printf( Perl_debug_log, "\n");
910
1e2e3d02 911 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 912 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
913
914 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
915
916 if ( trie->states[ state ].wordnum ) {
917 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
918 } else {
919 PerlIO_printf( Perl_debug_log, "%6s", "" );
920 }
921
922 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
923
924 if ( base ) {
925 U32 ofs = 0;
926
927 while( ( base + ofs < trie->uniquecharcount ) ||
928 ( base + ofs - trie->uniquecharcount < trie->lasttrans
929 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
930 ofs++;
931
932 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
933
934 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
935 if ( ( base + ofs >= trie->uniquecharcount ) &&
936 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
937 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
938 {
ab3bbdeb
YO
939 PerlIO_printf( Perl_debug_log, "%*"UVXf,
940 colwidth,
3dab1dad
YO
941 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
942 } else {
ab3bbdeb 943 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
944 }
945 }
946
947 PerlIO_printf( Perl_debug_log, "]");
948
949 }
950 PerlIO_printf( Perl_debug_log, "\n" );
951 }
952}
953/*
3dab1dad
YO
954 Dumps a fully constructed but uncompressed trie in list form.
955 List tries normally only are used for construction when the number of
956 possible chars (trie->uniquecharcount) is very high.
957 Used for debugging make_trie().
958*/
959STATIC void
55eed653 960S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
961 HV *widecharmap, AV *revcharmap, U32 next_alloc,
962 U32 depth)
3dab1dad
YO
963{
964 U32 state;
ab3bbdeb 965 SV *sv=sv_newmortal();
55eed653 966 int colwidth= widecharmap ? 6 : 4;
3dab1dad 967 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
968
969 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
970
3dab1dad 971 /* print out the table precompression. */
ab3bbdeb
YO
972 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
973 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
974 "------:-----+-----------------\n" );
3dab1dad
YO
975
976 for( state=1 ; state < next_alloc ; state ++ ) {
977 U16 charid;
978
ab3bbdeb 979 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
980 (int)depth * 2 + 2,"", (UV)state );
981 if ( ! trie->states[ state ].wordnum ) {
982 PerlIO_printf( Perl_debug_log, "%5s| ","");
983 } else {
984 PerlIO_printf( Perl_debug_log, "W%4x| ",
985 trie->states[ state ].wordnum
986 );
987 }
988 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 989 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
990 if ( tmp ) {
991 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
992 colwidth,
ddc5bc0f 993 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
994 PL_colors[0], PL_colors[1],
995 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
996 PERL_PV_ESCAPE_FIRSTCHAR
997 ) ,
1e2e3d02
YO
998 TRIE_LIST_ITEM(state,charid).forid,
999 (UV)TRIE_LIST_ITEM(state,charid).newstate
1000 );
1001 if (!(charid % 10))
664e119d
RGS
1002 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1003 (int)((depth * 2) + 14), "");
1e2e3d02 1004 }
ab3bbdeb
YO
1005 }
1006 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1007 }
1008}
1009
1010/*
3dab1dad
YO
1011 Dumps a fully constructed but uncompressed trie in table form.
1012 This is the normal DFA style state transition table, with a few
1013 twists to facilitate compression later.
1014 Used for debugging make_trie().
1015*/
1016STATIC void
55eed653 1017S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1018 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1019 U32 depth)
3dab1dad
YO
1020{
1021 U32 state;
1022 U16 charid;
ab3bbdeb 1023 SV *sv=sv_newmortal();
55eed653 1024 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1025 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1026
1027 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1028
1029 /*
1030 print out the table precompression so that we can do a visual check
1031 that they are identical.
1032 */
1033
1034 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1035
1036 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1037 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1038 if ( tmp ) {
ab3bbdeb
YO
1039 PerlIO_printf( Perl_debug_log, "%*s",
1040 colwidth,
ddc5bc0f 1041 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1042 PL_colors[0], PL_colors[1],
1043 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1044 PERL_PV_ESCAPE_FIRSTCHAR
1045 )
1046 );
3dab1dad
YO
1047 }
1048 }
1049
1050 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1051
1052 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1053 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1054 }
1055
1056 PerlIO_printf( Perl_debug_log, "\n" );
1057
1058 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1059
1060 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1061 (int)depth * 2 + 2,"",
1062 (UV)TRIE_NODENUM( state ) );
1063
1064 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1065 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1066 if (v)
1067 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1068 else
1069 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1070 }
1071 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1072 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1073 } else {
1074 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1075 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1076 }
1077 }
07be1b83 1078}
3dab1dad
YO
1079
1080#endif
1081
786e8c11
YO
1082/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1083 startbranch: the first branch in the whole branch sequence
1084 first : start branch of sequence of branch-exact nodes.
1085 May be the same as startbranch
1086 last : Thing following the last branch.
1087 May be the same as tail.
1088 tail : item following the branch sequence
1089 count : words in the sequence
1090 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1091 depth : indent depth
3dab1dad 1092
786e8c11 1093Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1094
786e8c11
YO
1095A trie is an N'ary tree where the branches are determined by digital
1096decomposition of the key. IE, at the root node you look up the 1st character and
1097follow that branch repeat until you find the end of the branches. Nodes can be
1098marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1099
786e8c11 1100 /he|she|his|hers/
72f13be8 1101
786e8c11
YO
1102would convert into the following structure. Numbers represent states, letters
1103following numbers represent valid transitions on the letter from that state, if
1104the number is in square brackets it represents an accepting state, otherwise it
1105will be in parenthesis.
07be1b83 1106
786e8c11
YO
1107 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1108 | |
1109 | (2)
1110 | |
1111 (1) +-i->(6)-+-s->[7]
1112 |
1113 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1114
786e8c11
YO
1115 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1116
1117This shows that when matching against the string 'hers' we will begin at state 1
1118read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1119then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1120is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1121single traverse. We store a mapping from accepting to state to which word was
1122matched, and then when we have multiple possibilities we try to complete the
1123rest of the regex in the order in which they occured in the alternation.
1124
1125The only prior NFA like behaviour that would be changed by the TRIE support is
1126the silent ignoring of duplicate alternations which are of the form:
1127
1128 / (DUPE|DUPE) X? (?{ ... }) Y /x
1129
1130Thus EVAL blocks follwing a trie may be called a different number of times with
1131and without the optimisation. With the optimisations dupes will be silently
1132ignored. This inconsistant behaviour of EVAL type nodes is well established as
1133the following demonstrates:
1134
1135 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1136
1137which prints out 'word' three times, but
1138
1139 'words'=~/(word|word|word)(?{ print $1 })S/
1140
1141which doesnt print it out at all. This is due to other optimisations kicking in.
1142
1143Example of what happens on a structural level:
1144
1145The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1146
1147 1: CURLYM[1] {1,32767}(18)
1148 5: BRANCH(8)
1149 6: EXACT <ac>(16)
1150 8: BRANCH(11)
1151 9: EXACT <ad>(16)
1152 11: BRANCH(14)
1153 12: EXACT <ab>(16)
1154 16: SUCCEED(0)
1155 17: NOTHING(18)
1156 18: END(0)
1157
1158This would be optimizable with startbranch=5, first=5, last=16, tail=16
1159and should turn into:
1160
1161 1: CURLYM[1] {1,32767}(18)
1162 5: TRIE(16)
1163 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1164 <ac>
1165 <ad>
1166 <ab>
1167 16: SUCCEED(0)
1168 17: NOTHING(18)
1169 18: END(0)
1170
1171Cases where tail != last would be like /(?foo|bar)baz/:
1172
1173 1: BRANCH(4)
1174 2: EXACT <foo>(8)
1175 4: BRANCH(7)
1176 5: EXACT <bar>(8)
1177 7: TAIL(8)
1178 8: EXACT <baz>(10)
1179 10: END(0)
1180
1181which would be optimizable with startbranch=1, first=1, last=7, tail=8
1182and would end up looking like:
1183
1184 1: TRIE(8)
1185 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1186 <foo>
1187 <bar>
1188 7: TAIL(8)
1189 8: EXACT <baz>(10)
1190 10: END(0)
1191
1192 d = uvuni_to_utf8_flags(d, uv, 0);
1193
1194is the recommended Unicode-aware way of saying
1195
1196 *(d++) = uv;
1197*/
1198
1e2e3d02 1199#define TRIE_STORE_REVCHAR \
786e8c11 1200 STMT_START { \
73031816
NC
1201 if (UTF) { \
1202 SV *zlopp = newSV(2); \
88c9ea1e
CB
1203 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1204 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1205 SvCUR_set(zlopp, kapow - flrbbbbb); \
1206 SvPOK_on(zlopp); \
1207 SvUTF8_on(zlopp); \
1208 av_push(revcharmap, zlopp); \
1209 } else { \
6bdeddd2 1210 char ooooff = (char)uvc; \
73031816
NC
1211 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1212 } \
1213 } STMT_END
786e8c11
YO
1214
1215#define TRIE_READ_CHAR STMT_START { \
1216 wordlen++; \
1217 if ( UTF ) { \
1218 if ( folder ) { \
1219 if ( foldlen > 0 ) { \
1220 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1221 foldlen -= len; \
1222 scan += len; \
1223 len = 0; \
1224 } else { \
1225 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1226 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1227 foldlen -= UNISKIP( uvc ); \
1228 scan = foldbuf + UNISKIP( uvc ); \
1229 } \
1230 } else { \
1231 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1232 } \
1233 } else { \
1234 uvc = (U32)*uc; \
1235 len = 1; \
1236 } \
1237} STMT_END
1238
1239
1240
1241#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1242 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1243 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1244 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1245 } \
1246 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1247 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1248 TRIE_LIST_CUR( state )++; \
1249} STMT_END
07be1b83 1250
786e8c11
YO
1251#define TRIE_LIST_NEW(state) STMT_START { \
1252 Newxz( trie->states[ state ].trans.list, \
1253 4, reg_trie_trans_le ); \
1254 TRIE_LIST_CUR( state ) = 1; \
1255 TRIE_LIST_LEN( state ) = 4; \
1256} STMT_END
07be1b83 1257
786e8c11
YO
1258#define TRIE_HANDLE_WORD(state) STMT_START { \
1259 U16 dupe= trie->states[ state ].wordnum; \
1260 regnode * const noper_next = regnext( noper ); \
1261 \
1262 if (trie->wordlen) \
1263 trie->wordlen[ curword ] = wordlen; \
1264 DEBUG_r({ \
1265 /* store the word for dumping */ \
1266 SV* tmp; \
1267 if (OP(noper) != NOTHING) \
740cce10 1268 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1269 else \
740cce10 1270 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1271 av_push( trie_words, tmp ); \
786e8c11
YO
1272 }); \
1273 \
1274 curword++; \
1275 \
1276 if ( noper_next < tail ) { \
1277 if (!trie->jump) \
c944940b 1278 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1279 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1280 if (!jumper) \
1281 jumper = noper_next; \
1282 if (!nextbranch) \
1283 nextbranch= regnext(cur); \
1284 } \
1285 \
1286 if ( dupe ) { \
1287 /* So it's a dupe. This means we need to maintain a */\
1288 /* linked-list from the first to the next. */\
1289 /* we only allocate the nextword buffer when there */\
1290 /* a dupe, so first time we have to do the allocation */\
1291 if (!trie->nextword) \
c944940b 1292 trie->nextword = (U16 *) \
446bd890 1293 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1294 while ( trie->nextword[dupe] ) \
1295 dupe= trie->nextword[dupe]; \
1296 trie->nextword[dupe]= curword; \
1297 } else { \
1298 /* we haven't inserted this word yet. */ \
1299 trie->states[ state ].wordnum = curword; \
1300 } \
1301} STMT_END
07be1b83 1302
3dab1dad 1303
786e8c11
YO
1304#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1305 ( ( base + charid >= ucharcount \
1306 && base + charid < ubound \
1307 && state == trie->trans[ base - ucharcount + charid ].check \
1308 && trie->trans[ base - ucharcount + charid ].next ) \
1309 ? trie->trans[ base - ucharcount + charid ].next \
1310 : ( state==1 ? special : 0 ) \
1311 )
3dab1dad 1312
786e8c11
YO
1313#define MADE_TRIE 1
1314#define MADE_JUMP_TRIE 2
1315#define MADE_EXACT_TRIE 4
3dab1dad 1316
a3621e74 1317STATIC I32
786e8c11 1318S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1319{
27da23d5 1320 dVAR;
a3621e74
YO
1321 /* first pass, loop through and scan words */
1322 reg_trie_data *trie;
55eed653 1323 HV *widecharmap = NULL;
2b8b4781 1324 AV *revcharmap = newAV();
a3621e74 1325 regnode *cur;
9f7f3913 1326 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1327 STRLEN len = 0;
1328 UV uvc = 0;
1329 U16 curword = 0;
1330 U32 next_alloc = 0;
786e8c11
YO
1331 regnode *jumper = NULL;
1332 regnode *nextbranch = NULL;
7f69552c 1333 regnode *convert = NULL;
a3621e74 1334 /* we just use folder as a flag in utf8 */
e1ec3a88 1335 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1336 ? PL_fold
1337 : ( flags == EXACTFL
1338 ? PL_fold_locale
1339 : NULL
1340 )
1341 );
1342
2b8b4781
NC
1343#ifdef DEBUGGING
1344 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1345 AV *trie_words = NULL;
1346 /* along with revcharmap, this only used during construction but both are
1347 * useful during debugging so we store them in the struct when debugging.
8e11feef 1348 */
2b8b4781
NC
1349#else
1350 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1351 STRLEN trie_charcount=0;
3dab1dad 1352#endif
2b8b4781 1353 SV *re_trie_maxbuff;
a3621e74 1354 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1355
1356 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1357#ifndef DEBUGGING
1358 PERL_UNUSED_ARG(depth);
1359#endif
a3621e74 1360
c944940b 1361 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1362 trie->refcount = 1;
3dab1dad 1363 trie->startstate = 1;
786e8c11 1364 trie->wordcount = word_count;
f8fc2ecf 1365 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1366 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1367 if (!(UTF && folder))
c944940b 1368 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1369 DEBUG_r({
2b8b4781 1370 trie_words = newAV();
a3621e74 1371 });
a3621e74 1372
0111c4fd 1373 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1374 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1375 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1376 }
3dab1dad
YO
1377 DEBUG_OPTIMISE_r({
1378 PerlIO_printf( Perl_debug_log,
786e8c11 1379 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1380 (int)depth * 2 + 2, "",
1381 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1382 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1383 (int)depth);
3dab1dad 1384 });
7f69552c
YO
1385
1386 /* Find the node we are going to overwrite */
1387 if ( first == startbranch && OP( last ) != BRANCH ) {
1388 /* whole branch chain */
1389 convert = first;
1390 } else {
1391 /* branch sub-chain */
1392 convert = NEXTOPER( first );
1393 }
1394
a3621e74
YO
1395 /* -- First loop and Setup --
1396
1397 We first traverse the branches and scan each word to determine if it
1398 contains widechars, and how many unique chars there are, this is
1399 important as we have to build a table with at least as many columns as we
1400 have unique chars.
1401
1402 We use an array of integers to represent the character codes 0..255
38a44b82 1403 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1404 native representation of the character value as the key and IV's for the
1405 coded index.
1406
1407 *TODO* If we keep track of how many times each character is used we can
1408 remap the columns so that the table compression later on is more
1409 efficient in terms of memory by ensuring most common value is in the
1410 middle and the least common are on the outside. IMO this would be better
1411 than a most to least common mapping as theres a decent chance the most
1412 common letter will share a node with the least common, meaning the node
1413 will not be compressable. With a middle is most common approach the worst
1414 case is when we have the least common nodes twice.
1415
1416 */
1417
a3621e74 1418 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1419 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1420 const U8 *uc = (U8*)STRING( noper );
a28509cc 1421 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1422 STRLEN foldlen = 0;
1423 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1424 const U8 *scan = (U8*)NULL;
07be1b83 1425 U32 wordlen = 0; /* required init */
02daf0ab
YO
1426 STRLEN chars = 0;
1427 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1428
3dab1dad
YO
1429 if (OP(noper) == NOTHING) {
1430 trie->minlen= 0;
1431 continue;
1432 }
02daf0ab
YO
1433 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1434 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1435 regardless of encoding */
1436
a3621e74 1437 for ( ; uc < e ; uc += len ) {
3dab1dad 1438 TRIE_CHARCOUNT(trie)++;
a3621e74 1439 TRIE_READ_CHAR;
3dab1dad 1440 chars++;
a3621e74
YO
1441 if ( uvc < 256 ) {
1442 if ( !trie->charmap[ uvc ] ) {
1443 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1444 if ( folder )
1445 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1446 TRIE_STORE_REVCHAR;
a3621e74 1447 }
02daf0ab
YO
1448 if ( set_bit ) {
1449 /* store the codepoint in the bitmap, and if its ascii
1450 also store its folded equivelent. */
1451 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1452
1453 /* store the folded codepoint */
1454 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1455
1456 if ( !UTF ) {
1457 /* store first byte of utf8 representation of
1458 codepoints in the 127 < uvc < 256 range */
1459 if (127 < uvc && uvc < 192) {
1460 TRIE_BITMAP_SET(trie,194);
1461 } else if (191 < uvc ) {
1462 TRIE_BITMAP_SET(trie,195);
1463 /* && uvc < 256 -- we know uvc is < 256 already */
1464 }
1465 }
02daf0ab
YO
1466 set_bit = 0; /* We've done our bit :-) */
1467 }
a3621e74
YO
1468 } else {
1469 SV** svpp;
55eed653
NC
1470 if ( !widecharmap )
1471 widecharmap = newHV();
a3621e74 1472
55eed653 1473 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1474
1475 if ( !svpp )
e4584336 1476 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1477
1478 if ( !SvTRUE( *svpp ) ) {
1479 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1480 TRIE_STORE_REVCHAR;
a3621e74
YO
1481 }
1482 }
1483 }
3dab1dad
YO
1484 if( cur == first ) {
1485 trie->minlen=chars;
1486 trie->maxlen=chars;
1487 } else if (chars < trie->minlen) {
1488 trie->minlen=chars;
1489 } else if (chars > trie->maxlen) {
1490 trie->maxlen=chars;
1491 }
1492
a3621e74
YO
1493 } /* end first pass */
1494 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1495 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1496 (int)depth * 2 + 2,"",
55eed653 1497 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1498 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1499 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1500 );
c944940b 1501 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1502
1503 /*
1504 We now know what we are dealing with in terms of unique chars and
1505 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1506 representation using a flat table will take. If it's over a reasonable
1507 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1508 conservative but potentially much slower representation using an array
1509 of lists.
1510
1511 At the end we convert both representations into the same compressed
1512 form that will be used in regexec.c for matching with. The latter
1513 is a form that cannot be used to construct with but has memory
1514 properties similar to the list form and access properties similar
1515 to the table form making it both suitable for fast searches and
1516 small enough that its feasable to store for the duration of a program.
1517
1518 See the comment in the code where the compressed table is produced
1519 inplace from the flat tabe representation for an explanation of how
1520 the compression works.
1521
1522 */
1523
1524
3dab1dad 1525 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1526 /*
1527 Second Pass -- Array Of Lists Representation
1528
1529 Each state will be represented by a list of charid:state records
1530 (reg_trie_trans_le) the first such element holds the CUR and LEN
1531 points of the allocated array. (See defines above).
1532
1533 We build the initial structure using the lists, and then convert
1534 it into the compressed table form which allows faster lookups
1535 (but cant be modified once converted).
a3621e74
YO
1536 */
1537
a3621e74
YO
1538 STRLEN transcount = 1;
1539
1e2e3d02
YO
1540 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1541 "%*sCompiling trie using list compiler\n",
1542 (int)depth * 2 + 2, ""));
446bd890 1543
c944940b
JH
1544 trie->states = (reg_trie_state *)
1545 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1546 sizeof(reg_trie_state) );
a3621e74
YO
1547 TRIE_LIST_NEW(1);
1548 next_alloc = 2;
1549
1550 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1551
c445ea15
AL
1552 regnode * const noper = NEXTOPER( cur );
1553 U8 *uc = (U8*)STRING( noper );
1554 const U8 * const e = uc + STR_LEN( noper );
1555 U32 state = 1; /* required init */
1556 U16 charid = 0; /* sanity init */
1557 U8 *scan = (U8*)NULL; /* sanity init */
1558 STRLEN foldlen = 0; /* required init */
07be1b83 1559 U32 wordlen = 0; /* required init */
c445ea15
AL
1560 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1561
3dab1dad 1562 if (OP(noper) != NOTHING) {
786e8c11 1563 for ( ; uc < e ; uc += len ) {
c445ea15 1564
786e8c11 1565 TRIE_READ_CHAR;
c445ea15 1566
786e8c11
YO
1567 if ( uvc < 256 ) {
1568 charid = trie->charmap[ uvc ];
c445ea15 1569 } else {
55eed653 1570 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1571 if ( !svpp ) {
1572 charid = 0;
1573 } else {
1574 charid=(U16)SvIV( *svpp );
1575 }
c445ea15 1576 }
786e8c11
YO
1577 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1578 if ( charid ) {
a3621e74 1579
786e8c11
YO
1580 U16 check;
1581 U32 newstate = 0;
a3621e74 1582
786e8c11
YO
1583 charid--;
1584 if ( !trie->states[ state ].trans.list ) {
1585 TRIE_LIST_NEW( state );
c445ea15 1586 }
786e8c11
YO
1587 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1588 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1589 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1590 break;
1591 }
1592 }
1593 if ( ! newstate ) {
1594 newstate = next_alloc++;
1595 TRIE_LIST_PUSH( state, charid, newstate );
1596 transcount++;
1597 }
1598 state = newstate;
1599 } else {
1600 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1601 }
a28509cc 1602 }
c445ea15 1603 }
3dab1dad 1604 TRIE_HANDLE_WORD(state);
a3621e74
YO
1605
1606 } /* end second pass */
1607
1e2e3d02
YO
1608 /* next alloc is the NEXT state to be allocated */
1609 trie->statecount = next_alloc;
c944940b
JH
1610 trie->states = (reg_trie_state *)
1611 PerlMemShared_realloc( trie->states,
1612 next_alloc
1613 * sizeof(reg_trie_state) );
a3621e74 1614
3dab1dad 1615 /* and now dump it out before we compress it */
2b8b4781
NC
1616 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1617 revcharmap, next_alloc,
1618 depth+1)
1e2e3d02 1619 );
a3621e74 1620
c944940b
JH
1621 trie->trans = (reg_trie_trans *)
1622 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1623 {
1624 U32 state;
a3621e74
YO
1625 U32 tp = 0;
1626 U32 zp = 0;
1627
1628
1629 for( state=1 ; state < next_alloc ; state ++ ) {
1630 U32 base=0;
1631
1632 /*
1633 DEBUG_TRIE_COMPILE_MORE_r(
1634 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1635 );
1636 */
1637
1638 if (trie->states[state].trans.list) {
1639 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1640 U16 maxid=minid;
a28509cc 1641 U16 idx;
a3621e74
YO
1642
1643 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1644 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1645 if ( forid < minid ) {
1646 minid=forid;
1647 } else if ( forid > maxid ) {
1648 maxid=forid;
1649 }
a3621e74
YO
1650 }
1651 if ( transcount < tp + maxid - minid + 1) {
1652 transcount *= 2;
c944940b
JH
1653 trie->trans = (reg_trie_trans *)
1654 PerlMemShared_realloc( trie->trans,
446bd890
NC
1655 transcount
1656 * sizeof(reg_trie_trans) );
a3621e74
YO
1657 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1658 }
1659 base = trie->uniquecharcount + tp - minid;
1660 if ( maxid == minid ) {
1661 U32 set = 0;
1662 for ( ; zp < tp ; zp++ ) {
1663 if ( ! trie->trans[ zp ].next ) {
1664 base = trie->uniquecharcount + zp - minid;
1665 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1666 trie->trans[ zp ].check = state;
1667 set = 1;
1668 break;
1669 }
1670 }
1671 if ( !set ) {
1672 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1673 trie->trans[ tp ].check = state;
1674 tp++;
1675 zp = tp;
1676 }
1677 } else {
1678 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1679 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1680 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1681 trie->trans[ tid ].check = state;
1682 }
1683 tp += ( maxid - minid + 1 );
1684 }
1685 Safefree(trie->states[ state ].trans.list);
1686 }
1687 /*
1688 DEBUG_TRIE_COMPILE_MORE_r(
1689 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1690 );
1691 */
1692 trie->states[ state ].trans.base=base;
1693 }
cc601c31 1694 trie->lasttrans = tp + 1;
a3621e74
YO
1695 }
1696 } else {
1697 /*
1698 Second Pass -- Flat Table Representation.
1699
1700 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1701 We know that we will need Charcount+1 trans at most to store the data
1702 (one row per char at worst case) So we preallocate both structures
1703 assuming worst case.
1704
1705 We then construct the trie using only the .next slots of the entry
1706 structs.
1707
1708 We use the .check field of the first entry of the node temporarily to
1709 make compression both faster and easier by keeping track of how many non
1710 zero fields are in the node.
1711
1712 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1713 transition.
1714
1715 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1716 number representing the first entry of the node, and state as a
1717 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1718 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1719 are 2 entrys per node. eg:
1720
1721 A B A B
1722 1. 2 4 1. 3 7
1723 2. 0 3 3. 0 5
1724 3. 0 0 5. 0 0
1725 4. 0 0 7. 0 0
1726
1727 The table is internally in the right hand, idx form. However as we also
1728 have to deal with the states array which is indexed by nodenum we have to
1729 use TRIE_NODENUM() to convert.
1730
1731 */
1e2e3d02
YO
1732 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1733 "%*sCompiling trie using table compiler\n",
1734 (int)depth * 2 + 2, ""));
3dab1dad 1735
c944940b
JH
1736 trie->trans = (reg_trie_trans *)
1737 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1738 * trie->uniquecharcount + 1,
1739 sizeof(reg_trie_trans) );
1740 trie->states = (reg_trie_state *)
1741 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1742 sizeof(reg_trie_state) );
a3621e74
YO
1743 next_alloc = trie->uniquecharcount + 1;
1744
3dab1dad 1745
a3621e74
YO
1746 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1747
c445ea15 1748 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1749 const U8 *uc = (U8*)STRING( noper );
1750 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1751
1752 U32 state = 1; /* required init */
1753
1754 U16 charid = 0; /* sanity init */
1755 U32 accept_state = 0; /* sanity init */
1756 U8 *scan = (U8*)NULL; /* sanity init */
1757
1758 STRLEN foldlen = 0; /* required init */
07be1b83 1759 U32 wordlen = 0; /* required init */
a3621e74
YO
1760 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1761
3dab1dad 1762 if ( OP(noper) != NOTHING ) {
786e8c11 1763 for ( ; uc < e ; uc += len ) {
a3621e74 1764
786e8c11 1765 TRIE_READ_CHAR;
a3621e74 1766
786e8c11
YO
1767 if ( uvc < 256 ) {
1768 charid = trie->charmap[ uvc ];
1769 } else {
55eed653 1770 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1771 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1772 }
786e8c11
YO
1773 if ( charid ) {
1774 charid--;
1775 if ( !trie->trans[ state + charid ].next ) {
1776 trie->trans[ state + charid ].next = next_alloc;
1777 trie->trans[ state ].check++;
1778 next_alloc += trie->uniquecharcount;
1779 }
1780 state = trie->trans[ state + charid ].next;
1781 } else {
1782 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1783 }
1784 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1785 }
a3621e74 1786 }
3dab1dad
YO
1787 accept_state = TRIE_NODENUM( state );
1788 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1789
1790 } /* end second pass */
1791
3dab1dad 1792 /* and now dump it out before we compress it */
2b8b4781
NC
1793 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1794 revcharmap,
1795 next_alloc, depth+1));
a3621e74 1796
a3621e74
YO
1797 {
1798 /*
1799 * Inplace compress the table.*
1800
1801 For sparse data sets the table constructed by the trie algorithm will
1802 be mostly 0/FAIL transitions or to put it another way mostly empty.
1803 (Note that leaf nodes will not contain any transitions.)
1804
1805 This algorithm compresses the tables by eliminating most such
1806 transitions, at the cost of a modest bit of extra work during lookup:
1807
1808 - Each states[] entry contains a .base field which indicates the
1809 index in the state[] array wheres its transition data is stored.
1810
1811 - If .base is 0 there are no valid transitions from that node.
1812
1813 - If .base is nonzero then charid is added to it to find an entry in
1814 the trans array.
1815
1816 -If trans[states[state].base+charid].check!=state then the
1817 transition is taken to be a 0/Fail transition. Thus if there are fail
1818 transitions at the front of the node then the .base offset will point
1819 somewhere inside the previous nodes data (or maybe even into a node
1820 even earlier), but the .check field determines if the transition is
1821 valid.
1822
786e8c11 1823 XXX - wrong maybe?
a3621e74
YO
1824 The following process inplace converts the table to the compressed
1825 table: We first do not compress the root node 1,and mark its all its
1826 .check pointers as 1 and set its .base pointer as 1 as well. This
1827 allows to do a DFA construction from the compressed table later, and
1828 ensures that any .base pointers we calculate later are greater than
1829 0.
1830
1831 - We set 'pos' to indicate the first entry of the second node.
1832
1833 - We then iterate over the columns of the node, finding the first and
1834 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1835 and set the .check pointers accordingly, and advance pos
1836 appropriately and repreat for the next node. Note that when we copy
1837 the next pointers we have to convert them from the original
1838 NODEIDX form to NODENUM form as the former is not valid post
1839 compression.
1840
1841 - If a node has no transitions used we mark its base as 0 and do not
1842 advance the pos pointer.
1843
1844 - If a node only has one transition we use a second pointer into the
1845 structure to fill in allocated fail transitions from other states.
1846 This pointer is independent of the main pointer and scans forward
1847 looking for null transitions that are allocated to a state. When it
1848 finds one it writes the single transition into the "hole". If the
786e8c11 1849 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1850
1851 - Once compressed we can Renew/realloc the structures to release the
1852 excess space.
1853
1854 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1855 specifically Fig 3.47 and the associated pseudocode.
1856
1857 demq
1858 */
a3b680e6 1859 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1860 U32 state, charid;
a3621e74 1861 U32 pos = 0, zp=0;
1e2e3d02 1862 trie->statecount = laststate;
a3621e74
YO
1863
1864 for ( state = 1 ; state < laststate ; state++ ) {
1865 U8 flag = 0;
a28509cc
AL
1866 const U32 stateidx = TRIE_NODEIDX( state );
1867 const U32 o_used = trie->trans[ stateidx ].check;
1868 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1869 trie->trans[ stateidx ].check = 0;
1870
1871 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1872 if ( flag || trie->trans[ stateidx + charid ].next ) {
1873 if ( trie->trans[ stateidx + charid ].next ) {
1874 if (o_used == 1) {
1875 for ( ; zp < pos ; zp++ ) {
1876 if ( ! trie->trans[ zp ].next ) {
1877 break;
1878 }
1879 }
1880 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1881 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1882 trie->trans[ zp ].check = state;
1883 if ( ++zp > pos ) pos = zp;
1884 break;
1885 }
1886 used--;
1887 }
1888 if ( !flag ) {
1889 flag = 1;
1890 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1891 }
1892 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1893 trie->trans[ pos ].check = state;
1894 pos++;
1895 }
1896 }
1897 }
cc601c31 1898 trie->lasttrans = pos + 1;
c944940b
JH
1899 trie->states = (reg_trie_state *)
1900 PerlMemShared_realloc( trie->states, laststate
1901 * sizeof(reg_trie_state) );
a3621e74 1902 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1903 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1904 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1905 (int)depth * 2 + 2,"",
1906 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1907 (IV)next_alloc,
1908 (IV)pos,
a3621e74
YO
1909 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1910 );
1911
1912 } /* end table compress */
1913 }
1e2e3d02
YO
1914 DEBUG_TRIE_COMPILE_MORE_r(
1915 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1916 (int)depth * 2 + 2, "",
1917 (UV)trie->statecount,
1918 (UV)trie->lasttrans)
1919 );
cc601c31 1920 /* resize the trans array to remove unused space */
c944940b
JH
1921 trie->trans = (reg_trie_trans *)
1922 PerlMemShared_realloc( trie->trans, trie->lasttrans
1923 * sizeof(reg_trie_trans) );
a3621e74 1924
3dab1dad 1925 /* and now dump out the compressed format */
2b8b4781 1926 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1927
3dab1dad 1928 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1929 U8 nodetype =(U8)(flags & 0xFF);
1930 char *str=NULL;
786e8c11 1931
07be1b83 1932#ifdef DEBUGGING
e62cc96a 1933 regnode *optimize = NULL;
7122b237
YO
1934#ifdef RE_TRACK_PATTERN_OFFSETS
1935
b57a0404
JH
1936 U32 mjd_offset = 0;
1937 U32 mjd_nodelen = 0;
7122b237
YO
1938#endif /* RE_TRACK_PATTERN_OFFSETS */
1939#endif /* DEBUGGING */
a3621e74 1940 /*
3dab1dad
YO
1941 This means we convert either the first branch or the first Exact,
1942 depending on whether the thing following (in 'last') is a branch
1943 or not and whther first is the startbranch (ie is it a sub part of
1944 the alternation or is it the whole thing.)
1945 Assuming its a sub part we conver the EXACT otherwise we convert
1946 the whole branch sequence, including the first.
a3621e74 1947 */
3dab1dad 1948 /* Find the node we are going to overwrite */
7f69552c 1949 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1950 /* branch sub-chain */
3dab1dad 1951 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1952#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1953 DEBUG_r({
1954 mjd_offset= Node_Offset((convert));
1955 mjd_nodelen= Node_Length((convert));
1956 });
7122b237 1957#endif
7f69552c 1958 /* whole branch chain */
7122b237
YO
1959 }
1960#ifdef RE_TRACK_PATTERN_OFFSETS
1961 else {
7f69552c
YO
1962 DEBUG_r({
1963 const regnode *nop = NEXTOPER( convert );
1964 mjd_offset= Node_Offset((nop));
1965 mjd_nodelen= Node_Length((nop));
1966 });
07be1b83
YO
1967 }
1968 DEBUG_OPTIMISE_r(
1969 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1970 (int)depth * 2 + 2, "",
786e8c11 1971 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1972 );
7122b237 1973#endif
3dab1dad
YO
1974 /* But first we check to see if there is a common prefix we can
1975 split out as an EXACT and put in front of the TRIE node. */
1976 trie->startstate= 1;
55eed653 1977 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1978 U32 state;
1e2e3d02 1979 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1980 U32 ofs = 0;
8e11feef
RGS
1981 I32 idx = -1;
1982 U32 count = 0;
1983 const U32 base = trie->states[ state ].trans.base;
a3621e74 1984
3dab1dad 1985 if ( trie->states[state].wordnum )
8e11feef 1986 count = 1;
a3621e74 1987
8e11feef 1988 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1989 if ( ( base + ofs >= trie->uniquecharcount ) &&
1990 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1991 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1992 {
3dab1dad 1993 if ( ++count > 1 ) {
2b8b4781 1994 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1995 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1996 if ( state == 1 ) break;
3dab1dad
YO
1997 if ( count == 2 ) {
1998 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1999 DEBUG_OPTIMISE_r(
8e11feef
RGS
2000 PerlIO_printf(Perl_debug_log,
2001 "%*sNew Start State=%"UVuf" Class: [",
2002 (int)depth * 2 + 2, "",
786e8c11 2003 (UV)state));
be8e71aa 2004 if (idx >= 0) {
2b8b4781 2005 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2006 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2007
3dab1dad 2008 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2009 if ( folder )
2010 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2011 DEBUG_OPTIMISE_r(
f1f66076 2012 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2013 );
8e11feef
RGS
2014 }
2015 }
2016 TRIE_BITMAP_SET(trie,*ch);
2017 if ( folder )
2018 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2019 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2020 }
2021 idx = ofs;
2022 }
3dab1dad
YO
2023 }
2024 if ( count == 1 ) {
2b8b4781 2025 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2026 STRLEN len;
2027 char *ch = SvPV( *tmp, len );
de734bd5
A
2028 DEBUG_OPTIMISE_r({
2029 SV *sv=sv_newmortal();
8e11feef
RGS
2030 PerlIO_printf( Perl_debug_log,
2031 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2032 (int)depth * 2 + 2, "",
de734bd5
A
2033 (UV)state, (UV)idx,
2034 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2035 PL_colors[0], PL_colors[1],
2036 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2037 PERL_PV_ESCAPE_FIRSTCHAR
2038 )
2039 );
2040 });
3dab1dad
YO
2041 if ( state==1 ) {
2042 OP( convert ) = nodetype;
2043 str=STRING(convert);
2044 STR_LEN(convert)=0;
2045 }
c490c714
YO
2046 STR_LEN(convert) += len;
2047 while (len--)
de734bd5 2048 *str++ = *ch++;
8e11feef 2049 } else {
f9049ba1 2050#ifdef DEBUGGING
8e11feef
RGS
2051 if (state>1)
2052 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2053#endif
8e11feef
RGS
2054 break;
2055 }
2056 }
3dab1dad 2057 if (str) {
8e11feef 2058 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2059 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2060 trie->startstate = state;
07be1b83
YO
2061 trie->minlen -= (state - 1);
2062 trie->maxlen -= (state - 1);
33809eae
JH
2063#ifdef DEBUGGING
2064 /* At least the UNICOS C compiler choked on this
2065 * being argument to DEBUG_r(), so let's just have
2066 * it right here. */
2067 if (
2068#ifdef PERL_EXT_RE_BUILD
2069 1
2070#else
2071 DEBUG_r_TEST
2072#endif
2073 ) {
2074 regnode *fix = convert;
2075 U32 word = trie->wordcount;
2076 mjd_nodelen++;
2077 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2078 while( ++fix < n ) {
2079 Set_Node_Offset_Length(fix, 0, 0);
2080 }
2081 while (word--) {
2082 SV ** const tmp = av_fetch( trie_words, word, 0 );
2083 if (tmp) {
2084 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2085 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2086 else
2087 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2088 }
2089 }
2090 }
2091#endif
8e11feef
RGS
2092 if (trie->maxlen) {
2093 convert = n;
2094 } else {
3dab1dad 2095 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2096 DEBUG_r(optimize= n);
3dab1dad
YO
2097 }
2098 }
2099 }
a5ca303d
YO
2100 if (!jumper)
2101 jumper = last;
3dab1dad 2102 if ( trie->maxlen ) {
8e11feef
RGS
2103 NEXT_OFF( convert ) = (U16)(tail - convert);
2104 ARG_SET( convert, data_slot );
786e8c11
YO
2105 /* Store the offset to the first unabsorbed branch in
2106 jump[0], which is otherwise unused by the jump logic.
2107 We use this when dumping a trie and during optimisation. */
2108 if (trie->jump)
7f69552c 2109 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2110
786e8c11
YO
2111 /* XXXX */
2112 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2113 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2114 {
2115 OP( convert ) = TRIEC;
2116 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2117 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2118 trie->bitmap= NULL;
2119 } else
2120 OP( convert ) = TRIE;
a3621e74 2121
3dab1dad
YO
2122 /* store the type in the flags */
2123 convert->flags = nodetype;
a5ca303d
YO
2124 DEBUG_r({
2125 optimize = convert
2126 + NODE_STEP_REGNODE
2127 + regarglen[ OP( convert ) ];
2128 });
2129 /* XXX We really should free up the resource in trie now,
2130 as we won't use them - (which resources?) dmq */
3dab1dad 2131 }
a3621e74 2132 /* needed for dumping*/
e62cc96a 2133 DEBUG_r(if (optimize) {
07be1b83 2134 regnode *opt = convert;
bcdf7404 2135
e62cc96a 2136 while ( ++opt < optimize) {
07be1b83
YO
2137 Set_Node_Offset_Length(opt,0,0);
2138 }
786e8c11
YO
2139 /*
2140 Try to clean up some of the debris left after the
2141 optimisation.
a3621e74 2142 */
786e8c11 2143 while( optimize < jumper ) {
07be1b83 2144 mjd_nodelen += Node_Length((optimize));
a3621e74 2145 OP( optimize ) = OPTIMIZED;
07be1b83 2146 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2147 optimize++;
2148 }
07be1b83 2149 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2150 });
2151 } /* end node insert */
55eed653 2152 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2153#ifdef DEBUGGING
2154 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2155 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2156#else
2157 SvREFCNT_dec(revcharmap);
07be1b83 2158#endif
786e8c11
YO
2159 return trie->jump
2160 ? MADE_JUMP_TRIE
2161 : trie->startstate>1
2162 ? MADE_EXACT_TRIE
2163 : MADE_TRIE;
2164}
2165
2166STATIC void
2167S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2168{
2169/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2170
2171 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2172 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2173 ISBN 0-201-10088-6
2174
2175 We find the fail state for each state in the trie, this state is the longest proper
2176 suffix of the current states 'word' that is also a proper prefix of another word in our
2177 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2178 the DFA not to have to restart after its tried and failed a word at a given point, it
2179 simply continues as though it had been matching the other word in the first place.
2180 Consider
2181 'abcdgu'=~/abcdefg|cdgu/
2182 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2183 fail, which would bring use to the state representing 'd' in the second word where we would
2184 try 'g' and succeed, prodceding to match 'cdgu'.
2185 */
2186 /* add a fail transition */
3251b653
NC
2187 const U32 trie_offset = ARG(source);
2188 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2189 U32 *q;
2190 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2191 const U32 numstates = trie->statecount;
786e8c11
YO
2192 const U32 ubound = trie->lasttrans + ucharcount;
2193 U32 q_read = 0;
2194 U32 q_write = 0;
2195 U32 charid;
2196 U32 base = trie->states[ 1 ].trans.base;
2197 U32 *fail;
2198 reg_ac_data *aho;
2199 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2200 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2201
2202 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2203#ifndef DEBUGGING
2204 PERL_UNUSED_ARG(depth);
2205#endif
2206
2207
2208 ARG_SET( stclass, data_slot );
c944940b 2209 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2210 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2211 aho->trie=trie_offset;
446bd890
NC
2212 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2213 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2214 Newxz( q, numstates, U32);
c944940b 2215 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2216 aho->refcount = 1;
2217 fail = aho->fail;
2218 /* initialize fail[0..1] to be 1 so that we always have
2219 a valid final fail state */
2220 fail[ 0 ] = fail[ 1 ] = 1;
2221
2222 for ( charid = 0; charid < ucharcount ; charid++ ) {
2223 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2224 if ( newstate ) {
2225 q[ q_write ] = newstate;
2226 /* set to point at the root */
2227 fail[ q[ q_write++ ] ]=1;
2228 }
2229 }
2230 while ( q_read < q_write) {
2231 const U32 cur = q[ q_read++ % numstates ];
2232 base = trie->states[ cur ].trans.base;
2233
2234 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2235 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2236 if (ch_state) {
2237 U32 fail_state = cur;
2238 U32 fail_base;
2239 do {
2240 fail_state = fail[ fail_state ];
2241 fail_base = aho->states[ fail_state ].trans.base;
2242 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2243
2244 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2245 fail[ ch_state ] = fail_state;
2246 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2247 {
2248 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2249 }
2250 q[ q_write++ % numstates] = ch_state;
2251 }
2252 }
2253 }
2254 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2255 when we fail in state 1, this allows us to use the
2256 charclass scan to find a valid start char. This is based on the principle
2257 that theres a good chance the string being searched contains lots of stuff
2258 that cant be a start char.
2259 */
2260 fail[ 0 ] = fail[ 1 ] = 0;
2261 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2262 PerlIO_printf(Perl_debug_log,
2263 "%*sStclass Failtable (%"UVuf" states): 0",
2264 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2265 );
786e8c11
YO
2266 for( q_read=1; q_read<numstates; q_read++ ) {
2267 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2268 }
2269 PerlIO_printf(Perl_debug_log, "\n");
2270 });
2271 Safefree(q);
2272 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2273}
2274
786e8c11 2275
a3621e74 2276/*
5d1c421c
JH
2277 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2278 * These need to be revisited when a newer toolchain becomes available.
2279 */
2280#if defined(__sparc64__) && defined(__GNUC__)
2281# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2282# undef SPARC64_GCC_WORKAROUND
2283# define SPARC64_GCC_WORKAROUND 1
2284# endif
2285#endif
2286
07be1b83 2287#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2288 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2289 SV * const mysv=sv_newmortal(); \
2290 regnode *Next = regnext(scan); \
2291 regprop(RExC_rx, mysv, scan); \
7f69552c 2292 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2293 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2294 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2295 }});
07be1b83 2296
1de06328
YO
2297
2298
2299
2300
07be1b83
YO
2301#define JOIN_EXACT(scan,min,flags) \
2302 if (PL_regkind[OP(scan)] == EXACT) \
2303 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2304
be8e71aa 2305STATIC U32
07be1b83
YO
2306S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2307 /* Merge several consecutive EXACTish nodes into one. */
2308 regnode *n = regnext(scan);
2309 U32 stringok = 1;
2310 regnode *next = scan + NODE_SZ_STR(scan);
2311 U32 merged = 0;
2312 U32 stopnow = 0;
2313#ifdef DEBUGGING
2314 regnode *stop = scan;
72f13be8 2315 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2316#else
d47053eb
RGS
2317 PERL_UNUSED_ARG(depth);
2318#endif
7918f24d
NC
2319
2320 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2321#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2322 PERL_UNUSED_ARG(flags);
2323 PERL_UNUSED_ARG(val);
07be1b83 2324#endif
07be1b83
YO
2325 DEBUG_PEEP("join",scan,depth);
2326
2327 /* Skip NOTHING, merge EXACT*. */
2328 while (n &&
2329 ( PL_regkind[OP(n)] == NOTHING ||
2330 (stringok && (OP(n) == OP(scan))))
2331 && NEXT_OFF(n)
2332 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2333
2334 if (OP(n) == TAIL || n > next)
2335 stringok = 0;
2336 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2337 DEBUG_PEEP("skip:",n,depth);
2338 NEXT_OFF(scan) += NEXT_OFF(n);
2339 next = n + NODE_STEP_REGNODE;
2340#ifdef DEBUGGING
2341 if (stringok)
2342 stop = n;
2343#endif
2344 n = regnext(n);
2345 }
2346 else if (stringok) {
786e8c11 2347 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2348 regnode * const nnext = regnext(n);
2349
2350 DEBUG_PEEP("merg",n,depth);
2351
2352 merged++;
2353 if (oldl + STR_LEN(n) > U8_MAX)
2354 break;
2355 NEXT_OFF(scan) += NEXT_OFF(n);
2356 STR_LEN(scan) += STR_LEN(n);
2357 next = n + NODE_SZ_STR(n);
2358 /* Now we can overwrite *n : */
2359 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2360#ifdef DEBUGGING
2361 stop = next - 1;
2362#endif
2363 n = nnext;
2364 if (stopnow) break;
2365 }
2366
d47053eb
RGS
2367#ifdef EXPERIMENTAL_INPLACESCAN
2368 if (flags && !NEXT_OFF(n)) {
2369 DEBUG_PEEP("atch", val, depth);
2370 if (reg_off_by_arg[OP(n)]) {
2371 ARG_SET(n, val - n);
2372 }
2373 else {
2374 NEXT_OFF(n) = val - n;
2375 }
2376 stopnow = 1;
2377 }
07be1b83
YO
2378#endif
2379 }
2380
2381 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2382 /*
2383 Two problematic code points in Unicode casefolding of EXACT nodes:
2384
2385 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2386 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2387
2388 which casefold to
2389
2390 Unicode UTF-8
2391
2392 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2393 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2394
2395 This means that in case-insensitive matching (or "loose matching",
2396 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2397 length of the above casefolded versions) can match a target string
2398 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2399 This would rather mess up the minimum length computation.
2400
2401 What we'll do is to look for the tail four bytes, and then peek
2402 at the preceding two bytes to see whether we need to decrease
2403 the minimum length by four (six minus two).
2404
2405 Thanks to the design of UTF-8, there cannot be false matches:
2406 A sequence of valid UTF-8 bytes cannot be a subsequence of
2407 another valid sequence of UTF-8 bytes.
2408
2409 */
2410 char * const s0 = STRING(scan), *s, *t;
2411 char * const s1 = s0 + STR_LEN(scan) - 1;
2412 char * const s2 = s1 - 4;
e294cc5d
JH
2413#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2414 const char t0[] = "\xaf\x49\xaf\x42";
2415#else
07be1b83 2416 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2417#endif
07be1b83
YO
2418 const char * const t1 = t0 + 3;
2419
2420 for (s = s0 + 2;
2421 s < s2 && (t = ninstr(s, s1, t0, t1));
2422 s = t + 4) {
e294cc5d
JH
2423#ifdef EBCDIC
2424 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2425 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2426#else
07be1b83
YO
2427 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2428 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2429#endif
07be1b83
YO
2430 *min -= 4;
2431 }
2432 }
2433
2434#ifdef DEBUGGING
2435 /* Allow dumping */
2436 n = scan + NODE_SZ_STR(scan);
2437 while (n <= stop) {
2438 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2439 OP(n) = OPTIMIZED;
2440 NEXT_OFF(n) = 0;
2441 }
2442 n++;
2443 }
2444#endif
2445 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2446 return stopnow;
2447}
2448
653099ff
GS
2449/* REx optimizer. Converts nodes into quickier variants "in place".
2450 Finds fixed substrings. */
2451
a0288114 2452/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2453 to the position after last scanned or to NULL. */
2454
40d049e4
YO
2455#define INIT_AND_WITHP \
2456 assert(!and_withp); \
2457 Newx(and_withp,1,struct regnode_charclass_class); \
2458 SAVEFREEPV(and_withp)
07be1b83 2459
b515a41d
YO
2460/* this is a chain of data about sub patterns we are processing that
2461 need to be handled seperately/specially in study_chunk. Its so
2462 we can simulate recursion without losing state. */
2463struct scan_frame;
2464typedef struct scan_frame {
2465 regnode *last; /* last node to process in this frame */
2466 regnode *next; /* next node to process when last is reached */
2467 struct scan_frame *prev; /*previous frame*/
2468 I32 stop; /* what stopparen do we use */
2469} scan_frame;
2470
304ee84b
YO
2471
2472#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2473
e1d1eefb
YO
2474#define CASE_SYNST_FNC(nAmE) \
2475case nAmE: \
2476 if (flags & SCF_DO_STCLASS_AND) { \
2477 for (value = 0; value < 256; value++) \
2478 if (!is_ ## nAmE ## _cp(value)) \
2479 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2480 } \
2481 else { \
2482 for (value = 0; value < 256; value++) \
2483 if (is_ ## nAmE ## _cp(value)) \
2484 ANYOF_BITMAP_SET(data->start_class, value); \
2485 } \
2486 break; \
2487case N ## nAmE: \
2488 if (flags & SCF_DO_STCLASS_AND) { \
2489 for (value = 0; value < 256; value++) \
2490 if (is_ ## nAmE ## _cp(value)) \
2491 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2492 } \
2493 else { \
2494 for (value = 0; value < 256; value++) \
2495 if (!is_ ## nAmE ## _cp(value)) \
2496 ANYOF_BITMAP_SET(data->start_class, value); \
2497 } \
2498 break
2499
2500
2501
76e3520e 2502STATIC I32
40d049e4 2503S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2504 I32 *minlenp, I32 *deltap,
40d049e4
YO
2505 regnode *last,
2506 scan_data_t *data,
2507 I32 stopparen,
2508 U8* recursed,
2509 struct regnode_charclass_class *and_withp,
2510 U32 flags, U32 depth)
c277df42
IZ
2511 /* scanp: Start here (read-write). */
2512 /* deltap: Write maxlen-minlen here. */
2513 /* last: Stop before this one. */
40d049e4
YO
2514 /* data: string data about the pattern */
2515 /* stopparen: treat close N as END */
2516 /* recursed: which subroutines have we recursed into */
2517 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2518{
97aff369 2519 dVAR;
c277df42
IZ
2520 I32 min = 0, pars = 0, code;
2521 regnode *scan = *scanp, *next;
2522 I32 delta = 0;
2523 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2524 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2525 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2526 scan_data_t data_fake;
a3621e74 2527 SV *re_trie_maxbuff = NULL;
786e8c11 2528 regnode *first_non_open = scan;
e2e6a0f1 2529 I32 stopmin = I32_MAX;
8aa23a47 2530 scan_frame *frame = NULL;
a3621e74 2531 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2532
7918f24d
NC
2533 PERL_ARGS_ASSERT_STUDY_CHUNK;
2534
13a24bad 2535#ifdef DEBUGGING
40d049e4 2536 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2537#endif
40d049e4 2538
786e8c11 2539 if ( depth == 0 ) {
40d049e4 2540 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2541 first_non_open=regnext(first_non_open);
2542 }
2543
b81d288d 2544
8aa23a47
YO
2545 fake_study_recurse:
2546 while ( scan && OP(scan) != END && scan < last ){
2547 /* Peephole optimizer: */
304ee84b 2548 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2549 DEBUG_PEEP("Peep",scan,depth);
2550 JOIN_EXACT(scan,&min,0);
2551
2552 /* Follow the next-chain of the current node and optimize
2553 away all the NOTHINGs from it. */
2554 if (OP(scan) != CURLYX) {
2555 const int max = (reg_off_by_arg[OP(scan)]
2556 ? I32_MAX
2557 /* I32 may be smaller than U16 on CRAYs! */
2558 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2559 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2560 int noff;
2561 regnode *n = scan;
2562
2563 /* Skip NOTHING and LONGJMP. */
2564 while ((n = regnext(n))
2565 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2566 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2567 && off + noff < max)
2568 off += noff;
2569 if (reg_off_by_arg[OP(scan)])
2570 ARG(scan) = off;
2571 else
2572 NEXT_OFF(scan) = off;
2573 }
a3621e74 2574
c277df42 2575
8aa23a47
YO
2576
2577 /* The principal pseudo-switch. Cannot be a switch, since we
2578 look into several different things. */
2579 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2580 || OP(scan) == IFTHEN) {
2581 next = regnext(scan);
2582 code = OP(scan);
2583 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2584
2585 if (OP(next) == code || code == IFTHEN) {
2586 /* NOTE - There is similar code to this block below for handling
2587 TRIE nodes on a re-study. If you change stuff here check there
2588 too. */
2589 I32 max1 = 0, min1 = I32_MAX, num = 0;
2590 struct regnode_charclass_class accum;
2591 regnode * const startbranch=scan;
2592
2593 if (flags & SCF_DO_SUBSTR)
304ee84b 2594 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2595 if (flags & SCF_DO_STCLASS)
2596 cl_init_zero(pRExC_state, &accum);
2597
2598 while (OP(scan) == code) {
2599 I32 deltanext, minnext, f = 0, fake;
2600 struct regnode_charclass_class this_class;
2601
2602 num++;
2603 data_fake.flags = 0;
2604 if (data) {
2605 data_fake.whilem_c = data->whilem_c;
2606 data_fake.last_closep = data->last_closep;
2607 }
2608 else
2609 data_fake.last_closep = &fake;
58e23c8d
YO
2610
2611 data_fake.pos_delta = delta;
8aa23a47
YO
2612 next = regnext(scan);
2613 scan = NEXTOPER(scan);
2614 if (code != BRANCH)
c277df42 2615 scan = NEXTOPER(scan);
8aa23a47
YO
2616 if (flags & SCF_DO_STCLASS) {
2617 cl_init(pRExC_state, &this_class);
2618 data_fake.start_class = &this_class;
2619 f = SCF_DO_STCLASS_AND;
58e23c8d 2620 }
8aa23a47
YO
2621 if (flags & SCF_WHILEM_VISITED_POS)
2622 f |= SCF_WHILEM_VISITED_POS;
2623
2624 /* we suppose the run is continuous, last=next...*/
2625 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2626 next, &data_fake,
2627 stopparen, recursed, NULL, f,depth+1);
2628 if (min1 > minnext)
2629 min1 = minnext;
2630 if (max1 < minnext + deltanext)
2631 max1 = minnext + deltanext;
2632 if (deltanext == I32_MAX)
2633 is_inf = is_inf_internal = 1;
2634 scan = next;
2635 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2636 pars++;
2637 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2638 if ( stopmin > minnext)
2639 stopmin = min + min1;
2640 flags &= ~SCF_DO_SUBSTR;
2641 if (data)
2642 data->flags |= SCF_SEEN_ACCEPT;
2643 }
2644 if (data) {
2645 if (data_fake.flags & SF_HAS_EVAL)
2646 data->flags |= SF_HAS_EVAL;
2647 data->whilem_c = data_fake.whilem_c;
3dab1dad 2648 }
8aa23a47
YO
2649 if (flags & SCF_DO_STCLASS)
2650 cl_or(pRExC_state, &accum, &this_class);
2651 }
2652 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2653 min1 = 0;
2654 if (flags & SCF_DO_SUBSTR) {
2655 data->pos_min += min1;
2656 data->pos_delta += max1 - min1;
2657 if (max1 != min1 || is_inf)
2658 data->longest = &(data->longest_float);
2659 }
2660 min += min1;
2661 delta += max1 - min1;
2662 if (flags & SCF_DO_STCLASS_OR) {
2663 cl_or(pRExC_state, data->start_class, &accum);
2664 if (min1) {
2665 cl_and(data->start_class, and_withp);
2666 flags &= ~SCF_DO_STCLASS;
653099ff 2667 }
8aa23a47
YO
2668 }
2669 else if (flags & SCF_DO_STCLASS_AND) {
2670 if (min1) {
2671 cl_and(data->start_class, &accum);
2672 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2673 }
8aa23a47
YO
2674 else {
2675 /* Switch to OR mode: cache the old value of
2676 * data->start_class */
2677 INIT_AND_WITHP;
2678 StructCopy(data->start_class, and_withp,
2679 struct regnode_charclass_class);
2680 flags &= ~SCF_DO_STCLASS_AND;
2681 StructCopy(&accum, data->start_class,
2682 struct regnode_charclass_class);
2683 flags |= SCF_DO_STCLASS_OR;
2684 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2685 }
8aa23a47 2686 }
a3621e74 2687
8aa23a47
YO
2688 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2689 /* demq.
a3621e74 2690
8aa23a47
YO
2691 Assuming this was/is a branch we are dealing with: 'scan' now
2692 points at the item that follows the branch sequence, whatever
2693 it is. We now start at the beginning of the sequence and look
2694 for subsequences of
a3621e74 2695
8aa23a47
YO
2696 BRANCH->EXACT=>x1
2697 BRANCH->EXACT=>x2
2698 tail
a3621e74 2699
8aa23a47 2700 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2701
8aa23a47
YO
2702 If we can find such a subseqence we need to turn the first
2703 element into a trie and then add the subsequent branch exact
2704 strings to the trie.
a3621e74 2705
8aa23a47 2706 We have two cases
a3621e74 2707
8aa23a47 2708 1. patterns where the whole set of branch can be converted.
a3621e74 2709
8aa23a47 2710 2. patterns where only a subset can be converted.
a3621e74 2711
8aa23a47
YO
2712 In case 1 we can replace the whole set with a single regop
2713 for the trie. In case 2 we need to keep the start and end
2714 branchs so
a3621e74 2715
8aa23a47
YO
2716 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2717 becomes BRANCH TRIE; BRANCH X;
786e8c11 2718
8aa23a47
YO
2719 There is an additional case, that being where there is a
2720 common prefix, which gets split out into an EXACT like node
2721 preceding the TRIE node.
a3621e74 2722
8aa23a47
YO
2723 If x(1..n)==tail then we can do a simple trie, if not we make
2724 a "jump" trie, such that when we match the appropriate word
2725 we "jump" to the appopriate tail node. Essentailly we turn
2726 a nested if into a case structure of sorts.
b515a41d 2727
8aa23a47
YO
2728 */
2729
2730 int made=0;
2731 if (!re_trie_maxbuff) {
2732 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2733 if (!SvIOK(re_trie_maxbuff))
2734 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2735 }
2736 if ( SvIV(re_trie_maxbuff)>=0 ) {
2737 regnode *cur;
2738 regnode *first = (regnode *)NULL;
2739 regnode *last = (regnode *)NULL;
2740 regnode *tail = scan;
2741 U8 optype = 0;
2742 U32 count=0;
a3621e74
YO
2743
2744#ifdef DEBUGGING
8aa23a47 2745 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2746#endif
8aa23a47
YO
2747 /* var tail is used because there may be a TAIL
2748 regop in the way. Ie, the exacts will point to the
2749 thing following the TAIL, but the last branch will
2750 point at the TAIL. So we advance tail. If we
2751 have nested (?:) we may have to move through several
2752 tails.
2753 */
2754
2755 while ( OP( tail ) == TAIL ) {
2756 /* this is the TAIL generated by (?:) */
2757 tail = regnext( tail );
2758 }
a3621e74 2759
8aa23a47
YO
2760
2761 DEBUG_OPTIMISE_r({
2762 regprop(RExC_rx, mysv, tail );
2763 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2764 (int)depth * 2 + 2, "",
2765 "Looking for TRIE'able sequences. Tail node is: ",
2766 SvPV_nolen_const( mysv )
2767 );
2768 });
2769
2770 /*
2771
2772 step through the branches, cur represents each
2773 branch, noper is the first thing to be matched
2774 as part of that branch and noper_next is the
2775 regnext() of that node. if noper is an EXACT
2776 and noper_next is the same as scan (our current
2777 position in the regex) then the EXACT branch is
2778 a possible optimization target. Once we have
2779 two or more consequetive such branches we can
2780 create a trie of the EXACT's contents and stich
2781 it in place. If the sequence represents all of
2782 the branches we eliminate the whole thing and
2783 replace it with a single TRIE. If it is a
2784 subsequence then we need to stitch it in. This
2785 means the first branch has to remain, and needs
2786 to be repointed at the item on the branch chain
2787 following the last branch optimized. This could
2788 be either a BRANCH, in which case the
2789 subsequence is internal, or it could be the
2790 item following the branch sequence in which
2791 case the subsequence is at the end.
2792
2793 */
2794
2795 /* dont use tail as the end marker for this traverse */
2796 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2797 regnode * const noper = NEXTOPER( cur );
b515a41d 2798#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2799 regnode * const noper_next = regnext( noper );
b515a41d
YO
2800#endif
2801
8aa23a47
YO
2802 DEBUG_OPTIMISE_r({
2803 regprop(RExC_rx, mysv, cur);
2804 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2805 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2806
2807 regprop(RExC_rx, mysv, noper);
2808 PerlIO_printf( Perl_debug_log, " -> %s",
2809 SvPV_nolen_const(mysv));
2810
2811 if ( noper_next ) {
2812 regprop(RExC_rx, mysv, noper_next );
2813 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2814 SvPV_nolen_const(mysv));
2815 }
2816 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2817 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2818 });
2819 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2820 : PL_regkind[ OP( noper ) ] == EXACT )
2821 || OP(noper) == NOTHING )
786e8c11 2822#ifdef NOJUMPTRIE
8aa23a47 2823 && noper_next == tail
786e8c11 2824#endif
8aa23a47
YO
2825 && count < U16_MAX)
2826 {
2827 count++;
2828 if ( !first || optype == NOTHING ) {
2829 if (!first) first = cur;
2830 optype = OP( noper );
2831 } else {
2832 last = cur;
2833 }
2834 } else {
a0a388a1
YO
2835/*
2836 Currently we assume that the trie can handle unicode and ascii
2837 matches fold cased matches. If this proves true then the following
2838 define will prevent tries in this situation.
2839
2840 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2841*/
2842#define TRIE_TYPE_IS_SAFE 1
2843 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2844 make_trie( pRExC_state,
2845 startbranch, first, cur, tail, count,
2846 optype, depth+1 );
2847 }
2848 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2849#ifdef NOJUMPTRIE
8aa23a47 2850 && noper_next == tail
786e8c11 2851#endif
8aa23a47
YO
2852 ){
2853 count = 1;
2854 first = cur;
2855 optype = OP( noper );
2856 } else {
2857 count = 0;
2858 first = NULL;
2859 optype = 0;
2860 }
2861 last = NULL;
2862 }
2863 }
2864 DEBUG_OPTIMISE_r({
2865 regprop(RExC_rx, mysv, cur);
2866 PerlIO_printf( Perl_debug_log,
2867 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2868 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2869
2870 });
a0a388a1
YO
2871
2872 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2873 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2874#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2875 if ( ((made == MADE_EXACT_TRIE &&
2876 startbranch == first)
2877 || ( first_non_open == first )) &&
2878 depth==0 ) {
2879 flags |= SCF_TRIE_RESTUDY;
2880 if ( startbranch == first
2881 && scan == tail )
2882 {
2883 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2884 }
2885 }
3dab1dad 2886#endif
8aa23a47
YO
2887 }
2888 }
2889
2890 } /* do trie */
2891
653099ff 2892 }
8aa23a47
YO
2893 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2894 scan = NEXTOPER(NEXTOPER(scan));
2895 } else /* single branch is optimized. */
2896 scan = NEXTOPER(scan);
2897 continue;
2898 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2899 scan_frame *newframe = NULL;
2900 I32 paren;
2901 regnode *start;
2902 regnode *end;
2903
2904 if (OP(scan) != SUSPEND) {
2905 /* set the pointer */
2906 if (OP(scan) == GOSUB) {
2907 paren = ARG(scan);
2908 RExC_recurse[ARG2L(scan)] = scan;
2909 start = RExC_open_parens[paren-1];
2910 end = RExC_close_parens[paren-1];
2911 } else {
2912 paren = 0;
f8fc2ecf 2913 start = RExC_rxi->program + 1;
8aa23a47
YO
2914 end = RExC_opend;
2915 }
2916 if (!recursed) {
2917 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2918 SAVEFREEPV(recursed);
2919 }
2920 if (!PAREN_TEST(recursed,paren+1)) {
2921 PAREN_SET(recursed,paren+1);
2922 Newx(newframe,1,scan_frame);
2923 } else {
2924 if (flags & SCF_DO_SUBSTR) {
304ee84b 2925 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2926 data->longest = &(data->longest_float);
2927 }
2928 is_inf = is_inf_internal = 1;
2929 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2930 cl_anything(pRExC_state, data->start_class);
2931 flags &= ~SCF_DO_STCLASS;
2932 }
2933 } else {
2934 Newx(newframe,1,scan_frame);
2935 paren = stopparen;
2936 start = scan+2;
2937 end = regnext(scan);
2938 }
2939 if (newframe) {
2940 assert(start);
2941 assert(end);
2942 SAVEFREEPV(newframe);
2943 newframe->next = regnext(scan);
2944 newframe->last = last;
2945 newframe->stop = stopparen;
2946 newframe->prev = frame;
2947
2948 frame = newframe;
2949 scan = start;
2950 stopparen = paren;
2951 last = end;
2952
2953 continue;
2954 }
2955 }
2956 else if (OP(scan) == EXACT) {
2957 I32 l = STR_LEN(scan);
2958 UV uc;
2959 if (UTF) {
2960 const U8 * const s = (U8*)STRING(scan);
2961 l = utf8_length(s, s + l);
2962 uc = utf8_to_uvchr(s, NULL);
2963 } else {
2964 uc = *((U8*)STRING(scan));
2965 }
2966 min += l;
2967 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2968 /* The code below prefers earlier match for fixed
2969 offset, later match for variable offset. */
2970 if (data->last_end == -1) { /* Update the start info. */
2971 data->last_start_min = data->pos_min;
2972 data->last_start_max = is_inf
2973 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2974 }
8aa23a47
YO
2975 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2976 if (UTF)
2977 SvUTF8_on(data->last_found);
2978 {
2979 SV * const sv = data->last_found;
2980 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2981 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2982 if (mg && mg->mg_len >= 0)
2983 mg->mg_len += utf8_length((U8*)STRING(scan),
2984 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2985 }
8aa23a47
YO
2986 data->last_end = data->pos_min + l;
2987 data->pos_min += l; /* As in the first entry. */
2988 data->flags &= ~SF_BEFORE_EOL;
2989 }
2990 if (flags & SCF_DO_STCLASS_AND) {
2991 /* Check whether it is compatible with what we know already! */
2992 int compat = 1;
2993
2994 if (uc >= 0x100 ||
2995 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2996 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2997 && (!(data->start_class->flags & ANYOF_FOLD)
2998 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2999 )
3000 compat = 0;
3001 ANYOF_CLASS_ZERO(data->start_class);
3002 ANYOF_BITMAP_ZERO(data->start_class);
3003 if (compat)
3004 ANYOF_BITMAP_SET(data->start_class, uc);
3005 data->start_class->flags &= ~ANYOF_EOS;
3006 if (uc < 0x100)
3007 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3008 }
3009 else if (flags & SCF_DO_STCLASS_OR) {
3010 /* false positive possible if the class is case-folded */
3011 if (uc < 0x100)
3012 ANYOF_BITMAP_SET(data->start_class, uc);
3013 else
3014 data->start_class->flags |= ANYOF_UNICODE_ALL;
3015 data->start_class->flags &= ~ANYOF_EOS;
3016 cl_and(data->start_class, and_withp);
3017 }
3018 flags &= ~SCF_DO_STCLASS;
3019 }
3020 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3021 I32 l = STR_LEN(scan);
3022 UV uc = *((U8*)STRING(scan));
3023
3024 /* Search for fixed substrings supports EXACT only. */
3025 if (flags & SCF_DO_SUBSTR) {
3026 assert(data);
304ee84b 3027 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3028 }
3029 if (UTF) {
3030 const U8 * const s = (U8 *)STRING(scan);
3031 l = utf8_length(s, s + l);
3032 uc = utf8_to_uvchr(s, NULL);
3033 }
3034 min += l;
3035 if (flags & SCF_DO_SUBSTR)
3036 data->pos_min += l;
3037 if (flags & SCF_DO_STCLASS_AND) {
3038 /* Check whether it is compatible with what we know already! */
3039 int compat = 1;
3040
3041 if (uc >= 0x100 ||
3042 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3043 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3044 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3045 compat = 0;
3046 ANYOF_CLASS_ZERO(data->start_class);
3047 ANYOF_BITMAP_ZERO(data->start_class);
3048 if (compat) {
3049 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3050 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
3051 data->start_class->flags |= ANYOF_FOLD;
3052 if (OP(scan) == EXACTFL)
3053 data->start_class->flags |= ANYOF_LOCALE;
653099ff 3054 }
8aa23a47
YO
3055 }
3056 else if (flags & SCF_DO_STCLASS_OR) {
3057 if (data->start_class->flags & ANYOF_FOLD) {
3058 /* false positive possible if the class is case-folded.
3059 Assume that the locale settings are the same... */
1aa99e6b
IH
3060 if (uc < 0x100)
3061 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
3062 data->start_class->flags &= ~ANYOF_EOS;
3063 }
8aa23a47 3064 cl_and(data->start_class, and_withp);
653099ff 3065 }
8aa23a47
YO
3066 flags &= ~SCF_DO_STCLASS;
3067 }
3068 else if (strchr((const char*)PL_varies,OP(scan))) {
3069 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3070 I32 f = flags, pos_before = 0;
3071 regnode * const oscan = scan;
3072 struct regnode_charclass_class this_class;
3073 struct regnode_charclass_class *oclass = NULL;
3074 I32 next_is_eval = 0;
3075
3076 switch (PL_regkind[OP(scan)]) {
3077 case WHILEM: /* End of (?:...)* . */
3078 scan = NEXTOPER(scan);
3079 goto finish;
3080 case PLUS:
3081 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3082 next = NEXTOPER(scan);
3083 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3084 mincount = 1;
3085 maxcount = REG_INFTY;
3086 next = regnext(scan);
3087 scan = NEXTOPER(scan);
3088 goto do_curly;
3089 }
3090 }
3091 if (flags & SCF_DO_SUBSTR)
3092 data->pos_min++;
3093 min++;
3094 /* Fall through. */
3095 case STAR:
3096 if (flags & SCF_DO_STCLASS) {
3097 mincount = 0;
3098 maxcount = REG_INFTY;
3099 next = regnext(scan);
3100 scan = NEXTOPER(scan);
3101 goto do_curly;
3102 }
3103 is_inf = is_inf_internal = 1;
3104 scan = regnext(scan);
c277df42 3105 if (flags & SCF_DO_SUBSTR) {
304ee84b 3106 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3107 data->longest = &(data->longest_float);
c277df42 3108 }
8aa23a47
YO
3109 goto optimize_curly_tail;
3110 case CURLY:
3111 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3112 && (scan->flags == stopparen))
3113 {
3114 mincount = 1;
3115 maxcount = 1;
3116 } else {
3117 mincount = ARG1(scan);
3118 maxcount = ARG2(scan);
653099ff 3119 }
8aa23a47
YO
3120 next = regnext(scan);
3121 if (OP(scan) == CURLYX) {
3122 I32 lp = (data ? *(data->last_closep) : 0);
3123 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3124 }
8aa23a47
YO
3125 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3126 next_is_eval = (OP(scan) == EVAL);
3127 do_curly:
3128 if (flags & SCF_DO_SUBSTR) {
304ee84b 3129 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3130 pos_before = data->pos_min;
b45f050a 3131 }
8aa23a47
YO
3132 if (data) {
3133 fl = data->flags;
3134 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3135 if (is_inf)
3136 data->flags |= SF_IS_INF;
3137 }
3138 if (flags & SCF_DO_STCLASS) {
3139 cl_init(pRExC_state, &this_class);
3140 oclass = data->start_class;
3141 data->start_class = &this_class;
3142 f |= SCF_DO_STCLASS_AND;
3143 f &= ~SCF_DO_STCLASS_OR;
3144 }
3145 /* These are the cases when once a subexpression
3146 fails at a particular position, it cannot succeed
3147 even after backtracking at the enclosing scope.
3148
3149 XXXX what if minimal match and we are at the
3150 initial run of {n,m}? */
3151 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3152 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3153
8aa23a47
YO
3154 /* This will finish on WHILEM, setting scan, or on NULL: */
3155 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3156 last, data, stopparen, recursed, NULL,
3157 (mincount == 0
3158 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3159
8aa23a47
YO
3160 if (flags & SCF_DO_STCLASS)
3161 data->start_class = oclass;
3162 if (mincount == 0 || minnext == 0) {
3163 if (flags & SCF_DO_STCLASS_OR) {
3164 cl_or(pRExC_state, data->start_class, &this_class);
3165 }
3166 else if (flags & SCF_DO_STCLASS_AND) {
3167 /* Switch to OR mode: cache the old value of
3168 * data->start_class */
3169 INIT_AND_WITHP;
3170 StructCopy(data->start_class, and_withp,
3171 struct regnode_charclass_class);
3172 flags &= ~SCF_DO_STCLASS_AND;
3173 StructCopy(&this_class, data->start_class,
3174 struct regnode_charclass_class);
3175 flags |= SCF_DO_STCLASS_OR;
3176 data->start_class->flags |= ANYOF_EOS;
3177 }
3178 } else { /* Non-zero len */
3179 if (flags & SCF_DO_STCLASS_OR) {
3180 cl_or(pRExC_state, data->start_class, &this_class);
3181 cl_and(data->start_class, and_withp);
3182 }
3183 else if (flags & SCF_DO_STCLASS_AND)
3184 cl_and(data->start_class, &this_class);
3185 flags &= ~SCF_DO_STCLASS;
3186 }
3187 if (!scan) /* It was not CURLYX, but CURLY. */
3188 scan = next;
3189 if ( /* ? quantifier ok, except for (?{ ... }) */
3190 (next_is_eval || !(mincount == 0 && maxcount == 1))
3191 && (minnext == 0) && (deltanext == 0)
3192 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3193 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3194 {
668c081a
NC
3195 ckWARNreg(RExC_parse,
3196 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3197 }
3198
3199 min += minnext * mincount;
3200 is_inf_internal |= ((maxcount == REG_INFTY
3201 && (minnext + deltanext) > 0)
3202 || deltanext == I32_MAX);
3203 is_inf |= is_inf_internal;
3204 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3205
3206 /* Try powerful optimization CURLYX => CURLYN. */
3207 if ( OP(oscan) == CURLYX && data
3208 && data->flags & SF_IN_PAR
3209 && !(data->flags & SF_HAS_EVAL)
3210 && !deltanext && minnext == 1 ) {
3211 /* Try to optimize to CURLYN. */
3212 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3213 regnode * const nxt1 = nxt;
497b47a8 3214#ifdef DEBUGGING
8aa23a47 3215 regnode *nxt2;
497b47a8 3216#endif
c277df42 3217
8aa23a47
YO
3218 /* Skip open. */
3219 nxt = regnext(nxt);
3220 if (!strchr((const char*)PL_simple,OP(nxt))
3221 && !(PL_regkind[OP(nxt)] == EXACT
3222 && STR_LEN(nxt) == 1))
3223 goto nogo;
497b47a8 3224#ifdef DEBUGGING
8aa23a47 3225 nxt2 = nxt;
497b47a8 3226#endif
8aa23a47
YO
3227 nxt = regnext(nxt);
3228 if (OP(nxt) != CLOSE)
3229 goto nogo;
3230 if (RExC_open_parens) {
3231 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3232 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3233 }
3234 /* Now we know that nxt2 is the only contents: */
3235 oscan->flags = (U8)ARG(nxt);
3236 OP(oscan) = CURLYN;
3237 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3238
c277df42 3239#ifdef DEBUGGING
8aa23a47
YO
3240 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3241 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3242 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3243 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3244 OP(nxt + 1) = OPTIMIZED; /* was count. */
3245 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3246#endif
8aa23a47
YO
3247 }
3248 nogo:
3249
3250 /* Try optimization CURLYX => CURLYM. */
3251 if ( OP(oscan) == CURLYX && data
3252 && !(data->flags & SF_HAS_PAR)
3253 && !(data->flags & SF_HAS_EVAL)
3254 && !deltanext /* atom is fixed width */
3255 && minnext != 0 /* CURLYM can't handle zero width */
3256 ) {
3257 /* XXXX How to optimize if data == 0? */
3258 /* Optimize to a simpler form. */
3259 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3260 regnode *nxt2;
3261
3262 OP(oscan) = CURLYM;
3263 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3264 && (OP(nxt2) != WHILEM))
3265 nxt = nxt2;
3266 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3267 /* Need to optimize away parenths. */
3268 if (data->flags & SF_IN_PAR) {
3269 /* Set the parenth number. */
3270 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3271
3272 if (OP(nxt) != CLOSE)
3273 FAIL("Panic opt close");
3274 oscan->flags = (U8)ARG(nxt);
3275 if (RExC_open_parens) {
3276 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3277 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3278 }
8aa23a47
YO
3279 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3280 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3281
c277df42 3282#ifdef DEBUGGING
8aa23a47
YO
3283 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3284 OP(nxt + 1) = OPTIMIZED; /* was count. */
3285 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3286 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3287#endif
c277df42 3288#if 0
8aa23a47
YO
3289 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3290 regnode *nnxt = regnext(nxt1);
3291
3292 if (nnxt == nxt) {
3293 if (reg_off_by_arg[OP(nxt1)])
3294 ARG_SET(nxt1, nxt2 - nxt1);
3295 else if (nxt2 - nxt1 < U16_MAX)
3296 NEXT_OFF(nxt1) = nxt2 - nxt1;
3297 else
3298 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3299 }
8aa23a47 3300 nxt1 = nnxt;
c277df42 3301 }
5d1c421c 3302#endif
8aa23a47
YO
3303 /* Optimize again: */
3304 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3305 NULL, stopparen, recursed, NULL, 0,depth+1);
3306 }
3307 else
3308 oscan->flags = 0;
3309 }
3310 else if ((OP(oscan) == CURLYX)
3311 && (flags & SCF_WHILEM_VISITED_POS)
3312 /* See the comment on a similar expression above.
3313 However, this time it not a subexpression
3314 we care about, but the expression itself. */
3315 && (maxcount == REG_INFTY)
3316 && data && ++data->whilem_c < 16) {
3317 /* This stays as CURLYX, we can put the count/of pair. */
3318 /* Find WHILEM (as in regexec.c) */
3319 regnode *nxt = oscan + NEXT_OFF(oscan);
3320
3321 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3322 nxt += ARG(nxt);
3323 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3324 | (RExC_whilem_seen << 4)); /* On WHILEM */
3325 }
3326 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3327 pars++;
3328 if (flags & SCF_DO_SUBSTR) {
3329 SV *last_str = NULL;
3330 int counted = mincount != 0;
a0ed51b3 3331
8aa23a47
YO
3332 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3333#if defined(SPARC64_GCC_WORKAROUND)
3334 I32 b = 0;
3335 STRLEN l = 0;
3336 const char *s = NULL;
3337 I32 old = 0;
b515a41d 3338
8aa23a47
YO
3339 if (pos_before >= data->last_start_min)
3340 b = pos_before;
3341 else
3342 b = data->last_start_min;
b515a41d 3343
8aa23a47
YO
3344 l = 0;
3345 s = SvPV_const(data->last_found, l);
3346 old = b - data->last_start_min;
3347
3348#else
3349 I32 b = pos_before >= data->last_start_min
3350 ? pos_before : data->last_start_min;
3351 STRLEN l;
3352 const char * const s = SvPV_const(data->last_found, l);
3353 I32 old = b - data->last_start_min;
3354#endif
3355
3356 if (UTF)
3357 old = utf8_hop((U8*)s, old) - (U8*)s;
3358
3359 l -= old;
3360 /* Get the added string: */
740cce10 3361 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3362 if (deltanext == 0 && pos_before == b) {
3363 /* What was added is a constant string */
3364 if (mincount > 1) {
3365 SvGROW(last_str, (mincount * l) + 1);
3366 repeatcpy(SvPVX(last_str) + l,
3367 SvPVX_const(last_str), l, mincount - 1);
3368 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3369 /* Add additional parts. */
3370 SvCUR_set(data->last_found,
3371 SvCUR(data->last_found) - l);
3372 sv_catsv(data->last_found, last_str);
3373 {
3374 SV * sv = data->last_found;
3375 MAGIC *mg =
3376 SvUTF8(sv) && SvMAGICAL(sv) ?
3377 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3378 if (mg && mg->mg_len >= 0)
bd94e887 3379 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3380 }
8aa23a47 3381 data->last_end += l * (mincount - 1);
b515a41d 3382 }
8aa23a47
YO
3383 } else {
3384 /* start offset must point into the last copy */
3385 data->last_start_min += minnext * (mincount - 1);
3386 data->last_start_max += is_inf ? I32_MAX
3387 : (maxcount - 1) * (minnext + data->pos_delta);
3388 }
c277df42 3389 }
8aa23a47
YO
3390 /* It is counted once already... */
3391 data->pos_min += minnext * (mincount - counted);
3392 data->pos_delta += - counted * deltanext +
3393 (minnext + deltanext) * maxcount - minnext * mincount;
3394 if (mincount != maxcount) {
3395 /* Cannot extend fixed substrings found inside
3396 the group. */
304ee84b 3397 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3398 if (mincount && last_str) {
3399 SV * const sv = data->last_found;
3400 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3401 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3402
3403 if (mg)
3404 mg->mg_len = -1;
3405 sv_setsv(sv, last_str);
3406 data->last_end = data->pos_min;
3407 data->last_start_min =
3408 data->pos_min - CHR_SVLEN(last_str);
3409 data->last_start_max = is_inf
3410 ? I32_MAX
3411 : data->pos_min + data->pos_delta
3412 - CHR_SVLEN(last_str);
3413 }
3414 data->longest = &(data->longest_float);
3415 }
3416 SvREFCNT_dec(last_str);
c277df42 3417 }
8aa23a47
YO
3418 if (data && (fl & SF_HAS_EVAL))
3419 data->flags |= SF_HAS_EVAL;
3420 optimize_curly_tail:
3421 if (OP(oscan) != CURLYX) {
3422 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3423 && NEXT_OFF(next))
3424 NEXT_OFF(oscan) += NEXT_OFF(next);
3425 }
3426 continue;
3427 default: /* REF and CLUMP only? */
3428 if (flags & SCF_DO_SUBSTR) {
304ee84b 3429 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3430 data->longest = &(data->longest_float);
3431 }
3432 is_inf = is_inf_internal = 1;
3433 if (flags & SCF_DO_STCLASS_OR)
3434 cl_anything(pRExC_state, data->start_class);
3435 flags &= ~SCF_DO_STCLASS;
3436 break;
c277df42 3437 }
8aa23a47 3438 }
e1d1eefb
YO
3439 else if (OP(scan) == LNBREAK) {
3440 if (flags & SCF_DO_STCLASS) {
3441 int value = 0;
3442 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3443 if (flags & SCF_DO_STCLASS_AND) {
3444 for (value = 0; value < 256; value++)
e64b1bd1 3445 if (!is_VERTWS_cp(value))
e1d1eefb
YO
3446 ANYOF_BITMAP_CLEAR(data->start_class, value);
3447 }
3448 else {
3449 for (value = 0; value < 256; value++)
e64b1bd1 3450 if (is_VERTWS_cp(value))
e1d1eefb
YO
3451 ANYOF_BITMAP_SET(data->start_class, value);
3452 }
3453 if (flags & SCF_DO_STCLASS_OR)
3454 cl_and(data->start_class, and_withp);
3455 flags &= ~SCF_DO_STCLASS;
3456 }
3457 min += 1;
f9a79580 3458 delta += 1;
e1d1eefb
YO
3459 if (flags & SCF_DO_SUBSTR) {
3460 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3461 data->pos_min += 1;
f9a79580 3462 data->pos_delta += 1;
e1d1eefb
YO
3463 data->longest = &(data->longest_float);
3464 }
3465
3466 }
f9a79580
RGS
3467 else if (OP(scan) == FOLDCHAR) {
3468 int d = ARG(scan)==0xDF ? 1 : 2;
3469 flags &= ~SCF_DO_STCLASS;
3470 min += 1;
3471 delta += d;
3472 if (flags & SCF_DO_SUBSTR) {
3473 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3474 data->pos_min += 1;
3475 data->pos_delta += d;
3476 data->longest = &(data->longest_float);
3477 }
3478 }
8aa23a47
YO
3479 else if (strchr((const char*)PL_simple,OP(scan))) {
3480 int value = 0;
653099ff 3481
8aa23a47 3482 if (flags & SCF_DO_SUBSTR) {
304ee84b 3483 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3484 data->pos_min++;
3485 }
3486 min++;
3487 if (flags & SCF_DO_STCLASS) {
3488 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3489
8aa23a47
YO
3490 /* Some of the logic below assumes that switching
3491 locale on will only add false positives. */
3492 switch (PL_regkind[OP(scan)]) {
3493 case SANY:
3494 default:
3495 do_default:
3496 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3497 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3498 cl_anything(pRExC_state, data->start_class);
3499 break;
3500 case REG_ANY:
3501 if (OP(scan) == SANY)
3502 goto do_default;
3503 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3504 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3505 || (data->start_class->flags & ANYOF_CLASS));
3506 cl_anything(pRExC_state, data->start_class);
653099ff 3507 }
8aa23a47
YO
3508 if (flags & SCF_DO_STCLASS_AND || !value)
3509 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3510 break;
3511 case ANYOF:
3512 if (flags & SCF_DO_STCLASS_AND)
3513 cl_and(data->start_class,
3514 (struct regnode_charclass_class*)scan);
653099ff 3515 else
8aa23a47
YO
3516 cl_or(pRExC_state, data->start_class,
3517 (struct regnode_charclass_class*)scan);
3518 break;
3519 case ALNUM:
3520 if (flags & SCF_DO_STCLASS_AND) {
3521 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3522 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3523 for (value = 0; value < 256; value++)
3524 if (!isALNUM(value))
3525 ANYOF_BITMAP_CLEAR(data->start_class, value);
3526 }
653099ff 3527 }
8aa23a47
YO
3528 else {
3529 if (data->start_class->flags & ANYOF_LOCALE)
3530 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3531 else {
3532 for (value = 0; value < 256; value++)
3533 if (isALNUM(value))
3534 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3535 }
8aa23a47
YO
3536 }
3537 break;
3538 case ALNUML:
3539 if (flags & SCF_DO_STCLASS_AND) {
3540 if (data->start_class->flags & ANYOF_LOCALE)
3541 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3542 }
3543 else {
3544 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3545 data->start_class->flags |= ANYOF_LOCALE;
3546 }
3547 break;
3548 case NALNUM:
3549 if (flags & SCF_DO_STCLASS_AND) {
3550 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3551 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3552 for (value = 0; value < 256; value++)
3553 if (isALNUM(value))
3554 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3555 }
3556 }
8aa23a47
YO
3557 else {
3558 if (data->start_class->flags & ANYOF_LOCALE)
3559 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3560 else {
3561 for (value = 0; value < 256; value++)
3562 if (!isALNUM(value))
3563 ANYOF_BITMAP_SET(data->start_class, value);
3564 }
653099ff 3565 }
8aa23a47
YO
3566 break;
3567 case NALNUML:
3568 if (flags & SCF_DO_STCLASS_AND) {
3569 if (data->start_class->flags & ANYOF_LOCALE)
3570 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3571 }
8aa23a47
YO
3572 else {
3573 data->start_class->flags |= ANYOF_LOCALE;
3574 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3575 }
3576 break;
3577 case SPACE:
3578 if (flags & SCF_DO_STCLASS_AND) {
3579 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3580 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3581 for (value = 0; value < 256; value++)
3582 if (!isSPACE(value))
3583 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3584 }
3585 }
8aa23a47
YO
3586 else {
3587 if (data->start_class->flags & ANYOF_LOCALE)
3588 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3589 else {
3590 for (value = 0; value < 256; value++)
3591 if (isSPACE(value))
3592 ANYOF_BITMAP_SET(data->start_class, value);
3593 }
653099ff 3594 }
8aa23a47
YO
3595 break;
3596 case SPACEL:
3597 if (flags & SCF_DO_STCLASS_AND) {
3598 if (data->start_class->flags & ANYOF_LOCALE)
3599 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3600 }
3601 else {
3602 data->start_class->flags |= ANYOF_LOCALE;
3603 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3604 }
3605 break;
3606 case NSPACE:
3607 if (flags & SCF_DO_STCLASS_AND) {
3608 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3609 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3610 for (value = 0; value < 256; value++)
3611 if (isSPACE(value))
3612 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3613 }
8aa23a47
YO
3614 }
3615 else {
3616 if (data->start_class->flags & ANYOF_LOCALE)
3617 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3618 else {
3619 for (value = 0; value < 256; value++)
3620 if (!isSPACE(value))
3621 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3622 }
3623 }
8aa23a47
YO
3624 break;
3625 case NSPACEL:
3626 if (flags & SCF_DO_STCLASS_AND) {
3627 if (data->start_class->flags & ANYOF_LOCALE) {
3628 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3629 for (value = 0; value < 256; value++)
3630 if (!isSPACE(value))
3631 ANYOF_BITMAP_CLEAR(data->start_class, value);
3632 }
653099ff 3633 }
8aa23a47
YO
3634 else {
3635 data->start_class->flags |= ANYOF_LOCALE;
3636 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3637 }
3638 break;
3639 case DIGIT:
3640 if (flags & SCF_DO_STCLASS_AND) {
3641 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3642 for (value = 0; value < 256; value++)
3643 if (!isDIGIT(value))
3644 ANYOF_BITMAP_CLEAR(data->start_class, value);
3645 }
3646 else {
3647 if (data->start_class->flags & ANYOF_LOCALE)
3648 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3649 else {
3650 for (value = 0; value < 256; value++)
3651 if (isDIGIT(value))
3652 ANYOF_BITMAP_SET(data->start_class, value);
3653 }
3654 }
3655 break;
3656 case NDIGIT:
3657 if (flags & SCF_DO_STCLASS_AND) {
3658 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3659 for (value = 0; value < 256; value++)
3660 if (isDIGIT(value))
3661 ANYOF_BITMAP_CLEAR(data->start_class, value);
3662 }
3663 else {
3664 if (data->start_class->flags & ANYOF_LOCALE)
3665 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3666 else {
3667 for (value = 0; value < 256; value++)
3668 if (!isDIGIT(value))
3669 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3670 }
3671 }
8aa23a47 3672 break;
e1d1eefb
YO
3673 CASE_SYNST_FNC(VERTWS);
3674 CASE_SYNST_FNC(HORIZWS);
3675
8aa23a47
YO
3676 }
3677 if (flags & SCF_DO_STCLASS_OR)
3678 cl_and(data->start_class, and_withp);
3679 flags &= ~SCF_DO_STCLASS;
3680 }
3681 }
3682 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3683 data->flags |= (OP(scan) == MEOL
3684 ? SF_BEFORE_MEOL
3685 : SF_BEFORE_SEOL);
3686 }
3687 else if ( PL_regkind[OP(scan)] == BRANCHJ
3688 /* Lookbehind, or need to calculate parens/evals/stclass: */
3689 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3690 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3691 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3692 || OP(scan) == UNLESSM )
3693 {
3694 /* Negative Lookahead/lookbehind
3695 In this case we can't do fixed string optimisation.
3696 */
1de06328 3697
8aa23a47
YO
3698 I32 deltanext, minnext, fake = 0;
3699 regnode *nscan;
3700 struct regnode_charclass_class intrnl;
3701 int f = 0;
1de06328 3702
8aa23a47
YO
3703 data_fake.flags = 0;
3704 if (data) {
3705 data_fake.whilem_c = data->whilem_c;
3706 data_fake.last_closep = data->last_closep;
c277df42 3707 }
8aa23a47
YO
3708 else
3709 data_fake.last_closep = &fake;
58e23c8d 3710 data_fake.pos_delta = delta;
8aa23a47
YO
3711 if ( flags & SCF_DO_STCLASS && !scan->flags
3712 && OP(scan) == IFMATCH ) { /* Lookahead */
3713 cl_init(pRExC_state, &intrnl);
3714 data_fake.start_class = &intrnl;
3715 f |= SCF_DO_STCLASS_AND;
3716 }
3717 if (flags & SCF_WHILEM_VISITED_POS)
3718 f |= SCF_WHILEM_VISITED_POS;
3719 next = regnext(scan);
3720 nscan = NEXTOPER(NEXTOPER(scan));
3721 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3722 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3723 if (scan->flags) {
3724 if (deltanext) {
58e23c8d 3725 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3726 }
3727 else if (minnext > (I32)U8_MAX) {
58e23c8d 3728 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3729 }
3730 scan->flags = (U8)minnext;
3731 }
3732 if (data) {
3733 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3734 pars++;
3735 if (data_fake.flags & SF_HAS_EVAL)
3736 data->flags |= SF_HAS_EVAL;
3737 data->whilem_c = data_fake.whilem_c;
3738 }
3739 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
3740 if (flags & SCF_DO_STCLASS_OR) {
3741 /* OR before, AND after: ideally we would recurse with
3742 * data_fake to get the AND applied by study of the
3743 * remainder of the pattern, and then derecurse;
3744 * *** HACK *** for now just treat as "no information".
3745 * See [perl #56690].
3746 */
3747 cl_init(pRExC_state, data->start_class);
3748 } else {
3749 /* AND before and after: combine and continue */
3750 const int was = (data->start_class->flags & ANYOF_EOS);
3751
3752 cl_and(data->start_class, &intrnl);
3753 if (was)
3754 data->start_class->flags |= ANYOF_EOS;
3755 }
8aa23a47 3756 }
cb434fcc 3757 }
8aa23a47
YO
3758#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3759 else {
3760 /* Positive Lookahead/lookbehind
3761 In this case we can do fixed string optimisation,
3762 but we must be careful about it. Note in the case of
3763 lookbehind the positions will be offset by the minimum
3764 length of the pattern, something we won't know about
3765 until after the recurse.
3766 */
3767 I32 deltanext, fake = 0;
3768 regnode *nscan;
3769 struct regnode_charclass_class intrnl;
3770 int f = 0;
3771 /* We use SAVEFREEPV so that when the full compile
3772 is finished perl will clean up the allocated
3773 minlens when its all done. This was we don't
3774 have to worry about freeing them when we know
3775 they wont be used, which would be a pain.
3776 */
3777 I32 *minnextp;
3778 Newx( minnextp, 1, I32 );
3779 SAVEFREEPV(minnextp);
3780
3781 if (data) {
3782 StructCopy(data, &data_fake, scan_data_t);
3783 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3784 f |= SCF_DO_SUBSTR;
3785 if (scan->flags)
304ee84b 3786 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3787 data_fake.last_found=newSVsv(data->last_found);
3788 }
3789 }
3790 else
3791 data_fake.last_closep = &fake;
3792 data_fake.flags = 0;
58e23c8d 3793 data_fake.pos_delta = delta;
8aa23a47
YO
3794 if (is_inf)
3795 data_fake.flags |= SF_IS_INF;
3796 if ( flags & SCF_DO_STCLASS && !scan->flags
3797 && OP(scan) == IFMATCH ) { /* Lookahead */
3798 cl_init(pRExC_state, &intrnl);
3799 data_fake.start_class = &intrnl;
3800 f |= SCF_DO_STCLASS_AND;
3801 }
3802 if (flags & SCF_WHILEM_VISITED_POS)
3803 f |= SCF_WHILEM_VISITED_POS;
3804 next = regnext(scan);
3805 nscan = NEXTOPER(NEXTOPER(scan));
3806
3807 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3808 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3809 if (scan->flags) {
3810 if (deltanext) {
58e23c8d 3811 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3812 }
3813 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3814 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3815 }
3816 scan->flags = (U8)*minnextp;
3817 }
3818
3819 *minnextp += min;
3820
3821 if (f & SCF_DO_STCLASS_AND) {
3822 const int was = (data->start_class->flags & ANYOF_EOS);
3823
3824 cl_and(data->start_class, &intrnl);
3825 if (was)
3826 data->start_class->flags |= ANYOF_EOS;
3827 }
3828 if (data) {
3829 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3830 pars++;
3831 if (data_fake.flags & SF_HAS_EVAL)
3832 data->flags |= SF_HAS_EVAL;
3833 data->whilem_c = data_fake.whilem_c;
3834 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3835 if (RExC_rx->minlen<*minnextp)
3836 RExC_rx->minlen=*minnextp;
304ee84b 3837 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3838 SvREFCNT_dec(data_fake.last_found);
3839
3840 if ( data_fake.minlen_fixed != minlenp )
3841 {
3842 data->offset_fixed= data_fake.offset_fixed;
3843 data->minlen_fixed= data_fake.minlen_fixed;
3844 data->lookbehind_fixed+= scan->flags;
3845 }
3846 if ( data_fake.minlen_float != minlenp )
3847 {
3848 data->minlen_float= data_fake.minlen_float;
3849 data->offset_float_min=data_fake.offset_float_min;
3850 data->offset_float_max=data_fake.offset_float_max;
3851 data->lookbehind_float+= scan->flags;
3852 }
3853 }
3854 }
3855
3856
40d049e4 3857 }
8aa23a47
YO
3858#endif
3859 }
3860 else if (OP(scan) == OPEN) {
3861 if (stopparen != (I32)ARG(scan))
3862 pars++;
3863 }
3864 else if (OP(scan) == CLOSE) {
3865 if (stopparen == (I32)ARG(scan)) {
3866 break;
3867 }
3868 if ((I32)ARG(scan) == is_par) {
3869 next = regnext(scan);
b515a41d 3870
8aa23a47
YO
3871 if ( next && (OP(next) != WHILEM) && next < last)
3872 is_par = 0; /* Disable optimization */
40d049e4 3873 }
8aa23a47
YO
3874 if (data)
3875 *(data->last_closep) = ARG(scan);
3876 }
3877 else if (OP(scan) == EVAL) {
c277df42
IZ
3878 if (data)
3879 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3880 }
3881 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3882 if (flags & SCF_DO_SUBSTR) {
304ee84b 3883 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3884 flags &= ~SCF_DO_SUBSTR;
40d049e4 3885 }
8aa23a47
YO
3886 if (data && OP(scan)==ACCEPT) {
3887 data->flags |= SCF_SEEN_ACCEPT;
3888 if (stopmin > min)
3889 stopmin = min;
e2e6a0f1 3890 }
8aa23a47
YO
3891 }
3892 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3893 {
0f5d15d6 3894 if (flags & SCF_DO_SUBSTR) {
304ee84b 3895 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3896 data->longest = &(data->longest_float);
3897 }
3898 is_inf = is_inf_internal = 1;
653099ff 3899 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3900 cl_anything(pRExC_state, data->start_class);
96776eda 3901 flags &= ~SCF_DO_STCLASS;
8aa23a47 3902 }
58e23c8d 3903 else if (OP(scan) == GPOS) {
bbe252da 3904 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3905 !(delta || is_inf || (data && data->pos_delta)))
3906 {
bbe252da
YO
3907 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3908 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3909 if (RExC_rx->gofs < (U32)min)
3910 RExC_rx->gofs = min;
3911 } else {
bbe252da 3912 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3913 RExC_rx->gofs = 0;
3914 }
3915 }
786e8c11 3916#ifdef TRIE_STUDY_OPT
40d049e4 3917#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3918 else if (PL_regkind[OP(scan)] == TRIE) {
3919 /* NOTE - There is similar code to this block above for handling
3920 BRANCH nodes on the initial study. If you change stuff here
3921 check there too. */
3922 regnode *trie_node= scan;
3923 regnode *tail= regnext(scan);
f8fc2ecf 3924 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3925 I32 max1 = 0, min1 = I32_MAX;
3926 struct regnode_charclass_class accum;
3927
3928 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3929 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3930 if (flags & SCF_DO_STCLASS)
3931 cl_init_zero(pRExC_state, &accum);
3932
3933 if (!trie->jump) {
3934 min1= trie->minlen;
3935 max1= trie->maxlen;
3936 } else {
3937 const regnode *nextbranch= NULL;
3938 U32 word;
3939
3940 for ( word=1 ; word <= trie->wordcount ; word++)
3941 {
3942 I32 deltanext=0, minnext=0, f = 0, fake;
3943 struct regnode_charclass_class this_class;
3944
3945 data_fake.flags = 0;
3946 if (data) {
3947 data_fake.whilem_c = data->whilem_c;
3948 data_fake.last_closep = data->last_closep;
3949 }
3950 else
3951 data_fake.last_closep = &fake;
58e23c8d 3952 data_fake.pos_delta = delta;
8aa23a47
YO
3953 if (flags & SCF_DO_STCLASS) {
3954 cl_init(pRExC_state, &this_class);
3955 data_fake.start_class = &this_class;
3956 f = SCF_DO_STCLASS_AND;
3957 }
3958 if (flags & SCF_WHILEM_VISITED_POS)
3959 f |= SCF_WHILEM_VISITED_POS;
3960
3961 if (trie->jump[word]) {
3962 if (!nextbranch)
3963 nextbranch = trie_node + trie->jump[0];
3964 scan= trie_node + trie->jump[word];
3965 /* We go from the jump point to the branch that follows
3966 it. Note this means we need the vestigal unused branches
3967 even though they arent otherwise used.
3968 */
3969 minnext = study_chunk(pRExC_state, &scan, minlenp,
3970 &deltanext, (regnode *)nextbranch, &data_fake,
3971 stopparen, recursed, NULL, f,depth+1);
3972 }
3973 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3974 nextbranch= regnext((regnode*)nextbranch);
3975
3976 if (min1 > (I32)(minnext + trie->minlen))
3977 min1 = minnext + trie->minlen;
3978 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3979 max1 = minnext + deltanext + trie->maxlen;
3980 if (deltanext == I32_MAX)
3981 is_inf = is_inf_internal = 1;
3982
3983 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3984 pars++;
3985 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3986 if ( stopmin > min + min1)
3987 stopmin = min + min1;
3988 flags &= ~SCF_DO_SUBSTR;
3989 if (data)
3990 data->flags |= SCF_SEEN_ACCEPT;
3991 }
3992 if (data) {
3993 if (data_fake.flags & SF_HAS_EVAL)
3994 data->flags |= SF_HAS_EVAL;
3995 data->whilem_c = data_fake.whilem_c;
3996 }
3997 if (flags & SCF_DO_STCLASS)
3998 cl_or(pRExC_state, &accum, &this_class);
3999 }
4000 }
4001 if (flags & SCF_DO_SUBSTR) {
4002 data->pos_min += min1;
4003 data->pos_delta += max1 - min1;
4004 if (max1 != min1 || is_inf)
4005 data->longest = &(data->longest_float);
4006 }
4007 min += min1;
4008 delta += max1 - min1;
4009 if (flags & SCF_DO_STCLASS_OR) {
4010 cl_or(pRExC_state, data->start_class, &accum);
4011 if (min1) {
4012 cl_and(data->start_class, and_withp);
4013 flags &= ~SCF_DO_STCLASS;
4014 }
4015 }
4016 else if (flags & SCF_DO_STCLASS_AND) {
4017 if (min1) {
4018 cl_and(data->start_class, &accum);
4019 flags &= ~SCF_DO_STCLASS;
4020 }
4021 else {
4022 /* Switch to OR mode: cache the old value of
4023 * data->start_class */
4024 INIT_AND_WITHP;
4025 StructCopy(data->start_class, and_withp,
4026 struct regnode_charclass_class);
4027 flags &= ~SCF_DO_STCLASS_AND;
4028 StructCopy(&accum, data->start_class,
4029 struct regnode_charclass_class);
4030 flags |= SCF_DO_STCLASS_OR;
4031 data->start_class->flags |= ANYOF_EOS;
4032 }
4033 }
4034 scan= tail;
4035 continue;
4036 }
786e8c11 4037#else
8aa23a47 4038 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4039 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4040 U8*bang=NULL;
4041
4042 min += trie->minlen;
4043 delta += (trie->maxlen - trie->minlen);
4044 flags &= ~SCF_DO_STCLASS; /* xxx */
4045 if (flags & SCF_DO_SUBSTR) {
304ee84b 4046 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4047 data->pos_min += trie->minlen;
4048 data->pos_delta += (trie->maxlen - trie->minlen);
4049 if (trie->maxlen != trie->minlen)
4050 data->longest = &(data->longest_float);
4051 }
4052 if (trie->jump) /* no more substrings -- for now /grr*/
4053 flags &= ~SCF_DO_SUBSTR;
b515a41d 4054 }
8aa23a47
YO
4055#endif /* old or new */
4056#endif /* TRIE_STUDY_OPT */
e1d1eefb 4057
8aa23a47
YO
4058 /* Else: zero-length, ignore. */
4059 scan = regnext(scan);
4060 }
4061 if (frame) {
4062 last = frame->last;
4063 scan = frame->next;
4064 stopparen = frame->stop;
4065 frame = frame->prev;
4066 goto fake_study_recurse;
c277df42
IZ
4067 }
4068
4069 finish:
8aa23a47 4070 assert(!frame);
304ee84b 4071 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4072
c277df42 4073 *scanp = scan;
aca2d497 4074 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4075 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4076 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4077 if (is_par > (I32)U8_MAX)
c277df42
IZ
4078 is_par = 0;
4079 if (is_par && pars==1 && data) {
4080 data->flags |= SF_IN_PAR;
4081 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4082 }
4083 else if (pars && data) {
c277df42
IZ
4084 data->flags |= SF_HAS_PAR;
4085 data->flags &= ~SF_IN_PAR;
4086 }
653099ff 4087 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4088 cl_and(data->start_class, and_withp);
786e8c11
YO
4089 if (flags & SCF_TRIE_RESTUDY)
4090 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4091
304ee84b 4092 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4093
e2e6a0f1 4094 return min < stopmin ? min : stopmin;
c277df42
IZ
4095}
4096
2eccd3b2
NC
4097STATIC U32
4098S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4099{
4a4e7719
NC
4100 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4101
7918f24d
NC
4102 PERL_ARGS_ASSERT_ADD_DATA;
4103
4a4e7719
NC
4104 Renewc(RExC_rxi->data,
4105 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4106 char, struct reg_data);
4107 if(count)
f8fc2ecf 4108 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4109 else
f8fc2ecf 4110 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4111 RExC_rxi->data->count = count + n;
4112 Copy(s, RExC_rxi->data->what + count, n, U8);
4113 return count;
c277df42
IZ
4114}
4115
f8149455 4116/*XXX: todo make this not included in a non debugging perl */
76234dfb 4117#ifndef PERL_IN_XSUB_RE
d88dccdf 4118void
864dbfa3 4119Perl_reginitcolors(pTHX)
d88dccdf 4120{
97aff369 4121 dVAR;
1df70142 4122 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4123 if (s) {
1df70142
AL
4124 char *t = savepv(s);
4125 int i = 0;
4126 PL_colors[0] = t;
d88dccdf 4127 while (++i < 6) {
1df70142
AL
4128 t = strchr(t, '\t');
4129 if (t) {
4130 *t = '\0';
4131 PL_colors[i] = ++t;
d88dccdf
IZ
4132 }
4133 else
1df70142 4134 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4135 }
4136 } else {
1df70142 4137 int i = 0;
b81d288d 4138 while (i < 6)
06b5626a 4139 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4140 }
4141 PL_colorset = 1;
4142}
76234dfb 4143#endif
8615cb43 4144
07be1b83 4145
786e8c11
YO
4146#ifdef TRIE_STUDY_OPT
4147#define CHECK_RESTUDY_GOTO \
4148 if ( \
4149 (data.flags & SCF_TRIE_RESTUDY) \
4150 && ! restudied++ \
4151 ) goto reStudy
4152#else
4153#define CHECK_RESTUDY_GOTO
4154#endif
f9f4320a 4155
a687059c 4156/*
e50aee73 4157 - pregcomp - compile a regular expression into internal code
a687059c
LW
4158 *
4159 * We can't allocate space until we know how big the compiled form will be,
4160 * but we can't compile it (and thus know how big it is) until we've got a
4161 * place to put the code. So we cheat: we compile it twice, once with code
4162 * generation turned off and size counting turned on, and once "for real".
4163 * This also means that we don't allocate space until we are sure that the
4164 * thing really will compile successfully, and we never have to move the
4165 * code and thus invalidate pointers into it. (Note that it has to be in
4166 * one piece because free() must be able to free it all.) [NB: not true in perl]
4167 *
4168 * Beware that the optimization-preparation code in here knows about some
4169 * of the structure of the compiled regexp. [I'll say.]
4170 */
b9b4dddf
YO
4171
4172
4173
f9f4320a 4174#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4175#define RE_ENGINE_PTR &PL_core_reg_engine
4176#else
f9f4320a
YO
4177extern const struct regexp_engine my_reg_engine;
4178#define RE_ENGINE_PTR &my_reg_engine
4179#endif
6d5c990f
RGS
4180
4181#ifndef PERL_IN_XSUB_RE
3ab4a224 4182REGEXP *
1593ad57 4183Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4184{
97aff369 4185 dVAR;
6d5c990f 4186 HV * const table = GvHV(PL_hintgv);
7918f24d
NC
4187
4188 PERL_ARGS_ASSERT_PREGCOMP;
4189
f9f4320a
YO
4190 /* Dispatch a request to compile a regexp to correct
4191 regexp engine. */
f9f4320a
YO
4192 if (table) {
4193 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4194 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4195 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4196 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4197 DEBUG_COMPILE_r({
8d8756e7 4198 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4199 SvIV(*ptr));
4200 });
3ab4a224 4201 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4202 }
b9b4dddf 4203 }
3ab4a224 4204 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4205}
6d5c990f 4206#endif
2a5d9b1d 4207
3ab4a224 4208REGEXP *
1593ad57 4209Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
2a5d9b1d
RGS
4210{
4211 dVAR;
288b8c02
NC
4212 REGEXP *rx;
4213 struct regexp *r;
f8fc2ecf 4214 register regexp_internal *ri;
3ab4a224 4215 STRLEN plen;
1593ad57 4216 char *exp = SvPV(pattern, plen);
3ab4a224 4217 char* xend = exp + plen;
c277df42 4218 regnode *scan;
a0d0e21e 4219 I32 flags;
a0d0e21e
LW
4220 I32 minlen = 0;
4221 I32 sawplus = 0;
4222 I32 sawopen = 0;
2c2d71f5 4223 scan_data_t data;
830247a4 4224 RExC_state_t RExC_state;
be8e71aa 4225 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
4226#ifdef TRIE_STUDY_OPT
4227 int restudied= 0;
4228 RExC_state_t copyRExC_state;
4229#endif
2a5d9b1d 4230 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4231
4232 PERL_ARGS_ASSERT_RE_COMPILE;
4233
6d5c990f 4234 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4235
b9ad30b4 4236 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
a0ed51b3 4237
a3621e74 4238 DEBUG_COMPILE_r({
ab3bbdeb
YO
4239 SV *dsv= sv_newmortal();
4240 RE_PV_QUOTED_DECL(s, RExC_utf8,
3ab4a224 4241 dsv, exp, plen, 60);
ab3bbdeb
YO
4242 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4243 PL_colors[4],PL_colors[5],s);
a5961de5 4244 });
02daf0ab
YO
4245
4246redo_first_pass:
4247 RExC_precomp = exp;
c737faaf 4248 RExC_flags = pm_flags;
830247a4 4249 RExC_sawback = 0;
bbce6d69 4250
830247a4
IZ
4251 RExC_seen = 0;
4252 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4253 RExC_seen_evals = 0;
4254 RExC_extralen = 0;
c277df42 4255
bbce6d69 4256 /* First pass: determine size, legality. */
830247a4 4257 RExC_parse = exp;
fac92740 4258 RExC_start = exp;
830247a4
IZ
4259 RExC_end = xend;
4260 RExC_naughty = 0;
4261 RExC_npar = 1;
e2e6a0f1 4262 RExC_nestroot = 0;
830247a4
IZ
4263 RExC_size = 0L;
4264 RExC_emit = &PL_regdummy;
4265 RExC_whilem_seen = 0;
fc8cd66c 4266 RExC_charnames = NULL;
40d049e4
YO
4267 RExC_open_parens = NULL;
4268 RExC_close_parens = NULL;
4269 RExC_opend = NULL;
81714fb9 4270 RExC_paren_names = NULL;
1f1031fe
YO
4271#ifdef DEBUGGING
4272 RExC_paren_name_list = NULL;
4273#endif
40d049e4
YO
4274 RExC_recurse = NULL;
4275 RExC_recurse_count = 0;
81714fb9 4276
85ddcde9
JH
4277#if 0 /* REGC() is (currently) a NOP at the first pass.
4278 * Clever compilers notice this and complain. --jhi */
830247a4 4279 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4280#endif
3dab1dad
YO
4281 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4282 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4283 RExC_precomp = NULL;
a0d0e21e
LW
4284 return(NULL);
4285 }
02daf0ab 4286 if (RExC_utf8 && !RExC_orig_utf8) {
38a44b82 4287 /* It's possible to write a regexp in ascii that represents Unicode
02daf0ab
YO
4288 codepoints outside of the byte range, such as via \x{100}. If we
4289 detect such a sequence we have to convert the entire pattern to utf8
4290 and then recompile, as our sizing calculation will have been based
4291 on 1 byte == 1 character, but we will need to use utf8 to encode
4292 at least some part of the pattern, and therefore must convert the whole
4293 thing.
4294 XXX: somehow figure out how to make this less expensive...
4295 -- dmq */
3ab4a224 4296 STRLEN len = plen;
02daf0ab
YO
4297 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4298 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4299 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4300 xend = exp + len;
4301 RExC_orig_utf8 = RExC_utf8;
4302 SAVEFREEPV(exp);
4303 goto redo_first_pass;
4304 }
07be1b83 4305 DEBUG_PARSE_r({
81714fb9
YO
4306 PerlIO_printf(Perl_debug_log,
4307 "Required size %"IVdf" nodes\n"
4308 "Starting second pass (creation)\n",
4309 (IV)RExC_size);
07be1b83
YO
4310 RExC_lastnum=0;
4311 RExC_lastparse=NULL;
4312 });
c277df42
IZ
4313 /* Small enough for pointer-storage convention?
4314 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4315 if (RExC_size >= 0x10000L && RExC_extralen)
4316 RExC_size += RExC_extralen;
c277df42 4317 else
830247a4
IZ
4318 RExC_extralen = 0;
4319 if (RExC_whilem_seen > 15)
4320 RExC_whilem_seen = 15;
a0d0e21e 4321
f9f4320a
YO
4322 /* Allocate space and zero-initialize. Note, the two step process
4323 of zeroing when in debug mode, thus anything assigned has to
4324 happen after that */
d2f13c59 4325 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 4326 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
4327 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4328 char, regexp_internal);
4329 if ( r == NULL || ri == NULL )
b45f050a 4330 FAIL("Regexp out of space");
0f79a09d
GS
4331#ifdef DEBUGGING
4332 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4333 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4334#else
f8fc2ecf
YO
4335 /* bulk initialize base fields with 0. */
4336 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4337#endif
58e23c8d
YO
4338
4339 /* non-zero initialization begins here */
f8fc2ecf 4340 RXi_SET( r, ri );
f9f4320a 4341 r->engine= RE_ENGINE_PTR;
c737faaf 4342 r->extflags = pm_flags;
bcdf7404 4343 {
f7819f85 4344 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
bcdf7404
YO
4345 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4346 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
4347 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4348 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
4349 const char *fptr = STD_PAT_MODS; /*"msix"*/
4350 char *p;
9d17798d 4351 const STRLEN wraplen = plen + has_minus + has_p + has_runon
bcdf7404
YO
4352 + (sizeof(STD_PAT_MODS) - 1)
4353 + (sizeof("(?:)") - 1);
4354
ad64d0ec 4355 p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
9d17798d 4356 SvCUR_set(rx, wraplen);
f7c278bf 4357 SvPOK_on(rx);
8f6ae13c 4358 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 4359 *p++='('; *p++='?';
f7819f85
A
4360 if (has_p)
4361 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404
YO
4362 {
4363 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4364 char *colon = r + 1;
4365 char ch;
4366
4367 while((ch = *fptr++)) {
4368 if(reganch & 1)
4369 *p++ = ch;
4370 else
4371 *r-- = ch;
4372 reganch >>= 1;
4373 }
4374 if(has_minus) {
4375 *r = '-';
4376 p = colon;
4377 }
4378 }
4379
28d8d7f4 4380 *p++ = ':';
bb661a58 4381 Copy(RExC_precomp, p, plen, char);
efd26800
NC
4382 assert ((RX_WRAPPED(rx) - p) < 16);
4383 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 4384 p += plen;
bcdf7404 4385 if (has_runon)
28d8d7f4
YO
4386 *p++ = '\n';
4387 *p++ = ')';
4388 *p = 0;
bcdf7404
YO
4389 }
4390
bbe252da 4391 r->intflags = 0;
830247a4 4392 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4393
6bda09f9 4394 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4395 Newxz(RExC_open_parens, RExC_npar,regnode *);
4396 SAVEFREEPV(RExC_open_parens);
4397 Newxz(RExC_close_parens,RExC_npar,regnode *);
4398 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4399 }
4400
4401 /* Useful during FAIL. */
7122b237
YO
4402#ifdef RE_TRACK_PATTERN_OFFSETS
4403 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4404 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4405 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4406 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4407 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4408#endif
4409 SetProgLen(ri,RExC_size);
288b8c02 4410 RExC_rx_sv = rx;
830247a4 4411 RExC_rx = r;
f8fc2ecf 4412 RExC_rxi = ri;
bbce6d69 4413
4414 /* Second pass: emit code. */
c737faaf 4415 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
4416 RExC_parse = exp;
4417 RExC_end = xend;
4418 RExC_naughty = 0;
4419 RExC_npar = 1;
f8fc2ecf
YO
4420 RExC_emit_start = ri->program;
4421 RExC_emit = ri->program;
3b57cd43
YO
4422 RExC_emit_bound = ri->program + RExC_size + 1;
4423
2cd61cdb 4424 /* Store the count of eval-groups for security checks: */
f8149455 4425 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4426 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 4427 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 4428 ReREFCNT_dec(rx);
a0d0e21e 4429 return(NULL);
80757612 4430 }
07be1b83
YO
4431 /* XXXX To minimize changes to RE engine we always allocate
4432 3-units-long substrs field. */
4433 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4434 if (RExC_recurse_count) {
4435 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4436 SAVEFREEPV(RExC_recurse);
4437 }
a0d0e21e 4438
07be1b83 4439reStudy:
1de06328 4440 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4441 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4442
07be1b83 4443#ifdef TRIE_STUDY_OPT
0934c9d9
SH
4444 if (!restudied) {
4445 StructCopy(&zero_scan_data, &data, scan_data_t);
4446 copyRExC_state = RExC_state;
4447 } else {
5d458dd8 4448 U32 seen=RExC_seen;
07be1b83 4449 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4450
4451 RExC_state = copyRExC_state;
4452 if (seen & REG_TOP_LEVEL_BRANCHES)
4453 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4454 else
4455 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4456 if (data.last_found) {
07be1b83 4457 SvREFCNT_dec(data.longest_fixed);
07be1b83 4458 SvREFCNT_dec(data.longest_float);
07be1b83 4459 SvREFCNT_dec(data.last_found);
1de06328 4460 }
40d049e4 4461 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4462 }
40d049e4
YO
4463#else
4464 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4465#endif
fc8cd66c 4466
a0d0e21e 4467 /* Dig out information for optimizations. */
f7819f85 4468 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
4469 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4470
a0ed51b3 4471 if (UTF)
8f6ae13c 4472 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 4473 ri->regstclass = NULL;
830247a4 4474 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4475 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4476 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4477
1de06328
YO
4478 /* testing for BRANCH here tells us whether there is "must appear"
4479 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4480 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4481 I32 fake;
c5254dd6 4482 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4483 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4484 int stclass_flag;
07be1b83 4485 I32 last_close = 0; /* pointed to by data */
5339e136
YO
4486 regnode *first= scan;
4487 regnode *first_next= regnext(first);
4488
639081d6
YO
4489 /*
4490 * Skip introductions and multiplicators >= 1
4491 * so that we can extract the 'meat' of the pattern that must
4492 * match in the large if() sequence following.
4493 * NOTE that EXACT is NOT covered here, as it is normally
4494 * picked up by the optimiser separately.
4495 *
4496 * This is unfortunate as the optimiser isnt handling lookahead
4497 * properly currently.
4498 *
4499 */
a0d0e21e 4500 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4501 /* An OR of *one* alternative - should not happen now. */
5339e136 4502 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83
YO
4503 /* for now we can't handle lookbehind IFMATCH*/
4504 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4505 (OP(first) == PLUS) ||
4506 (OP(first) == MINMOD) ||
653099ff 4507 /* An {n,m} with n>0 */
5339e136
YO
4508 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4509 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 4510 {
639081d6
YO
4511 /*
4512 * the only op that could be a regnode is PLUS, all the rest
4513 * will be regnode_1 or regnode_2.
4514 *
4515 */
a0d0e21e
LW
4516 if (OP(first) == PLUS)
4517 sawplus = 1;
4518 else
3dab1dad 4519 first += regarglen[OP(first)];
639081d6
YO
4520
4521 first = NEXTOPER(first);
5339e136 4522 first_next= regnext(first);
a687059c
LW
4523 }
4524
a0d0e21e
LW
4525 /* Starting-point info. */
4526 again:
786e8c11 4527 DEBUG_PEEP("first:",first,0);
07be1b83 4528 /* Ignore EXACT as we deal with it later. */
3dab1dad 4529 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4530 if (OP(first) == EXACT)
6f207bd3 4531 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4532 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4533 ri->regstclass = first;
b3c9acc1 4534 }
07be1b83 4535#ifdef TRIE_STCLASS
786e8c11 4536 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4537 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4538 {
786e8c11 4539 regnode *trie_op;
07be1b83 4540 /* this can happen only on restudy */
786e8c11 4541 if ( OP(first) == TRIE ) {
c944940b 4542 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4543 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4544 StructCopy(first,trieop,struct regnode_1);
4545 trie_op=(regnode *)trieop;
4546 } else {
c944940b 4547 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4548 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4549 StructCopy(first,trieop,struct regnode_charclass);
4550 trie_op=(regnode *)trieop;
4551 }
1de06328 4552 OP(trie_op)+=2;
786e8c11 4553 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4554 ri->regstclass = trie_op;
07be1b83
YO
4555 }
4556#endif
bfed75c6 4557 else if (strchr((const char*)PL_simple,OP(first)))
f8fc2ecf 4558 ri->regstclass = first;
3dab1dad
YO
4559 else if (PL_regkind[OP(first)] == BOUND ||
4560 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4561 ri->regstclass = first;
3dab1dad 4562 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4563 r->extflags |= (OP(first) == MBOL
4564 ? RXf_ANCH_MBOL
cad2e5aa 4565 : (OP(first) == SBOL
bbe252da
YO
4566 ? RXf_ANCH_SBOL
4567 : RXf_ANCH_BOL));
a0d0e21e 4568 first = NEXTOPER(first);
774d564b 4569 goto again;
4570 }
4571 else if (OP(first) == GPOS) {
bbe252da 4572 r->extflags |= RXf_ANCH_GPOS;
774d564b 4573 first = NEXTOPER(first);
4574 goto again;
a0d0e21e 4575 }
cf2a2b69
YO
4576 else if ((!sawopen || !RExC_sawback) &&
4577 (OP(first) == STAR &&
3dab1dad 4578 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4579 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4580 {
4581 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4582 const int type =
4583 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4584 ? RXf_ANCH_MBOL
4585 : RXf_ANCH_SBOL;
4586 r->extflags |= type;
4587 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4588 first = NEXTOPER(first);
774d564b 4589 goto again;
a0d0e21e 4590 }
b81d288d 4591 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4592 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4593 /* x+ must match at the 1st pos of run of x's */
bbe252da 4594 r->intflags |= PREGf_SKIP;
a0d0e21e 4595
c277df42 4596 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4597#ifdef TRIE_STUDY_OPT
81714fb9 4598 DEBUG_PARSE_r(
be8e71aa
YO
4599 if (!restudied)
4600 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4601 (IV)(first - scan + 1))
4602 );
4603#else
81714fb9 4604 DEBUG_PARSE_r(
be8e71aa
YO
4605 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4606 (IV)(first - scan + 1))
4607 );
4608#endif
4609
4610
a0d0e21e
LW
4611 /*
4612 * If there's something expensive in the r.e., find the
4613 * longest literal string that must appear and make it the
4614 * regmust. Resolve ties in favor of later strings, since
4615 * the regstart check works with the beginning of the r.e.
4616 * and avoiding duplication strengthens checking. Not a
4617 * strong reason, but sufficient in the absence of others.
4618 * [Now we resolve ties in favor of the earlier string if
c277df42 4619 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4620 * earlier string may buy us something the later one won't.]
4621 */
de8c5301 4622
396482e1
GA
4623 data.longest_fixed = newSVpvs("");
4624 data.longest_float = newSVpvs("");
4625 data.last_found = newSVpvs("");
c277df42
IZ
4626 data.longest = &(data.longest_fixed);
4627 first = scan;
f8fc2ecf 4628 if (!ri->regstclass) {
830247a4 4629 cl_init(pRExC_state, &ch_class);
653099ff
GS
4630 data.start_class = &ch_class;
4631 stclass_flag = SCF_DO_STCLASS_AND;
4632 } else /* XXXX Check for BOUND? */
4633 stclass_flag = 0;
cb434fcc 4634 data.last_closep = &last_close;
de8c5301 4635
1de06328 4636 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4637 &data, -1, NULL, NULL,
4638 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4639
07be1b83 4640
786e8c11
YO
4641 CHECK_RESTUDY_GOTO;
4642
4643
830247a4 4644 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4645 && data.last_start_min == 0 && data.last_end > 0
830247a4 4646 && !RExC_seen_zerolen
2bf803e2 4647 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4648 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4649 r->extflags |= RXf_CHECK_ALL;
304ee84b 4650 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4651 SvREFCNT_dec(data.last_found);
4652
1de06328
YO
4653 /* Note that code very similar to this but for anchored string
4654 follows immediately below, changes may need to be made to both.
4655 Be careful.
4656 */
a0ed51b3 4657 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4658 if (longest_float_length
c277df42
IZ
4659 || (data.flags & SF_FL_BEFORE_EOL
4660 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4661 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4662 {
1182767e 4663 I32 t,ml;
cf93c79d 4664
1de06328 4665 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4666 && data.offset_fixed == data.offset_float_min
4667 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4668 goto remove_float; /* As in (a)+. */
4669
1de06328
YO
4670 /* copy the information about the longest float from the reg_scan_data
4671 over to the program. */
33b8afdf
JH
4672 if (SvUTF8(data.longest_float)) {
4673 r->float_utf8 = data.longest_float;
c445ea15 4674 r->float_substr = NULL;
33b8afdf
JH
4675 } else {
4676 r->float_substr = data.longest_float;
c445ea15 4677 r->float_utf8 = NULL;
33b8afdf 4678 }
1de06328
YO
4679 /* float_end_shift is how many chars that must be matched that
4680 follow this item. We calculate it ahead of time as once the
4681 lookbehind offset is added in we lose the ability to correctly
4682 calculate it.*/
4683 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4684 : (I32)longest_float_length;
1de06328
YO
4685 r->float_end_shift = ml - data.offset_float_min
4686 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4687 + data.lookbehind_float;
4688 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4689 r->float_max_offset = data.offset_float_max;
1182767e 4690 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4691 r->float_max_offset -= data.lookbehind_float;
4692
cf93c79d
IZ
4693 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4694 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4695 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4696 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4697 }
4698 else {
aca2d497 4699 remove_float:
c445ea15 4700 r->float_substr = r->float_utf8 = NULL;
c277df42 4701 SvREFCNT_dec(data.longest_float);
c5254dd6 4702 longest_float_length = 0;
a0d0e21e 4703 }
c277df42 4704
1de06328
YO
4705 /* Note that code very similar to this but for floating string
4706 is immediately above, changes may need to be made to both.
4707 Be careful.
4708 */
a0ed51b3 4709 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4710 if (longest_fixed_length
c277df42
IZ
4711 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4712 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4713 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4714 {
1182767e 4715 I32 t,ml;
cf93c79d 4716
1de06328
YO
4717 /* copy the information about the longest fixed
4718 from the reg_scan_data over to the program. */
33b8afdf
JH
4719 if (SvUTF8(data.longest_fixed)) {
4720 r->anchored_utf8 = data.longest_fixed;
c445ea15 4721 r->anchored_substr = NULL;
33b8afdf
JH
4722 } else {
4723 r->anchored_substr = data.longest_fixed;
c445ea15 4724 r->anchored_utf8 = NULL;
33b8afdf 4725 }
1de06328
YO
4726 /* fixed_end_shift is how many chars that must be matched that
4727 follow this item. We calculate it ahead of time as once the
4728 lookbehind offset is added in we lose the ability to correctly
4729 calculate it.*/
4730 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4731 : (I32)longest_fixed_length;
1de06328
YO
4732 r->anchored_end_shift = ml - data.offset_fixed
4733 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4734 + data.lookbehind_fixed;
4735 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4736
cf93c79d
IZ
4737 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4738 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4739 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4740 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4741 }
4742 else {
c445ea15 4743 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4744 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4745 longest_fixed_length = 0;
a0d0e21e 4746 }
f8fc2ecf
YO
4747 if (ri->regstclass
4748 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4749 ri->regstclass = NULL;
33b8afdf
JH
4750 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4751 && stclass_flag
653099ff 4752 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4753 && !cl_is_anything(data.start_class))
4754 {
2eccd3b2 4755 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4756
f8fc2ecf 4757 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4758 struct regnode_charclass_class);
4759 StructCopy(data.start_class,
f8fc2ecf 4760 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4761 struct regnode_charclass_class);
f8fc2ecf 4762 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4763 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4764 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4765 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4766 PerlIO_printf(Perl_debug_log,
a0288114 4767 "synthetic stclass \"%s\".\n",
3f7c398e 4768 SvPVX_const(sv));});
653099ff 4769 }
c277df42
IZ
4770
4771 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4772 if (longest_fixed_length > longest_float_length) {
1de06328 4773 r->check_end_shift = r->anchored_end_shift;
c277df42 4774 r->check_substr = r->anchored_substr;
33b8afdf 4775 r->check_utf8 = r->anchored_utf8;
c277df42 4776 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4777 if (r->extflags & RXf_ANCH_SINGLE)
4778 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4779 }
4780 else {
1de06328 4781 r->check_end_shift = r->float_end_shift;
c277df42 4782 r->check_substr = r->float_substr;
33b8afdf 4783 r->check_utf8 = r->float_utf8;
1de06328
YO
4784 r->check_offset_min = r->float_min_offset;
4785 r->check_offset_max = r->float_max_offset;
a0d0e21e 4786 }
30382c73
IZ
4787 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4788 This should be changed ASAP! */
bbe252da
YO
4789 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4790 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4791 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4792 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4793 }
1de06328
YO
4794 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4795 if ( (STRLEN)minlen < longest_float_length )
4796 minlen= longest_float_length;
4797 if ( (STRLEN)minlen < longest_fixed_length )
4798 minlen= longest_fixed_length;
4799 */
a0ed51b3
LW
4800 }
4801 else {
c277df42
IZ
4802 /* Several toplevels. Best we can is to set minlen. */
4803 I32 fake;
653099ff 4804 struct regnode_charclass_class ch_class;
cb434fcc 4805 I32 last_close = 0;
c277df42 4806
5d458dd8 4807 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4808
f8fc2ecf 4809 scan = ri->program + 1;
830247a4 4810 cl_init(pRExC_state, &ch_class);
653099ff 4811 data.start_class = &ch_class;
cb434fcc 4812 data.last_closep = &last_close;
07be1b83 4813
de8c5301 4814
1de06328 4815 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4816 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4817
786e8c11 4818 CHECK_RESTUDY_GOTO;
07be1b83 4819
33b8afdf 4820 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4821 = r->float_substr = r->float_utf8 = NULL;
653099ff 4822 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4823 && !cl_is_anything(data.start_class))
4824 {
2eccd3b2 4825 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4826
f8fc2ecf 4827 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4828 struct regnode_charclass_class);
4829 StructCopy(data.start_class,
f8fc2ecf 4830 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4831 struct regnode_charclass_class);
f8fc2ecf 4832 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4833 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4834 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4835 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4836 PerlIO_printf(Perl_debug_log,
a0288114 4837 "synthetic stclass \"%s\".\n",
3f7c398e 4838 SvPVX_const(sv));});
653099ff 4839 }
a0d0e21e
LW
4840 }
4841
1de06328
YO
4842 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4843 the "real" pattern. */
cf9788e3
RGS
4844 DEBUG_OPTIMISE_r({
4845 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4846 (IV)minlen, (IV)r->minlen);
cf9788e3 4847 });
de8c5301 4848 r->minlenret = minlen;
1de06328
YO
4849 if (r->minlen < minlen)
4850 r->minlen = minlen;
4851
b81d288d 4852 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4853 r->extflags |= RXf_GPOS_SEEN;
830247a4 4854 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4855 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4856 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4857 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4858 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4859 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4860 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4861 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4862 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4863 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 4864 if (RExC_paren_names)
85fbaab2 4865 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 4866 else
5daac39c 4867 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 4868
7bd1e614 4869#ifdef STUPID_PATTERN_CHECKS
5509d87a 4870 if (RX_PRELEN(rx) == 0)
640f820d 4871 r->extflags |= RXf_NULL;
5509d87a 4872 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
4873 /* XXX: this should happen BEFORE we compile */
4874 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 4875 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 4876 r->extflags |= RXf_WHITE;
5509d87a 4877 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 4878 r->extflags |= RXf_START_ONLY;
f1b875a0 4879#else
5509d87a 4880 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
4881 /* XXX: this should happen BEFORE we compile */
4882 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4883 else {
4884 regnode *first = ri->program + 1;
39aa8307
JH
4885 U8 fop = OP(first);
4886 U8 nop = OP(NEXTOPER(first));
7bd1e614 4887
640f820d
AB
4888 if (PL_regkind[fop] == NOTHING && nop == END)
4889 r->extflags |= RXf_NULL;
4890 else if (PL_regkind[fop] == BOL && nop == END)
7bd1e614
YO
4891 r->extflags |= RXf_START_ONLY;
4892 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4893 r->extflags |= RXf_WHITE;
4894 }
f1b875a0 4895#endif
1f1031fe
YO
4896#ifdef DEBUGGING
4897 if (RExC_paren_names) {
4898 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4899 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4900 } else
1f1031fe 4901#endif
cde0cee5 4902 ri->name_list_idx = 0;
1f1031fe 4903
40d049e4
YO
4904 if (RExC_recurse_count) {
4905 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4906 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4907 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4908 }
4909 }
f0ab9afb 4910 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
4911 /* assume we don't need to swap parens around before we match */
4912
be8e71aa
YO
4913 DEBUG_DUMP_r({
4914 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4915 regdump(r);
4916 });
7122b237
YO
4917#ifdef RE_TRACK_PATTERN_OFFSETS
4918 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4919 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
4920 U32 i;
4921 GET_RE_DEBUG_FLAGS_DECL;
7122b237 4922 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 4923 for (i = 1; i <= len; i++) {
7122b237 4924 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 4925 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 4926 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
4927 }
4928 PerlIO_printf(Perl_debug_log, "\n");
4929 });
7122b237 4930#endif
288b8c02 4931 return rx;
a687059c
LW
4932}
4933
f9f4320a 4934#undef RE_ENGINE_PTR
3dab1dad 4935
93b32b6d 4936
81714fb9 4937SV*
192b9cd1
AB
4938Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4939 const U32 flags)
4940{
7918f24d
NC
4941 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4942
192b9cd1
AB
4943 PERL_UNUSED_ARG(value);
4944
f1b875a0 4945 if (flags & RXapif_FETCH) {
192b9cd1 4946 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 4947 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
f1f66076 4948 Perl_croak(aTHX_ "%s", PL_no_modify);
192b9cd1 4949 return NULL;
f1b875a0 4950 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
4951 return reg_named_buff_exists(rx, key, flags)
4952 ? &PL_sv_yes
4953 : &PL_sv_no;
f1b875a0 4954 } else if (flags & RXapif_REGNAMES) {
192b9cd1 4955 return reg_named_buff_all(rx, flags);
f1b875a0 4956 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
4957 return reg_named_buff_scalar(rx, flags);
4958 } else {
4959 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4960 return NULL;
4961 }
4962}
4963
4964SV*
4965Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4966 const U32 flags)
4967{
7918f24d 4968 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
4969 PERL_UNUSED_ARG(lastkey);
4970
f1b875a0 4971 if (flags & RXapif_FIRSTKEY)
192b9cd1 4972 return reg_named_buff_firstkey(rx, flags);
f1b875a0 4973 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
4974 return reg_named_buff_nextkey(rx, flags);
4975 else {
4976 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4977 return NULL;
4978 }
4979}
4980
4981SV*
288b8c02
NC
4982Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
4983 const U32 flags)
81714fb9 4984{
44a2ac75
YO
4985 AV *retarray = NULL;
4986 SV *ret;
288b8c02 4987 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
4988
4989 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4990
f1b875a0 4991 if (flags & RXapif_ALL)
44a2ac75 4992 retarray=newAV();
93b32b6d 4993
5daac39c
NC
4994 if (rx && RXp_PAREN_NAMES(rx)) {
4995 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
4996 if (he_str) {
4997 IV i;
4998 SV* sv_dat=HeVAL(he_str);
4999 I32 *nums=(I32*)SvPVX(sv_dat);
5000 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
5001 if ((I32)(rx->nparens) >= nums[i]
5002 && rx->offs[nums[i]].start != -1
5003 && rx->offs[nums[i]].end != -1)
93b32b6d 5004 {
49d7dfbc 5005 ret = newSVpvs("");
288b8c02 5006 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
5007 if (!retarray)
5008 return ret;
5009 } else {
5010 ret = newSVsv(&PL_sv_undef);
5011 }
ec83ea38 5012 if (retarray)
93b32b6d 5013 av_push(retarray, ret);
81714fb9 5014 }
93b32b6d 5015 if (retarray)
ad64d0ec 5016 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
5017 }
5018 }
5019 return NULL;
5020}
5021
5022bool
288b8c02 5023Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
5024 const U32 flags)
5025{
288b8c02 5026 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5027
5028 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5029
5daac39c 5030 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5031 if (flags & RXapif_ALL) {
5daac39c 5032 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 5033 } else {
288b8c02 5034 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
5035 if (sv) {
5036 SvREFCNT_dec(sv);
192b9cd1
AB
5037 return TRUE;
5038 } else {
5039 return FALSE;
5040 }
5041 }
5042 } else {
5043 return FALSE;
5044 }
5045}
5046
5047SV*
288b8c02 5048Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5049{
288b8c02 5050 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5051
5052 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5053
5daac39c
NC
5054 if ( rx && RXp_PAREN_NAMES(rx) ) {
5055 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 5056
288b8c02 5057 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
5058 } else {
5059 return FALSE;
5060 }
192b9cd1
AB
5061}
5062
5063SV*
288b8c02 5064Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5065{
288b8c02 5066 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 5067 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5068
5069 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5070
5daac39c
NC
5071 if (rx && RXp_PAREN_NAMES(rx)) {
5072 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
5073 HE *temphe;
5074 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5075 IV i;
5076 IV parno = 0;
5077 SV* sv_dat = HeVAL(temphe);
5078 I32 *nums = (I32*)SvPVX(sv_dat);
5079 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5080 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5081 rx->offs[nums[i]].start != -1 &&
5082 rx->offs[nums[i]].end != -1)
5083 {
5084 parno = nums[i];
5085 break;
5086 }
5087 }
f1b875a0 5088 if (parno || flags & RXapif_ALL) {
a663657d 5089 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5090 }
81714fb9
YO
5091 }
5092 }
44a2ac75
YO
5093 return NULL;
5094}
5095
192b9cd1 5096SV*
288b8c02 5097Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5098{
5099 SV *ret;
5100 AV *av;
5101 I32 length;
288b8c02 5102 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5103
7918f24d
NC
5104 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5105
5daac39c 5106 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5107 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5108 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5109 } else if (flags & RXapif_ONE) {
288b8c02 5110 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 5111 av = MUTABLE_AV(SvRV(ret));
192b9cd1 5112 length = av_len(av);
ec83ea38 5113 SvREFCNT_dec(ret);
192b9cd1
AB
5114 return newSViv(length + 1);
5115 } else {
5116 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5117 return NULL;
5118 }
5119 }
5120 return &PL_sv_undef;
5121}
5122
5123SV*
288b8c02 5124Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5125{
288b8c02 5126 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5127 AV *av = newAV();
5128
7918f24d
NC
5129 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5130
5daac39c
NC
5131 if (rx && RXp_PAREN_NAMES(rx)) {
5132 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5133 HE *temphe;
5134 (void)hv_iterinit(hv);
5135 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5136 IV i;
5137 IV parno = 0;
5138 SV* sv_dat = HeVAL(temphe);
5139 I32 *nums = (I32*)SvPVX(sv_dat);
5140 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5141 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5142 rx->offs[nums[i]].start != -1 &&
5143 rx->offs[nums[i]].end != -1)
5144 {
5145 parno = nums[i];
5146 break;
5147 }
5148 }
f1b875a0 5149 if (parno || flags & RXapif_ALL) {
a663657d 5150 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
5151 }
5152 }
5153 }
5154
ad64d0ec 5155 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
5156}
5157
49d7dfbc 5158void
288b8c02
NC
5159Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5160 SV * const sv)
44a2ac75 5161{
288b8c02 5162 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 5163 char *s = NULL;
a9d504c3 5164 I32 i = 0;
44a2ac75 5165 I32 s1, t1;
7918f24d
NC
5166
5167 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 5168
cde0cee5
YO
5169 if (!rx->subbeg) {
5170 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5171 return;
cde0cee5
YO
5172 }
5173 else
f1b875a0 5174 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 5175 /* $` */
f0ab9afb 5176 i = rx->offs[0].start;
cde0cee5 5177 s = rx->subbeg;
44a2ac75
YO
5178 }
5179 else
f1b875a0 5180 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 5181 /* $' */
f0ab9afb
NC
5182 s = rx->subbeg + rx->offs[0].end;
5183 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
5184 }
5185 else
5186 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
5187 (s1 = rx->offs[paren].start) != -1 &&
5188 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
5189 {
5190 /* $& $1 ... */
5191 i = t1 - s1;
5192 s = rx->subbeg + s1;
cde0cee5
YO
5193 } else {
5194 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5195 return;
cde0cee5
YO
5196 }
5197 assert(rx->sublen >= (s - rx->subbeg) + i );
5198 if (i >= 0) {
5199 const int oldtainted = PL_tainted;
5200 TAINT_NOT;
5201 sv_setpvn(sv, s, i);
5202 PL_tainted = oldtainted;
5203 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 5204 ? (RXp_MATCH_UTF8(rx)
cde0cee5 5205 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 5206 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
5207 {
5208 SvUTF8_on(sv);
5209 }
5210 else
5211 SvUTF8_off(sv);
5212 if (PL_tainting) {
07bc277f 5213 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
5214 if (SvTYPE(sv) >= SVt_PVMG) {
5215 MAGIC* const mg = SvMAGIC(sv);
5216 MAGIC* mgt;
5217 PL_tainted = 1;
5218 SvMAGIC_set(sv, mg->mg_moremagic);
5219 SvTAINT(sv);
5220 if ((mgt = SvMAGIC(sv))) {
5221 mg->mg_moremagic = mgt;
5222 SvMAGIC_set(sv, mg);
44a2ac75 5223 }
cde0cee5
YO
5224 } else {
5225 PL_tainted = 1;
5226 SvTAINT(sv);
5227 }
5228 } else
5229 SvTAINTED_off(sv);
44a2ac75 5230 }
81714fb9 5231 } else {
44a2ac75 5232 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5233 return;
81714fb9
YO
5234 }
5235}
93b32b6d 5236
2fdbfb4d
AB
5237void
5238Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5239 SV const * const value)
5240{
7918f24d
NC
5241 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5242
2fdbfb4d
AB
5243 PERL_UNUSED_ARG(rx);
5244 PERL_UNUSED_ARG(paren);
5245 PERL_UNUSED_ARG(value);
5246
5247 if (!PL_localizing)
f1f66076 5248 Perl_croak(aTHX_ "%s", PL_no_modify);
2fdbfb4d
AB
5249}
5250
5251I32
288b8c02 5252Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
5253 const I32 paren)
5254{
288b8c02 5255 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
5256 I32 i;
5257 I32 s1, t1;
5258
7918f24d
NC
5259 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5260
2fdbfb4d
AB
5261 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5262 switch (paren) {
192b9cd1 5263 /* $` / ${^PREMATCH} */
f1b875a0 5264 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
5265 if (rx->offs[0].start != -1) {
5266 i = rx->offs[0].start;
5267 if (i > 0) {
5268 s1 = 0;
5269 t1 = i;
5270 goto getlen;
5271 }
5272 }
5273 return 0;
192b9cd1 5274 /* $' / ${^POSTMATCH} */
f1b875a0 5275 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
5276 if (rx->offs[0].end != -1) {
5277 i = rx->sublen - rx->offs[0].end;
5278 if (i > 0) {
5279 s1 = rx->offs[0].end;
5280 t1 = rx->sublen;
5281 goto getlen;
5282 }
5283 }
5284 return 0;
192b9cd1
AB
5285 /* $& / ${^MATCH}, $1, $2, ... */
5286 default:
2fdbfb4d
AB
5287 if (paren <= (I32)rx->nparens &&
5288 (s1 = rx->offs[paren].start) != -1 &&
5289 (t1 = rx->offs[paren].end) != -1)
5290 {
5291 i = t1 - s1;
5292 goto getlen;
5293 } else {
5294 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 5295 report_uninit((const SV *)sv);
2fdbfb4d
AB
5296 return 0;
5297 }
5298 }
5299 getlen:
07bc277f 5300 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
5301 const char * const s = rx->subbeg + s1;
5302 const U8 *ep;
5303 STRLEN el;
5304
5305 i = t1 - s1;
5306 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5307 i = el;
5308 }
5309 return i;
5310}
5311
fe578d7f 5312SV*
49d7dfbc 5313Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 5314{
7918f24d 5315 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 5316 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
5317 if (0)
5318 return NULL;
5319 else
5320 return newSVpvs("Regexp");
fe578d7f 5321}
0a4db386 5322
894be9b7 5323/* Scans the name of a named buffer from the pattern.
0a4db386
YO
5324 * If flags is REG_RSN_RETURN_NULL returns null.
5325 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5326 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5327 * to the parsed name as looked up in the RExC_paren_names hash.
5328 * If there is an error throws a vFAIL().. type exception.
894be9b7 5329 */
0a4db386
YO
5330
5331#define REG_RSN_RETURN_NULL 0
5332#define REG_RSN_RETURN_NAME 1
5333#define REG_RSN_RETURN_DATA 2
5334
894be9b7 5335STATIC SV*
7918f24d
NC
5336S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5337{
894be9b7 5338 char *name_start = RExC_parse;
1f1031fe 5339
7918f24d
NC
5340 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5341
1f1031fe
YO
5342 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5343 /* skip IDFIRST by using do...while */
5344 if (UTF)
5345 do {
5346 RExC_parse += UTF8SKIP(RExC_parse);
5347 } while (isALNUM_utf8((U8*)RExC_parse));
5348 else
5349 do {
5350 RExC_parse++;
5351 } while (isALNUM(*RExC_parse));
894be9b7 5352 }
1f1031fe 5353
0a4db386 5354 if ( flags ) {
59cd0e26
NC
5355 SV* sv_name
5356 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5357 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
5358 if ( flags == REG_RSN_RETURN_NAME)
5359 return sv_name;
5360 else if (flags==REG_RSN_RETURN_DATA) {
5361 HE *he_str = NULL;
5362 SV *sv_dat = NULL;
5363 if ( ! sv_name ) /* should not happen*/
5364 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5365 if (RExC_paren_names)
5366 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5367 if ( he_str )
5368 sv_dat = HeVAL(he_str);
5369 if ( ! sv_dat )
5370 vFAIL("Reference to nonexistent named group");
5371 return sv_dat;
5372 }
5373 else {
5374 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5375 }
5376 /* NOT REACHED */
894be9b7 5377 }
0a4db386 5378 return NULL;
894be9b7
YO
5379}
5380
3dab1dad
YO
5381#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5382 int rem=(int)(RExC_end - RExC_parse); \
5383 int cut; \
5384 int num; \
5385 int iscut=0; \
5386 if (rem>10) { \
5387 rem=10; \
5388 iscut=1; \
5389 } \
5390 cut=10-rem; \
5391 if (RExC_lastparse!=RExC_parse) \
5392 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5393 rem, RExC_parse, \
5394 cut + 4, \
5395 iscut ? "..." : "<" \
5396 ); \
5397 else \
5398 PerlIO_printf(Perl_debug_log,"%16s",""); \
5399 \
5400 if (SIZE_ONLY) \
3b57cd43 5401 num = RExC_size + 1; \
3dab1dad
YO
5402 else \
5403 num=REG_NODE_NUM(RExC_emit); \
5404 if (RExC_lastnum!=num) \
0a4db386 5405 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 5406 else \
0a4db386 5407 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
5408 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5409 (int)((depth*2)), "", \
3dab1dad
YO
5410 (funcname) \
5411 ); \
5412 RExC_lastnum=num; \
5413 RExC_lastparse=RExC_parse; \
5414})
5415
07be1b83
YO
5416
5417
3dab1dad
YO
5418#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5419 DEBUG_PARSE_MSG((funcname)); \
5420 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5421})
6bda09f9
YO
5422#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5423 DEBUG_PARSE_MSG((funcname)); \
5424 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5425})
a687059c
LW
5426/*
5427 - reg - regular expression, i.e. main body or parenthesized thing
5428 *
5429 * Caller must absorb opening parenthesis.
5430 *
5431 * Combining parenthesis handling with the base level of regular expression
5432 * is a trifle forced, but the need to tie the tails of the branches to what
5433 * follows makes it hard to avoid.
5434 */
07be1b83
YO
5435#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5436#ifdef DEBUGGING
5437#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5438#else
5439#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5440#endif
3dab1dad 5441
76e3520e 5442STATIC regnode *
3dab1dad 5443S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 5444 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 5445{
27da23d5 5446 dVAR;
c277df42
IZ
5447 register regnode *ret; /* Will be the head of the group. */
5448 register regnode *br;
5449 register regnode *lastbr;
cbbf8932 5450 register regnode *ender = NULL;
a0d0e21e 5451 register I32 parno = 0;
cbbf8932 5452 I32 flags;
f7819f85 5453 U32 oregflags = RExC_flags;
6136c704
AL
5454 bool have_branch = 0;
5455 bool is_open = 0;
594d7033
YO
5456 I32 freeze_paren = 0;
5457 I32 after_freeze = 0;
9d1d55b5
JP
5458
5459 /* for (?g), (?gc), and (?o) warnings; warning
5460 about (?c) will warn about (?g) -- japhy */
5461
6136c704
AL
5462#define WASTED_O 0x01
5463#define WASTED_G 0x02
5464#define WASTED_C 0x04
5465#define WASTED_GC (0x02|0x04)
cbbf8932 5466 I32 wastedflags = 0x00;
9d1d55b5 5467
fac92740 5468 char * parse_start = RExC_parse; /* MJD */
a28509cc 5469 char * const oregcomp_parse = RExC_parse;
a0d0e21e 5470
3dab1dad 5471 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5472
5473 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
5474 DEBUG_PARSE("reg ");
5475
821b33a5 5476 *flagp = 0; /* Tentatively. */
a0d0e21e 5477
9d1d55b5 5478
a0d0e21e
LW
5479 /* Make an OPEN node, if parenthesized. */
5480 if (paren) {
e2e6a0f1
YO
5481 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5482 char *start_verb = RExC_parse;
5483 STRLEN verb_len = 0;
5484 char *start_arg = NULL;
5485 unsigned char op = 0;
5486 int argok = 1;
5487 int internal_argval = 0; /* internal_argval is only useful if !argok */
5488 while ( *RExC_parse && *RExC_parse != ')' ) {
5489 if ( *RExC_parse == ':' ) {
5490 start_arg = RExC_parse + 1;
5491 break;
5492 }
5493 RExC_parse++;
5494 }
5495 ++start_verb;
5496 verb_len = RExC_parse - start_verb;
5497 if ( start_arg ) {
5498 RExC_parse++;
5499 while ( *RExC_parse && *RExC_parse != ')' )
5500 RExC_parse++;
5501 if ( *RExC_parse != ')' )
5502 vFAIL("Unterminated verb pattern argument");
5503 if ( RExC_parse == start_arg )
5504 start_arg = NULL;
5505 } else {
5506 if ( *RExC_parse != ')' )
5507 vFAIL("Unterminated verb pattern");
5508 }
5d458dd8 5509
e2e6a0f1
YO
5510 switch ( *start_verb ) {
5511 case 'A': /* (*ACCEPT) */
568a785a 5512 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
5513 op = ACCEPT;
5514 internal_argval = RExC_nestroot;
5515 }
5516 break;
5517 case 'C': /* (*COMMIT) */
568a785a 5518 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 5519 op = COMMIT;
e2e6a0f1
YO
5520 break;
5521 case 'F': /* (*FAIL) */
568a785a 5522 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
5523 op = OPFAIL;
5524 argok = 0;
5525 }
5526 break;
5d458dd8
YO
5527 case ':': /* (*:NAME) */
5528 case 'M': /* (*MARK:NAME) */
568a785a 5529 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 5530 op = MARKPOINT;
5d458dd8
YO
5531 argok = -1;
5532 }
5533 break;
5534 case 'P': /* (*PRUNE) */
568a785a 5535 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 5536 op = PRUNE;
e2e6a0f1 5537 break;
5d458dd8 5538 case 'S': /* (*SKIP) */
568a785a 5539 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
5540 op = SKIP;
5541 break;
5542 case 'T': /* (*THEN) */
5543 /* [19:06] <TimToady> :: is then */
568a785a 5544 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
5545 op = CUTGROUP;
5546 RExC_seen |= REG_SEEN_CUTGROUP;
5547 }
e2e6a0f1
YO
5548 break;
5549 }
5550 if ( ! op ) {
5551 RExC_parse++;
5552 vFAIL3("Unknown verb pattern '%.*s'",
5553 verb_len, start_verb);
5554 }
5555 if ( argok ) {
5556 if ( start_arg && internal_argval ) {
5557 vFAIL3("Verb pattern '%.*s' may not have an argument",
5558 verb_len, start_verb);
5559 } else if ( argok < 0 && !start_arg ) {
5560 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5561 verb_len, start_verb);
5562 } else {
5563 ret = reganode(pRExC_state, op, internal_argval);
5564 if ( ! internal_argval && ! SIZE_ONLY ) {
5565 if (start_arg) {
5566 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5567 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5568 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
5569 ret->flags = 0;
5570 } else {
5571 ret->flags = 1;
5572 }
5573 }
5574 }
5575 if (!internal_argval)
5576 RExC_seen |= REG_SEEN_VERBARG;
5577 } else if ( start_arg ) {
5578 vFAIL3("Verb pattern '%.*s' may not have an argument",
5579 verb_len, start_verb);
5580 } else {
5581 ret = reg_node(pRExC_state, op);
5582 }
5583 nextchar(pRExC_state);
5584 return ret;
5585 } else
fac92740 5586 if (*RExC_parse == '?') { /* (?...) */
6136c704 5587 bool is_logical = 0;
a28509cc 5588 const char * const seqstart = RExC_parse;
ca9dfc88 5589
830247a4
IZ
5590 RExC_parse++;
5591 paren = *RExC_parse++;
c277df42 5592 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5593 switch (paren) {
894be9b7 5594
1f1031fe
YO
5595 case 'P': /* (?P...) variants for those used to PCRE/Python */
5596 paren = *RExC_parse++;
5597 if ( paren == '<') /* (?P<...>) named capture */
5598 goto named_capture;
5599 else if (paren == '>') { /* (?P>name) named recursion */
5600 goto named_recursion;
5601 }
5602 else if (paren == '=') { /* (?P=...) named backref */
5603 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5604 you change this make sure you change that */
5605 char* name_start = RExC_parse;
5606 U32 num = 0;
5607 SV *sv_dat = reg_scan_name(pRExC_state,
5608 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5609 if (RExC_parse == name_start || *RExC_parse != ')')
5610 vFAIL2("Sequence %.3s... not terminated",parse_start);
5611
5612 if (!SIZE_ONLY) {
5613 num = add_data( pRExC_state, 1, "S" );
5614 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5615 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
5616 }
5617 RExC_sawback = 1;
5618 ret = reganode(pRExC_state,
5619 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5620 num);
5621 *flagp |= HASWIDTH;
5622
5623 Set_Node_Offset(ret, parse_start+1);
5624 Set_Node_Cur_Length(ret); /* MJD */
5625
5626 nextchar(pRExC_state);
5627 return ret;
5628 }
57b84237
YO
5629 RExC_parse++;
5630 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5631 /*NOTREACHED*/
5632 case '<': /* (?<...) */
b81d288d 5633 if (*RExC_parse == '!')
c277df42 5634 paren = ',';
0a4db386 5635 else if (*RExC_parse != '=')
1f1031fe 5636 named_capture:
0a4db386 5637 { /* (?<...>) */
81714fb9 5638 char *name_start;
894be9b7 5639 SV *svname;
81714fb9
YO
5640 paren= '>';
5641 case '\'': /* (?'...') */
5642 name_start= RExC_parse;
0a4db386
YO
5643 svname = reg_scan_name(pRExC_state,
5644 SIZE_ONLY ? /* reverse test from the others */
5645 REG_RSN_RETURN_NAME :
5646 REG_RSN_RETURN_NULL);
57b84237
YO
5647 if (RExC_parse == name_start) {
5648 RExC_parse++;
5649 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5650 /*NOTREACHED*/
5651 }
81714fb9
YO
5652 if (*RExC_parse != paren)
5653 vFAIL2("Sequence (?%c... not terminated",
5654 paren=='>' ? '<' : paren);
5655 if (SIZE_ONLY) {
e62cc96a
YO
5656 HE *he_str;
5657 SV *sv_dat = NULL;
894be9b7
YO
5658 if (!svname) /* shouldnt happen */
5659 Perl_croak(aTHX_
5660 "panic: reg_scan_name returned NULL");
81714fb9
YO
5661 if (!RExC_paren_names) {
5662 RExC_paren_names= newHV();
ad64d0ec 5663 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
5664#ifdef DEBUGGING
5665 RExC_paren_name_list= newAV();
ad64d0ec 5666 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 5667#endif
81714fb9
YO
5668 }
5669 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5670 if ( he_str )
81714fb9 5671 sv_dat = HeVAL(he_str);
e62cc96a 5672 if ( ! sv_dat ) {
81714fb9 5673 /* croak baby croak */
e62cc96a
YO
5674 Perl_croak(aTHX_
5675 "panic: paren_name hash element allocation failed");
5676 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
5677 /* (?|...) can mean we have dupes so scan to check
5678 its already been stored. Maybe a flag indicating
5679 we are inside such a construct would be useful,
5680 but the arrays are likely to be quite small, so
5681 for now we punt -- dmq */
5682 IV count = SvIV(sv_dat);
5683 I32 *pv = (I32*)SvPVX(sv_dat);
5684 IV i;
5685 for ( i = 0 ; i < count ; i++ ) {
5686 if ( pv[i] == RExC_npar ) {
5687 count = 0;
5688 break;
5689 }
5690 }
5691 if ( count ) {
5692 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5693 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5694 pv[count] = RExC_npar;
3a92e6ae 5695 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 5696 }
81714fb9
YO
5697 } else {
5698 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5699 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5700 SvIOK_on(sv_dat);
3ec35e0f 5701 SvIV_set(sv_dat, 1);
e62cc96a 5702 }
1f1031fe
YO
5703#ifdef DEBUGGING
5704 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5705 SvREFCNT_dec(svname);
5706#endif
e62cc96a 5707
81714fb9
YO
5708 /*sv_dump(sv_dat);*/
5709 }
5710 nextchar(pRExC_state);
5711 paren = 1;
5712 goto capturing_parens;
5713 }
5714 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5715 RExC_parse++;
fac92740 5716 case '=': /* (?=...) */
89c6a13e
AB
5717 RExC_seen_zerolen++;
5718 break;
fac92740 5719 case '!': /* (?!...) */
830247a4 5720 RExC_seen_zerolen++;
e2e6a0f1
YO
5721 if (*RExC_parse == ')') {
5722 ret=reg_node(pRExC_state, OPFAIL);
5723 nextchar(pRExC_state);
5724 return ret;
5725 }
594d7033
YO
5726 break;
5727 case '|': /* (?|...) */
5728 /* branch reset, behave like a (?:...) except that
5729 buffers in alternations share the same numbers */
5730 paren = ':';
5731 after_freeze = freeze_paren = RExC_npar;
5732 break;
fac92740
MJD
5733 case ':': /* (?:...) */
5734 case '>': /* (?>...) */
a0d0e21e 5735 break;
fac92740
MJD
5736 case '$': /* (?$...) */
5737 case '@': /* (?@...) */
8615cb43 5738 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5739 break;
fac92740 5740 case '#': /* (?#...) */
830247a4
IZ
5741 while (*RExC_parse && *RExC_parse != ')')
5742 RExC_parse++;
5743 if (*RExC_parse != ')')
c277df42 5744 FAIL("Sequence (?#... not terminated");
830247a4 5745 nextchar(pRExC_state);
a0d0e21e
LW
5746 *flagp = TRYAGAIN;
5747 return NULL;
894be9b7
YO
5748 case '0' : /* (?0) */
5749 case 'R' : /* (?R) */
5750 if (*RExC_parse != ')')
6bda09f9 5751 FAIL("Sequence (?R) not terminated");
1a147d38 5752 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 5753 *flagp |= POSTPONED;
7f69552c
YO
5754 nextchar(pRExC_state);
5755 return ret;
5756 /*notreached*/
894be9b7
YO
5757 { /* named and numeric backreferences */
5758 I32 num;
894be9b7
YO
5759 case '&': /* (?&NAME) */
5760 parse_start = RExC_parse - 1;
1f1031fe 5761 named_recursion:
894be9b7 5762 {
0a4db386
YO
5763 SV *sv_dat = reg_scan_name(pRExC_state,
5764 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5765 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5766 }
5767 goto gen_recurse_regop;
5768 /* NOT REACHED */
542fa716
YO
5769 case '+':
5770 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5771 RExC_parse++;
5772 vFAIL("Illegal pattern");
5773 }
5774 goto parse_recursion;
5775 /* NOT REACHED*/
5776 case '-': /* (?-1) */
5777 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5778 RExC_parse--; /* rewind to let it be handled later */
5779 goto parse_flags;
5780 }
5781 /*FALLTHROUGH */
6bda09f9
YO
5782 case '1': case '2': case '3': case '4': /* (?1) */
5783 case '5': case '6': case '7': case '8': case '9':
5784 RExC_parse--;
542fa716 5785 parse_recursion:
894be9b7
YO
5786 num = atoi(RExC_parse);
5787 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5788 if (*RExC_parse == '-')
5789 RExC_parse++;
6bda09f9
YO
5790 while (isDIGIT(*RExC_parse))
5791 RExC_parse++;
5792 if (*RExC_parse!=')')
5793 vFAIL("Expecting close bracket");
894be9b7
YO
5794
5795 gen_recurse_regop:
542fa716
YO
5796 if ( paren == '-' ) {
5797 /*
5798 Diagram of capture buffer numbering.
5799 Top line is the normal capture buffer numbers
5800 Botton line is the negative indexing as from
5801 the X (the (?-2))
5802
5803 + 1 2 3 4 5 X 6 7
5804 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5805 - 5 4 3 2 1 X x x
5806
5807 */
5808 num = RExC_npar + num;
5809 if (num < 1) {
5810 RExC_parse++;
5811 vFAIL("Reference to nonexistent group");
5812 }
5813 } else if ( paren == '+' ) {
5814 num = RExC_npar + num - 1;
5815 }
5816
1a147d38 5817 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5818 if (!SIZE_ONLY) {
5819 if (num > (I32)RExC_rx->nparens) {
5820 RExC_parse++;
5821 vFAIL("Reference to nonexistent group");
5822 }
40d049e4 5823 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5824 RExC_emit++;
226de585 5825 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5826 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5827 } else {
6bda09f9 5828 RExC_size++;
6bda09f9 5829 }
0a4db386 5830 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5831 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5832 Set_Node_Offset(ret, parse_start); /* MJD */
5833
a3b492c3 5834 *flagp |= POSTPONED;
6bda09f9
YO
5835 nextchar(pRExC_state);
5836 return ret;
894be9b7
YO
5837 } /* named and numeric backreferences */
5838 /* NOT REACHED */
5839
fac92740 5840 case '?': /* (??...) */
6136c704 5841 is_logical = 1;
57b84237
YO
5842 if (*RExC_parse != '{') {
5843 RExC_parse++;
5844 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5845 /*NOTREACHED*/
5846 }
a3b492c3 5847 *flagp |= POSTPONED;
830247a4 5848 paren = *RExC_parse++;
0f5d15d6 5849 /* FALL THROUGH */
fac92740 5850 case '{': /* (?{...}) */
c277df42 5851 {
2eccd3b2
NC
5852 I32 count = 1;
5853 U32 n = 0;
c277df42 5854 char c;
830247a4 5855 char *s = RExC_parse;
c277df42 5856
830247a4
IZ
5857 RExC_seen_zerolen++;
5858 RExC_seen |= REG_SEEN_EVAL;
5859 while (count && (c = *RExC_parse)) {
6136c704
AL
5860 if (c == '\\') {
5861 if (RExC_parse[1])
5862 RExC_parse++;
5863 }
b81d288d 5864 else if (c == '{')
c277df42 5865 count++;
b81d288d 5866 else if (c == '}')
c277df42 5867 count--;
830247a4 5868 RExC_parse++;
c277df42 5869 }
6136c704 5870 if (*RExC_parse != ')') {
b81d288d 5871 RExC_parse = s;
b45f050a
JF
5872 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5873 }
c277df42 5874 if (!SIZE_ONLY) {
f3548bdc 5875 PAD *pad;
6136c704
AL
5876 OP_4tree *sop, *rop;
5877 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5878
569233ed
SB
5879 ENTER;
5880 Perl_save_re_context(aTHX);
f3548bdc 5881 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5882 sop->op_private |= OPpREFCOUNTED;
5883 /* re_dup will OpREFCNT_inc */
5884 OpREFCNT_set(sop, 1);
569233ed 5885 LEAVE;
c277df42 5886
830247a4 5887 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5888 RExC_rxi->data->data[n] = (void*)rop;
5889 RExC_rxi->data->data[n+1] = (void*)sop;
5890 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 5891 SvREFCNT_dec(sv);
a0ed51b3 5892 }
e24b16f9 5893 else { /* First pass */
830247a4 5894 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5895 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5896 /* No compiled RE interpolated, has runtime
5897 components ===> unsafe. */
5898 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5899 if (PL_tainting && PL_tainted)
cc6b7395 5900 FAIL("Eval-group in insecure regular expression");
54df2634 5901#if PERL_VERSION > 8
923e4eb5 5902 if (IN_PERL_COMPILETIME)
b5c19bd7 5903 PL_cv_has_eval = 1;
54df2634 5904#endif
c277df42 5905 }
b5c19bd7 5906
830247a4 5907 nextchar(pRExC_state);
6136c704 5908 if (is_logical) {
830247a4 5909 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5910 if (!SIZE_ONLY)
5911 ret->flags = 2;
3dab1dad 5912 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5913 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5914 return ret;
5915 }
ccb2c380
MP
5916 ret = reganode(pRExC_state, EVAL, n);
5917 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5918 Set_Node_Offset(ret, parse_start);
5919 return ret;
c277df42 5920 }
fac92740 5921 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5922 {
0a4db386 5923 int is_define= 0;
fac92740 5924 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5925 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5926 || RExC_parse[1] == '<'
830247a4 5927 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5928 I32 flag;
5929
830247a4 5930 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5931 if (!SIZE_ONLY)
5932 ret->flags = 1;
3dab1dad 5933 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5934 goto insert_if;
b81d288d 5935 }
a0ed51b3 5936 }
0a4db386
YO
5937 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5938 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5939 {
5940 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5941 char *name_start= RExC_parse++;
2eccd3b2 5942 U32 num = 0;
0a4db386
YO
5943 SV *sv_dat=reg_scan_name(pRExC_state,
5944 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5945 if (RExC_parse == name_start || *RExC_parse != ch)
5946 vFAIL2("Sequence (?(%c... not terminated",
5947 (ch == '>' ? '<' : ch));
5948 RExC_parse++;
5949 if (!SIZE_ONLY) {
5950 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5951 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5952 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
5953 }
5954 ret = reganode(pRExC_state,NGROUPP,num);
5955 goto insert_if_check_paren;
5956 }
5957 else if (RExC_parse[0] == 'D' &&
5958 RExC_parse[1] == 'E' &&
5959 RExC_parse[2] == 'F' &&
5960 RExC_parse[3] == 'I' &&
5961 RExC_parse[4] == 'N' &&
5962 RExC_parse[5] == 'E')
5963 {
5964 ret = reganode(pRExC_state,DEFINEP,0);
5965 RExC_parse +=6 ;
5966 is_define = 1;
5967 goto insert_if_check_paren;
5968 }
5969 else if (RExC_parse[0] == 'R') {
5970 RExC_parse++;
5971 parno = 0;
5972 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5973 parno = atoi(RExC_parse++);
5974 while (isDIGIT(*RExC_parse))
5975 RExC_parse++;
5976 } else if (RExC_parse[0] == '&') {
5977 SV *sv_dat;
5978 RExC_parse++;
5979 sv_dat = reg_scan_name(pRExC_state,
5980 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5981 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5982 }
1a147d38 5983 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5984 goto insert_if_check_paren;
5985 }
830247a4 5986 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5987 /* (?(1)...) */
6136c704 5988 char c;
830247a4 5989 parno = atoi(RExC_parse++);
c277df42 5990
830247a4
IZ
5991 while (isDIGIT(*RExC_parse))
5992 RExC_parse++;
fac92740 5993 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5994
0a4db386 5995 insert_if_check_paren:
830247a4 5996 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5997 vFAIL("Switch condition not recognized");
c277df42 5998 insert_if:
3dab1dad
YO
5999 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6000 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 6001 if (br == NULL)
830247a4 6002 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 6003 else
3dab1dad 6004 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 6005 c = *nextchar(pRExC_state);
d1b80229
IZ
6006 if (flags&HASWIDTH)
6007 *flagp |= HASWIDTH;
c277df42 6008 if (c == '|') {
0a4db386
YO
6009 if (is_define)
6010 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 6011 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
6012 regbranch(pRExC_state, &flags, 1,depth+1);
6013 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
6014 if (flags&HASWIDTH)
6015 *flagp |= HASWIDTH;
830247a4 6016 c = *nextchar(pRExC_state);
a0ed51b3
LW
6017 }
6018 else
c277df42
IZ
6019 lastbr = NULL;
6020 if (c != ')')
8615cb43 6021 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 6022 ender = reg_node(pRExC_state, TAIL);
3dab1dad 6023 REGTAIL(pRExC_state, br, ender);
c277df42 6024 if (lastbr) {
3dab1dad
YO
6025 REGTAIL(pRExC_state, lastbr, ender);
6026 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
6027 }
6028 else
3dab1dad 6029 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
6030 RExC_size++; /* XXX WHY do we need this?!!
6031 For large programs it seems to be required
6032 but I can't figure out why. -- dmq*/
c277df42 6033 return ret;
a0ed51b3
LW
6034 }
6035 else {
830247a4 6036 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
6037 }
6038 }
1b1626e4 6039 case 0:
830247a4 6040 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 6041 vFAIL("Sequence (? incomplete");
1b1626e4 6042 break;
a0d0e21e 6043 default:
cde0cee5
YO
6044 --RExC_parse;
6045 parse_flags: /* (?i) */
6046 {
6047 U32 posflags = 0, negflags = 0;
6048 U32 *flagsp = &posflags;
6049
6050 while (*RExC_parse) {
6051 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
6052 /* (?g), (?gc) and (?o) are useless here
6053 and must be globally applied -- japhy */
cde0cee5
YO
6054 switch (*RExC_parse) {
6055 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
f7819f85
A
6056 case ONCE_PAT_MOD: /* 'o' */
6057 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 6058 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 6059 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
6060 if (! (wastedflags & wflagbit) ) {
6061 wastedflags |= wflagbit;
6062 vWARN5(
6063 RExC_parse + 1,
6064 "Useless (%s%c) - %suse /%c modifier",
6065 flagsp == &negflags ? "?-" : "?",
6066 *RExC_parse,
6067 flagsp == &negflags ? "don't " : "",
6068 *RExC_parse
6069 );
6070 }
6071 }
cde0cee5
YO
6072 break;
6073
f7819f85 6074 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 6075 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
6076 if (! (wastedflags & WASTED_C) ) {
6077 wastedflags |= WASTED_GC;
9d1d55b5
JP
6078 vWARN3(
6079 RExC_parse + 1,
6080 "Useless (%sc) - %suse /gc modifier",
6081 flagsp == &negflags ? "?-" : "?",
6082 flagsp == &negflags ? "don't " : ""
6083 );
6084 }
6085 }
cde0cee5 6086 break;
f7819f85 6087 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5 6088 if (flagsp == &negflags) {
668c081a
NC
6089 if (SIZE_ONLY)
6090 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
6091 } else {
6092 *flagsp |= RXf_PMf_KEEPCOPY;
6093 }
6094 break;
6095 case '-':
57b84237
YO
6096 if (flagsp == &negflags) {
6097 RExC_parse++;
6098 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6099 /*NOTREACHED*/
6100 }
cde0cee5
YO
6101 flagsp = &negflags;
6102 wastedflags = 0; /* reset so (?g-c) warns twice */
6103 break;
6104 case ':':
6105 paren = ':';
6106 /*FALLTHROUGH*/
6107 case ')':
6108 RExC_flags |= posflags;
6109 RExC_flags &= ~negflags;
f7819f85
A
6110 if (paren != ':') {
6111 oregflags |= posflags;
6112 oregflags &= ~negflags;
6113 }
cde0cee5
YO
6114 nextchar(pRExC_state);
6115 if (paren != ':') {
6116 *flagp = TRYAGAIN;
6117 return NULL;
6118 } else {
6119 ret = NULL;
6120 goto parse_rest;
6121 }
6122 /*NOTREACHED*/
6123 default:
cde0cee5
YO
6124 RExC_parse++;
6125 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6126 /*NOTREACHED*/
6127 }
830247a4 6128 ++RExC_parse;
48c036b1 6129 }
cde0cee5 6130 }} /* one for the default block, one for the switch */
a0d0e21e 6131 }
fac92740 6132 else { /* (...) */
81714fb9 6133 capturing_parens:
830247a4
IZ
6134 parno = RExC_npar;
6135 RExC_npar++;
e2e6a0f1 6136
830247a4 6137 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
6138 if (!SIZE_ONLY ){
6139 if (!RExC_nestroot)
6140 RExC_nestroot = parno;
c009da3d
YO
6141 if (RExC_seen & REG_SEEN_RECURSE
6142 && !RExC_open_parens[parno-1])
6143 {
e2e6a0f1 6144 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
6145 "Setting open paren #%"IVdf" to %d\n",
6146 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
6147 RExC_open_parens[parno-1]= ret;
6148 }
6bda09f9 6149 }
fac92740
MJD
6150 Set_Node_Length(ret, 1); /* MJD */
6151 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 6152 is_open = 1;
a0d0e21e 6153 }
a0ed51b3 6154 }
fac92740 6155 else /* ! paren */
a0d0e21e 6156 ret = NULL;
cde0cee5
YO
6157
6158 parse_rest:
a0d0e21e 6159 /* Pick up the branches, linking them together. */
fac92740 6160 parse_start = RExC_parse; /* MJD */
3dab1dad 6161 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e
VP
6162
6163 if (freeze_paren) {
6164 if (RExC_npar > after_freeze)
6165 after_freeze = RExC_npar;
6166 RExC_npar = freeze_paren;
6167 }
6168
fac92740 6169 /* branch_len = (paren != 0); */
2af232bd 6170
a0d0e21e
LW
6171 if (br == NULL)
6172 return(NULL);
830247a4
IZ
6173 if (*RExC_parse == '|') {
6174 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 6175 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 6176 }
fac92740 6177 else { /* MJD */
6bda09f9 6178 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
6179 Set_Node_Length(br, paren != 0);
6180 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6181 }
c277df42
IZ
6182 have_branch = 1;
6183 if (SIZE_ONLY)
830247a4 6184 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
6185 }
6186 else if (paren == ':') {
c277df42
IZ
6187 *flagp |= flags&SIMPLE;
6188 }
6136c704 6189 if (is_open) { /* Starts with OPEN. */
3dab1dad 6190 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
6191 }
6192 else if (paren != '?') /* Not Conditional */
a0d0e21e 6193 ret = br;
8ae10a67 6194 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 6195 lastbr = br;
830247a4
IZ
6196 while (*RExC_parse == '|') {
6197 if (!SIZE_ONLY && RExC_extralen) {
6198 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 6199 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
6200 }
6201 if (SIZE_ONLY)
830247a4
IZ
6202 RExC_extralen += 2; /* Account for LONGJMP. */
6203 nextchar(pRExC_state);
594d7033
YO
6204 if (freeze_paren) {
6205 if (RExC_npar > after_freeze)
6206 after_freeze = RExC_npar;
6207 RExC_npar = freeze_paren;
6208 }
3dab1dad 6209 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 6210
a687059c 6211 if (br == NULL)
a0d0e21e 6212 return(NULL);
3dab1dad 6213 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 6214 lastbr = br;
8ae10a67 6215 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
6216 }
6217
c277df42
IZ
6218 if (have_branch || paren != ':') {
6219 /* Make a closing node, and hook it on the end. */
6220 switch (paren) {
6221 case ':':
830247a4 6222 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
6223 break;
6224 case 1:
830247a4 6225 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
6226 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6227 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6228 "Setting close paren #%"IVdf" to %d\n",
6229 (IV)parno, REG_NODE_NUM(ender)));
6230 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
6231 if (RExC_nestroot == parno)
6232 RExC_nestroot = 0;
40d049e4 6233 }
fac92740
MJD
6234 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6235 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
6236 break;
6237 case '<':
c277df42
IZ
6238 case ',':
6239 case '=':
6240 case '!':
c277df42 6241 *flagp &= ~HASWIDTH;
821b33a5
IZ
6242 /* FALL THROUGH */
6243 case '>':
830247a4 6244 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
6245 break;
6246 case 0:
830247a4 6247 ender = reg_node(pRExC_state, END);
40d049e4
YO
6248 if (!SIZE_ONLY) {
6249 assert(!RExC_opend); /* there can only be one! */
6250 RExC_opend = ender;
6251 }
c277df42
IZ
6252 break;
6253 }
eaf3ca90 6254 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 6255
9674d46a 6256 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
6257 if (depth==1)
6258 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6259
c277df42 6260 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
6261 for (br = ret; br; br = regnext(br)) {
6262 const U8 op = PL_regkind[OP(br)];
6263 if (op == BRANCH) {
07be1b83 6264 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
6265 }
6266 else if (op == BRANCHJ) {
07be1b83 6267 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 6268 }
c277df42
IZ
6269 }
6270 }
a0d0e21e 6271 }
c277df42
IZ
6272
6273 {
e1ec3a88
AL
6274 const char *p;
6275 static const char parens[] = "=!<,>";
c277df42
IZ
6276
6277 if (paren && (p = strchr(parens, paren))) {
eb160463 6278 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
6279 int flag = (p - parens) > 1;
6280
6281 if (paren == '>')
6282 node = SUSPEND, flag = 0;
6bda09f9 6283 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
6284 Set_Node_Cur_Length(ret);
6285 Set_Node_Offset(ret, parse_start + 1);
c277df42 6286 ret->flags = flag;
07be1b83 6287 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 6288 }
a0d0e21e
LW
6289 }
6290
6291 /* Check for proper termination. */
ce3e6498 6292 if (paren) {
e2509266 6293 RExC_flags = oregflags;
830247a4
IZ
6294 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6295 RExC_parse = oregcomp_parse;
380a0633 6296 vFAIL("Unmatched (");
ce3e6498 6297 }
a0ed51b3 6298 }
830247a4
IZ
6299 else if (!paren && RExC_parse < RExC_end) {
6300 if (*RExC_parse == ')') {
6301 RExC_parse++;
380a0633 6302 vFAIL("Unmatched )");
a0ed51b3
LW
6303 }
6304 else
b45f050a 6305 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
6306 /* NOTREACHED */
6307 }
594d7033
YO
6308 if (after_freeze)
6309 RExC_npar = after_freeze;
a0d0e21e 6310 return(ret);
a687059c
LW
6311}
6312
6313/*
6314 - regbranch - one alternative of an | operator
6315 *
6316 * Implements the concatenation operator.
6317 */
76e3520e 6318STATIC regnode *
3dab1dad 6319S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 6320{
97aff369 6321 dVAR;
c277df42
IZ
6322 register regnode *ret;
6323 register regnode *chain = NULL;
6324 register regnode *latest;
6325 I32 flags = 0, c = 0;
3dab1dad 6326 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6327
6328 PERL_ARGS_ASSERT_REGBRANCH;
6329
3dab1dad 6330 DEBUG_PARSE("brnc");
02daf0ab 6331
b81d288d 6332 if (first)
c277df42
IZ
6333 ret = NULL;
6334 else {
b81d288d 6335 if (!SIZE_ONLY && RExC_extralen)
830247a4 6336 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 6337 else {
830247a4 6338 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
6339 Set_Node_Length(ret, 1);
6340 }
c277df42
IZ
6341 }
6342
b81d288d 6343 if (!first && SIZE_ONLY)
830247a4 6344 RExC_extralen += 1; /* BRANCHJ */
b81d288d 6345
c277df42 6346 *flagp = WORST; /* Tentatively. */
a0d0e21e 6347
830247a4
IZ
6348 RExC_parse--;
6349 nextchar(pRExC_state);
6350 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 6351 flags &= ~TRYAGAIN;
3dab1dad 6352 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6353 if (latest == NULL) {
6354 if (flags & TRYAGAIN)
6355 continue;
6356 return(NULL);
a0ed51b3
LW
6357 }
6358 else if (ret == NULL)
c277df42 6359 ret = latest;
8ae10a67 6360 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 6361 if (chain == NULL) /* First piece. */
a0d0e21e
LW
6362 *flagp |= flags&SPSTART;
6363 else {
830247a4 6364 RExC_naughty++;
3dab1dad 6365 REGTAIL(pRExC_state, chain, latest);
a687059c 6366 }
a0d0e21e 6367 chain = latest;
c277df42
IZ
6368 c++;
6369 }
6370 if (chain == NULL) { /* Loop ran zero times. */
830247a4 6371 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
6372 if (ret == NULL)
6373 ret = chain;
6374 }
6375 if (c == 1) {
6376 *flagp |= flags&SIMPLE;
a0d0e21e 6377 }
a687059c 6378
d4c19fe8 6379 return ret;
a687059c
LW
6380}
6381
6382/*
6383 - regpiece - something followed by possible [*+?]
6384 *
6385 * Note that the branching code sequences used for ? and the general cases
6386 * of * and + are somewhat optimized: they use the same NOTHING node as
6387 * both the endmarker for their branch list and the body of the last branch.
6388 * It might seem that this node could be dispensed with entirely, but the
6389 * endmarker role is not redundant.
6390 */
76e3520e 6391STATIC regnode *
3dab1dad 6392S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6393{
97aff369 6394 dVAR;
c277df42 6395 register regnode *ret;
a0d0e21e
LW
6396 register char op;
6397 register char *next;
6398 I32 flags;
1df70142 6399 const char * const origparse = RExC_parse;
a0d0e21e 6400 I32 min;
c277df42 6401 I32 max = REG_INFTY;
fac92740 6402 char *parse_start;
10edeb5d 6403 const char *maxpos = NULL;
3dab1dad 6404 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6405
6406 PERL_ARGS_ASSERT_REGPIECE;
6407
3dab1dad 6408 DEBUG_PARSE("piec");
a0d0e21e 6409
3dab1dad 6410 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6411 if (ret == NULL) {
6412 if (flags & TRYAGAIN)
6413 *flagp |= TRYAGAIN;
6414 return(NULL);
6415 }
6416
830247a4 6417 op = *RExC_parse;
a0d0e21e 6418
830247a4 6419 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 6420 maxpos = NULL;
fac92740 6421 parse_start = RExC_parse; /* MJD */
830247a4 6422 next = RExC_parse + 1;
a0d0e21e
LW
6423 while (isDIGIT(*next) || *next == ',') {
6424 if (*next == ',') {
6425 if (maxpos)
6426 break;
6427 else
6428 maxpos = next;
a687059c 6429 }
a0d0e21e
LW
6430 next++;
6431 }
6432 if (*next == '}') { /* got one */
6433 if (!maxpos)
6434 maxpos = next;
830247a4
IZ
6435 RExC_parse++;
6436 min = atoi(RExC_parse);
a0d0e21e
LW
6437 if (*maxpos == ',')
6438 maxpos++;
6439 else
830247a4 6440 maxpos = RExC_parse;
a0d0e21e
LW
6441 max = atoi(maxpos);
6442 if (!max && *maxpos != '0')
c277df42
IZ
6443 max = REG_INFTY; /* meaning "infinity" */
6444 else if (max >= REG_INFTY)
8615cb43 6445 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
6446 RExC_parse = next;
6447 nextchar(pRExC_state);
a0d0e21e
LW
6448
6449 do_curly:
6450 if ((flags&SIMPLE)) {
830247a4 6451 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 6452 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
6453 Set_Node_Offset(ret, parse_start+1); /* MJD */
6454 Set_Node_Cur_Length(ret);
a0d0e21e
LW
6455 }
6456 else {
3dab1dad 6457 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
6458
6459 w->flags = 0;
3dab1dad 6460 REGTAIL(pRExC_state, ret, w);
830247a4 6461 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
6462 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6463 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
6464 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6465 }
6bda09f9 6466 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
6467 /* MJD hk */
6468 Set_Node_Offset(ret, parse_start+1);
2af232bd 6469 Set_Node_Length(ret,
fac92740 6470 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 6471
830247a4 6472 if (!SIZE_ONLY && RExC_extralen)
c277df42 6473 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 6474 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 6475 if (SIZE_ONLY)
830247a4
IZ
6476 RExC_whilem_seen++, RExC_extralen += 3;
6477 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 6478 }
c277df42 6479 ret->flags = 0;
a0d0e21e
LW
6480
6481 if (min > 0)
821b33a5
IZ
6482 *flagp = WORST;
6483 if (max > 0)
6484 *flagp |= HASWIDTH;
8fa23287 6485 if (max < min)
8615cb43 6486 vFAIL("Can't do {n,m} with n > m");
c277df42 6487 if (!SIZE_ONLY) {
eb160463
GS
6488 ARG1_SET(ret, (U16)min);
6489 ARG2_SET(ret, (U16)max);
a687059c 6490 }
a687059c 6491
a0d0e21e 6492 goto nest_check;
a687059c 6493 }
a0d0e21e 6494 }
a687059c 6495
a0d0e21e
LW
6496 if (!ISMULT1(op)) {
6497 *flagp = flags;
a687059c 6498 return(ret);
a0d0e21e 6499 }
bb20fd44 6500
c277df42 6501#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
6502
6503 /* if this is reinstated, don't forget to put this back into perldiag:
6504
6505 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6506
6507 (F) The part of the regexp subject to either the * or + quantifier
6508 could match an empty string. The {#} shows in the regular
6509 expression about where the problem was discovered.
6510
6511 */
6512
bb20fd44 6513 if (!(flags&HASWIDTH) && op != '?')
b45f050a 6514 vFAIL("Regexp *+ operand could be empty");
b81d288d 6515#endif
bb20fd44 6516
fac92740 6517 parse_start = RExC_parse;
830247a4 6518 nextchar(pRExC_state);
a0d0e21e 6519
821b33a5 6520 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
6521
6522 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 6523 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 6524 ret->flags = 0;
830247a4 6525 RExC_naughty += 4;
a0d0e21e
LW
6526 }
6527 else if (op == '*') {
6528 min = 0;
6529 goto do_curly;
a0ed51b3
LW
6530 }
6531 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 6532 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 6533 ret->flags = 0;
830247a4 6534 RExC_naughty += 3;
a0d0e21e
LW
6535 }
6536 else if (op == '+') {
6537 min = 1;
6538 goto do_curly;
a0ed51b3
LW
6539 }
6540 else if (op == '?') {
a0d0e21e
LW
6541 min = 0; max = 1;
6542 goto do_curly;
6543 }
6544 nest_check:
668c081a
NC
6545 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6546 ckWARN3reg(RExC_parse,
6547 "%.*s matches null string many times",
6548 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6549 origparse);
a0d0e21e
LW
6550 }
6551
b9b4dddf 6552 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 6553 nextchar(pRExC_state);
6bda09f9 6554 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 6555 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 6556 }
b9b4dddf
YO
6557#ifndef REG_ALLOW_MINMOD_SUSPEND
6558 else
6559#endif
6560 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6561 regnode *ender;
6562 nextchar(pRExC_state);
6563 ender = reg_node(pRExC_state, SUCCEED);
6564 REGTAIL(pRExC_state, ret, ender);
6565 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6566 ret->flags = 0;
6567 ender = reg_node(pRExC_state, TAIL);
6568 REGTAIL(pRExC_state, ret, ender);
6569 /*ret= ender;*/
6570 }
6571
6572 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 6573 RExC_parse++;
b45f050a
JF
6574 vFAIL("Nested quantifiers");
6575 }
a0d0e21e
LW
6576
6577 return(ret);
a687059c
LW
6578}
6579
fc8cd66c
YO
6580
6581/* reg_namedseq(pRExC_state,UVp)
6582
6583 This is expected to be called by a parser routine that has
afefe6bf 6584 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
6585 expected to point at the first char following the N at the time
6586 of the call.
6587
6588 If valuep is non-null then it is assumed that we are parsing inside
6589 of a charclass definition and the first codepoint in the resolved
6590 string is returned via *valuep and the routine will return NULL.
6591 In this mode if a multichar string is returned from the charnames
6592 handler a warning will be issued, and only the first char in the
6593 sequence will be examined. If the string returned is zero length
6594 then the value of *valuep is undefined and NON-NULL will
6595 be returned to indicate failure. (This will NOT be a valid pointer
6596 to a regnode.)
6597
afefe6bf 6598 If valuep is null then it is assumed that we are parsing normal text
fc8cd66c
YO
6599 and inserts a new EXACT node into the program containing the resolved
6600 string and returns a pointer to the new node. If the string is
6601 zerolength a NOTHING node is emitted.
afefe6bf 6602
fc8cd66c
YO
6603 On success RExC_parse is set to the char following the endbrace.
6604 Parsing failures will generate a fatal errorvia vFAIL(...)
6605
6606 NOTE: We cache all results from the charnames handler locally in
6607 the RExC_charnames hash (created on first use) to prevent a charnames
6608 handler from playing silly-buggers and returning a short string and
6609 then a long string for a given pattern. Since the regexp program
6610 size is calculated during an initial parse this would result
6611 in a buffer overrun so we cache to prevent the charname result from
6612 changing during the course of the parse.
6613
6614 */
6615STATIC regnode *
afefe6bf 6616S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
fc8cd66c
YO
6617{
6618 char * name; /* start of the content of the name */
6619 char * endbrace; /* endbrace following the name */
6620 SV *sv_str = NULL;
6621 SV *sv_name = NULL;
6622 STRLEN len; /* this has various purposes throughout the code */
6623 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6624 regnode *ret = NULL;
7918f24d
NC
6625
6626 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6627
afefe6bf
RGS
6628 if (*RExC_parse != '{' ||
6629 (*RExC_parse == '{' && RExC_parse[1]
6630 && strchr("0123456789", RExC_parse[1])))
6631 {
6632 GET_RE_DEBUG_FLAGS_DECL;
6633 if (valuep)
6634 /* no bare \N in a charclass */
6635 vFAIL("Missing braces on \\N{}");
6636 GET_RE_DEBUG_FLAGS;
6637 nextchar(pRExC_state);
6638 ret = reg_node(pRExC_state, REG_ANY);
6639 *flagp |= HASWIDTH|SIMPLE;
6640 RExC_naughty++;
6641 RExC_parse--;
6642 Set_Node_Length(ret, 1); /* MJD */
6643 return ret;
fc8cd66c
YO
6644 }
6645 name = RExC_parse+1;
6646 endbrace = strchr(RExC_parse, '}');
6647 if ( ! endbrace ) {
6648 RExC_parse++;
6649 vFAIL("Missing right brace on \\N{}");
6650 }
6651 RExC_parse = endbrace + 1;
6652
6653
6654 /* RExC_parse points at the beginning brace,
6655 endbrace points at the last */
6656 if ( name[0]=='U' && name[1]=='+' ) {
38a44b82 6657 /* its a "Unicode hex" notation {U+89AB} */
fc8cd66c
YO
6658 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6659 | PERL_SCAN_DISALLOW_PREFIX
6660 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6661 UV cp;
196f1508 6662 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 6663 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 6664 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
6665 cp = 0xFFFD;
6666 }
fc8cd66c 6667 if ( valuep ) {
a4893424 6668 if (cp > 0xff) RExC_utf8 = 1;
fc8cd66c
YO
6669 *valuep = cp;
6670 return NULL;
6671 }
a4893424
KW
6672
6673 /* Need to convert to utf8 if either: won't fit into a byte, or the re
6674 * is going to be in utf8 and the representation changes under utf8. */
6675 if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
6676 U8 string[UTF8_MAXBYTES+1];
6677 U8 *tmps;
6678 RExC_utf8 = 1;
6679 tmps = uvuni_to_utf8(string, cp);
6680 sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
6681 } else { /* Otherwise, no need for utf8, can skip that step */
6682 char string;
6683 string = (char)cp;
6684 sv_str= newSVpvn(&string, 1);
6685 }
fc8cd66c
YO
6686 } else {
6687 /* fetch the charnames handler for this scope */
6688 HV * const table = GvHV(PL_hintgv);
6689 SV **cvp= table ?
6690 hv_fetchs(table, "charnames", FALSE) :
6691 NULL;
6692 SV *cv= cvp ? *cvp : NULL;
6693 HE *he_str;
6694 int count;
6695 /* create an SV with the name as argument */
6696 sv_name = newSVpvn(name, endbrace - name);
6697
6698 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
deda7dd1 6699 vFAIL2("Constant(\\N{%" SVf "}) unknown: "
fc8cd66c 6700 "(possibly a missing \"use charnames ...\")",
deda7dd1 6701 SVfARG(sv_name));
fc8cd66c
YO
6702 }
6703 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
deda7dd1
NC
6704 vFAIL2("Constant(\\N{%" SVf "}): "
6705 "$^H{charnames} is not defined", SVfARG(sv_name));
fc8cd66c
YO
6706 }
6707
6708
6709
6710 if (!RExC_charnames) {
6711 /* make sure our cache is allocated */
6712 RExC_charnames = newHV();
ad64d0ec 6713 sv_2mortal(MUTABLE_SV(RExC_charnames));
fc8cd66c
YO
6714 }
6715 /* see if we have looked this one up before */
6716 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6717 if ( he_str ) {
6718 sv_str = HeVAL(he_str);
6719 cached = 1;
6720 } else {
6721 dSP ;
6722
6723 ENTER ;
6724 SAVETMPS ;
6725 PUSHMARK(SP) ;
6726
6727 XPUSHs(sv_name);
6728
6729 PUTBACK ;
6730
6731 count= call_sv(cv, G_SCALAR);
6732
6733 if (count == 1) { /* XXXX is this right? dmq */
6734 sv_str = POPs;
6735 SvREFCNT_inc_simple_void(sv_str);
6736 }
6737
6738 SPAGAIN ;
6739 PUTBACK ;
6740 FREETMPS ;
6741 LEAVE ;
6742
6743 if ( !sv_str || !SvOK(sv_str) ) {
deda7dd1
NC
6744 vFAIL2("Constant(\\N{%" SVf "}): Call to &{$^H{charnames}} "
6745 "did not return a defined value", SVfARG(sv_name));
fc8cd66c
YO
6746 }
6747 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6748 cached = 1;
6749 }
6750 }
6751 if (valuep) {
6752 char *p = SvPV(sv_str, len);
6753 if (len) {
6754 STRLEN numlen = 1;
6755 if ( SvUTF8(sv_str) ) {
196f1508 6756 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
6757 if (*valuep > 0x7F)
6758 RExC_utf8 = 1;
6759 /* XXXX
6760 We have to turn on utf8 for high bit chars otherwise
6761 we get failures with
6762
6763 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6764 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6765
6766 This is different from what \x{} would do with the same
6767 codepoint, where the condition is > 0xFF.
6768 - dmq
6769 */
6770
6771
6772 } else {
6773 *valuep = (UV)*p;
6774 /* warn if we havent used the whole string? */
6775 }
668c081a
NC
6776 if (numlen<len && SIZE_ONLY) {
6777 ckWARN2reg(RExC_parse,
deda7dd1
NC
6778 "Ignoring excess chars from \\N{%" SVf "} in character class",
6779 SVfARG(sv_name)
fc8cd66c
YO
6780 );
6781 }
668c081a
NC
6782 } else if (SIZE_ONLY) {
6783 ckWARN2reg(RExC_parse,
deda7dd1
NC
6784 "Ignoring zero length \\N{%" SVf "} in character class",
6785 SVfARG(sv_name)
fc8cd66c
YO
6786 );
6787 }
6788 if (sv_name)
6789 SvREFCNT_dec(sv_name);
6790 if (!cached)
6791 SvREFCNT_dec(sv_str);
6792 return len ? NULL : (regnode *)&len;
6793 } else if(SvCUR(sv_str)) {
6794
6795 char *s;
6796 char *p, *pend;
6797 STRLEN charlen = 1;
d008bc60 6798#ifdef DEBUGGING
fc8cd66c 6799 char * parse_start = name-3; /* needed for the offsets */
d008bc60 6800#endif
fc8cd66c
YO
6801 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6802
6803 ret = reg_node(pRExC_state,
6804 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6805 s= STRING(ret);
6806
6807 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6808 sv_utf8_upgrade(sv_str);
6809 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6810 RExC_utf8= 1;
6811 }
6812
6813 p = SvPV(sv_str, len);
6814 pend = p + len;
6815 /* len is the length written, charlen is the size the char read */
6816 for ( len = 0; p < pend; p += charlen ) {
6817 if (UTF) {
196f1508 6818 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
6819 if (FOLD) {
6820 STRLEN foldlen,numlen;
6821 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6822 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6823 /* Emit all the Unicode characters. */
6824
6825 for (foldbuf = tmpbuf;
6826 foldlen;
6827 foldlen -= numlen)
6828 {
6829 uvc = utf8_to_uvchr(foldbuf, &numlen);
6830 if (numlen > 0) {
6831 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6832 s += unilen;
6833 len += unilen;
6834 /* In EBCDIC the numlen
6835 * and unilen can differ. */
6836 foldbuf += numlen;
6837 if (numlen >= foldlen)
6838 break;
6839 }
6840 else
6841 break; /* "Can't happen." */
6842 }
6843 } else {
6844 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6845 if (unilen > 0) {
6846 s += unilen;
6847 len += unilen;
6848 }
6849 }
6850 } else {
6851 len++;
6852 REGC(*p, s++);
6853 }
6854 }
6855 if (SIZE_ONLY) {
6856 RExC_size += STR_SZ(len);
6857 } else {
6858 STR_LEN(ret) = len;
6859 RExC_emit += STR_SZ(len);
6860 }
6861 Set_Node_Cur_Length(ret); /* MJD */
6862 RExC_parse--;
6863 nextchar(pRExC_state);
a4893424 6864 } else { /* zero length */
fc8cd66c
YO
6865 ret = reg_node(pRExC_state,NOTHING);
6866 }
6867 if (!cached) {
6868 SvREFCNT_dec(sv_str);
6869 }
6870 if (sv_name) {
6871 SvREFCNT_dec(sv_name);
6872 }
6873 return ret;
6874
6875}
6876
6877
9e08bc66
TS
6878/*
6879 * reg_recode
6880 *
6881 * It returns the code point in utf8 for the value in *encp.
6882 * value: a code value in the source encoding
6883 * encp: a pointer to an Encode object
6884 *
6885 * If the result from Encode is not a single character,
6886 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6887 */
6888STATIC UV
6889S_reg_recode(pTHX_ const char value, SV **encp)
6890{
6891 STRLEN numlen = 1;
59cd0e26 6892 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 6893 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
6894 const STRLEN newlen = SvCUR(sv);
6895 UV uv = UNICODE_REPLACEMENT;
6896
7918f24d
NC
6897 PERL_ARGS_ASSERT_REG_RECODE;
6898
9e08bc66
TS
6899 if (newlen)
6900 uv = SvUTF8(sv)
6901 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6902 : *(U8*)s;
6903
6904 if (!newlen || numlen != newlen) {
6905 uv = UNICODE_REPLACEMENT;
c86f7df5 6906 *encp = NULL;
9e08bc66
TS
6907 }
6908 return uv;
6909}
6910
fc8cd66c 6911
a687059c
LW
6912/*
6913 - regatom - the lowest level
ee9b8eae
YO
6914
6915 Try to identify anything special at the start of the pattern. If there
6916 is, then handle it as required. This may involve generating a single regop,
6917 such as for an assertion; or it may involve recursing, such as to
6918 handle a () structure.
6919
6920 If the string doesn't start with something special then we gobble up
6921 as much literal text as we can.
6922
6923 Once we have been able to handle whatever type of thing started the
6924 sequence, we return.
6925
6926 Note: we have to be careful with escapes, as they can be both literal
6927 and special, and in the case of \10 and friends can either, depending
6928 on context. Specifically there are two seperate switches for handling
6929 escape sequences, with the one for handling literal escapes requiring
6930 a dummy entry for all of the special escapes that are actually handled
6931 by the other.
6932*/
6933
76e3520e 6934STATIC regnode *
3dab1dad 6935S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6936{
97aff369 6937 dVAR;
cbbf8932 6938 register regnode *ret = NULL;
a0d0e21e 6939 I32 flags;
45948336 6940 char *parse_start = RExC_parse;
3dab1dad
YO
6941 GET_RE_DEBUG_FLAGS_DECL;
6942 DEBUG_PARSE("atom");
a0d0e21e
LW
6943 *flagp = WORST; /* Tentatively. */
6944
7918f24d 6945 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 6946
a0d0e21e 6947tryagain:
f9a79580 6948 switch ((U8)*RExC_parse) {
a0d0e21e 6949 case '^':
830247a4
IZ
6950 RExC_seen_zerolen++;
6951 nextchar(pRExC_state);
bbe252da 6952 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6953 ret = reg_node(pRExC_state, MBOL);
bbe252da 6954 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6955 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6956 else
830247a4 6957 ret = reg_node(pRExC_state, BOL);
fac92740 6958 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6959 break;
6960 case '$':
830247a4 6961 nextchar(pRExC_state);
b81d288d 6962 if (*RExC_parse)
830247a4 6963 RExC_seen_zerolen++;
bbe252da 6964 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6965 ret = reg_node(pRExC_state, MEOL);
bbe252da 6966 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6967 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6968 else
830247a4 6969 ret = reg_node(pRExC_state, EOL);
fac92740 6970 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6971 break;
6972 case '.':
830247a4 6973 nextchar(pRExC_state);
bbe252da 6974 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
6975 ret = reg_node(pRExC_state, SANY);
6976 else
6977 ret = reg_node(pRExC_state, REG_ANY);
6978 *flagp |= HASWIDTH|SIMPLE;
830247a4 6979 RExC_naughty++;
fac92740 6980 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6981 break;
6982 case '[':
b45f050a 6983 {
3dab1dad
YO
6984 char * const oregcomp_parse = ++RExC_parse;
6985 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6986 if (*RExC_parse != ']') {
6987 RExC_parse = oregcomp_parse;
b45f050a
JF
6988 vFAIL("Unmatched [");
6989 }
830247a4 6990 nextchar(pRExC_state);
a0d0e21e 6991 *flagp |= HASWIDTH|SIMPLE;
fac92740 6992 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6993 break;
b45f050a 6994 }
a0d0e21e 6995 case '(':
830247a4 6996 nextchar(pRExC_state);
3dab1dad 6997 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6998 if (ret == NULL) {
bf93d4cc 6999 if (flags & TRYAGAIN) {
830247a4 7000 if (RExC_parse == RExC_end) {
bf93d4cc
GS
7001 /* Make parent create an empty node if needed. */
7002 *flagp |= TRYAGAIN;
7003 return(NULL);
7004 }
a0d0e21e 7005 goto tryagain;
bf93d4cc 7006 }
a0d0e21e
LW
7007 return(NULL);
7008 }
a3b492c3 7009 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
7010 break;
7011 case '|':
7012 case ')':
7013 if (flags & TRYAGAIN) {
7014 *flagp |= TRYAGAIN;
7015 return NULL;
7016 }
b45f050a 7017 vFAIL("Internal urp");
a0d0e21e
LW
7018 /* Supposed to be caught earlier. */
7019 break;
85afd4ae 7020 case '{':
830247a4
IZ
7021 if (!regcurly(RExC_parse)) {
7022 RExC_parse++;
85afd4ae
CS
7023 goto defchar;
7024 }
7025 /* FALL THROUGH */
a0d0e21e
LW
7026 case '?':
7027 case '+':
7028 case '*':
830247a4 7029 RExC_parse++;
b45f050a 7030 vFAIL("Quantifier follows nothing");
a0d0e21e 7031 break;
f9a79580
RGS
7032 case 0xDF:
7033 case 0xC3:
7034 case 0xCE:
a0a388a1 7035 do_foldchar:
56d400ed 7036 if (!LOC && FOLD) {
e64b1bd1 7037 U32 len,cp;
7cf3a6a3 7038 len=0; /* silence a spurious compiler warning */
56d400ed 7039 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
e64b1bd1
YO
7040 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7041 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7042 ret = reganode(pRExC_state, FOLDCHAR, cp);
7043 Set_Node_Length(ret, 1); /* MJD */
7044 nextchar(pRExC_state); /* kill whitespace under /x */
7045 return ret;
7046 }
7047 }
7048 goto outer_default;
a0d0e21e 7049 case '\\':
ee9b8eae
YO
7050 /* Special Escapes
7051
7052 This switch handles escape sequences that resolve to some kind
7053 of special regop and not to literal text. Escape sequnces that
7054 resolve to literal text are handled below in the switch marked
7055 "Literal Escapes".
7056
7057 Every entry in this switch *must* have a corresponding entry
7058 in the literal escape switch. However, the opposite is not
7059 required, as the default for this switch is to jump to the
7060 literal text handling code.
7061 */
a0a388a1
YO
7062 switch ((U8)*++RExC_parse) {
7063 case 0xDF:
7064 case 0xC3:
7065 case 0xCE:
7066 goto do_foldchar;
ee9b8eae 7067 /* Special Escapes */
a0d0e21e 7068 case 'A':
830247a4
IZ
7069 RExC_seen_zerolen++;
7070 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7071 *flagp |= SIMPLE;
ee9b8eae 7072 goto finish_meta_pat;
a0d0e21e 7073 case 'G':
830247a4
IZ
7074 ret = reg_node(pRExC_state, GPOS);
7075 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 7076 *flagp |= SIMPLE;
ee9b8eae
YO
7077 goto finish_meta_pat;
7078 case 'K':
7079 RExC_seen_zerolen++;
7080 ret = reg_node(pRExC_state, KEEPS);
7081 *flagp |= SIMPLE;
37923168
RGS
7082 /* XXX:dmq : disabling in-place substitution seems to
7083 * be necessary here to avoid cases of memory corruption, as
7084 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7085 */
7086 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 7087 goto finish_meta_pat;
a0d0e21e 7088 case 'Z':
830247a4 7089 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7090 *flagp |= SIMPLE;
a1917ab9 7091 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7092 goto finish_meta_pat;
b85d18e9 7093 case 'z':
830247a4 7094 ret = reg_node(pRExC_state, EOS);
b85d18e9 7095 *flagp |= SIMPLE;
830247a4 7096 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7097 goto finish_meta_pat;
4a2d328f 7098 case 'C':
f33976b4
DB
7099 ret = reg_node(pRExC_state, CANY);
7100 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 7101 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7102 goto finish_meta_pat;
a0ed51b3 7103 case 'X':
830247a4 7104 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 7105 *flagp |= HASWIDTH;
ee9b8eae 7106 goto finish_meta_pat;
a0d0e21e 7107 case 'w':
eb160463 7108 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 7109 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7110 goto finish_meta_pat;
a0d0e21e 7111 case 'W':
eb160463 7112 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 7113 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7114 goto finish_meta_pat;
a0d0e21e 7115 case 'b':
830247a4
IZ
7116 RExC_seen_zerolen++;
7117 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 7118 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 7119 *flagp |= SIMPLE;
ee9b8eae 7120 goto finish_meta_pat;
a0d0e21e 7121 case 'B':
830247a4
IZ
7122 RExC_seen_zerolen++;
7123 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 7124 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 7125 *flagp |= SIMPLE;
ee9b8eae 7126 goto finish_meta_pat;
a0d0e21e 7127 case 's':
eb160463 7128 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 7129 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7130 goto finish_meta_pat;
a0d0e21e 7131 case 'S':
eb160463 7132 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 7133 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7134 goto finish_meta_pat;
a0d0e21e 7135 case 'd':
ffc61ed2 7136 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 7137 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7138 goto finish_meta_pat;
a0d0e21e 7139 case 'D':
ffc61ed2 7140 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 7141 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7142 goto finish_meta_pat;
e1d1eefb
YO
7143 case 'R':
7144 ret = reg_node(pRExC_state, LNBREAK);
7145 *flagp |= HASWIDTH|SIMPLE;
7146 goto finish_meta_pat;
7147 case 'h':
7148 ret = reg_node(pRExC_state, HORIZWS);
7149 *flagp |= HASWIDTH|SIMPLE;
7150 goto finish_meta_pat;
7151 case 'H':
7152 ret = reg_node(pRExC_state, NHORIZWS);
7153 *flagp |= HASWIDTH|SIMPLE;
7154 goto finish_meta_pat;
ee9b8eae 7155 case 'v':
e1d1eefb
YO
7156 ret = reg_node(pRExC_state, VERTWS);
7157 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
7158 goto finish_meta_pat;
7159 case 'V':
e1d1eefb
YO
7160 ret = reg_node(pRExC_state, NVERTWS);
7161 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7162 finish_meta_pat:
830247a4 7163 nextchar(pRExC_state);
fac92740 7164 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 7165 break;
a14b48bc
LW
7166 case 'p':
7167 case 'P':
3568d838 7168 {
3dab1dad 7169 char* const oldregxend = RExC_end;
d008bc60 7170#ifdef DEBUGGING
ccb2c380 7171 char* parse_start = RExC_parse - 2;
d008bc60 7172#endif
a14b48bc 7173
830247a4 7174 if (RExC_parse[1] == '{') {
3568d838 7175 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
7176 RExC_end = strchr(RExC_parse, '}');
7177 if (!RExC_end) {
3dab1dad 7178 const U8 c = (U8)*RExC_parse;
830247a4
IZ
7179 RExC_parse += 2;
7180 RExC_end = oldregxend;
0da60cf5 7181 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 7182 }
830247a4 7183 RExC_end++;
a14b48bc 7184 }
af6f566e 7185 else {
830247a4 7186 RExC_end = RExC_parse + 2;
af6f566e
HS
7187 if (RExC_end > oldregxend)
7188 RExC_end = oldregxend;
7189 }
830247a4 7190 RExC_parse--;
a14b48bc 7191
3dab1dad 7192 ret = regclass(pRExC_state,depth+1);
a14b48bc 7193
830247a4
IZ
7194 RExC_end = oldregxend;
7195 RExC_parse--;
ccb2c380
MP
7196
7197 Set_Node_Offset(ret, parse_start + 2);
7198 Set_Node_Cur_Length(ret);
830247a4 7199 nextchar(pRExC_state);
a14b48bc
LW
7200 *flagp |= HASWIDTH|SIMPLE;
7201 }
7202 break;
fc8cd66c 7203 case 'N':
afefe6bf 7204 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
7205 multicharacter. join_exact() will join them up later on.
7206 Also this makes sure that things like /\N{BLAH}+/ and
7207 \N{BLAH} being multi char Just Happen. dmq*/
7208 ++RExC_parse;
afefe6bf 7209 ret= reg_namedseq(pRExC_state, NULL, flagp);
fc8cd66c 7210 break;
0a4db386 7211 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 7212 parse_named_seq:
81714fb9
YO
7213 {
7214 char ch= RExC_parse[1];
1f1031fe
YO
7215 if (ch != '<' && ch != '\'' && ch != '{') {
7216 RExC_parse++;
7217 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 7218 } else {
1f1031fe
YO
7219 /* this pretty much dupes the code for (?P=...) in reg(), if
7220 you change this make sure you change that */
81714fb9 7221 char* name_start = (RExC_parse += 2);
2eccd3b2 7222 U32 num = 0;
0a4db386
YO
7223 SV *sv_dat = reg_scan_name(pRExC_state,
7224 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 7225 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 7226 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
7227 vFAIL2("Sequence %.3s... not terminated",parse_start);
7228
7229 if (!SIZE_ONLY) {
7230 num = add_data( pRExC_state, 1, "S" );
7231 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 7232 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
7233 }
7234
81714fb9
YO
7235 RExC_sawback = 1;
7236 ret = reganode(pRExC_state,
7237 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7238 num);
7239 *flagp |= HASWIDTH;
1f1031fe 7240
81714fb9
YO
7241 /* override incorrect value set in reganode MJD */
7242 Set_Node_Offset(ret, parse_start+1);
7243 Set_Node_Cur_Length(ret); /* MJD */
7244 nextchar(pRExC_state);
1f1031fe 7245
81714fb9
YO
7246 }
7247 break;
1f1031fe 7248 }
2bf803e2 7249 case 'g':
a0d0e21e
LW
7250 case '1': case '2': case '3': case '4':
7251 case '5': case '6': case '7': case '8': case '9':
7252 {
c74340f9 7253 I32 num;
2bf803e2
YO
7254 bool isg = *RExC_parse == 'g';
7255 bool isrel = 0;
7256 bool hasbrace = 0;
7257 if (isg) {
c74340f9 7258 RExC_parse++;
2bf803e2
YO
7259 if (*RExC_parse == '{') {
7260 RExC_parse++;
7261 hasbrace = 1;
7262 }
7263 if (*RExC_parse == '-') {
7264 RExC_parse++;
7265 isrel = 1;
7266 }
1f1031fe
YO
7267 if (hasbrace && !isDIGIT(*RExC_parse)) {
7268 if (isrel) RExC_parse--;
7269 RExC_parse -= 2;
7270 goto parse_named_seq;
7271 } }
c74340f9 7272 num = atoi(RExC_parse);
b72d83b2
RGS
7273 if (isg && num == 0)
7274 vFAIL("Reference to invalid group 0");
c74340f9 7275 if (isrel) {
5624f11d 7276 num = RExC_npar - num;
c74340f9
YO
7277 if (num < 1)
7278 vFAIL("Reference to nonexistent or unclosed group");
7279 }
2bf803e2 7280 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
7281 goto defchar;
7282 else {
3dab1dad 7283 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
7284 while (isDIGIT(*RExC_parse))
7285 RExC_parse++;
1f1031fe
YO
7286 if (parse_start == RExC_parse - 1)
7287 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
7288 if (hasbrace) {
7289 if (*RExC_parse != '}')
7290 vFAIL("Unterminated \\g{...} pattern");
7291 RExC_parse++;
7292 }
c74340f9
YO
7293 if (!SIZE_ONLY) {
7294 if (num > (I32)RExC_rx->nparens)
7295 vFAIL("Reference to nonexistent group");
c74340f9 7296 }
830247a4 7297 RExC_sawback = 1;
eb160463
GS
7298 ret = reganode(pRExC_state,
7299 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7300 num);
a0d0e21e 7301 *flagp |= HASWIDTH;
2af232bd 7302
fac92740 7303 /* override incorrect value set in reganode MJD */
2af232bd 7304 Set_Node_Offset(ret, parse_start+1);
fac92740 7305 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
7306 RExC_parse--;
7307 nextchar(pRExC_state);
a0d0e21e
LW
7308 }
7309 }
7310 break;
7311 case '\0':
830247a4 7312 if (RExC_parse >= RExC_end)
b45f050a 7313 FAIL("Trailing \\");
a0d0e21e
LW
7314 /* FALL THROUGH */
7315 default:
a0288114 7316 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 7317 back into the quick-grab loop below */
45948336 7318 parse_start--;
a0d0e21e
LW
7319 goto defchar;
7320 }
7321 break;
4633a7c4
LW
7322
7323 case '#':
bbe252da 7324 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 7325 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
7326 goto tryagain;
7327 }
7328 /* FALL THROUGH */
7329
f9a79580
RGS
7330 default:
7331 outer_default:{
ba210ebe 7332 register STRLEN len;
58ae7d3f 7333 register UV ender;
a0d0e21e 7334 register char *p;
3dab1dad 7335 char *s;
80aecb99 7336 STRLEN foldlen;
89ebb4a3 7337 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
7338
7339 parse_start = RExC_parse - 1;
a0d0e21e 7340
830247a4 7341 RExC_parse++;
a0d0e21e
LW
7342
7343 defchar:
58ae7d3f 7344 ender = 0;
eb160463
GS
7345 ret = reg_node(pRExC_state,
7346 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 7347 s = STRING(ret);
830247a4
IZ
7348 for (len = 0, p = RExC_parse - 1;
7349 len < 127 && p < RExC_end;
a0d0e21e
LW
7350 len++)
7351 {
3dab1dad 7352 char * const oldp = p;
5b5a24f7 7353
bbe252da 7354 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 7355 p = regwhite( pRExC_state, p );
f9a79580
RGS
7356 switch ((U8)*p) {
7357 case 0xDF:
7358 case 0xC3:
7359 case 0xCE:
56d400ed 7360 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
f9a79580 7361 goto normal_default;
a0d0e21e
LW
7362 case '^':
7363 case '$':
7364 case '.':
7365 case '[':
7366 case '(':
7367 case ')':
7368 case '|':
7369 goto loopdone;
7370 case '\\':
ee9b8eae
YO
7371 /* Literal Escapes Switch
7372
7373 This switch is meant to handle escape sequences that
7374 resolve to a literal character.
7375
7376 Every escape sequence that represents something
7377 else, like an assertion or a char class, is handled
7378 in the switch marked 'Special Escapes' above in this
7379 routine, but also has an entry here as anything that
7380 isn't explicitly mentioned here will be treated as
7381 an unescaped equivalent literal.
7382 */
7383
a0a388a1 7384 switch ((U8)*++p) {
ee9b8eae 7385 /* These are all the special escapes. */
a0a388a1
YO
7386 case 0xDF:
7387 case 0xC3:
7388 case 0xCE:
7389 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7390 goto normal_default;
ee9b8eae
YO
7391 case 'A': /* Start assertion */
7392 case 'b': case 'B': /* Word-boundary assertion*/
7393 case 'C': /* Single char !DANGEROUS! */
7394 case 'd': case 'D': /* digit class */
7395 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 7396 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
7397 case 'k': case 'K': /* named backref, keep marker */
7398 case 'N': /* named char sequence */
38a44b82 7399 case 'p': case 'P': /* Unicode property */
e1d1eefb 7400 case 'R': /* LNBREAK */
ee9b8eae 7401 case 's': case 'S': /* space class */
e1d1eefb 7402 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
7403 case 'w': case 'W': /* word class */
7404 case 'X': /* eXtended Unicode "combining character sequence" */
7405 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
7406 --p;
7407 goto loopdone;
ee9b8eae
YO
7408
7409 /* Anything after here is an escape that resolves to a
7410 literal. (Except digits, which may or may not)
7411 */
a0d0e21e
LW
7412 case 'n':
7413 ender = '\n';
7414 p++;
a687059c 7415 break;
a0d0e21e
LW
7416 case 'r':
7417 ender = '\r';
7418 p++;
a687059c 7419 break;
a0d0e21e
LW
7420 case 't':
7421 ender = '\t';
7422 p++;
a687059c 7423 break;
a0d0e21e
LW
7424 case 'f':
7425 ender = '\f';
7426 p++;
a687059c 7427 break;
a0d0e21e 7428 case 'e':
c7f1f016 7429 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 7430 p++;
a687059c 7431 break;
a0d0e21e 7432 case 'a':
c7f1f016 7433 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 7434 p++;
a687059c 7435 break;
a0d0e21e 7436 case 'x':
a0ed51b3 7437 if (*++p == '{') {
1df70142 7438 char* const e = strchr(p, '}');
b81d288d 7439
b45f050a 7440 if (!e) {
830247a4 7441 RExC_parse = p + 1;
b45f050a
JF
7442 vFAIL("Missing right brace on \\x{}");
7443 }
de5f0749 7444 else {
a4c04bdc
NC
7445 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7446 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 7447 STRLEN numlen = e - p - 1;
53305cf1 7448 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
7449 if (ender > 0xff)
7450 RExC_utf8 = 1;
a0ed51b3
LW
7451 p = e + 1;
7452 }
a0ed51b3
LW
7453 }
7454 else {
a4c04bdc 7455 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 7456 STRLEN numlen = 2;
53305cf1 7457 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
7458 p += numlen;
7459 }
9e08bc66
TS
7460 if (PL_encoding && ender < 0x100)
7461 goto recode_encoding;
a687059c 7462 break;
a0d0e21e
LW
7463 case 'c':
7464 p++;
bbce6d69 7465 ender = UCHARAT(p++);
7466 ender = toCTRL(ender);
a687059c 7467 break;
a0d0e21e
LW
7468 case '0': case '1': case '2': case '3':case '4':
7469 case '5': case '6': case '7': case '8':case '9':
7470 if (*p == '0' ||
830247a4 7471 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 7472 I32 flags = 0;
1df70142 7473 STRLEN numlen = 3;
53305cf1 7474 ender = grok_oct(p, &numlen, &flags, NULL);
609122bd
KW
7475
7476 /* An octal above 0xff is interpreted differently
7477 * depending on if the re is in utf8 or not. If it
7478 * is in utf8, the value will be itself, otherwise
7479 * it is interpreted as modulo 0x100. It has been
7480 * decided to discourage the use of octal above the
7481 * single-byte range. For now, warn only when
7482 * it ends up modulo */
7483 if (SIZE_ONLY && ender >= 0x100
668c081a
NC
7484 && ! UTF && ! PL_encoding) {
7485 ckWARNregdep(p, "Use of octal value above 377 is deprecated");
609122bd 7486 }
a0d0e21e
LW
7487 p += numlen;
7488 }
7489 else {
7490 --p;
7491 goto loopdone;
a687059c 7492 }
9e08bc66
TS
7493 if (PL_encoding && ender < 0x100)
7494 goto recode_encoding;
7495 break;
7496 recode_encoding:
7497 {
7498 SV* enc = PL_encoding;
7499 ender = reg_recode((const char)(U8)ender, &enc);
668c081a
NC
7500 if (!enc && SIZE_ONLY)
7501 ckWARNreg(p, "Invalid escape in the specified encoding");
9e08bc66
TS
7502 RExC_utf8 = 1;
7503 }
a687059c 7504 break;
a0d0e21e 7505 case '\0':
830247a4 7506 if (p >= RExC_end)
b45f050a 7507 FAIL("Trailing \\");
a687059c 7508 /* FALL THROUGH */
a0d0e21e 7509 default:
668c081a
NC
7510 if (!SIZE_ONLY&& isALPHA(*p))
7511 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 7512 goto normal_default;
a0d0e21e
LW
7513 }
7514 break;
a687059c 7515 default:
a0ed51b3 7516 normal_default:
fd400ab9 7517 if (UTF8_IS_START(*p) && UTF) {
1df70142 7518 STRLEN numlen;
5e12f4fb 7519 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 7520 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
7521 p += numlen;
7522 }
7523 else
7524 ender = *p++;
a0d0e21e 7525 break;
a687059c 7526 }
bcdf7404
YO
7527 if ( RExC_flags & RXf_PMf_EXTENDED)
7528 p = regwhite( pRExC_state, p );
60a8b682
JH
7529 if (UTF && FOLD) {
7530 /* Prime the casefolded buffer. */
ac7e0132 7531 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 7532 }
bcdf7404 7533 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
7534 if (len)
7535 p = oldp;
16ea2a2e 7536 else if (UTF) {
80aecb99 7537 if (FOLD) {
60a8b682 7538 /* Emit all the Unicode characters. */
1df70142 7539 STRLEN numlen;
80aecb99
JH
7540 for (foldbuf = tmpbuf;
7541 foldlen;
7542 foldlen -= numlen) {
7543 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7544 if (numlen > 0) {
71207a34 7545 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7546 s += unilen;
7547 len += unilen;
7548 /* In EBCDIC the numlen
7549 * and unilen can differ. */
9dc45d57 7550 foldbuf += numlen;
47654450
JH
7551 if (numlen >= foldlen)
7552 break;
9dc45d57
JH
7553 }
7554 else
7555 break; /* "Can't happen." */
80aecb99
JH
7556 }
7557 }
7558 else {
71207a34 7559 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7560 if (unilen > 0) {
0ebc6274
JH
7561 s += unilen;
7562 len += unilen;
9dc45d57 7563 }
80aecb99 7564 }
a0ed51b3 7565 }
a0d0e21e
LW
7566 else {
7567 len++;
eb160463 7568 REGC((char)ender, s++);
a0d0e21e
LW
7569 }
7570 break;
a687059c 7571 }
16ea2a2e 7572 if (UTF) {
80aecb99 7573 if (FOLD) {
60a8b682 7574 /* Emit all the Unicode characters. */
1df70142 7575 STRLEN numlen;
80aecb99
JH
7576 for (foldbuf = tmpbuf;
7577 foldlen;
7578 foldlen -= numlen) {
7579 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7580 if (numlen > 0) {
71207a34 7581 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7582 len += unilen;
7583 s += unilen;
7584 /* In EBCDIC the numlen
7585 * and unilen can differ. */
9dc45d57 7586 foldbuf += numlen;
47654450
JH
7587 if (numlen >= foldlen)
7588 break;
9dc45d57
JH
7589 }
7590 else
7591 break;
80aecb99
JH
7592 }
7593 }
7594 else {
71207a34 7595 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7596 if (unilen > 0) {
0ebc6274
JH
7597 s += unilen;
7598 len += unilen;
9dc45d57 7599 }
80aecb99
JH
7600 }
7601 len--;
a0ed51b3
LW
7602 }
7603 else
eb160463 7604 REGC((char)ender, s++);
a0d0e21e
LW
7605 }
7606 loopdone:
830247a4 7607 RExC_parse = p - 1;
fac92740 7608 Set_Node_Cur_Length(ret); /* MJD */
830247a4 7609 nextchar(pRExC_state);
793db0cb
JH
7610 {
7611 /* len is STRLEN which is unsigned, need to copy to signed */
7612 IV iv = len;
7613 if (iv < 0)
7614 vFAIL("Internal disaster");
7615 }
a0d0e21e
LW
7616 if (len > 0)
7617 *flagp |= HASWIDTH;
090f7165 7618 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 7619 *flagp |= SIMPLE;
3dab1dad 7620
cd439c50 7621 if (SIZE_ONLY)
830247a4 7622 RExC_size += STR_SZ(len);
3dab1dad
YO
7623 else {
7624 STR_LEN(ret) = len;
830247a4 7625 RExC_emit += STR_SZ(len);
07be1b83 7626 }
3dab1dad 7627 }
a0d0e21e
LW
7628 break;
7629 }
a687059c 7630
a0d0e21e 7631 return(ret);
a687059c
LW
7632}
7633
873ef191 7634STATIC char *
bcdf7404 7635S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 7636{
bcdf7404 7637 const char *e = RExC_end;
7918f24d
NC
7638
7639 PERL_ARGS_ASSERT_REGWHITE;
7640
5b5a24f7
CS
7641 while (p < e) {
7642 if (isSPACE(*p))
7643 ++p;
7644 else if (*p == '#') {
bcdf7404 7645 bool ended = 0;
5b5a24f7 7646 do {
bcdf7404
YO
7647 if (*p++ == '\n') {
7648 ended = 1;
7649 break;
7650 }
7651 } while (p < e);
7652 if (!ended)
7653 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
7654 }
7655 else
7656 break;
7657 }
7658 return p;
7659}
7660
b8c5462f
JH
7661/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7662 Character classes ([:foo:]) can also be negated ([:^foo:]).
7663 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7664 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 7665 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
7666
7667#define POSIXCC_DONE(c) ((c) == ':')
7668#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7669#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7670
b8c5462f 7671STATIC I32
830247a4 7672S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 7673{
97aff369 7674 dVAR;
936ed897 7675 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 7676
7918f24d
NC
7677 PERL_ARGS_ASSERT_REGPPOSIXCC;
7678
830247a4 7679 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 7680 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 7681 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 7682 const char c = UCHARAT(RExC_parse);
097eb12c 7683 char* const s = RExC_parse++;
b81d288d 7684
9a86a77b 7685 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
7686 RExC_parse++;
7687 if (RExC_parse == RExC_end)
620e46c5 7688 /* Grandfather lone [:, [=, [. */
830247a4 7689 RExC_parse = s;
620e46c5 7690 else {
3dab1dad 7691 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
7692 assert(*t == c);
7693
9a86a77b 7694 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 7695 const char *posixcc = s + 1;
830247a4 7696 RExC_parse++; /* skip over the ending ] */
3dab1dad 7697
b8c5462f 7698 if (*s == ':') {
1df70142
AL
7699 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7700 const I32 skip = t - posixcc;
80916619
NC
7701
7702 /* Initially switch on the length of the name. */
7703 switch (skip) {
7704 case 4:
3dab1dad
YO
7705 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7706 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 7707 break;
80916619
NC
7708 case 5:
7709 /* Names all of length 5. */
7710 /* alnum alpha ascii blank cntrl digit graph lower
7711 print punct space upper */
7712 /* Offset 4 gives the best switch position. */
7713 switch (posixcc[4]) {
7714 case 'a':
3dab1dad
YO
7715 if (memEQ(posixcc, "alph", 4)) /* alpha */
7716 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
7717 break;
7718 case 'e':
3dab1dad
YO
7719 if (memEQ(posixcc, "spac", 4)) /* space */
7720 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
7721 break;
7722 case 'h':
3dab1dad
YO
7723 if (memEQ(posixcc, "grap", 4)) /* graph */
7724 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
7725 break;
7726 case 'i':
3dab1dad
YO
7727 if (memEQ(posixcc, "asci", 4)) /* ascii */
7728 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
7729 break;
7730 case 'k':
3dab1dad
YO
7731 if (memEQ(posixcc, "blan", 4)) /* blank */
7732 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
7733 break;
7734 case 'l':
3dab1dad
YO
7735 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7736 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
7737 break;
7738 case 'm':
3dab1dad
YO
7739 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7740 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
7741 break;
7742 case 'r':
3dab1dad
YO
7743 if (memEQ(posixcc, "lowe", 4)) /* lower */
7744 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7745 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7746 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
7747 break;
7748 case 't':
3dab1dad
YO
7749 if (memEQ(posixcc, "digi", 4)) /* digit */
7750 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7751 else if (memEQ(posixcc, "prin", 4)) /* print */
7752 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7753 else if (memEQ(posixcc, "punc", 4)) /* punct */
7754 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 7755 break;
b8c5462f
JH
7756 }
7757 break;
80916619 7758 case 6:
3dab1dad
YO
7759 if (memEQ(posixcc, "xdigit", 6))
7760 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
7761 break;
7762 }
80916619
NC
7763
7764 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
7765 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7766 t - s - 1, s + 1);
80916619
NC
7767 assert (posixcc[skip] == ':');
7768 assert (posixcc[skip+1] == ']');
b45f050a 7769 } else if (!SIZE_ONLY) {
b8c5462f 7770 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 7771
830247a4 7772 /* adjust RExC_parse so the warning shows after
b45f050a 7773 the class closes */
9a86a77b 7774 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 7775 RExC_parse++;
b45f050a
JF
7776 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7777 }
b8c5462f
JH
7778 } else {
7779 /* Maternal grandfather:
7780 * "[:" ending in ":" but not in ":]" */
830247a4 7781 RExC_parse = s;
767d463e 7782 }
620e46c5
JH
7783 }
7784 }
7785
b8c5462f
JH
7786 return namedclass;
7787}
7788
7789STATIC void
830247a4 7790S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 7791{
97aff369 7792 dVAR;
7918f24d
NC
7793
7794 PERL_ARGS_ASSERT_CHECKPOSIXCC;
7795
3dab1dad 7796 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
7797 const char *s = RExC_parse;
7798 const char c = *s++;
b8c5462f 7799
3dab1dad 7800 while (isALNUM(*s))
b8c5462f
JH
7801 s++;
7802 if (*s && c == *s && s[1] == ']') {
668c081a
NC
7803 ckWARN3reg(s+2,
7804 "POSIX syntax [%c %c] belongs inside character classes",
7805 c, c);
b45f050a
JF
7806
7807 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 7808 if (POSIXCC_NOTYET(c)) {
830247a4 7809 /* adjust RExC_parse so the error shows after
b45f050a 7810 the class closes */
9a86a77b 7811 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 7812 NOOP;
b45f050a
JF
7813 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7814 }
b8c5462f
JH
7815 }
7816 }
620e46c5
JH
7817}
7818
7f6f358c 7819
89836f1f
YO
7820#define _C_C_T_(NAME,TEST,WORD) \
7821ANYOF_##NAME: \
7822 if (LOC) \
7823 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7824 else { \
7825 for (value = 0; value < 256; value++) \
7826 if (TEST) \
7827 ANYOF_BITMAP_SET(ret, value); \
7828 } \
7829 yesno = '+'; \
7830 what = WORD; \
7831 break; \
7832case ANYOF_N##NAME: \
7833 if (LOC) \
7834 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7835 else { \
7836 for (value = 0; value < 256; value++) \
7837 if (!TEST) \
7838 ANYOF_BITMAP_SET(ret, value); \
7839 } \
7840 yesno = '!'; \
7841 what = WORD; \
7842 break
7843
e1d1eefb
YO
7844#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7845ANYOF_##NAME: \
7846 for (value = 0; value < 256; value++) \
7847 if (TEST) \
7848 ANYOF_BITMAP_SET(ret, value); \
7849 yesno = '+'; \
7850 what = WORD; \
7851 break; \
7852case ANYOF_N##NAME: \
7853 for (value = 0; value < 256; value++) \
7854 if (!TEST) \
7855 ANYOF_BITMAP_SET(ret, value); \
7856 yesno = '!'; \
7857 what = WORD; \
7858 break
89836f1f 7859
da7fcca4
YO
7860/*
7861 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7862 so that it is possible to override the option here without having to
7863 rebuild the entire core. as we are required to do if we change regcomp.h
7864 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7865*/
7866#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7867#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7868#endif
7869
7870#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7871#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7872#else
7873#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7874#endif
7875
7f6f358c
YO
7876/*
7877 parse a class specification and produce either an ANYOF node that
89836f1f
YO
7878 matches the pattern or if the pattern matches a single char only and
7879 that char is < 256 and we are case insensitive then we produce an
7880 EXACT node instead.
7f6f358c 7881*/
89836f1f 7882
76e3520e 7883STATIC regnode *
3dab1dad 7884S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 7885{
97aff369 7886 dVAR;
9a86a77b 7887 register UV nextvalue;
3568d838 7888 register IV prevvalue = OOB_UNICODE;
ffc61ed2 7889 register IV range = 0;
e1d1eefb 7890 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 7891 register regnode *ret;
ba210ebe 7892 STRLEN numlen;
ffc61ed2 7893 IV namedclass;
cbbf8932 7894 char *rangebegin = NULL;
936ed897 7895 bool need_class = 0;
c445ea15 7896 SV *listsv = NULL;
ffc61ed2 7897 UV n;
9e55ce06 7898 bool optimize_invert = TRUE;
cbbf8932 7899 AV* unicode_alternate = NULL;
1b2d223b
JH
7900#ifdef EBCDIC
7901 UV literal_endpoint = 0;
7902#endif
7f6f358c 7903 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 7904
3dab1dad 7905 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 7906 case we need to change the emitted regop to an EXACT. */
07be1b83 7907 const char * orig_parse = RExC_parse;
72f13be8 7908 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
7909
7910 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
7911#ifndef DEBUGGING
7912 PERL_UNUSED_ARG(depth);
7913#endif
72f13be8 7914
3dab1dad 7915 DEBUG_PARSE("clas");
7f6f358c
YO
7916
7917 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
7918 ret = reganode(pRExC_state, ANYOF, 0);
7919
7920 if (!SIZE_ONLY)
7921 ANYOF_FLAGS(ret) = 0;
7922
9a86a77b 7923 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
7924 RExC_naughty++;
7925 RExC_parse++;
7926 if (!SIZE_ONLY)
7927 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7928 }
a0d0e21e 7929
73060fc4 7930 if (SIZE_ONLY) {
830247a4 7931 RExC_size += ANYOF_SKIP;
73060fc4
JH
7932 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7933 }
936ed897 7934 else {
830247a4 7935 RExC_emit += ANYOF_SKIP;
936ed897
IZ
7936 if (FOLD)
7937 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7938 if (LOC)
7939 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 7940 ANYOF_BITMAP_ZERO(ret);
396482e1 7941 listsv = newSVpvs("# comment\n");
a0d0e21e 7942 }
b8c5462f 7943
9a86a77b
JH
7944 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7945
b938889d 7946 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 7947 checkposixcc(pRExC_state);
b8c5462f 7948
f064b6ad
HS
7949 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7950 if (UCHARAT(RExC_parse) == ']')
7951 goto charclassloop;
ffc61ed2 7952
fc8cd66c 7953parseit:
9a86a77b 7954 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
7955
7956 charclassloop:
7957
7958 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7959
73b437c8 7960 if (!range)
830247a4 7961 rangebegin = RExC_parse;
ffc61ed2 7962 if (UTF) {
5e12f4fb 7963 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 7964 RExC_end - RExC_parse,
9f7f3913 7965 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7966 RExC_parse += numlen;
7967 }
7968 else
7969 value = UCHARAT(RExC_parse++);
7f6f358c 7970
9a86a77b
JH
7971 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7972 if (value == '[' && POSIXCC(nextvalue))
830247a4 7973 namedclass = regpposixcc(pRExC_state, value);
620e46c5 7974 else if (value == '\\') {
ffc61ed2 7975 if (UTF) {
5e12f4fb 7976 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 7977 RExC_end - RExC_parse,
9f7f3913 7978 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7979 RExC_parse += numlen;
7980 }
7981 else
7982 value = UCHARAT(RExC_parse++);
470c3474 7983 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 7984 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
7985 * be a problem later if we want switch on Unicode.
7986 * A similar issue a little bit later when switching on
7987 * namedclass. --jhi */
ffc61ed2 7988 switch ((I32)value) {
b8c5462f
JH
7989 case 'w': namedclass = ANYOF_ALNUM; break;
7990 case 'W': namedclass = ANYOF_NALNUM; break;
7991 case 's': namedclass = ANYOF_SPACE; break;
7992 case 'S': namedclass = ANYOF_NSPACE; break;
7993 case 'd': namedclass = ANYOF_DIGIT; break;
7994 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
7995 case 'v': namedclass = ANYOF_VERTWS; break;
7996 case 'V': namedclass = ANYOF_NVERTWS; break;
7997 case 'h': namedclass = ANYOF_HORIZWS; break;
7998 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
7999 case 'N': /* Handle \N{NAME} in class */
8000 {
8001 /* We only pay attention to the first char of
8002 multichar strings being returned. I kinda wonder
8003 if this makes sense as it does change the behaviour
8004 from earlier versions, OTOH that behaviour was broken
8005 as well. */
8006 UV v; /* value is register so we cant & it /grrr */
afefe6bf 8007 if (reg_namedseq(pRExC_state, &v, NULL)) {
fc8cd66c
YO
8008 goto parseit;
8009 }
8010 value= v;
8011 }
8012 break;
ffc61ed2
JH
8013 case 'p':
8014 case 'P':
3dab1dad
YO
8015 {
8016 char *e;
af6f566e 8017 if (RExC_parse >= RExC_end)
2a4859cd 8018 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 8019 if (*RExC_parse == '{') {
1df70142 8020 const U8 c = (U8)value;
ffc61ed2
JH
8021 e = strchr(RExC_parse++, '}');
8022 if (!e)
0da60cf5 8023 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
8024 while (isSPACE(UCHARAT(RExC_parse)))
8025 RExC_parse++;
8026 if (e == RExC_parse)
0da60cf5 8027 vFAIL2("Empty \\%c{}", c);
ffc61ed2 8028 n = e - RExC_parse;
ab13f0c7
JH
8029 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8030 n--;
ffc61ed2
JH
8031 }
8032 else {
8033 e = RExC_parse;
8034 n = 1;
8035 }
8036 if (!SIZE_ONLY) {
ab13f0c7
JH
8037 if (UCHARAT(RExC_parse) == '^') {
8038 RExC_parse++;
8039 n--;
8040 value = value == 'p' ? 'P' : 'p'; /* toggle */
8041 while (isSPACE(UCHARAT(RExC_parse))) {
8042 RExC_parse++;
8043 n--;
8044 }
8045 }
097eb12c
AL
8046 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8047 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
8048 }
8049 RExC_parse = e + 1;
8050 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 8051 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 8052 }
f81125e2 8053 break;
b8c5462f
JH
8054 case 'n': value = '\n'; break;
8055 case 'r': value = '\r'; break;
8056 case 't': value = '\t'; break;
8057 case 'f': value = '\f'; break;
8058 case 'b': value = '\b'; break;
c7f1f016
NIS
8059 case 'e': value = ASCII_TO_NATIVE('\033');break;
8060 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 8061 case 'x':
ffc61ed2 8062 if (*RExC_parse == '{') {
a4c04bdc
NC
8063 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8064 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 8065 char * const e = strchr(RExC_parse++, '}');
b81d288d 8066 if (!e)
ffc61ed2 8067 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
8068
8069 numlen = e - RExC_parse;
8070 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8071 RExC_parse = e + 1;
8072 }
8073 else {
a4c04bdc 8074 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
8075 numlen = 2;
8076 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8077 RExC_parse += numlen;
8078 }
9e08bc66
TS
8079 if (PL_encoding && value < 0x100)
8080 goto recode_encoding;
b8c5462f
JH
8081 break;
8082 case 'c':
830247a4 8083 value = UCHARAT(RExC_parse++);
b8c5462f
JH
8084 value = toCTRL(value);
8085 break;
8086 case '0': case '1': case '2': case '3': case '4':
8087 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
8088 {
8089 I32 flags = 0;
8090 numlen = 3;
8091 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8092 RExC_parse += numlen;
8093 if (PL_encoding && value < 0x100)
8094 goto recode_encoding;
8095 break;
8096 }
8097 recode_encoding:
8098 {
8099 SV* enc = PL_encoding;
8100 value = reg_recode((const char)(U8)value, &enc);
668c081a
NC
8101 if (!enc && SIZE_ONLY)
8102 ckWARNreg(RExC_parse,
8103 "Invalid escape in the specified encoding");
9e08bc66
TS
8104 break;
8105 }
1028017a 8106 default:
668c081a
NC
8107 if (!SIZE_ONLY && isALPHA(value))
8108 ckWARN2reg(RExC_parse,
8109 "Unrecognized escape \\%c in character class passed through",
8110 (int)value);
1028017a 8111 break;
b8c5462f 8112 }
ffc61ed2 8113 } /* end of \blah */
1b2d223b
JH
8114#ifdef EBCDIC
8115 else
8116 literal_endpoint++;
8117#endif
ffc61ed2
JH
8118
8119 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8120
8121 if (!SIZE_ONLY && !need_class)
936ed897 8122 ANYOF_CLASS_ZERO(ret);
ffc61ed2 8123
936ed897 8124 need_class = 1;
ffc61ed2
JH
8125
8126 /* a bad range like a-\d, a-[:digit:] ? */
8127 if (range) {
73b437c8 8128 if (!SIZE_ONLY) {
668c081a
NC
8129 const int w =
8130 RExC_parse >= rangebegin ?
8131 RExC_parse - rangebegin : 0;
8132 ckWARN4reg(RExC_parse,
b45f050a 8133 "False [] range \"%*.*s\"",
097eb12c 8134 w, w, rangebegin);
668c081a 8135
3568d838
JH
8136 if (prevvalue < 256) {
8137 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
8138 ANYOF_BITMAP_SET(ret, '-');
8139 }
8140 else {
8141 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8142 Perl_sv_catpvf(aTHX_ listsv,
3568d838 8143 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 8144 }
b8c5462f 8145 }
ffc61ed2
JH
8146
8147 range = 0; /* this was not a true range */
73b437c8 8148 }
ffc61ed2 8149
89836f1f
YO
8150
8151
73b437c8 8152 if (!SIZE_ONLY) {
c49a72a9
NC
8153 const char *what = NULL;
8154 char yesno = 0;
8155
3568d838
JH
8156 if (namedclass > OOB_NAMEDCLASS)
8157 optimize_invert = FALSE;
e2962f66
JH
8158 /* Possible truncation here but in some 64-bit environments
8159 * the compiler gets heartburn about switch on 64-bit values.
8160 * A similar issue a little earlier when switching on value.
98f323fa 8161 * --jhi */
e2962f66 8162 switch ((I32)namedclass) {
da7fcca4
YO
8163
8164 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8165 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8166 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8167 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8168 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8169 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8170 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8171 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8172 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8173 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8174#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
89836f1f 8175 case _C_C_T_(ALNUM, isALNUM(value), "Word");
89836f1f 8176 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
da7fcca4
YO
8177#else
8178 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8179 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8180#endif
89836f1f 8181 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
e1d1eefb
YO
8182 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8183 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
73b437c8
JH
8184 case ANYOF_ASCII:
8185 if (LOC)
936ed897 8186 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 8187 else {
c7f1f016 8188#ifndef EBCDIC
1ba5c669
JH
8189 for (value = 0; value < 128; value++)
8190 ANYOF_BITMAP_SET(ret, value);
8191#else /* EBCDIC */
ffbc6a93 8192 for (value = 0; value < 256; value++) {
3a3c4447
JH
8193 if (isASCII(value))
8194 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 8195 }
1ba5c669 8196#endif /* EBCDIC */
73b437c8 8197 }
c49a72a9
NC
8198 yesno = '+';
8199 what = "ASCII";
73b437c8
JH
8200 break;
8201 case ANYOF_NASCII:
8202 if (LOC)
936ed897 8203 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 8204 else {
c7f1f016 8205#ifndef EBCDIC
1ba5c669
JH
8206 for (value = 128; value < 256; value++)
8207 ANYOF_BITMAP_SET(ret, value);
8208#else /* EBCDIC */
ffbc6a93 8209 for (value = 0; value < 256; value++) {
3a3c4447
JH
8210 if (!isASCII(value))
8211 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 8212 }
1ba5c669 8213#endif /* EBCDIC */
73b437c8 8214 }
c49a72a9
NC
8215 yesno = '!';
8216 what = "ASCII";
89836f1f 8217 break;
ffc61ed2
JH
8218 case ANYOF_DIGIT:
8219 if (LOC)
8220 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8221 else {
8222 /* consecutive digits assumed */
8223 for (value = '0'; value <= '9'; value++)
8224 ANYOF_BITMAP_SET(ret, value);
8225 }
c49a72a9 8226 yesno = '+';
da7fcca4 8227 what = POSIX_CC_UNI_NAME("Digit");
ffc61ed2
JH
8228 break;
8229 case ANYOF_NDIGIT:
8230 if (LOC)
8231 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8232 else {
8233 /* consecutive digits assumed */
8234 for (value = 0; value < '0'; value++)
8235 ANYOF_BITMAP_SET(ret, value);
8236 for (value = '9' + 1; value < 256; value++)
8237 ANYOF_BITMAP_SET(ret, value);
8238 }
c49a72a9 8239 yesno = '!';
da7fcca4 8240 what = POSIX_CC_UNI_NAME("Digit");
89836f1f 8241 break;
f81125e2
JP
8242 case ANYOF_MAX:
8243 /* this is to handle \p and \P */
8244 break;
73b437c8 8245 default:
b45f050a 8246 vFAIL("Invalid [::] class");
73b437c8 8247 break;
b8c5462f 8248 }
c49a72a9
NC
8249 if (what) {
8250 /* Strings such as "+utf8::isWord\n" */
8251 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8252 }
b8c5462f 8253 if (LOC)
936ed897 8254 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 8255 continue;
a0d0e21e 8256 }
ffc61ed2
JH
8257 } /* end of namedclass \blah */
8258
a0d0e21e 8259 if (range) {
eb160463 8260 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
8261 const int w = RExC_parse - rangebegin;
8262 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 8263 range = 0; /* not a valid range */
73b437c8 8264 }
a0d0e21e
LW
8265 }
8266 else {
3568d838 8267 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
8268 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8269 RExC_parse[1] != ']') {
8270 RExC_parse++;
ffc61ed2
JH
8271
8272 /* a bad range like \w-, [:word:]- ? */
8273 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 8274 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 8275 const int w =
afd78fd5
JH
8276 RExC_parse >= rangebegin ?
8277 RExC_parse - rangebegin : 0;
830247a4 8278 vWARN4(RExC_parse,
b45f050a 8279 "False [] range \"%*.*s\"",
097eb12c 8280 w, w, rangebegin);
afd78fd5 8281 }
73b437c8 8282 if (!SIZE_ONLY)
936ed897 8283 ANYOF_BITMAP_SET(ret, '-');
73b437c8 8284 } else
ffc61ed2
JH
8285 range = 1; /* yeah, it's a range! */
8286 continue; /* but do it the next time */
a0d0e21e 8287 }
a687059c 8288 }
ffc61ed2 8289
93733859 8290 /* now is the next time */
07be1b83 8291 /*stored += (value - prevvalue + 1);*/
ae5c130c 8292 if (!SIZE_ONLY) {
3568d838 8293 if (prevvalue < 256) {
1df70142 8294 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 8295 IV i;
3568d838 8296#ifdef EBCDIC
1b2d223b
JH
8297 /* In EBCDIC [\x89-\x91] should include
8298 * the \x8e but [i-j] should not. */
8299 if (literal_endpoint == 2 &&
8300 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8301 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 8302 {
3568d838
JH
8303 if (isLOWER(prevvalue)) {
8304 for (i = prevvalue; i <= ceilvalue; i++)
2670d666
BC
8305 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8306 stored++;
ffc61ed2 8307 ANYOF_BITMAP_SET(ret, i);
2670d666 8308 }
ffc61ed2 8309 } else {
3568d838 8310 for (i = prevvalue; i <= ceilvalue; i++)
2670d666
BC
8311 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8312 stored++;
ffc61ed2 8313 ANYOF_BITMAP_SET(ret, i);
2670d666 8314 }
ffc61ed2 8315 }
8ada0baa 8316 }
ffc61ed2 8317 else
8ada0baa 8318#endif
07be1b83
YO
8319 for (i = prevvalue; i <= ceilvalue; i++) {
8320 if (!ANYOF_BITMAP_TEST(ret,i)) {
8321 stored++;
8322 ANYOF_BITMAP_SET(ret, i);
8323 }
8324 }
3568d838 8325 }
a5961de5 8326 if (value > 255 || UTF) {
1df70142
AL
8327 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8328 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 8329 stored+=2; /* can't optimize this class */
ffc61ed2 8330 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 8331 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 8332 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
8333 prevnatvalue, natvalue);
8334 }
8335 else if (prevnatvalue == natvalue) {
8336 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 8337 if (FOLD) {
89ebb4a3 8338 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 8339 STRLEN foldlen;
1df70142 8340 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 8341
e294cc5d
JH
8342#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8343 if (RExC_precomp[0] == ':' &&
8344 RExC_precomp[1] == '[' &&
8345 (f == 0xDF || f == 0x92)) {
8346 f = NATIVE_TO_UNI(f);
8347 }
8348#endif
c840d2a2
JH
8349 /* If folding and foldable and a single
8350 * character, insert also the folded version
8351 * to the charclass. */
9e55ce06 8352 if (f != value) {
e294cc5d
JH
8353#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8354 if ((RExC_precomp[0] == ':' &&
8355 RExC_precomp[1] == '[' &&
8356 (f == 0xA2 &&
8357 (value == 0xFB05 || value == 0xFB06))) ?
8358 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8359 foldlen == (STRLEN)UNISKIP(f) )
8360#else
eb160463 8361 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 8362#endif
9e55ce06
JH
8363 Perl_sv_catpvf(aTHX_ listsv,
8364 "%04"UVxf"\n", f);
8365 else {
8366 /* Any multicharacter foldings
8367 * require the following transform:
8368 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8369 * where E folds into "pq" and F folds
8370 * into "rst", all other characters
8371 * fold to single characters. We save
8372 * away these multicharacter foldings,
8373 * to be later saved as part of the
8374 * additional "s" data. */
8375 SV *sv;
8376
8377 if (!unicode_alternate)
8378 unicode_alternate = newAV();
740cce10
NC
8379 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8380 TRUE);
9e55ce06
JH
8381 av_push(unicode_alternate, sv);
8382 }
8383 }
254ba52a 8384
60a8b682
JH
8385 /* If folding and the value is one of the Greek
8386 * sigmas insert a few more sigmas to make the
8387 * folding rules of the sigmas to work right.
8388 * Note that not all the possible combinations
8389 * are handled here: some of them are handled
9e55ce06
JH
8390 * by the standard folding rules, and some of
8391 * them (literal or EXACTF cases) are handled
8392 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
8393 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8394 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8395 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 8396 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8397 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8398 }
8399 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8400 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8401 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8402 }
8403 }
ffc61ed2 8404 }
1b2d223b
JH
8405#ifdef EBCDIC
8406 literal_endpoint = 0;
8407#endif
8ada0baa 8408 }
ffc61ed2
JH
8409
8410 range = 0; /* this range (if it was one) is done now */
a0d0e21e 8411 }
ffc61ed2 8412
936ed897 8413 if (need_class) {
4f66b38d 8414 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 8415 if (SIZE_ONLY)
830247a4 8416 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 8417 else
830247a4 8418 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 8419 }
ffc61ed2 8420
7f6f358c
YO
8421
8422 if (SIZE_ONLY)
8423 return ret;
8424 /****** !SIZE_ONLY AFTER HERE *********/
8425
3b57cd43 8426 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
7f6f358c
YO
8427 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8428 ) {
8429 /* optimize single char class to an EXACT node
8430 but *only* when its not a UTF/high char */
07be1b83
YO
8431 const char * cur_parse= RExC_parse;
8432 RExC_emit = (regnode *)orig_emit;
8433 RExC_parse = (char *)orig_parse;
7f6f358c
YO
8434 ret = reg_node(pRExC_state,
8435 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 8436 RExC_parse = (char *)cur_parse;
7f6f358c
YO
8437 *STRING(ret)= (char)value;
8438 STR_LEN(ret)= 1;
8439 RExC_emit += STR_SZ(1);
b023a8ee
MHM
8440 if (listsv) {
8441 SvREFCNT_dec(listsv);
8442 }
7f6f358c
YO
8443 return ret;
8444 }
ae5c130c 8445 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 8446 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
8447 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8448 ) {
a0ed51b3 8449 for (value = 0; value < 256; ++value) {
936ed897 8450 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 8451 UV fold = PL_fold[value];
ffc61ed2
JH
8452
8453 if (fold != value)
8454 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
8455 }
8456 }
936ed897 8457 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 8458 }
ffc61ed2 8459
ae5c130c 8460 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 8461 if (optimize_invert &&
ffc61ed2
JH
8462 /* If the only flag is inversion. */
8463 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 8464 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 8465 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 8466 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 8467 }
7f6f358c 8468 {
097eb12c 8469 AV * const av = newAV();
ffc61ed2 8470 SV *rv;
9e55ce06 8471 /* The 0th element stores the character class description
6a0407ee 8472 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
8473 * to initialize the appropriate swash (which gets stored in
8474 * the 1st element), and also useful for dumping the regnode.
8475 * The 2nd element stores the multicharacter foldings,
6a0407ee 8476 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
8477 av_store(av, 0, listsv);
8478 av_store(av, 1, NULL);
ad64d0ec
NC
8479 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8480 rv = newRV_noinc(MUTABLE_SV(av));
19860706 8481 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 8482 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 8483 ARG_SET(ret, n);
a0ed51b3 8484 }
a0ed51b3
LW
8485 return ret;
8486}
89836f1f
YO
8487#undef _C_C_T_
8488
a0ed51b3 8489
bcdf7404
YO
8490/* reg_skipcomment()
8491
8492 Absorbs an /x style # comments from the input stream.
8493 Returns true if there is more text remaining in the stream.
8494 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8495 terminates the pattern without including a newline.
8496
8497 Note its the callers responsibility to ensure that we are
8498 actually in /x mode
8499
8500*/
8501
8502STATIC bool
8503S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8504{
8505 bool ended = 0;
7918f24d
NC
8506
8507 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8508
bcdf7404
YO
8509 while (RExC_parse < RExC_end)
8510 if (*RExC_parse++ == '\n') {
8511 ended = 1;
8512 break;
8513 }
8514 if (!ended) {
8515 /* we ran off the end of the pattern without ending
8516 the comment, so we have to add an \n when wrapping */
8517 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8518 return 0;
8519 } else
8520 return 1;
8521}
8522
8523/* nextchar()
8524
8525 Advance that parse position, and optionall absorbs
8526 "whitespace" from the inputstream.
8527
8528 Without /x "whitespace" means (?#...) style comments only,
8529 with /x this means (?#...) and # comments and whitespace proper.
8530
8531 Returns the RExC_parse point from BEFORE the scan occurs.
8532
8533 This is the /x friendly way of saying RExC_parse++.
8534*/
8535
76e3520e 8536STATIC char*
830247a4 8537S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 8538{
097eb12c 8539 char* const retval = RExC_parse++;
a0d0e21e 8540
7918f24d
NC
8541 PERL_ARGS_ASSERT_NEXTCHAR;
8542
4633a7c4 8543 for (;;) {
830247a4
IZ
8544 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8545 RExC_parse[2] == '#') {
e994fd66
AE
8546 while (*RExC_parse != ')') {
8547 if (RExC_parse == RExC_end)
8548 FAIL("Sequence (?#... not terminated");
830247a4 8549 RExC_parse++;
e994fd66 8550 }
830247a4 8551 RExC_parse++;
4633a7c4
LW
8552 continue;
8553 }
bbe252da 8554 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
8555 if (isSPACE(*RExC_parse)) {
8556 RExC_parse++;
748a9306
LW
8557 continue;
8558 }
830247a4 8559 else if (*RExC_parse == '#') {
bcdf7404
YO
8560 if ( reg_skipcomment( pRExC_state ) )
8561 continue;
748a9306 8562 }
748a9306 8563 }
4633a7c4 8564 return retval;
a0d0e21e 8565 }
a687059c
LW
8566}
8567
8568/*
c277df42 8569- reg_node - emit a node
a0d0e21e 8570*/
76e3520e 8571STATIC regnode * /* Location. */
830247a4 8572S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 8573{
97aff369 8574 dVAR;
c277df42 8575 register regnode *ptr;
504618e9 8576 regnode * const ret = RExC_emit;
07be1b83 8577 GET_RE_DEBUG_FLAGS_DECL;
a687059c 8578
7918f24d
NC
8579 PERL_ARGS_ASSERT_REG_NODE;
8580
c277df42 8581 if (SIZE_ONLY) {
830247a4
IZ
8582 SIZE_ALIGN(RExC_size);
8583 RExC_size += 1;
a0d0e21e
LW
8584 return(ret);
8585 }
3b57cd43
YO
8586 if (RExC_emit >= RExC_emit_bound)
8587 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8588
c277df42 8589 NODE_ALIGN_FILL(ret);
a0d0e21e 8590 ptr = ret;
c277df42 8591 FILL_ADVANCE_NODE(ptr, op);
7122b237 8592#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8593 if (RExC_offsets) { /* MJD */
07be1b83 8594 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 8595 "reg_node", __LINE__,
13d6edb4 8596 PL_reg_name[op],
07be1b83
YO
8597 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8598 ? "Overwriting end of array!\n" : "OK",
8599 (UV)(RExC_emit - RExC_emit_start),
8600 (UV)(RExC_parse - RExC_start),
8601 (UV)RExC_offsets[0]));
ccb2c380 8602 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 8603 }
7122b237 8604#endif
830247a4 8605 RExC_emit = ptr;
a0d0e21e 8606 return(ret);
a687059c
LW
8607}
8608
8609/*
a0d0e21e
LW
8610- reganode - emit a node with an argument
8611*/
76e3520e 8612STATIC regnode * /* Location. */
830247a4 8613S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 8614{
97aff369 8615 dVAR;
c277df42 8616 register regnode *ptr;
504618e9 8617 regnode * const ret = RExC_emit;
07be1b83 8618 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 8619
7918f24d
NC
8620 PERL_ARGS_ASSERT_REGANODE;
8621
c277df42 8622 if (SIZE_ONLY) {
830247a4
IZ
8623 SIZE_ALIGN(RExC_size);
8624 RExC_size += 2;
6bda09f9
YO
8625 /*
8626 We can't do this:
8627
8628 assert(2==regarglen[op]+1);
8629
8630 Anything larger than this has to allocate the extra amount.
8631 If we changed this to be:
8632
8633 RExC_size += (1 + regarglen[op]);
8634
8635 then it wouldn't matter. Its not clear what side effect
8636 might come from that so its not done so far.
8637 -- dmq
8638 */
a0d0e21e
LW
8639 return(ret);
8640 }
3b57cd43
YO
8641 if (RExC_emit >= RExC_emit_bound)
8642 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8643
c277df42 8644 NODE_ALIGN_FILL(ret);
a0d0e21e 8645 ptr = ret;
c277df42 8646 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 8647#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8648 if (RExC_offsets) { /* MJD */
07be1b83 8649 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 8650 "reganode",
ccb2c380 8651 __LINE__,
13d6edb4 8652 PL_reg_name[op],
07be1b83 8653 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 8654 "Overwriting end of array!\n" : "OK",
07be1b83
YO
8655 (UV)(RExC_emit - RExC_emit_start),
8656 (UV)(RExC_parse - RExC_start),
8657 (UV)RExC_offsets[0]));
ccb2c380 8658 Set_Cur_Node_Offset;
fac92740 8659 }
7122b237 8660#endif
830247a4 8661 RExC_emit = ptr;
a0d0e21e 8662 return(ret);
fe14fcc3
LW
8663}
8664
8665/*
cd439c50 8666- reguni - emit (if appropriate) a Unicode character
a0ed51b3 8667*/
71207a34
AL
8668STATIC STRLEN
8669S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 8670{
97aff369 8671 dVAR;
7918f24d
NC
8672
8673 PERL_ARGS_ASSERT_REGUNI;
8674
71207a34 8675 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
8676}
8677
8678/*
a0d0e21e
LW
8679- reginsert - insert an operator in front of already-emitted operand
8680*
8681* Means relocating the operand.
8682*/
76e3520e 8683STATIC void
6bda09f9 8684S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 8685{
97aff369 8686 dVAR;
c277df42
IZ
8687 register regnode *src;
8688 register regnode *dst;
8689 register regnode *place;
504618e9 8690 const int offset = regarglen[(U8)op];
6bda09f9 8691 const int size = NODE_STEP_REGNODE + offset;
07be1b83 8692 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8693
8694 PERL_ARGS_ASSERT_REGINSERT;
def51078 8695 PERL_UNUSED_ARG(depth);
22c35a8c 8696/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 8697 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 8698 if (SIZE_ONLY) {
6bda09f9 8699 RExC_size += size;
a0d0e21e
LW
8700 return;
8701 }
a687059c 8702
830247a4 8703 src = RExC_emit;
6bda09f9 8704 RExC_emit += size;
830247a4 8705 dst = RExC_emit;
40d049e4 8706 if (RExC_open_parens) {
6bda09f9 8707 int paren;
3b57cd43 8708 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 8709 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 8710 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 8711 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
8712 RExC_open_parens[paren] += size;
8713 } else {
3b57cd43 8714 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
8715 }
8716 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 8717 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
8718 RExC_close_parens[paren] += size;
8719 } else {
3b57cd43 8720 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
8721 }
8722 }
6bda09f9 8723 }
40d049e4 8724
fac92740 8725 while (src > opnd) {
c277df42 8726 StructCopy(--src, --dst, regnode);
7122b237 8727#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8728 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 8729 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 8730 "reg_insert",
ccb2c380 8731 __LINE__,
13d6edb4 8732 PL_reg_name[op],
07be1b83
YO
8733 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8734 ? "Overwriting end of array!\n" : "OK",
8735 (UV)(src - RExC_emit_start),
8736 (UV)(dst - RExC_emit_start),
8737 (UV)RExC_offsets[0]));
ccb2c380
MP
8738 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8739 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 8740 }
7122b237 8741#endif
fac92740
MJD
8742 }
8743
a0d0e21e
LW
8744
8745 place = opnd; /* Op node, where operand used to be. */
7122b237 8746#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8747 if (RExC_offsets) { /* MJD */
07be1b83 8748 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 8749 "reginsert",
ccb2c380 8750 __LINE__,
13d6edb4 8751 PL_reg_name[op],
07be1b83 8752 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 8753 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
8754 (UV)(place - RExC_emit_start),
8755 (UV)(RExC_parse - RExC_start),
786e8c11 8756 (UV)RExC_offsets[0]));
ccb2c380 8757 Set_Node_Offset(place, RExC_parse);
45948336 8758 Set_Node_Length(place, 1);
fac92740 8759 }
7122b237 8760#endif
c277df42
IZ
8761 src = NEXTOPER(place);
8762 FILL_ADVANCE_NODE(place, op);
8763 Zero(src, offset, regnode);
a687059c
LW
8764}
8765
8766/*
c277df42 8767- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 8768- SEE ALSO: regtail_study
a0d0e21e 8769*/
097eb12c 8770/* TODO: All three parms should be const */
76e3520e 8771STATIC void
3dab1dad 8772S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 8773{
97aff369 8774 dVAR;
c277df42 8775 register regnode *scan;
72f13be8 8776 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8777
8778 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
8779#ifndef DEBUGGING
8780 PERL_UNUSED_ARG(depth);
8781#endif
a0d0e21e 8782
c277df42 8783 if (SIZE_ONLY)
a0d0e21e
LW
8784 return;
8785
8786 /* Find last node. */
8787 scan = p;
8788 for (;;) {
504618e9 8789 regnode * const temp = regnext(scan);
3dab1dad
YO
8790 DEBUG_PARSE_r({
8791 SV * const mysv=sv_newmortal();
8792 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8793 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
8794 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8795 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8796 (temp == NULL ? "->" : ""),
13d6edb4 8797 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 8798 );
3dab1dad
YO
8799 });
8800 if (temp == NULL)
8801 break;
8802 scan = temp;
8803 }
8804
8805 if (reg_off_by_arg[OP(scan)]) {
8806 ARG_SET(scan, val - scan);
8807 }
8808 else {
8809 NEXT_OFF(scan) = val - scan;
8810 }
8811}
8812
07be1b83 8813#ifdef DEBUGGING
3dab1dad
YO
8814/*
8815- regtail_study - set the next-pointer at the end of a node chain of p to val.
8816- Look for optimizable sequences at the same time.
8817- currently only looks for EXACT chains.
07be1b83
YO
8818
8819This is expermental code. The idea is to use this routine to perform
8820in place optimizations on branches and groups as they are constructed,
8821with the long term intention of removing optimization from study_chunk so
8822that it is purely analytical.
8823
8824Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8825to control which is which.
8826
3dab1dad
YO
8827*/
8828/* TODO: All four parms should be const */
07be1b83 8829
3dab1dad
YO
8830STATIC U8
8831S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8832{
8833 dVAR;
8834 register regnode *scan;
07be1b83
YO
8835 U8 exact = PSEUDO;
8836#ifdef EXPERIMENTAL_INPLACESCAN
8837 I32 min = 0;
8838#endif
3dab1dad
YO
8839 GET_RE_DEBUG_FLAGS_DECL;
8840
7918f24d
NC
8841 PERL_ARGS_ASSERT_REGTAIL_STUDY;
8842
07be1b83 8843
3dab1dad
YO
8844 if (SIZE_ONLY)
8845 return exact;
8846
8847 /* Find last node. */
8848
8849 scan = p;
8850 for (;;) {
8851 regnode * const temp = regnext(scan);
07be1b83
YO
8852#ifdef EXPERIMENTAL_INPLACESCAN
8853 if (PL_regkind[OP(scan)] == EXACT)
8854 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8855 return EXACT;
8856#endif
3dab1dad
YO
8857 if ( exact ) {
8858 switch (OP(scan)) {
8859 case EXACT:
8860 case EXACTF:
8861 case EXACTFL:
8862 if( exact == PSEUDO )
8863 exact= OP(scan);
07be1b83
YO
8864 else if ( exact != OP(scan) )
8865 exact= 0;
3dab1dad
YO
8866 case NOTHING:
8867 break;
8868 default:
8869 exact= 0;
8870 }
8871 }
8872 DEBUG_PARSE_r({
8873 SV * const mysv=sv_newmortal();
8874 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8875 regprop(RExC_rx, mysv, scan);
eaf3ca90 8876 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8877 SvPV_nolen_const(mysv),
eaf3ca90 8878 REG_NODE_NUM(scan),
13d6edb4 8879 PL_reg_name[exact]);
3dab1dad 8880 });
a0d0e21e
LW
8881 if (temp == NULL)
8882 break;
8883 scan = temp;
8884 }
07be1b83
YO
8885 DEBUG_PARSE_r({
8886 SV * const mysv_val=sv_newmortal();
8887 DEBUG_PARSE_MSG("");
8888 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
8889 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8890 SvPV_nolen_const(mysv_val),
8891 (IV)REG_NODE_NUM(val),
8892 (IV)(val - scan)
07be1b83
YO
8893 );
8894 });
c277df42
IZ
8895 if (reg_off_by_arg[OP(scan)]) {
8896 ARG_SET(scan, val - scan);
a0ed51b3
LW
8897 }
8898 else {
c277df42
IZ
8899 NEXT_OFF(scan) = val - scan;
8900 }
3dab1dad
YO
8901
8902 return exact;
a687059c 8903}
07be1b83 8904#endif
a687059c
LW
8905
8906/*
a687059c
LW
8907 - regcurly - a little FSA that accepts {\d+,?\d*}
8908 */
79072805 8909STATIC I32
5f66b61c 8910S_regcurly(register const char *s)
a687059c 8911{
7918f24d
NC
8912 PERL_ARGS_ASSERT_REGCURLY;
8913
a687059c
LW
8914 if (*s++ != '{')
8915 return FALSE;
f0fcb552 8916 if (!isDIGIT(*s))
a687059c 8917 return FALSE;
f0fcb552 8918 while (isDIGIT(*s))
a687059c
LW
8919 s++;
8920 if (*s == ',')
8921 s++;
f0fcb552 8922 while (isDIGIT(*s))
a687059c
LW
8923 s++;
8924 if (*s != '}')
8925 return FALSE;
8926 return TRUE;
8927}
8928
a687059c
LW
8929
8930/*
fd181c75 8931 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 8932 */
f7819f85 8933#ifdef DEBUGGING
c33269f7 8934static void
7918f24d
NC
8935S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8936{
f7819f85
A
8937 int bit;
8938 int set=0;
7918f24d 8939
f7819f85
A
8940 for (bit=0; bit<32; bit++) {
8941 if (flags & (1<<bit)) {
8942 if (!set++ && lead)
8943 PerlIO_printf(Perl_debug_log, "%s",lead);
8944 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8945 }
8946 }
8947 if (lead) {
8948 if (set)
8949 PerlIO_printf(Perl_debug_log, "\n");
8950 else
8951 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8952 }
8953}
8954#endif
8955
a687059c 8956void
097eb12c 8957Perl_regdump(pTHX_ const regexp *r)
a687059c 8958{
35ff7856 8959#ifdef DEBUGGING
97aff369 8960 dVAR;
c445ea15 8961 SV * const sv = sv_newmortal();
ab3bbdeb 8962 SV *dsv= sv_newmortal();
f8fc2ecf 8963 RXi_GET_DECL(r,ri);
f7819f85 8964 GET_RE_DEBUG_FLAGS_DECL;
a687059c 8965
7918f24d
NC
8966 PERL_ARGS_ASSERT_REGDUMP;
8967
f8fc2ecf 8968 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8969
8970 /* Header fields of interest. */
ab3bbdeb
YO
8971 if (r->anchored_substr) {
8972 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8973 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8974 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8975 "anchored %s%s at %"IVdf" ",
8976 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8977 (IV)r->anchored_offset);
ab3bbdeb
YO
8978 } else if (r->anchored_utf8) {
8979 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8980 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8981 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8982 "anchored utf8 %s%s at %"IVdf" ",
8983 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8984 (IV)r->anchored_offset);
ab3bbdeb
YO
8985 }
8986 if (r->float_substr) {
8987 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8988 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8989 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8990 "floating %s%s at %"IVdf"..%"UVuf" ",
8991 s, RE_SV_TAIL(r->float_substr),
7b0972df 8992 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8993 } else if (r->float_utf8) {
8994 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8995 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8996 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8997 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8998 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8999 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 9000 }
33b8afdf 9001 if (r->check_substr || r->check_utf8)
b81d288d 9002 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
9003 (const char *)
9004 (r->check_substr == r->float_substr
9005 && r->check_utf8 == r->float_utf8
9006 ? "(checking floating" : "(checking anchored"));
bbe252da 9007 if (r->extflags & RXf_NOSCAN)
c277df42 9008 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 9009 if (r->extflags & RXf_CHECK_ALL)
c277df42 9010 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 9011 if (r->check_substr || r->check_utf8)
c277df42
IZ
9012 PerlIO_printf(Perl_debug_log, ") ");
9013
f8fc2ecf
YO
9014 if (ri->regstclass) {
9015 regprop(r, sv, ri->regstclass);
1de06328 9016 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 9017 }
bbe252da 9018 if (r->extflags & RXf_ANCH) {
774d564b 9019 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 9020 if (r->extflags & RXf_ANCH_BOL)
774d564b 9021 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 9022 if (r->extflags & RXf_ANCH_MBOL)
c277df42 9023 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 9024 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 9025 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 9026 if (r->extflags & RXf_ANCH_GPOS)
774d564b 9027 PerlIO_printf(Perl_debug_log, "(GPOS)");
9028 PerlIO_putc(Perl_debug_log, ' ');
9029 }
bbe252da 9030 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 9031 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 9032 if (r->intflags & PREGf_SKIP)
760ac839 9033 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 9034 if (r->intflags & PREGf_IMPLICIT)
760ac839 9035 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 9036 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 9037 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 9038 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 9039 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 9040 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 9041#else
7918f24d 9042 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 9043 PERL_UNUSED_CONTEXT;
65e66c80 9044 PERL_UNUSED_ARG(r);
17c3b450 9045#endif /* DEBUGGING */
a687059c
LW
9046}
9047
9048/*
a0d0e21e
LW
9049- regprop - printable representation of opcode
9050*/
3339dfd8
YO
9051#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9052STMT_START { \
9053 if (do_sep) { \
9054 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9055 if (flags & ANYOF_INVERT) \
9056 /*make sure the invert info is in each */ \
9057 sv_catpvs(sv, "^"); \
9058 do_sep = 0; \
9059 } \
9060} STMT_END
9061
46fc3d4c 9062void
32fc9b6a 9063Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 9064{
35ff7856 9065#ifdef DEBUGGING
97aff369 9066 dVAR;
9b155405 9067 register int k;
f8fc2ecf 9068 RXi_GET_DECL(prog,progi);
1de06328 9069 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 9070
7918f24d 9071 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 9072
76f68e9b 9073 sv_setpvs(sv, "");
8aa23a47 9074
03363afd 9075 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
9076 /* It would be nice to FAIL() here, but this may be called from
9077 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 9078 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 9079 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 9080
3dab1dad 9081 k = PL_regkind[OP(o)];
9b155405 9082
2a782b5b 9083 if (k == EXACT) {
f92a2122 9084 sv_catpvs(sv, " ");
ab3bbdeb
YO
9085 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9086 * is a crude hack but it may be the best for now since
9087 * we have no flag "this EXACTish node was UTF-8"
9088 * --jhi */
f92a2122
NC
9089 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9090 PERL_PV_ESCAPE_UNI_DETECT |
9091 PERL_PV_PRETTY_ELLIPSES |
9092 PERL_PV_PRETTY_LTGT |
9093 PERL_PV_PRETTY_NOCLEAR
9094 );
bb263b4e 9095 } else if (k == TRIE) {
3dab1dad 9096 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 9097 * progi->data isn't available here */
1de06328 9098 const char op = OP(o);
647f639f 9099 const U32 n = ARG(o);
1de06328 9100 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 9101 (reg_ac_data *)progi->data->data[n] :
1de06328 9102 NULL;
3251b653
NC
9103 const reg_trie_data * const trie
9104 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 9105
13d6edb4 9106 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
9107 DEBUG_TRIE_COMPILE_r(
9108 Perl_sv_catpvf(aTHX_ sv,
9109 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9110 (UV)trie->startstate,
1e2e3d02 9111 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
9112 (UV)trie->wordcount,
9113 (UV)trie->minlen,
9114 (UV)trie->maxlen,
9115 (UV)TRIE_CHARCOUNT(trie),
9116 (UV)trie->uniquecharcount
9117 )
9118 );
9119 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9120 int i;
9121 int rangestart = -1;
f46cb337 9122 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 9123 sv_catpvs(sv, "[");
1de06328
YO
9124 for (i = 0; i <= 256; i++) {
9125 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9126 if (rangestart == -1)
9127 rangestart = i;
9128 } else if (rangestart != -1) {
9129 if (i <= rangestart + 3)
9130 for (; rangestart < i; rangestart++)
9131 put_byte(sv, rangestart);
9132 else {
9133 put_byte(sv, rangestart);
9134 sv_catpvs(sv, "-");
9135 put_byte(sv, i - 1);
9136 }
9137 rangestart = -1;
9138 }
9139 }
f3a2811a 9140 sv_catpvs(sv, "]");
1de06328
YO
9141 }
9142
a3621e74 9143 } else if (k == CURLY) {
cb434fcc 9144 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
9145 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9146 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 9147 }
2c2d71f5
JH
9148 else if (k == WHILEM && o->flags) /* Ordinal/of */
9149 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 9150 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 9151 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 9152 if ( RXp_PAREN_NAMES(prog) ) {
ee9b8eae 9153 if ( k != REF || OP(o) < NREF) {
502c6561 9154 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
9155 SV **name= av_fetch(list, ARG(o), 0 );
9156 if (name)
9157 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9158 }
9159 else {
502c6561 9160 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 9161 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
9162 I32 *nums=(I32*)SvPVX(sv_dat);
9163 SV **name= av_fetch(list, nums[0], 0 );
9164 I32 n;
9165 if (name) {
9166 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9167 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9168 (n ? "," : ""), (IV)nums[n]);
9169 }
9170 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 9171 }
1f1031fe 9172 }
ee9b8eae 9173 }
1f1031fe 9174 } else if (k == GOSUB)
6bda09f9 9175 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
9176 else if (k == VERB) {
9177 if (!o->flags)
9178 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 9179 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 9180 } else if (k == LOGICAL)
04ebc1ab 9181 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
f9a79580 9182 else if (k == FOLDCHAR)
df44d732 9183 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
653099ff
GS
9184 else if (k == ANYOF) {
9185 int i, rangestart = -1;
2d03de9c 9186 const U8 flags = ANYOF_FLAGS(o);
24d786f4 9187 int do_sep = 0;
0bd48802
AL
9188
9189 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9190 static const char * const anyofs[] = {
653099ff
GS
9191 "\\w",
9192 "\\W",
9193 "\\s",
9194 "\\S",
9195 "\\d",
9196 "\\D",
9197 "[:alnum:]",
9198 "[:^alnum:]",
9199 "[:alpha:]",
9200 "[:^alpha:]",
9201 "[:ascii:]",
9202 "[:^ascii:]",
24d786f4
YO
9203 "[:cntrl:]",
9204 "[:^cntrl:]",
653099ff
GS
9205 "[:graph:]",
9206 "[:^graph:]",
9207 "[:lower:]",
9208 "[:^lower:]",
9209 "[:print:]",
9210 "[:^print:]",
9211 "[:punct:]",
9212 "[:^punct:]",
9213 "[:upper:]",
aaa51d5e 9214 "[:^upper:]",
653099ff 9215 "[:xdigit:]",
aaa51d5e
JF
9216 "[:^xdigit:]",
9217 "[:space:]",
9218 "[:^space:]",
9219 "[:blank:]",
9220 "[:^blank:]"
653099ff
GS
9221 };
9222
19860706 9223 if (flags & ANYOF_LOCALE)
396482e1 9224 sv_catpvs(sv, "{loc}");
19860706 9225 if (flags & ANYOF_FOLD)
396482e1 9226 sv_catpvs(sv, "{i}");
653099ff 9227 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 9228 if (flags & ANYOF_INVERT)
396482e1 9229 sv_catpvs(sv, "^");
3339dfd8
YO
9230
9231 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
9232 for (i = 0; i <= 256; i++) {
9233 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9234 if (rangestart == -1)
9235 rangestart = i;
9236 } else if (rangestart != -1) {
9237 if (i <= rangestart + 3)
9238 for (; rangestart < i; rangestart++)
653099ff 9239 put_byte(sv, rangestart);
ffc61ed2
JH
9240 else {
9241 put_byte(sv, rangestart);
396482e1 9242 sv_catpvs(sv, "-");
ffc61ed2 9243 put_byte(sv, i - 1);
653099ff 9244 }
24d786f4 9245 do_sep = 1;
ffc61ed2 9246 rangestart = -1;
653099ff 9247 }
847a199f 9248 }
3339dfd8
YO
9249
9250 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9251 /* output any special charclass tests (used mostly under use locale) */
ffc61ed2 9252 if (o->flags & ANYOF_CLASS)
bb7a0f54 9253 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 9254 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 9255 sv_catpv(sv, anyofs[i]);
24d786f4
YO
9256 do_sep = 1;
9257 }
9258
3339dfd8
YO
9259 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9260
9261 /* output information about the unicode matching */
ffc61ed2 9262 if (flags & ANYOF_UNICODE)
396482e1 9263 sv_catpvs(sv, "{unicode}");
1aa99e6b 9264 else if (flags & ANYOF_UNICODE_ALL)
396482e1 9265 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
9266
9267 {
9268 SV *lv;
32fc9b6a 9269 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 9270
ffc61ed2
JH
9271 if (lv) {
9272 if (sw) {
89ebb4a3 9273 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 9274
ffc61ed2 9275 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 9276 uvchr_to_utf8(s, i);
ffc61ed2 9277
3568d838 9278 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
9279 if (rangestart == -1)
9280 rangestart = i;
9281 } else if (rangestart != -1) {
ffc61ed2
JH
9282 if (i <= rangestart + 3)
9283 for (; rangestart < i; rangestart++) {
2d03de9c
AL
9284 const U8 * const e = uvchr_to_utf8(s,rangestart);
9285 U8 *p;
9286 for(p = s; p < e; p++)
ffc61ed2
JH
9287 put_byte(sv, *p);
9288 }
9289 else {
2d03de9c
AL
9290 const U8 *e = uvchr_to_utf8(s,rangestart);
9291 U8 *p;
9292 for (p = s; p < e; p++)
ffc61ed2 9293 put_byte(sv, *p);
396482e1 9294 sv_catpvs(sv, "-");
2d03de9c
AL
9295 e = uvchr_to_utf8(s, i-1);
9296 for (p = s; p < e; p++)
1df70142 9297 put_byte(sv, *p);
ffc61ed2
JH
9298 }
9299 rangestart = -1;
9300 }
19860706 9301 }
ffc61ed2 9302
396482e1 9303 sv_catpvs(sv, "..."); /* et cetera */
19860706 9304 }
fde631ed 9305
ffc61ed2 9306 {
2e0de35c 9307 char *s = savesvpv(lv);
c445ea15 9308 char * const origs = s;
b81d288d 9309
3dab1dad
YO
9310 while (*s && *s != '\n')
9311 s++;
b81d288d 9312
ffc61ed2 9313 if (*s == '\n') {
2d03de9c 9314 const char * const t = ++s;
ffc61ed2
JH
9315
9316 while (*s) {
9317 if (*s == '\n')
9318 *s = ' ';
9319 s++;
9320 }
9321 if (s[-1] == ' ')
9322 s[-1] = 0;
9323
9324 sv_catpv(sv, t);
fde631ed 9325 }
b81d288d 9326
ffc61ed2 9327 Safefree(origs);
fde631ed
JH
9328 }
9329 }
653099ff 9330 }
ffc61ed2 9331
653099ff
GS
9332 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9333 }
9b155405 9334 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 9335 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 9336#else
96a5add6 9337 PERL_UNUSED_CONTEXT;
65e66c80
SP
9338 PERL_UNUSED_ARG(sv);
9339 PERL_UNUSED_ARG(o);
f9049ba1 9340 PERL_UNUSED_ARG(prog);
17c3b450 9341#endif /* DEBUGGING */
35ff7856 9342}
a687059c 9343
cad2e5aa 9344SV *
288b8c02 9345Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 9346{ /* Assume that RE_INTUIT is set */
97aff369 9347 dVAR;
288b8c02 9348 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 9349 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9350
9351 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
9352 PERL_UNUSED_CONTEXT;
9353
a3621e74 9354 DEBUG_COMPILE_r(
cfd0369c 9355 {
2d03de9c 9356 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 9357 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
9358
9359 if (!PL_colorset) reginitcolors();
9360 PerlIO_printf(Perl_debug_log,
a0288114 9361 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
9362 PL_colors[4],
9363 prog->check_substr ? "" : "utf8 ",
9364 PL_colors[5],PL_colors[0],
cad2e5aa
JH
9365 s,
9366 PL_colors[1],
9367 (strlen(s) > 60 ? "..." : ""));
9368 } );
9369
33b8afdf 9370 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
9371}
9372
84da74a7 9373/*
f8149455 9374 pregfree()
84da74a7 9375
f8149455
YO
9376 handles refcounting and freeing the perl core regexp structure. When
9377 it is necessary to actually free the structure the first thing it
9378 does is call the 'free' method of the regexp_engine associated to to
9379 the regexp, allowing the handling of the void *pprivate; member
9380 first. (This routine is not overridable by extensions, which is why
9381 the extensions free is called first.)
9382
9383 See regdupe and regdupe_internal if you change anything here.
84da74a7 9384*/
f8149455 9385#ifndef PERL_IN_XSUB_RE
2b69d0c2 9386void
84679df5 9387Perl_pregfree(pTHX_ REGEXP *r)
a687059c 9388{
288b8c02
NC
9389 SvREFCNT_dec(r);
9390}
9391
9392void
9393Perl_pregfree2(pTHX_ REGEXP *rx)
9394{
27da23d5 9395 dVAR;
288b8c02 9396 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 9397 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 9398
7918f24d
NC
9399 PERL_ARGS_ASSERT_PREGFREE2;
9400
28d8d7f4
YO
9401 if (r->mother_re) {
9402 ReREFCNT_dec(r->mother_re);
9403 } else {
288b8c02 9404 CALLREGFREE_PVT(rx); /* free the private data */
5daac39c
NC
9405 if (RXp_PAREN_NAMES(r))
9406 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
9407 }
9408 if (r->substrs) {
9409 if (r->anchored_substr)
9410 SvREFCNT_dec(r->anchored_substr);
9411 if (r->anchored_utf8)
9412 SvREFCNT_dec(r->anchored_utf8);
9413 if (r->float_substr)
9414 SvREFCNT_dec(r->float_substr);
9415 if (r->float_utf8)
9416 SvREFCNT_dec(r->float_utf8);
9417 Safefree(r->substrs);
9418 }
288b8c02 9419 RX_MATCH_COPY_FREE(rx);
f8c7b90f 9420#ifdef PERL_OLD_COPY_ON_WRITE
ed252734 9421 if (r->saved_copy)
28d8d7f4 9422 SvREFCNT_dec(r->saved_copy);
ed252734 9423#endif
f0ab9afb 9424 Safefree(r->offs);
f8149455 9425}
28d8d7f4
YO
9426
9427/* reg_temp_copy()
9428
9429 This is a hacky workaround to the structural issue of match results
9430 being stored in the regexp structure which is in turn stored in
9431 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9432 could be PL_curpm in multiple contexts, and could require multiple
9433 result sets being associated with the pattern simultaneously, such
9434 as when doing a recursive match with (??{$qr})
9435
9436 The solution is to make a lightweight copy of the regexp structure
9437 when a qr// is returned from the code executed by (??{$qr}) this
9438 lightweight copy doesnt actually own any of its data except for
9439 the starp/end and the actual regexp structure itself.
9440
9441*/
9442
9443
84679df5 9444REGEXP *
7918f24d
NC
9445Perl_reg_temp_copy (pTHX_ REGEXP *rx)
9446{
d2f13c59 9447 REGEXP *ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02
NC
9448 struct regexp *ret = (struct regexp *)SvANY(ret_x);
9449 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 9450 register const I32 npar = r->nparens+1;
7918f24d
NC
9451
9452 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9453
288b8c02 9454 (void)ReREFCNT_inc(rx);
f7c278bf
NC
9455 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9456 by pointing directly at the buffer, but flagging that the allocated
9457 space in the copy is zero. As we've just done a struct copy, it's now
9458 a case of zero-ing that, rather than copying the current length. */
9459 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 9460 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
b6f60916
NC
9461 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9462 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
f7c278bf 9463 SvLEN_set(ret_x, 0);
f0ab9afb
NC
9464 Newx(ret->offs, npar, regexp_paren_pair);
9465 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 9466 if (r->substrs) {
28d8d7f4 9467 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
9468 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9469
9470 SvREFCNT_inc_void(ret->anchored_substr);
9471 SvREFCNT_inc_void(ret->anchored_utf8);
9472 SvREFCNT_inc_void(ret->float_substr);
9473 SvREFCNT_inc_void(ret->float_utf8);
9474
9475 /* check_substr and check_utf8, if non-NULL, point to either their
9476 anchored or float namesakes, and don't hold a second reference. */
486913e4 9477 }
288b8c02 9478 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 9479#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 9480 ret->saved_copy = NULL;
28d8d7f4 9481#endif
288b8c02 9482 ret->mother_re = rx;
28d8d7f4 9483
288b8c02 9484 return ret_x;
28d8d7f4 9485}
f8149455
YO
9486#endif
9487
9488/* regfree_internal()
9489
9490 Free the private data in a regexp. This is overloadable by
9491 extensions. Perl takes care of the regexp structure in pregfree(),
9492 this covers the *pprivate pointer which technically perldoesnt
9493 know about, however of course we have to handle the
9494 regexp_internal structure when no extension is in use.
9495
9496 Note this is called before freeing anything in the regexp
9497 structure.
9498 */
9499
9500void
288b8c02 9501Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
9502{
9503 dVAR;
288b8c02 9504 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
9505 RXi_GET_DECL(r,ri);
9506 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9507
9508 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9509
f8149455
YO
9510 DEBUG_COMPILE_r({
9511 if (!PL_colorset)
9512 reginitcolors();
9513 {
9514 SV *dsv= sv_newmortal();
3c8556c3 9515 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 9516 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
9517 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9518 PL_colors[4],PL_colors[5],s);
9519 }
9520 });
7122b237
YO
9521#ifdef RE_TRACK_PATTERN_OFFSETS
9522 if (ri->u.offsets)
9523 Safefree(ri->u.offsets); /* 20010421 MJD */
9524#endif
f8fc2ecf
YO
9525 if (ri->data) {
9526 int n = ri->data->count;
f3548bdc
DM
9527 PAD* new_comppad = NULL;
9528 PAD* old_comppad;
4026c95a 9529 PADOFFSET refcnt;
dfad63ad 9530
c277df42 9531 while (--n >= 0) {
261faec3 9532 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 9533 switch (ri->data->what[n]) {
c277df42 9534 case 's':
81714fb9 9535 case 'S':
55eed653 9536 case 'u':
ad64d0ec 9537 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 9538 break;
653099ff 9539 case 'f':
f8fc2ecf 9540 Safefree(ri->data->data[n]);
653099ff 9541 break;
dfad63ad 9542 case 'p':
502c6561 9543 new_comppad = MUTABLE_AV(ri->data->data[n]);
dfad63ad 9544 break;
c277df42 9545 case 'o':
dfad63ad 9546 if (new_comppad == NULL)
cea2e8a9 9547 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
9548 PAD_SAVE_LOCAL(old_comppad,
9549 /* Watch out for global destruction's random ordering. */
c445ea15 9550 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 9551 );
b34c0dd4 9552 OP_REFCNT_LOCK;
f8fc2ecf 9553 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
9554 OP_REFCNT_UNLOCK;
9555 if (!refcnt)
f8fc2ecf 9556 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 9557
f3548bdc 9558 PAD_RESTORE_LOCAL(old_comppad);
ad64d0ec 9559 SvREFCNT_dec(MUTABLE_SV(new_comppad));
dfad63ad 9560 new_comppad = NULL;
c277df42
IZ
9561 break;
9562 case 'n':
9e55ce06 9563 break;
07be1b83 9564 case 'T':
be8e71aa
YO
9565 { /* Aho Corasick add-on structure for a trie node.
9566 Used in stclass optimization only */
07be1b83 9567 U32 refcount;
f8fc2ecf 9568 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
9569 OP_REFCNT_LOCK;
9570 refcount = --aho->refcount;
9571 OP_REFCNT_UNLOCK;
9572 if ( !refcount ) {
446bd890
NC
9573 PerlMemShared_free(aho->states);
9574 PerlMemShared_free(aho->fail);
446bd890
NC
9575 /* do this last!!!! */
9576 PerlMemShared_free(ri->data->data[n]);
9577 PerlMemShared_free(ri->regstclass);
07be1b83
YO
9578 }
9579 }
9580 break;
a3621e74 9581 case 't':
07be1b83 9582 {
be8e71aa 9583 /* trie structure. */
07be1b83 9584 U32 refcount;
f8fc2ecf 9585 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
9586 OP_REFCNT_LOCK;
9587 refcount = --trie->refcount;
9588 OP_REFCNT_UNLOCK;
9589 if ( !refcount ) {
446bd890 9590 PerlMemShared_free(trie->charmap);
446bd890
NC
9591 PerlMemShared_free(trie->states);
9592 PerlMemShared_free(trie->trans);
07be1b83 9593 if (trie->bitmap)
446bd890 9594 PerlMemShared_free(trie->bitmap);
07be1b83 9595 if (trie->wordlen)
446bd890 9596 PerlMemShared_free(trie->wordlen);
786e8c11 9597 if (trie->jump)
446bd890 9598 PerlMemShared_free(trie->jump);
786e8c11 9599 if (trie->nextword)
446bd890 9600 PerlMemShared_free(trie->nextword);
446bd890
NC
9601 /* do this last!!!! */
9602 PerlMemShared_free(ri->data->data[n]);
a3621e74 9603 }
07be1b83
YO
9604 }
9605 break;
c277df42 9606 default:
f8fc2ecf 9607 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
9608 }
9609 }
f8fc2ecf
YO
9610 Safefree(ri->data->what);
9611 Safefree(ri->data);
a0d0e21e 9612 }
28d8d7f4 9613
f8fc2ecf 9614 Safefree(ri);
a687059c 9615}
c277df42 9616
84da74a7 9617#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
502c6561 9618#define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
85fbaab2 9619#define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
84da74a7
YO
9620#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9621
9622/*
32cd70f6 9623 re_dup - duplicate a regexp.
84da74a7 9624
8233f606
DM
9625 This routine is expected to clone a given regexp structure. It is only
9626 compiled under USE_ITHREADS.
32cd70f6 9627
f8149455
YO
9628 After all of the core data stored in struct regexp is duplicated
9629 the regexp_engine.dupe method is used to copy any private data
9630 stored in the *pprivate pointer. This allows extensions to handle
9631 any duplication it needs to do.
9632
9633 See pregfree() and regfree_internal() if you change anything here.
84da74a7 9634*/
a3c0e9ca 9635#if defined(USE_ITHREADS)
f8149455 9636#ifndef PERL_IN_XSUB_RE
288b8c02
NC
9637void
9638Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 9639{
84da74a7 9640 dVAR;
a86a1ca7 9641 I32 npar;
288b8c02
NC
9642 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9643 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 9644
7918f24d
NC
9645 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9646
84da74a7 9647 npar = r->nparens+1;
f0ab9afb
NC
9648 Newx(ret->offs, npar, regexp_paren_pair);
9649 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 9650 if(ret->swap) {
28d8d7f4 9651 /* no need to copy these */
f0ab9afb 9652 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 9653 }
84da74a7 9654
6057429f 9655 if (ret->substrs) {
32cd70f6
NC
9656 /* Do it this way to avoid reading from *r after the StructCopy().
9657 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9658 cache, it doesn't matter. */
66b1de87
NC
9659 const bool anchored = r->check_substr
9660 ? r->check_substr == r->anchored_substr
9661 : r->check_utf8 == r->anchored_utf8;
785a26d5 9662 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
9663 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9664
32cd70f6
NC
9665 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9666 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9667 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9668 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 9669
32cd70f6
NC
9670 /* check_substr and check_utf8, if non-NULL, point to either their
9671 anchored or float namesakes, and don't hold a second reference. */
9672
9673 if (ret->check_substr) {
9674 if (anchored) {
9675 assert(r->check_utf8 == r->anchored_utf8);
9676 ret->check_substr = ret->anchored_substr;
9677 ret->check_utf8 = ret->anchored_utf8;
9678 } else {
9679 assert(r->check_substr == r->float_substr);
9680 assert(r->check_utf8 == r->float_utf8);
9681 ret->check_substr = ret->float_substr;
9682 ret->check_utf8 = ret->float_utf8;
9683 }
66b1de87
NC
9684 } else if (ret->check_utf8) {
9685 if (anchored) {
9686 ret->check_utf8 = ret->anchored_utf8;
9687 } else {
9688 ret->check_utf8 = ret->float_utf8;
9689 }
32cd70f6 9690 }
6057429f 9691 }
f8149455 9692
5daac39c 9693 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 9694
6057429f 9695 if (ret->pprivate)
288b8c02 9696 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 9697
288b8c02 9698 if (RX_MATCH_COPIED(dstr))
6057429f 9699 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
9700 else
9701 ret->subbeg = NULL;
9702#ifdef PERL_OLD_COPY_ON_WRITE
9703 ret->saved_copy = NULL;
9704#endif
6057429f
NC
9705
9706 ret->mother_re = NULL;
9707 ret->gofs = 0;
f8149455
YO
9708}
9709#endif /* PERL_IN_XSUB_RE */
9710
9711/*
9712 regdupe_internal()
9713
9714 This is the internal complement to regdupe() which is used to copy
9715 the structure pointed to by the *pprivate pointer in the regexp.
9716 This is the core version of the extension overridable cloning hook.
9717 The regexp structure being duplicated will be copied by perl prior
9718 to this and will be provided as the regexp *r argument, however
9719 with the /old/ structures pprivate pointer value. Thus this routine
9720 may override any copying normally done by perl.
9721
9722 It returns a pointer to the new regexp_internal structure.
9723*/
9724
9725void *
288b8c02 9726Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
9727{
9728 dVAR;
288b8c02 9729 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
9730 regexp_internal *reti;
9731 int len, npar;
9732 RXi_GET_DECL(r,ri);
7918f24d
NC
9733
9734 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455
YO
9735
9736 npar = r->nparens+1;
7122b237 9737 len = ProgLen(ri);
f8149455 9738
45cf4570 9739 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455
YO
9740 Copy(ri->program, reti->program, len+1, regnode);
9741
f8149455 9742
f8fc2ecf 9743 reti->regstclass = NULL;
bcdf7404 9744
f8fc2ecf 9745 if (ri->data) {
84da74a7 9746 struct reg_data *d;
f8fc2ecf 9747 const int count = ri->data->count;
84da74a7
YO
9748 int i;
9749
9750 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9751 char, struct reg_data);
9752 Newx(d->what, count, U8);
9753
9754 d->count = count;
9755 for (i = 0; i < count; i++) {
f8fc2ecf 9756 d->what[i] = ri->data->what[i];
84da74a7 9757 switch (d->what[i]) {
55eed653 9758 /* legal options are one of: sSfpontTu
84da74a7
YO
9759 see also regcomp.h and pregfree() */
9760 case 's':
81714fb9 9761 case 'S':
0536c0a7 9762 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 9763 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 9764 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 9765 break;
84da74a7
YO
9766 case 'f':
9767 /* This is cheating. */
9768 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 9769 StructCopy(ri->data->data[i], d->data[i],
84da74a7 9770 struct regnode_charclass_class);
f8fc2ecf 9771 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
9772 break;
9773 case 'o':
bbe252da
YO
9774 /* Compiled op trees are readonly and in shared memory,
9775 and can thus be shared without duplication. */
84da74a7 9776 OP_REFCNT_LOCK;
f8fc2ecf 9777 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
9778 OP_REFCNT_UNLOCK;
9779 break;
23eab42c
NC
9780 case 'T':
9781 /* Trie stclasses are readonly and can thus be shared
9782 * without duplication. We free the stclass in pregfree
9783 * when the corresponding reg_ac_data struct is freed.
9784 */
9785 reti->regstclass= ri->regstclass;
9786 /* Fall through */
84da74a7 9787 case 't':
84da74a7 9788 OP_REFCNT_LOCK;
0536c0a7 9789 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 9790 OP_REFCNT_UNLOCK;
0536c0a7
NC
9791 /* Fall through */
9792 case 'n':
9793 d->data[i] = ri->data->data[i];
84da74a7 9794 break;
84da74a7 9795 default:
f8fc2ecf 9796 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
9797 }
9798 }
9799
f8fc2ecf 9800 reti->data = d;
84da74a7
YO
9801 }
9802 else
f8fc2ecf 9803 reti->data = NULL;
84da74a7 9804
cde0cee5
YO
9805 reti->name_list_idx = ri->name_list_idx;
9806
7122b237
YO
9807#ifdef RE_TRACK_PATTERN_OFFSETS
9808 if (ri->u.offsets) {
9809 Newx(reti->u.offsets, 2*len+1, U32);
9810 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9811 }
9812#else
9813 SetProgLen(reti,len);
9814#endif
9815
f8149455 9816 return (void*)reti;
84da74a7 9817}
f8149455
YO
9818
9819#endif /* USE_ITHREADS */
84da74a7 9820
f8149455 9821#ifndef PERL_IN_XSUB_RE
bcdf7404 9822
c277df42
IZ
9823/*
9824 - regnext - dig the "next" pointer out of a node
c277df42
IZ
9825 */
9826regnode *
864dbfa3 9827Perl_regnext(pTHX_ register regnode *p)
c277df42 9828{
97aff369 9829 dVAR;
c277df42
IZ
9830 register I32 offset;
9831
f8fc2ecf 9832 if (!p)
c277df42
IZ
9833 return(NULL);
9834
9835 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9836 if (offset == 0)
9837 return(NULL);
9838
c277df42 9839 return(p+offset);
c277df42 9840}
76234dfb 9841#endif
c277df42 9842
01f988be 9843STATIC void
cea2e8a9 9844S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
9845{
9846 va_list args;
9847 STRLEN l1 = strlen(pat1);
9848 STRLEN l2 = strlen(pat2);
9849 char buf[512];
06bf62c7 9850 SV *msv;
73d840c0 9851 const char *message;
c277df42 9852
7918f24d
NC
9853 PERL_ARGS_ASSERT_RE_CROAK2;
9854
c277df42
IZ
9855 if (l1 > 510)
9856 l1 = 510;
9857 if (l1 + l2 > 510)
9858 l2 = 510 - l1;
9859 Copy(pat1, buf, l1 , char);
9860 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
9861 buf[l1 + l2] = '\n';
9862 buf[l1 + l2 + 1] = '\0';
8736538c
AS
9863#ifdef I_STDARG
9864 /* ANSI variant takes additional second argument */
c277df42 9865 va_start(args, pat2);
8736538c
AS
9866#else
9867 va_start(args);
9868#endif
5a844595 9869 msv = vmess(buf, &args);
c277df42 9870 va_end(args);
cfd0369c 9871 message = SvPV_const(msv,l1);
c277df42
IZ
9872 if (l1 > 512)
9873 l1 = 512;
9874 Copy(message, buf, l1 , char);
197cf9b9 9875 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 9876 Perl_croak(aTHX_ "%s", buf);
c277df42 9877}
a0ed51b3
LW
9878
9879/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9880
76234dfb 9881#ifndef PERL_IN_XSUB_RE
a0ed51b3 9882void
864dbfa3 9883Perl_save_re_context(pTHX)
b81d288d 9884{
97aff369 9885 dVAR;
1ade1aa1
NC
9886
9887 struct re_save_state *state;
9888
9889 SAVEVPTR(PL_curcop);
9890 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9891
9892 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9893 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9894 SSPUSHINT(SAVEt_RE_STATE);
9895
46ab3289 9896 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 9897
a0ed51b3 9898 PL_reg_start_tmp = 0;
a0ed51b3 9899 PL_reg_start_tmpl = 0;
c445ea15 9900 PL_reg_oldsaved = NULL;
a5db57d6 9901 PL_reg_oldsavedlen = 0;
a5db57d6 9902 PL_reg_maxiter = 0;
a5db57d6 9903 PL_reg_leftiter = 0;
c445ea15 9904 PL_reg_poscache = NULL;
a5db57d6 9905 PL_reg_poscache_size = 0;
1ade1aa1
NC
9906#ifdef PERL_OLD_COPY_ON_WRITE
9907 PL_nrs = NULL;
9908#endif
ada6e8a9 9909
c445ea15
AL
9910 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9911 if (PL_curpm) {
9912 const REGEXP * const rx = PM_GETRE(PL_curpm);
9913 if (rx) {
1df70142 9914 U32 i;
07bc277f 9915 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 9916 char digits[TYPE_CHARS(long)];
d9fad198 9917 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
9918 GV *const *const gvp
9919 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9920
b37c2d43
AL
9921 if (gvp) {
9922 GV * const gv = *gvp;
9923 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9924 save_scalar(gv);
49f27e4b 9925 }
ada6e8a9
AMS
9926 }
9927 }
9928 }
a0ed51b3 9929}
76234dfb 9930#endif
51371543 9931
51371543 9932static void
acfe0abc 9933clear_re(pTHX_ void *r)
51371543 9934{
97aff369 9935 dVAR;
84679df5 9936 ReREFCNT_dec((REGEXP *)r);
51371543 9937}
ffbc6a93 9938
a28509cc
AL
9939#ifdef DEBUGGING
9940
9941STATIC void
9942S_put_byte(pTHX_ SV *sv, int c)
9943{
7918f24d
NC
9944 PERL_ARGS_ASSERT_PUT_BYTE;
9945
7fddd944
NC
9946 /* Our definition of isPRINT() ignores locales, so only bytes that are
9947 not part of UTF-8 are considered printable. I assume that the same
9948 holds for UTF-EBCDIC.
9949 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9950 which Wikipedia says:
9951
9952 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9953 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9954 identical, to the ASCII delete (DEL) or rubout control character.
9955 ) So the old condition can be simplified to !isPRINT(c) */
9956 if (!isPRINT(c))
a28509cc 9957 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5e7aa789 9958 else {
88c9ea1e 9959 const char string = c;
5e7aa789
NC
9960 if (c == '-' || c == ']' || c == '\\' || c == '^')
9961 sv_catpvs(sv, "\\");
9962 sv_catpvn(sv, &string, 1);
9963 }
a28509cc
AL
9964}
9965
786e8c11 9966
3dab1dad
YO
9967#define CLEAR_OPTSTART \
9968 if (optstart) STMT_START { \
70685ca0 9969 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
9970 optstart=NULL; \
9971 } STMT_END
9972
786e8c11 9973#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 9974
b5a2f8d8
NC
9975STATIC const regnode *
9976S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9977 const regnode *last, const regnode *plast,
9978 SV* sv, I32 indent, U32 depth)
a28509cc 9979{
97aff369 9980 dVAR;
786e8c11 9981 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9982 register const regnode *next;
3dab1dad 9983 const regnode *optstart= NULL;
1f1031fe 9984
f8fc2ecf 9985 RXi_GET_DECL(r,ri);
3dab1dad 9986 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9987
9988 PERL_ARGS_ASSERT_DUMPUNTIL;
9989
786e8c11
YO
9990#ifdef DEBUG_DUMPUNTIL
9991 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9992 last ? last-start : 0,plast ? plast-start : 0);
9993#endif
9994
9995 if (plast && plast < last)
9996 last= plast;
9997
9998 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 9999 /* While that wasn't END last time... */
a28509cc
AL
10000 NODE_ALIGN(node);
10001 op = OP(node);
de734bd5 10002 if (op == CLOSE || op == WHILEM)
786e8c11 10003 indent--;
b5a2f8d8 10004 next = regnext((regnode *)node);
1f1031fe 10005
a28509cc 10006 /* Where, what. */
8e11feef 10007 if (OP(node) == OPTIMIZED) {
e68ec53f 10008 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 10009 optstart = node;
3dab1dad 10010 else
8e11feef 10011 goto after_print;
3dab1dad
YO
10012 } else
10013 CLEAR_OPTSTART;
1f1031fe 10014
32fc9b6a 10015 regprop(r, sv, node);
a28509cc 10016 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 10017 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
10018
10019 if (OP(node) != OPTIMIZED) {
10020 if (next == NULL) /* Next ptr. */
10021 PerlIO_printf(Perl_debug_log, " (0)");
10022 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10023 PerlIO_printf(Perl_debug_log, " (FAIL)");
10024 else
10025 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10026 (void)PerlIO_putc(Perl_debug_log, '\n');
10027 }
10028
a28509cc
AL
10029 after_print:
10030 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
10031 assert(next);
10032 {
10033 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
10034 ? regnext((regnode *)next)
10035 : next);
be8e71aa
YO
10036 if (last && nnode > last)
10037 nnode = last;
786e8c11 10038 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 10039 }
a28509cc
AL
10040 }
10041 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 10042 assert(next);
786e8c11 10043 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
10044 }
10045 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 10046 const regnode *this_trie = node;
1de06328 10047 const char op = OP(node);
647f639f 10048 const U32 n = ARG(node);
1de06328 10049 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 10050 (reg_ac_data *)ri->data->data[n] :
1de06328 10051 NULL;
3251b653
NC
10052 const reg_trie_data * const trie =
10053 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 10054#ifdef DEBUGGING
502c6561 10055 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 10056#endif
786e8c11 10057 const regnode *nextbranch= NULL;
a28509cc 10058 I32 word_idx;
76f68e9b 10059 sv_setpvs(sv, "");
786e8c11 10060 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 10061 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
10062
10063 PerlIO_printf(Perl_debug_log, "%*s%s ",
10064 (int)(2*(indent+3)), "",
10065 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
10066 PL_colors[0], PL_colors[1],
10067 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 10068 PERL_PV_PRETTY_ELLIPSES |
7f69552c 10069 PERL_PV_PRETTY_LTGT
786e8c11
YO
10070 )
10071 : "???"
10072 );
10073 if (trie->jump) {
40d049e4 10074 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
10075 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10076 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
10077 if (dist) {
10078 if (!nextbranch)
24b23f37 10079 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
10080 DUMPUNTIL(this_trie + dist, nextbranch);
10081 }
786e8c11
YO
10082 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10083 nextbranch= regnext((regnode *)nextbranch);
10084 } else {
10085 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 10086 }
786e8c11
YO
10087 }
10088 if (last && next > last)
10089 node= last;
10090 else
10091 node= next;
a28509cc 10092 }
786e8c11
YO
10093 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10094 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10095 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
10096 }
10097 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 10098 assert(next);
786e8c11 10099 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
10100 }
10101 else if ( op == PLUS || op == STAR) {
786e8c11 10102 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
10103 }
10104 else if (op == ANYOF) {
10105 /* arglen 1 + class block */
10106 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10107 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10108 node = NEXTOPER(node);
10109 }
10110 else if (PL_regkind[(U8)op] == EXACT) {
10111 /* Literal string, where present. */
10112 node += NODE_SZ_STR(node) - 1;
10113 node = NEXTOPER(node);
10114 }
10115 else {
10116 node = NEXTOPER(node);
10117 node += regarglen[(U8)op];
10118 }
10119 if (op == CURLYX || op == OPEN)
786e8c11 10120 indent++;
a28509cc 10121 }
3dab1dad 10122 CLEAR_OPTSTART;
786e8c11 10123#ifdef DEBUG_DUMPUNTIL
70685ca0 10124 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 10125#endif
1de06328 10126 return node;
a28509cc
AL
10127}
10128
10129#endif /* DEBUGGING */
10130
241d1a3b
NC
10131/*
10132 * Local variables:
10133 * c-indentation-style: bsd
10134 * c-basic-offset: 4
10135 * indent-tabs-mode: t
10136 * End:
10137 *
37442d52
RGS
10138 * ex: set ts=8 sts=4 sw=4 noet:
10139 */