This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Update the totals in perldelta with the last few commits
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
81714fb9 137 HV *paren_names; /* Paren names */
1f1031fe 138
40d049e4
YO
139 regnode **recurse; /* Recurse regops */
140 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
141#if ADD_TO_REGEXEC
142 char *starttry; /* -Dr: where regtry was called. */
143#define RExC_starttry (pRExC_state->starttry)
144#endif
3dab1dad 145#ifdef DEBUGGING
be8e71aa 146 const char *lastparse;
3dab1dad 147 I32 lastnum;
1f1031fe 148 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
149#define RExC_lastparse (pRExC_state->lastparse)
150#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 151#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 152#endif
830247a4
IZ
153} RExC_state_t;
154
e2509266 155#define RExC_flags (pRExC_state->flags)
830247a4 156#define RExC_precomp (pRExC_state->precomp)
288b8c02 157#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 158#define RExC_rx (pRExC_state->rx)
f8fc2ecf 159#define RExC_rxi (pRExC_state->rxi)
fac92740 160#define RExC_start (pRExC_state->start)
830247a4
IZ
161#define RExC_end (pRExC_state->end)
162#define RExC_parse (pRExC_state->parse)
163#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
164#ifdef RE_TRACK_PATTERN_OFFSETS
165#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
166#endif
830247a4 167#define RExC_emit (pRExC_state->emit)
fac92740 168#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 169#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
170#define RExC_naughty (pRExC_state->naughty)
171#define RExC_sawback (pRExC_state->sawback)
172#define RExC_seen (pRExC_state->seen)
173#define RExC_size (pRExC_state->size)
174#define RExC_npar (pRExC_state->npar)
e2e6a0f1 175#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
176#define RExC_extralen (pRExC_state->extralen)
177#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 179#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 180#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
181#define RExC_open_parens (pRExC_state->open_parens)
182#define RExC_close_parens (pRExC_state->close_parens)
183#define RExC_opend (pRExC_state->opend)
81714fb9 184#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
185#define RExC_recurse (pRExC_state->recurse)
186#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 187
cde0cee5 188
a687059c
LW
189#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
a687059c 192
35c8bce7
LW
193#ifdef SPSTART
194#undef SPSTART /* dratted cpp namespace... */
195#endif
a687059c
LW
196/*
197 * Flags to be passed up and down.
198 */
a687059c 199#define WORST 0 /* Worst case. */
a3b492c3 200#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
201
202/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
203 * character, and if utf8, must be invariant. */
204#define SIMPLE 0x02
a3b492c3
YO
205#define SPSTART 0x04 /* Starts with * or +. */
206#define TRYAGAIN 0x08 /* Weeded out a declaration. */
207#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 208
3dab1dad
YO
209#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
210
07be1b83
YO
211/* whether trie related optimizations are enabled */
212#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
213#define TRIE_STUDY_OPT
786e8c11 214#define FULL_TRIE_STUDY
07be1b83
YO
215#define TRIE_STCLASS
216#endif
1de06328
YO
217
218
40d049e4
YO
219
220#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
221#define PBITVAL(paren) (1 << ((paren) & 7))
222#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
223#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
224#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
225
bbd61b5f
KW
226/* If not already in utf8, do a longjmp back to the beginning */
227#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
228#define REQUIRE_UTF8 STMT_START { \
229 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
230 } STMT_END
40d049e4 231
1de06328
YO
232/* About scan_data_t.
233
234 During optimisation we recurse through the regexp program performing
235 various inplace (keyhole style) optimisations. In addition study_chunk
236 and scan_commit populate this data structure with information about
237 what strings MUST appear in the pattern. We look for the longest
3b753521 238 string that must appear at a fixed location, and we look for the
1de06328
YO
239 longest string that may appear at a floating location. So for instance
240 in the pattern:
241
242 /FOO[xX]A.*B[xX]BAR/
243
244 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
245 strings (because they follow a .* construct). study_chunk will identify
246 both FOO and BAR as being the longest fixed and floating strings respectively.
247
248 The strings can be composites, for instance
249
250 /(f)(o)(o)/
251
252 will result in a composite fixed substring 'foo'.
253
254 For each string some basic information is maintained:
255
256 - offset or min_offset
257 This is the position the string must appear at, or not before.
258 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
259 characters must match before the string we are searching for.
260 Likewise when combined with minlenp and the length of the string it
1de06328
YO
261 tells us how many characters must appear after the string we have
262 found.
263
264 - max_offset
265 Only used for floating strings. This is the rightmost point that
3b753521 266 the string can appear at. If set to I32 max it indicates that the
1de06328
YO
267 string can occur infinitely far to the right.
268
269 - minlenp
270 A pointer to the minimum length of the pattern that the string
271 was found inside. This is important as in the case of positive
272 lookahead or positive lookbehind we can have multiple patterns
273 involved. Consider
274
275 /(?=FOO).*F/
276
277 The minimum length of the pattern overall is 3, the minimum length
278 of the lookahead part is 3, but the minimum length of the part that
279 will actually match is 1. So 'FOO's minimum length is 3, but the
280 minimum length for the F is 1. This is important as the minimum length
281 is used to determine offsets in front of and behind the string being
282 looked for. Since strings can be composites this is the length of the
283 pattern at the time it was commited with a scan_commit. Note that
284 the length is calculated by study_chunk, so that the minimum lengths
285 are not known until the full pattern has been compiled, thus the
286 pointer to the value.
287
288 - lookbehind
289
290 In the case of lookbehind the string being searched for can be
291 offset past the start point of the final matching string.
292 If this value was just blithely removed from the min_offset it would
293 invalidate some of the calculations for how many chars must match
294 before or after (as they are derived from min_offset and minlen and
295 the length of the string being searched for).
296 When the final pattern is compiled and the data is moved from the
297 scan_data_t structure into the regexp structure the information
298 about lookbehind is factored in, with the information that would
299 have been lost precalculated in the end_shift field for the
300 associated string.
301
302 The fields pos_min and pos_delta are used to store the minimum offset
303 and the delta to the maximum offset at the current point in the pattern.
304
305*/
2c2d71f5
JH
306
307typedef struct scan_data_t {
1de06328
YO
308 /*I32 len_min; unused */
309 /*I32 len_delta; unused */
2c2d71f5
JH
310 I32 pos_min;
311 I32 pos_delta;
312 SV *last_found;
1de06328 313 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
314 I32 last_start_min;
315 I32 last_start_max;
1de06328
YO
316 SV **longest; /* Either &l_fixed, or &l_float. */
317 SV *longest_fixed; /* longest fixed string found in pattern */
318 I32 offset_fixed; /* offset where it starts */
319 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
320 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
321 SV *longest_float; /* longest floating string found in pattern */
322 I32 offset_float_min; /* earliest point in string it can appear */
323 I32 offset_float_max; /* latest point in string it can appear */
324 I32 *minlen_float; /* pointer to the minlen relevent to the string */
325 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
326 I32 flags;
327 I32 whilem_c;
cb434fcc 328 I32 *last_closep;
653099ff 329 struct regnode_charclass_class *start_class;
2c2d71f5
JH
330} scan_data_t;
331
a687059c 332/*
e50aee73 333 * Forward declarations for pregcomp()'s friends.
a687059c 334 */
a0d0e21e 335
27da23d5 336static const scan_data_t zero_scan_data =
1de06328 337 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
338
339#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
340#define SF_BEFORE_SEOL 0x0001
341#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
342#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
343#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
344
09b7f37c
CB
345#ifdef NO_UNARY_PLUS
346# define SF_FIX_SHIFT_EOL (0+2)
347# define SF_FL_SHIFT_EOL (0+4)
348#else
349# define SF_FIX_SHIFT_EOL (+2)
350# define SF_FL_SHIFT_EOL (+4)
351#endif
c277df42
IZ
352
353#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
354#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
355
356#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
357#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
358#define SF_IS_INF 0x0040
359#define SF_HAS_PAR 0x0080
360#define SF_IN_PAR 0x0100
361#define SF_HAS_EVAL 0x0200
362#define SCF_DO_SUBSTR 0x0400
653099ff
GS
363#define SCF_DO_STCLASS_AND 0x0800
364#define SCF_DO_STCLASS_OR 0x1000
365#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 366#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 367
786e8c11 368#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 369#define SCF_SEEN_ACCEPT 0x8000
07be1b83 370
43fead97
KW
371#define UTF cBOOL(RExC_utf8)
372#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
373#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
374#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 375
ffc61ed2 376#define OOB_UNICODE 12345678
93733859 377#define OOB_NAMEDCLASS -1
b8c5462f 378
a0ed51b3
LW
379#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
380#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
381
8615cb43 382
b45f050a
JF
383/* length of regex to show in messages that don't mark a position within */
384#define RegexLengthToShowInErrorMessages 127
385
386/*
387 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
388 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
389 * op/pragma/warn/regcomp.
390 */
7253e4e3
RK
391#define MARKER1 "<-- HERE" /* marker as it appears in the description */
392#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 393
7253e4e3 394#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
395
396/*
397 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
398 * arg. Show regex, up to a maximum length. If it's too long, chop and add
399 * "...".
400 */
58e23c8d 401#define _FAIL(code) STMT_START { \
bfed75c6 402 const char *ellipses = ""; \
ccb2c380
MP
403 IV len = RExC_end - RExC_precomp; \
404 \
405 if (!SIZE_ONLY) \
288b8c02 406 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
407 if (len > RegexLengthToShowInErrorMessages) { \
408 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
409 len = RegexLengthToShowInErrorMessages - 10; \
410 ellipses = "..."; \
411 } \
58e23c8d 412 code; \
ccb2c380 413} STMT_END
8615cb43 414
58e23c8d
YO
415#define FAIL(msg) _FAIL( \
416 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
417 msg, (int)len, RExC_precomp, ellipses))
418
419#define FAIL2(msg,arg) _FAIL( \
420 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
421 arg, (int)len, RExC_precomp, ellipses))
422
b45f050a 423/*
b45f050a
JF
424 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
425 */
ccb2c380 426#define Simple_vFAIL(m) STMT_START { \
a28509cc 427 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
428 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
b45f050a
JF
431
432/*
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
434 */
ccb2c380
MP
435#define vFAIL(m) STMT_START { \
436 if (!SIZE_ONLY) \
288b8c02 437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
438 Simple_vFAIL(m); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Like Simple_vFAIL(), but accepts two arguments.
443 */
ccb2c380 444#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 445 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
448} STMT_END
b45f050a
JF
449
450/*
451 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
452 */
ccb2c380
MP
453#define vFAIL2(m,a1) STMT_START { \
454 if (!SIZE_ONLY) \
288b8c02 455 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
456 Simple_vFAIL2(m, a1); \
457} STMT_END
b45f050a
JF
458
459
460/*
461 * Like Simple_vFAIL(), but accepts three arguments.
462 */
ccb2c380 463#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 464 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
465 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
466 (int)offset, RExC_precomp, RExC_precomp + offset); \
467} STMT_END
b45f050a
JF
468
469/*
470 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
471 */
ccb2c380
MP
472#define vFAIL3(m,a1,a2) STMT_START { \
473 if (!SIZE_ONLY) \
288b8c02 474 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
475 Simple_vFAIL3(m, a1, a2); \
476} STMT_END
b45f050a
JF
477
478/*
479 * Like Simple_vFAIL(), but accepts four arguments.
480 */
ccb2c380 481#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 482 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
483 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
484 (int)offset, RExC_precomp, RExC_precomp + offset); \
485} STMT_END
b45f050a 486
668c081a 487#define ckWARNreg(loc,m) STMT_START { \
a28509cc 488 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
489 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
491} STMT_END
492
668c081a 493#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 494 const IV offset = loc - RExC_precomp; \
d1d15184 495 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
496 m REPORT_LOCATION, \
497 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
498} STMT_END
499
668c081a 500#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 501 const IV offset = loc - RExC_precomp; \
668c081a 502 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
503 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
504} STMT_END
505
506#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 507 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
508 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
509 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
510} STMT_END
511
668c081a
NC
512#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
513 const IV offset = loc - RExC_precomp; \
514 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
515 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
516} STMT_END
517
ccb2c380 518#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 519 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
520 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
521 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
522} STMT_END
523
668c081a
NC
524#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
527 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
528} STMT_END
529
ccb2c380 530#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 531 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
532 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
533 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
534} STMT_END
9d1d55b5 535
8615cb43 536
cd439c50 537/* Allow for side effects in s */
ccb2c380
MP
538#define REGC(c,s) STMT_START { \
539 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
540} STMT_END
cd439c50 541
fac92740
MJD
542/* Macros for recording node offsets. 20001227 mjd@plover.com
543 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
544 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
545 * Element 0 holds the number n.
07be1b83 546 * Position is 1 indexed.
fac92740 547 */
7122b237
YO
548#ifndef RE_TRACK_PATTERN_OFFSETS
549#define Set_Node_Offset_To_R(node,byte)
550#define Set_Node_Offset(node,byte)
551#define Set_Cur_Node_Offset
552#define Set_Node_Length_To_R(node,len)
553#define Set_Node_Length(node,len)
554#define Set_Node_Cur_Length(node)
555#define Node_Offset(n)
556#define Node_Length(n)
557#define Set_Node_Offset_Length(node,offset,len)
558#define ProgLen(ri) ri->u.proglen
559#define SetProgLen(ri,x) ri->u.proglen = x
560#else
561#define ProgLen(ri) ri->u.offsets[0]
562#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
563#define Set_Node_Offset_To_R(node,byte) STMT_START { \
564 if (! SIZE_ONLY) { \
565 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 566 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 567 if((node) < 0) { \
551405c4 568 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
569 } else { \
570 RExC_offsets[2*(node)-1] = (byte); \
571 } \
572 } \
573} STMT_END
574
575#define Set_Node_Offset(node,byte) \
576 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
577#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
578
579#define Set_Node_Length_To_R(node,len) STMT_START { \
580 if (! SIZE_ONLY) { \
581 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 582 __LINE__, (int)(node), (int)(len))); \
ccb2c380 583 if((node) < 0) { \
551405c4 584 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
585 } else { \
586 RExC_offsets[2*(node)] = (len); \
587 } \
588 } \
589} STMT_END
590
591#define Set_Node_Length(node,len) \
592 Set_Node_Length_To_R((node)-RExC_emit_start, len)
593#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
594#define Set_Node_Cur_Length(node) \
595 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
596
597/* Get offsets and lengths */
598#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
599#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
600
07be1b83
YO
601#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
602 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
603 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
604} STMT_END
7122b237 605#endif
07be1b83
YO
606
607#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
608#define EXPERIMENTAL_INPLACESCAN
f427392e 609#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 610
304ee84b
YO
611#define DEBUG_STUDYDATA(str,data,depth) \
612DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 613 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
614 "%*s" str "Pos:%"IVdf"/%"IVdf \
615 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
616 (int)(depth)*2, "", \
617 (IV)((data)->pos_min), \
618 (IV)((data)->pos_delta), \
304ee84b 619 (UV)((data)->flags), \
1de06328 620 (IV)((data)->whilem_c), \
304ee84b
YO
621 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
622 is_inf ? "INF " : "" \
1de06328
YO
623 ); \
624 if ((data)->last_found) \
625 PerlIO_printf(Perl_debug_log, \
626 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
627 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
628 SvPVX_const((data)->last_found), \
629 (IV)((data)->last_end), \
630 (IV)((data)->last_start_min), \
631 (IV)((data)->last_start_max), \
632 ((data)->longest && \
633 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
634 SvPVX_const((data)->longest_fixed), \
635 (IV)((data)->offset_fixed), \
636 ((data)->longest && \
637 (data)->longest==&((data)->longest_float)) ? "*" : "", \
638 SvPVX_const((data)->longest_float), \
639 (IV)((data)->offset_float_min), \
640 (IV)((data)->offset_float_max) \
641 ); \
642 PerlIO_printf(Perl_debug_log,"\n"); \
643});
644
acfe0abc 645static void clear_re(pTHX_ void *r);
4327152a 646
653099ff 647/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 648 Update the longest found anchored substring and the longest found
653099ff
GS
649 floating substrings if needed. */
650
4327152a 651STATIC void
304ee84b 652S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 653{
e1ec3a88
AL
654 const STRLEN l = CHR_SVLEN(data->last_found);
655 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 656 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 657
7918f24d
NC
658 PERL_ARGS_ASSERT_SCAN_COMMIT;
659
c277df42 660 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 661 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
662 if (*data->longest == data->longest_fixed) {
663 data->offset_fixed = l ? data->last_start_min : data->pos_min;
664 if (data->flags & SF_BEFORE_EOL)
b81d288d 665 data->flags
c277df42
IZ
666 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
667 else
668 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
669 data->minlen_fixed=minlenp;
670 data->lookbehind_fixed=0;
a0ed51b3 671 }
304ee84b 672 else { /* *data->longest == data->longest_float */
c277df42 673 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
674 data->offset_float_max = (l
675 ? data->last_start_max
c277df42 676 : data->pos_min + data->pos_delta);
304ee84b 677 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 678 data->offset_float_max = I32_MAX;
c277df42 679 if (data->flags & SF_BEFORE_EOL)
b81d288d 680 data->flags
c277df42
IZ
681 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
682 else
683 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
684 data->minlen_float=minlenp;
685 data->lookbehind_float=0;
c277df42
IZ
686 }
687 }
688 SvCUR_set(data->last_found, 0);
0eda9292 689 {
a28509cc 690 SV * const sv = data->last_found;
097eb12c
AL
691 if (SvUTF8(sv) && SvMAGICAL(sv)) {
692 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
693 if (mg)
694 mg->mg_len = 0;
695 }
0eda9292 696 }
c277df42
IZ
697 data->last_end = -1;
698 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 699 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
700}
701
653099ff
GS
702/* Can match anything (initialization) */
703STATIC void
097eb12c 704S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 705{
7918f24d
NC
706 PERL_ARGS_ASSERT_CL_ANYTHING;
707
653099ff 708 ANYOF_CLASS_ZERO(cl);
f8bef550 709 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 710 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
711 if (LOC)
712 cl->flags |= ANYOF_LOCALE;
8951c461 713 cl->flags |= ANYOF_FOLD;
653099ff
GS
714}
715
716/* Can match anything (initialization) */
717STATIC int
5f66b61c 718S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
719{
720 int value;
721
7918f24d
NC
722 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
723
aaa51d5e 724 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
725 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
726 return 1;
1aa99e6b
IH
727 if (!(cl->flags & ANYOF_UNICODE_ALL))
728 return 0;
10edeb5d 729 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 730 return 0;
653099ff
GS
731 return 1;
732}
733
734/* Can match anything (initialization) */
735STATIC void
097eb12c 736S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 737{
7918f24d
NC
738 PERL_ARGS_ASSERT_CL_INIT;
739
8ecf7187 740 Zero(cl, 1, struct regnode_charclass_class);
653099ff 741 cl->type = ANYOF;
830247a4 742 cl_anything(pRExC_state, cl);
653099ff
GS
743}
744
745STATIC void
097eb12c 746S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 747{
7918f24d
NC
748 PERL_ARGS_ASSERT_CL_INIT_ZERO;
749
8ecf7187 750 Zero(cl, 1, struct regnode_charclass_class);
653099ff 751 cl->type = ANYOF;
830247a4 752 cl_anything(pRExC_state, cl);
653099ff
GS
753 if (LOC)
754 cl->flags |= ANYOF_LOCALE;
755}
756
757/* 'And' a given class with another one. Can create false positives */
758/* We assume that cl is not inverted */
759STATIC void
5f66b61c 760S_cl_and(struct regnode_charclass_class *cl,
a28509cc 761 const struct regnode_charclass_class *and_with)
653099ff 762{
7918f24d 763 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
764
765 assert(and_with->type == ANYOF);
653099ff
GS
766 if (!(and_with->flags & ANYOF_CLASS)
767 && !(cl->flags & ANYOF_CLASS)
768 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
769 && !(and_with->flags & ANYOF_FOLD)
770 && !(cl->flags & ANYOF_FOLD)) {
771 int i;
772
773 if (and_with->flags & ANYOF_INVERT)
774 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
775 cl->bitmap[i] &= ~and_with->bitmap[i];
776 else
777 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
778 cl->bitmap[i] &= and_with->bitmap[i];
779 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
780 if (!(and_with->flags & ANYOF_EOS))
781 cl->flags &= ~ANYOF_EOS;
1aa99e6b 782
8951c461
KW
783 if (!(and_with->flags & ANYOF_FOLD))
784 cl->flags &= ~ANYOF_FOLD;
785
14ebb1a2
JH
786 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
787 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
788 cl->flags &= ~ANYOF_UNICODE_ALL;
789 cl->flags |= ANYOF_UNICODE;
790 ARG_SET(cl, ARG(and_with));
791 }
14ebb1a2
JH
792 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
793 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 794 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
795 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
796 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 797 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
798}
799
800/* 'OR' a given class with another one. Can create false positives */
801/* We assume that cl is not inverted */
802STATIC void
097eb12c 803S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 804{
7918f24d
NC
805 PERL_ARGS_ASSERT_CL_OR;
806
653099ff
GS
807 if (or_with->flags & ANYOF_INVERT) {
808 /* We do not use
809 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
810 * <= (B1 | !B2) | (CL1 | !CL2)
811 * which is wasteful if CL2 is small, but we ignore CL2:
812 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
813 * XXXX Can we handle case-fold? Unclear:
814 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
815 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
816 */
817 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
818 && !(or_with->flags & ANYOF_FOLD)
819 && !(cl->flags & ANYOF_FOLD) ) {
820 int i;
821
822 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
823 cl->bitmap[i] |= ~or_with->bitmap[i];
824 } /* XXXX: logic is complicated otherwise */
825 else {
830247a4 826 cl_anything(pRExC_state, cl);
653099ff
GS
827 }
828 } else {
829 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
830 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 831 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
832 || (cl->flags & ANYOF_FOLD)) ) {
833 int i;
834
835 /* OR char bitmap and class bitmap separately */
836 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
837 cl->bitmap[i] |= or_with->bitmap[i];
838 if (or_with->flags & ANYOF_CLASS) {
839 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
840 cl->classflags[i] |= or_with->classflags[i];
841 cl->flags |= ANYOF_CLASS;
842 }
843 }
844 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 845 cl_anything(pRExC_state, cl);
653099ff
GS
846 }
847 }
848 if (or_with->flags & ANYOF_EOS)
849 cl->flags |= ANYOF_EOS;
1aa99e6b 850
8951c461
KW
851 if (or_with->flags & ANYOF_FOLD)
852 cl->flags |= ANYOF_FOLD;
853
1aa99e6b
IH
854 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
855 ARG(cl) != ARG(or_with)) {
856 cl->flags |= ANYOF_UNICODE_ALL;
857 cl->flags &= ~ANYOF_UNICODE;
858 }
859 if (or_with->flags & ANYOF_UNICODE_ALL) {
860 cl->flags |= ANYOF_UNICODE_ALL;
861 cl->flags &= ~ANYOF_UNICODE;
862 }
653099ff
GS
863}
864
a3621e74
YO
865#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
866#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
867#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
868#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
869
3dab1dad
YO
870
871#ifdef DEBUGGING
07be1b83 872/*
2b8b4781
NC
873 dump_trie(trie,widecharmap,revcharmap)
874 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
875 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
876
877 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
878 The _interim_ variants are used for debugging the interim
879 tables that are used to generate the final compressed
880 representation which is what dump_trie expects.
881
3dab1dad
YO
882 Part of the reason for their existance is to provide a form
883 of documentation as to how the different representations function.
07be1b83
YO
884
885*/
3dab1dad
YO
886
887/*
3dab1dad
YO
888 Dumps the final compressed table form of the trie to Perl_debug_log.
889 Used for debugging make_trie().
890*/
b9a59e08 891
3dab1dad 892STATIC void
2b8b4781
NC
893S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
894 AV *revcharmap, U32 depth)
3dab1dad
YO
895{
896 U32 state;
ab3bbdeb 897 SV *sv=sv_newmortal();
55eed653 898 int colwidth= widecharmap ? 6 : 4;
2e64971a 899 U16 word;
3dab1dad
YO
900 GET_RE_DEBUG_FLAGS_DECL;
901
7918f24d 902 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 903
3dab1dad
YO
904 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
905 (int)depth * 2 + 2,"",
906 "Match","Base","Ofs" );
907
908 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 909 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 910 if ( tmp ) {
ab3bbdeb
YO
911 PerlIO_printf( Perl_debug_log, "%*s",
912 colwidth,
ddc5bc0f 913 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
914 PL_colors[0], PL_colors[1],
915 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
916 PERL_PV_ESCAPE_FIRSTCHAR
917 )
918 );
3dab1dad
YO
919 }
920 }
921 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
922 (int)depth * 2 + 2,"");
923
924 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 925 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
926 PerlIO_printf( Perl_debug_log, "\n");
927
1e2e3d02 928 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 929 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
930
931 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
932
933 if ( trie->states[ state ].wordnum ) {
934 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
935 } else {
936 PerlIO_printf( Perl_debug_log, "%6s", "" );
937 }
938
939 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
940
941 if ( base ) {
942 U32 ofs = 0;
943
944 while( ( base + ofs < trie->uniquecharcount ) ||
945 ( base + ofs - trie->uniquecharcount < trie->lasttrans
946 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
947 ofs++;
948
949 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
950
951 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
952 if ( ( base + ofs >= trie->uniquecharcount ) &&
953 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
954 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
955 {
ab3bbdeb
YO
956 PerlIO_printf( Perl_debug_log, "%*"UVXf,
957 colwidth,
3dab1dad
YO
958 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
959 } else {
ab3bbdeb 960 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
961 }
962 }
963
964 PerlIO_printf( Perl_debug_log, "]");
965
966 }
967 PerlIO_printf( Perl_debug_log, "\n" );
968 }
2e64971a
DM
969 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
970 for (word=1; word <= trie->wordcount; word++) {
971 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
972 (int)word, (int)(trie->wordinfo[word].prev),
973 (int)(trie->wordinfo[word].len));
974 }
975 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
976}
977/*
3dab1dad
YO
978 Dumps a fully constructed but uncompressed trie in list form.
979 List tries normally only are used for construction when the number of
980 possible chars (trie->uniquecharcount) is very high.
981 Used for debugging make_trie().
982*/
983STATIC void
55eed653 984S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
985 HV *widecharmap, AV *revcharmap, U32 next_alloc,
986 U32 depth)
3dab1dad
YO
987{
988 U32 state;
ab3bbdeb 989 SV *sv=sv_newmortal();
55eed653 990 int colwidth= widecharmap ? 6 : 4;
3dab1dad 991 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
992
993 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
994
3dab1dad 995 /* print out the table precompression. */
ab3bbdeb
YO
996 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
997 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
998 "------:-----+-----------------\n" );
3dab1dad
YO
999
1000 for( state=1 ; state < next_alloc ; state ++ ) {
1001 U16 charid;
1002
ab3bbdeb 1003 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1004 (int)depth * 2 + 2,"", (UV)state );
1005 if ( ! trie->states[ state ].wordnum ) {
1006 PerlIO_printf( Perl_debug_log, "%5s| ","");
1007 } else {
1008 PerlIO_printf( Perl_debug_log, "W%4x| ",
1009 trie->states[ state ].wordnum
1010 );
1011 }
1012 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1013 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1014 if ( tmp ) {
1015 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1016 colwidth,
ddc5bc0f 1017 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1018 PL_colors[0], PL_colors[1],
1019 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1020 PERL_PV_ESCAPE_FIRSTCHAR
1021 ) ,
1e2e3d02
YO
1022 TRIE_LIST_ITEM(state,charid).forid,
1023 (UV)TRIE_LIST_ITEM(state,charid).newstate
1024 );
1025 if (!(charid % 10))
664e119d
RGS
1026 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1027 (int)((depth * 2) + 14), "");
1e2e3d02 1028 }
ab3bbdeb
YO
1029 }
1030 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1031 }
1032}
1033
1034/*
3dab1dad
YO
1035 Dumps a fully constructed but uncompressed trie in table form.
1036 This is the normal DFA style state transition table, with a few
1037 twists to facilitate compression later.
1038 Used for debugging make_trie().
1039*/
1040STATIC void
55eed653 1041S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1042 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1043 U32 depth)
3dab1dad
YO
1044{
1045 U32 state;
1046 U16 charid;
ab3bbdeb 1047 SV *sv=sv_newmortal();
55eed653 1048 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1049 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1050
1051 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1052
1053 /*
1054 print out the table precompression so that we can do a visual check
1055 that they are identical.
1056 */
1057
1058 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1059
1060 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1061 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1062 if ( tmp ) {
ab3bbdeb
YO
1063 PerlIO_printf( Perl_debug_log, "%*s",
1064 colwidth,
ddc5bc0f 1065 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1066 PL_colors[0], PL_colors[1],
1067 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1068 PERL_PV_ESCAPE_FIRSTCHAR
1069 )
1070 );
3dab1dad
YO
1071 }
1072 }
1073
1074 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1075
1076 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1077 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1078 }
1079
1080 PerlIO_printf( Perl_debug_log, "\n" );
1081
1082 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1083
1084 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1085 (int)depth * 2 + 2,"",
1086 (UV)TRIE_NODENUM( state ) );
1087
1088 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1089 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1090 if (v)
1091 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1092 else
1093 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1094 }
1095 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1096 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1097 } else {
1098 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1099 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1100 }
1101 }
07be1b83 1102}
3dab1dad
YO
1103
1104#endif
1105
2e64971a 1106
786e8c11
YO
1107/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1108 startbranch: the first branch in the whole branch sequence
1109 first : start branch of sequence of branch-exact nodes.
1110 May be the same as startbranch
1111 last : Thing following the last branch.
1112 May be the same as tail.
1113 tail : item following the branch sequence
1114 count : words in the sequence
1115 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1116 depth : indent depth
3dab1dad 1117
786e8c11 1118Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1119
786e8c11
YO
1120A trie is an N'ary tree where the branches are determined by digital
1121decomposition of the key. IE, at the root node you look up the 1st character and
1122follow that branch repeat until you find the end of the branches. Nodes can be
1123marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1124
786e8c11 1125 /he|she|his|hers/
72f13be8 1126
786e8c11
YO
1127would convert into the following structure. Numbers represent states, letters
1128following numbers represent valid transitions on the letter from that state, if
1129the number is in square brackets it represents an accepting state, otherwise it
1130will be in parenthesis.
07be1b83 1131
786e8c11
YO
1132 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1133 | |
1134 | (2)
1135 | |
1136 (1) +-i->(6)-+-s->[7]
1137 |
1138 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1139
786e8c11
YO
1140 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1141
1142This shows that when matching against the string 'hers' we will begin at state 1
1143read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1144then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1145is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1146single traverse. We store a mapping from accepting to state to which word was
1147matched, and then when we have multiple possibilities we try to complete the
1148rest of the regex in the order in which they occured in the alternation.
1149
1150The only prior NFA like behaviour that would be changed by the TRIE support is
1151the silent ignoring of duplicate alternations which are of the form:
1152
1153 / (DUPE|DUPE) X? (?{ ... }) Y /x
1154
1155Thus EVAL blocks follwing a trie may be called a different number of times with
1156and without the optimisation. With the optimisations dupes will be silently
1157ignored. This inconsistant behaviour of EVAL type nodes is well established as
1158the following demonstrates:
1159
1160 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1161
1162which prints out 'word' three times, but
1163
1164 'words'=~/(word|word|word)(?{ print $1 })S/
1165
1166which doesnt print it out at all. This is due to other optimisations kicking in.
1167
1168Example of what happens on a structural level:
1169
1170The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1171
1172 1: CURLYM[1] {1,32767}(18)
1173 5: BRANCH(8)
1174 6: EXACT <ac>(16)
1175 8: BRANCH(11)
1176 9: EXACT <ad>(16)
1177 11: BRANCH(14)
1178 12: EXACT <ab>(16)
1179 16: SUCCEED(0)
1180 17: NOTHING(18)
1181 18: END(0)
1182
1183This would be optimizable with startbranch=5, first=5, last=16, tail=16
1184and should turn into:
1185
1186 1: CURLYM[1] {1,32767}(18)
1187 5: TRIE(16)
1188 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1189 <ac>
1190 <ad>
1191 <ab>
1192 16: SUCCEED(0)
1193 17: NOTHING(18)
1194 18: END(0)
1195
1196Cases where tail != last would be like /(?foo|bar)baz/:
1197
1198 1: BRANCH(4)
1199 2: EXACT <foo>(8)
1200 4: BRANCH(7)
1201 5: EXACT <bar>(8)
1202 7: TAIL(8)
1203 8: EXACT <baz>(10)
1204 10: END(0)
1205
1206which would be optimizable with startbranch=1, first=1, last=7, tail=8
1207and would end up looking like:
1208
1209 1: TRIE(8)
1210 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1211 <foo>
1212 <bar>
1213 7: TAIL(8)
1214 8: EXACT <baz>(10)
1215 10: END(0)
1216
1217 d = uvuni_to_utf8_flags(d, uv, 0);
1218
1219is the recommended Unicode-aware way of saying
1220
1221 *(d++) = uv;
1222*/
1223
1e2e3d02 1224#define TRIE_STORE_REVCHAR \
786e8c11 1225 STMT_START { \
73031816
NC
1226 if (UTF) { \
1227 SV *zlopp = newSV(2); \
88c9ea1e
CB
1228 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1229 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1230 SvCUR_set(zlopp, kapow - flrbbbbb); \
1231 SvPOK_on(zlopp); \
1232 SvUTF8_on(zlopp); \
1233 av_push(revcharmap, zlopp); \
1234 } else { \
6bdeddd2 1235 char ooooff = (char)uvc; \
73031816
NC
1236 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1237 } \
1238 } STMT_END
786e8c11
YO
1239
1240#define TRIE_READ_CHAR STMT_START { \
1241 wordlen++; \
1242 if ( UTF ) { \
1243 if ( folder ) { \
1244 if ( foldlen > 0 ) { \
1245 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1246 foldlen -= len; \
1247 scan += len; \
1248 len = 0; \
1249 } else { \
1250 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1251 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1252 foldlen -= UNISKIP( uvc ); \
1253 scan = foldbuf + UNISKIP( uvc ); \
1254 } \
1255 } else { \
1256 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1257 } \
1258 } else { \
1259 uvc = (U32)*uc; \
1260 len = 1; \
1261 } \
1262} STMT_END
1263
1264
1265
1266#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1267 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1268 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1269 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1270 } \
1271 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1272 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1273 TRIE_LIST_CUR( state )++; \
1274} STMT_END
07be1b83 1275
786e8c11
YO
1276#define TRIE_LIST_NEW(state) STMT_START { \
1277 Newxz( trie->states[ state ].trans.list, \
1278 4, reg_trie_trans_le ); \
1279 TRIE_LIST_CUR( state ) = 1; \
1280 TRIE_LIST_LEN( state ) = 4; \
1281} STMT_END
07be1b83 1282
786e8c11
YO
1283#define TRIE_HANDLE_WORD(state) STMT_START { \
1284 U16 dupe= trie->states[ state ].wordnum; \
1285 regnode * const noper_next = regnext( noper ); \
1286 \
786e8c11
YO
1287 DEBUG_r({ \
1288 /* store the word for dumping */ \
1289 SV* tmp; \
1290 if (OP(noper) != NOTHING) \
740cce10 1291 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1292 else \
740cce10 1293 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1294 av_push( trie_words, tmp ); \
786e8c11
YO
1295 }); \
1296 \
1297 curword++; \
2e64971a
DM
1298 trie->wordinfo[curword].prev = 0; \
1299 trie->wordinfo[curword].len = wordlen; \
1300 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1301 \
1302 if ( noper_next < tail ) { \
1303 if (!trie->jump) \
c944940b 1304 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1305 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1306 if (!jumper) \
1307 jumper = noper_next; \
1308 if (!nextbranch) \
1309 nextbranch= regnext(cur); \
1310 } \
1311 \
1312 if ( dupe ) { \
2e64971a
DM
1313 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1314 /* chain, so that when the bits of chain are later */\
1315 /* linked together, the dups appear in the chain */\
1316 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1317 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1318 } else { \
1319 /* we haven't inserted this word yet. */ \
1320 trie->states[ state ].wordnum = curword; \
1321 } \
1322} STMT_END
07be1b83 1323
3dab1dad 1324
786e8c11
YO
1325#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1326 ( ( base + charid >= ucharcount \
1327 && base + charid < ubound \
1328 && state == trie->trans[ base - ucharcount + charid ].check \
1329 && trie->trans[ base - ucharcount + charid ].next ) \
1330 ? trie->trans[ base - ucharcount + charid ].next \
1331 : ( state==1 ? special : 0 ) \
1332 )
3dab1dad 1333
786e8c11
YO
1334#define MADE_TRIE 1
1335#define MADE_JUMP_TRIE 2
1336#define MADE_EXACT_TRIE 4
3dab1dad 1337
a3621e74 1338STATIC I32
786e8c11 1339S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1340{
27da23d5 1341 dVAR;
a3621e74
YO
1342 /* first pass, loop through and scan words */
1343 reg_trie_data *trie;
55eed653 1344 HV *widecharmap = NULL;
2b8b4781 1345 AV *revcharmap = newAV();
a3621e74 1346 regnode *cur;
9f7f3913 1347 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1348 STRLEN len = 0;
1349 UV uvc = 0;
1350 U16 curword = 0;
1351 U32 next_alloc = 0;
786e8c11
YO
1352 regnode *jumper = NULL;
1353 regnode *nextbranch = NULL;
7f69552c 1354 regnode *convert = NULL;
2e64971a 1355 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1356 /* we just use folder as a flag in utf8 */
e1ec3a88 1357 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1358 ? PL_fold
1359 : ( flags == EXACTFL
1360 ? PL_fold_locale
1361 : NULL
1362 )
1363 );
1364
2b8b4781
NC
1365#ifdef DEBUGGING
1366 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1367 AV *trie_words = NULL;
1368 /* along with revcharmap, this only used during construction but both are
1369 * useful during debugging so we store them in the struct when debugging.
8e11feef 1370 */
2b8b4781
NC
1371#else
1372 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1373 STRLEN trie_charcount=0;
3dab1dad 1374#endif
2b8b4781 1375 SV *re_trie_maxbuff;
a3621e74 1376 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1377
1378 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1379#ifndef DEBUGGING
1380 PERL_UNUSED_ARG(depth);
1381#endif
a3621e74 1382
c944940b 1383 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1384 trie->refcount = 1;
3dab1dad 1385 trie->startstate = 1;
786e8c11 1386 trie->wordcount = word_count;
f8fc2ecf 1387 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1388 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1389 if (!(UTF && folder))
c944940b 1390 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1391 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1392 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1393
a3621e74 1394 DEBUG_r({
2b8b4781 1395 trie_words = newAV();
a3621e74 1396 });
a3621e74 1397
0111c4fd 1398 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1399 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1400 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1401 }
3dab1dad
YO
1402 DEBUG_OPTIMISE_r({
1403 PerlIO_printf( Perl_debug_log,
786e8c11 1404 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1405 (int)depth * 2 + 2, "",
1406 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1407 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1408 (int)depth);
3dab1dad 1409 });
7f69552c
YO
1410
1411 /* Find the node we are going to overwrite */
1412 if ( first == startbranch && OP( last ) != BRANCH ) {
1413 /* whole branch chain */
1414 convert = first;
1415 } else {
1416 /* branch sub-chain */
1417 convert = NEXTOPER( first );
1418 }
1419
a3621e74
YO
1420 /* -- First loop and Setup --
1421
1422 We first traverse the branches and scan each word to determine if it
1423 contains widechars, and how many unique chars there are, this is
1424 important as we have to build a table with at least as many columns as we
1425 have unique chars.
1426
1427 We use an array of integers to represent the character codes 0..255
38a44b82 1428 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1429 native representation of the character value as the key and IV's for the
1430 coded index.
1431
1432 *TODO* If we keep track of how many times each character is used we can
1433 remap the columns so that the table compression later on is more
3b753521 1434 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1435 middle and the least common are on the outside. IMO this would be better
1436 than a most to least common mapping as theres a decent chance the most
1437 common letter will share a node with the least common, meaning the node
1438 will not be compressable. With a middle is most common approach the worst
1439 case is when we have the least common nodes twice.
1440
1441 */
1442
a3621e74 1443 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1444 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1445 const U8 *uc = (U8*)STRING( noper );
a28509cc 1446 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1447 STRLEN foldlen = 0;
1448 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1449 const U8 *scan = (U8*)NULL;
07be1b83 1450 U32 wordlen = 0; /* required init */
02daf0ab
YO
1451 STRLEN chars = 0;
1452 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1453
3dab1dad
YO
1454 if (OP(noper) == NOTHING) {
1455 trie->minlen= 0;
1456 continue;
1457 }
02daf0ab
YO
1458 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1459 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1460 regardless of encoding */
1461
a3621e74 1462 for ( ; uc < e ; uc += len ) {
3dab1dad 1463 TRIE_CHARCOUNT(trie)++;
a3621e74 1464 TRIE_READ_CHAR;
3dab1dad 1465 chars++;
a3621e74
YO
1466 if ( uvc < 256 ) {
1467 if ( !trie->charmap[ uvc ] ) {
1468 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1469 if ( folder )
1470 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1471 TRIE_STORE_REVCHAR;
a3621e74 1472 }
02daf0ab
YO
1473 if ( set_bit ) {
1474 /* store the codepoint in the bitmap, and if its ascii
1475 also store its folded equivelent. */
1476 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1477
1478 /* store the folded codepoint */
1479 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1480
1481 if ( !UTF ) {
1482 /* store first byte of utf8 representation of
1483 codepoints in the 127 < uvc < 256 range */
1484 if (127 < uvc && uvc < 192) {
1485 TRIE_BITMAP_SET(trie,194);
1486 } else if (191 < uvc ) {
1487 TRIE_BITMAP_SET(trie,195);
1488 /* && uvc < 256 -- we know uvc is < 256 already */
1489 }
1490 }
02daf0ab
YO
1491 set_bit = 0; /* We've done our bit :-) */
1492 }
a3621e74
YO
1493 } else {
1494 SV** svpp;
55eed653
NC
1495 if ( !widecharmap )
1496 widecharmap = newHV();
a3621e74 1497
55eed653 1498 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1499
1500 if ( !svpp )
e4584336 1501 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1502
1503 if ( !SvTRUE( *svpp ) ) {
1504 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1505 TRIE_STORE_REVCHAR;
a3621e74
YO
1506 }
1507 }
1508 }
3dab1dad
YO
1509 if( cur == first ) {
1510 trie->minlen=chars;
1511 trie->maxlen=chars;
1512 } else if (chars < trie->minlen) {
1513 trie->minlen=chars;
1514 } else if (chars > trie->maxlen) {
1515 trie->maxlen=chars;
1516 }
1517
a3621e74
YO
1518 } /* end first pass */
1519 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1520 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1521 (int)depth * 2 + 2,"",
55eed653 1522 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1523 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1524 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1525 );
a3621e74
YO
1526
1527 /*
1528 We now know what we are dealing with in terms of unique chars and
1529 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1530 representation using a flat table will take. If it's over a reasonable
1531 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1532 conservative but potentially much slower representation using an array
1533 of lists.
1534
1535 At the end we convert both representations into the same compressed
1536 form that will be used in regexec.c for matching with. The latter
1537 is a form that cannot be used to construct with but has memory
1538 properties similar to the list form and access properties similar
1539 to the table form making it both suitable for fast searches and
1540 small enough that its feasable to store for the duration of a program.
1541
1542 See the comment in the code where the compressed table is produced
1543 inplace from the flat tabe representation for an explanation of how
1544 the compression works.
1545
1546 */
1547
1548
2e64971a
DM
1549 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1550 prev_states[1] = 0;
1551
3dab1dad 1552 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1553 /*
1554 Second Pass -- Array Of Lists Representation
1555
1556 Each state will be represented by a list of charid:state records
1557 (reg_trie_trans_le) the first such element holds the CUR and LEN
1558 points of the allocated array. (See defines above).
1559
1560 We build the initial structure using the lists, and then convert
1561 it into the compressed table form which allows faster lookups
1562 (but cant be modified once converted).
a3621e74
YO
1563 */
1564
a3621e74
YO
1565 STRLEN transcount = 1;
1566
1e2e3d02
YO
1567 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1568 "%*sCompiling trie using list compiler\n",
1569 (int)depth * 2 + 2, ""));
446bd890 1570
c944940b
JH
1571 trie->states = (reg_trie_state *)
1572 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1573 sizeof(reg_trie_state) );
a3621e74
YO
1574 TRIE_LIST_NEW(1);
1575 next_alloc = 2;
1576
1577 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1578
c445ea15
AL
1579 regnode * const noper = NEXTOPER( cur );
1580 U8 *uc = (U8*)STRING( noper );
1581 const U8 * const e = uc + STR_LEN( noper );
1582 U32 state = 1; /* required init */
1583 U16 charid = 0; /* sanity init */
1584 U8 *scan = (U8*)NULL; /* sanity init */
1585 STRLEN foldlen = 0; /* required init */
07be1b83 1586 U32 wordlen = 0; /* required init */
c445ea15
AL
1587 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1588
3dab1dad 1589 if (OP(noper) != NOTHING) {
786e8c11 1590 for ( ; uc < e ; uc += len ) {
c445ea15 1591
786e8c11 1592 TRIE_READ_CHAR;
c445ea15 1593
786e8c11
YO
1594 if ( uvc < 256 ) {
1595 charid = trie->charmap[ uvc ];
c445ea15 1596 } else {
55eed653 1597 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1598 if ( !svpp ) {
1599 charid = 0;
1600 } else {
1601 charid=(U16)SvIV( *svpp );
1602 }
c445ea15 1603 }
786e8c11
YO
1604 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1605 if ( charid ) {
a3621e74 1606
786e8c11
YO
1607 U16 check;
1608 U32 newstate = 0;
a3621e74 1609
786e8c11
YO
1610 charid--;
1611 if ( !trie->states[ state ].trans.list ) {
1612 TRIE_LIST_NEW( state );
c445ea15 1613 }
786e8c11
YO
1614 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1615 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1616 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1617 break;
1618 }
1619 }
1620 if ( ! newstate ) {
1621 newstate = next_alloc++;
2e64971a 1622 prev_states[newstate] = state;
786e8c11
YO
1623 TRIE_LIST_PUSH( state, charid, newstate );
1624 transcount++;
1625 }
1626 state = newstate;
1627 } else {
1628 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1629 }
a28509cc 1630 }
c445ea15 1631 }
3dab1dad 1632 TRIE_HANDLE_WORD(state);
a3621e74
YO
1633
1634 } /* end second pass */
1635
1e2e3d02
YO
1636 /* next alloc is the NEXT state to be allocated */
1637 trie->statecount = next_alloc;
c944940b
JH
1638 trie->states = (reg_trie_state *)
1639 PerlMemShared_realloc( trie->states,
1640 next_alloc
1641 * sizeof(reg_trie_state) );
a3621e74 1642
3dab1dad 1643 /* and now dump it out before we compress it */
2b8b4781
NC
1644 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1645 revcharmap, next_alloc,
1646 depth+1)
1e2e3d02 1647 );
a3621e74 1648
c944940b
JH
1649 trie->trans = (reg_trie_trans *)
1650 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1651 {
1652 U32 state;
a3621e74
YO
1653 U32 tp = 0;
1654 U32 zp = 0;
1655
1656
1657 for( state=1 ; state < next_alloc ; state ++ ) {
1658 U32 base=0;
1659
1660 /*
1661 DEBUG_TRIE_COMPILE_MORE_r(
1662 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1663 );
1664 */
1665
1666 if (trie->states[state].trans.list) {
1667 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1668 U16 maxid=minid;
a28509cc 1669 U16 idx;
a3621e74
YO
1670
1671 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1672 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1673 if ( forid < minid ) {
1674 minid=forid;
1675 } else if ( forid > maxid ) {
1676 maxid=forid;
1677 }
a3621e74
YO
1678 }
1679 if ( transcount < tp + maxid - minid + 1) {
1680 transcount *= 2;
c944940b
JH
1681 trie->trans = (reg_trie_trans *)
1682 PerlMemShared_realloc( trie->trans,
446bd890
NC
1683 transcount
1684 * sizeof(reg_trie_trans) );
a3621e74
YO
1685 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1686 }
1687 base = trie->uniquecharcount + tp - minid;
1688 if ( maxid == minid ) {
1689 U32 set = 0;
1690 for ( ; zp < tp ; zp++ ) {
1691 if ( ! trie->trans[ zp ].next ) {
1692 base = trie->uniquecharcount + zp - minid;
1693 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1694 trie->trans[ zp ].check = state;
1695 set = 1;
1696 break;
1697 }
1698 }
1699 if ( !set ) {
1700 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1701 trie->trans[ tp ].check = state;
1702 tp++;
1703 zp = tp;
1704 }
1705 } else {
1706 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1707 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1708 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1709 trie->trans[ tid ].check = state;
1710 }
1711 tp += ( maxid - minid + 1 );
1712 }
1713 Safefree(trie->states[ state ].trans.list);
1714 }
1715 /*
1716 DEBUG_TRIE_COMPILE_MORE_r(
1717 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1718 );
1719 */
1720 trie->states[ state ].trans.base=base;
1721 }
cc601c31 1722 trie->lasttrans = tp + 1;
a3621e74
YO
1723 }
1724 } else {
1725 /*
1726 Second Pass -- Flat Table Representation.
1727
1728 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1729 We know that we will need Charcount+1 trans at most to store the data
1730 (one row per char at worst case) So we preallocate both structures
1731 assuming worst case.
1732
1733 We then construct the trie using only the .next slots of the entry
1734 structs.
1735
3b753521 1736 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
1737 make compression both faster and easier by keeping track of how many non
1738 zero fields are in the node.
1739
1740 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1741 transition.
1742
1743 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1744 number representing the first entry of the node, and state as a
1745 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1746 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1747 are 2 entrys per node. eg:
1748
1749 A B A B
1750 1. 2 4 1. 3 7
1751 2. 0 3 3. 0 5
1752 3. 0 0 5. 0 0
1753 4. 0 0 7. 0 0
1754
1755 The table is internally in the right hand, idx form. However as we also
1756 have to deal with the states array which is indexed by nodenum we have to
1757 use TRIE_NODENUM() to convert.
1758
1759 */
1e2e3d02
YO
1760 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1761 "%*sCompiling trie using table compiler\n",
1762 (int)depth * 2 + 2, ""));
3dab1dad 1763
c944940b
JH
1764 trie->trans = (reg_trie_trans *)
1765 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1766 * trie->uniquecharcount + 1,
1767 sizeof(reg_trie_trans) );
1768 trie->states = (reg_trie_state *)
1769 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1770 sizeof(reg_trie_state) );
a3621e74
YO
1771 next_alloc = trie->uniquecharcount + 1;
1772
3dab1dad 1773
a3621e74
YO
1774 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1775
c445ea15 1776 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1777 const U8 *uc = (U8*)STRING( noper );
1778 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1779
1780 U32 state = 1; /* required init */
1781
1782 U16 charid = 0; /* sanity init */
1783 U32 accept_state = 0; /* sanity init */
1784 U8 *scan = (U8*)NULL; /* sanity init */
1785
1786 STRLEN foldlen = 0; /* required init */
07be1b83 1787 U32 wordlen = 0; /* required init */
a3621e74
YO
1788 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1789
3dab1dad 1790 if ( OP(noper) != NOTHING ) {
786e8c11 1791 for ( ; uc < e ; uc += len ) {
a3621e74 1792
786e8c11 1793 TRIE_READ_CHAR;
a3621e74 1794
786e8c11
YO
1795 if ( uvc < 256 ) {
1796 charid = trie->charmap[ uvc ];
1797 } else {
55eed653 1798 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1799 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1800 }
786e8c11
YO
1801 if ( charid ) {
1802 charid--;
1803 if ( !trie->trans[ state + charid ].next ) {
1804 trie->trans[ state + charid ].next = next_alloc;
1805 trie->trans[ state ].check++;
2e64971a
DM
1806 prev_states[TRIE_NODENUM(next_alloc)]
1807 = TRIE_NODENUM(state);
786e8c11
YO
1808 next_alloc += trie->uniquecharcount;
1809 }
1810 state = trie->trans[ state + charid ].next;
1811 } else {
1812 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1813 }
1814 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1815 }
a3621e74 1816 }
3dab1dad
YO
1817 accept_state = TRIE_NODENUM( state );
1818 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1819
1820 } /* end second pass */
1821
3dab1dad 1822 /* and now dump it out before we compress it */
2b8b4781
NC
1823 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1824 revcharmap,
1825 next_alloc, depth+1));
a3621e74 1826
a3621e74
YO
1827 {
1828 /*
1829 * Inplace compress the table.*
1830
1831 For sparse data sets the table constructed by the trie algorithm will
1832 be mostly 0/FAIL transitions or to put it another way mostly empty.
1833 (Note that leaf nodes will not contain any transitions.)
1834
1835 This algorithm compresses the tables by eliminating most such
1836 transitions, at the cost of a modest bit of extra work during lookup:
1837
1838 - Each states[] entry contains a .base field which indicates the
1839 index in the state[] array wheres its transition data is stored.
1840
3b753521 1841 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
1842
1843 - If .base is nonzero then charid is added to it to find an entry in
1844 the trans array.
1845
1846 -If trans[states[state].base+charid].check!=state then the
1847 transition is taken to be a 0/Fail transition. Thus if there are fail
1848 transitions at the front of the node then the .base offset will point
1849 somewhere inside the previous nodes data (or maybe even into a node
1850 even earlier), but the .check field determines if the transition is
1851 valid.
1852
786e8c11 1853 XXX - wrong maybe?
a3621e74 1854 The following process inplace converts the table to the compressed
3b753521 1855 table: We first do not compress the root node 1,and mark all its
a3621e74 1856 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
1857 allows us to do a DFA construction from the compressed table later,
1858 and ensures that any .base pointers we calculate later are greater
1859 than 0.
a3621e74
YO
1860
1861 - We set 'pos' to indicate the first entry of the second node.
1862
1863 - We then iterate over the columns of the node, finding the first and
1864 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1865 and set the .check pointers accordingly, and advance pos
1866 appropriately and repreat for the next node. Note that when we copy
1867 the next pointers we have to convert them from the original
1868 NODEIDX form to NODENUM form as the former is not valid post
1869 compression.
1870
1871 - If a node has no transitions used we mark its base as 0 and do not
1872 advance the pos pointer.
1873
1874 - If a node only has one transition we use a second pointer into the
1875 structure to fill in allocated fail transitions from other states.
1876 This pointer is independent of the main pointer and scans forward
1877 looking for null transitions that are allocated to a state. When it
1878 finds one it writes the single transition into the "hole". If the
786e8c11 1879 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1880
1881 - Once compressed we can Renew/realloc the structures to release the
1882 excess space.
1883
1884 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1885 specifically Fig 3.47 and the associated pseudocode.
1886
1887 demq
1888 */
a3b680e6 1889 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1890 U32 state, charid;
a3621e74 1891 U32 pos = 0, zp=0;
1e2e3d02 1892 trie->statecount = laststate;
a3621e74
YO
1893
1894 for ( state = 1 ; state < laststate ; state++ ) {
1895 U8 flag = 0;
a28509cc
AL
1896 const U32 stateidx = TRIE_NODEIDX( state );
1897 const U32 o_used = trie->trans[ stateidx ].check;
1898 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1899 trie->trans[ stateidx ].check = 0;
1900
1901 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1902 if ( flag || trie->trans[ stateidx + charid ].next ) {
1903 if ( trie->trans[ stateidx + charid ].next ) {
1904 if (o_used == 1) {
1905 for ( ; zp < pos ; zp++ ) {
1906 if ( ! trie->trans[ zp ].next ) {
1907 break;
1908 }
1909 }
1910 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1911 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1912 trie->trans[ zp ].check = state;
1913 if ( ++zp > pos ) pos = zp;
1914 break;
1915 }
1916 used--;
1917 }
1918 if ( !flag ) {
1919 flag = 1;
1920 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1921 }
1922 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1923 trie->trans[ pos ].check = state;
1924 pos++;
1925 }
1926 }
1927 }
cc601c31 1928 trie->lasttrans = pos + 1;
c944940b
JH
1929 trie->states = (reg_trie_state *)
1930 PerlMemShared_realloc( trie->states, laststate
1931 * sizeof(reg_trie_state) );
a3621e74 1932 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1933 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1934 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1935 (int)depth * 2 + 2,"",
1936 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1937 (IV)next_alloc,
1938 (IV)pos,
a3621e74
YO
1939 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1940 );
1941
1942 } /* end table compress */
1943 }
1e2e3d02
YO
1944 DEBUG_TRIE_COMPILE_MORE_r(
1945 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1946 (int)depth * 2 + 2, "",
1947 (UV)trie->statecount,
1948 (UV)trie->lasttrans)
1949 );
cc601c31 1950 /* resize the trans array to remove unused space */
c944940b
JH
1951 trie->trans = (reg_trie_trans *)
1952 PerlMemShared_realloc( trie->trans, trie->lasttrans
1953 * sizeof(reg_trie_trans) );
a3621e74 1954
3b753521 1955 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
1956 U8 nodetype =(U8)(flags & 0xFF);
1957 char *str=NULL;
786e8c11 1958
07be1b83 1959#ifdef DEBUGGING
e62cc96a 1960 regnode *optimize = NULL;
7122b237
YO
1961#ifdef RE_TRACK_PATTERN_OFFSETS
1962
b57a0404
JH
1963 U32 mjd_offset = 0;
1964 U32 mjd_nodelen = 0;
7122b237
YO
1965#endif /* RE_TRACK_PATTERN_OFFSETS */
1966#endif /* DEBUGGING */
a3621e74 1967 /*
3dab1dad
YO
1968 This means we convert either the first branch or the first Exact,
1969 depending on whether the thing following (in 'last') is a branch
1970 or not and whther first is the startbranch (ie is it a sub part of
1971 the alternation or is it the whole thing.)
3b753521 1972 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 1973 the whole branch sequence, including the first.
a3621e74 1974 */
3dab1dad 1975 /* Find the node we are going to overwrite */
7f69552c 1976 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1977 /* branch sub-chain */
3dab1dad 1978 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1979#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1980 DEBUG_r({
1981 mjd_offset= Node_Offset((convert));
1982 mjd_nodelen= Node_Length((convert));
1983 });
7122b237 1984#endif
7f69552c 1985 /* whole branch chain */
7122b237
YO
1986 }
1987#ifdef RE_TRACK_PATTERN_OFFSETS
1988 else {
7f69552c
YO
1989 DEBUG_r({
1990 const regnode *nop = NEXTOPER( convert );
1991 mjd_offset= Node_Offset((nop));
1992 mjd_nodelen= Node_Length((nop));
1993 });
07be1b83
YO
1994 }
1995 DEBUG_OPTIMISE_r(
1996 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1997 (int)depth * 2 + 2, "",
786e8c11 1998 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1999 );
7122b237 2000#endif
3dab1dad
YO
2001 /* But first we check to see if there is a common prefix we can
2002 split out as an EXACT and put in front of the TRIE node. */
2003 trie->startstate= 1;
55eed653 2004 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2005 U32 state;
1e2e3d02 2006 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2007 U32 ofs = 0;
8e11feef
RGS
2008 I32 idx = -1;
2009 U32 count = 0;
2010 const U32 base = trie->states[ state ].trans.base;
a3621e74 2011
3dab1dad 2012 if ( trie->states[state].wordnum )
8e11feef 2013 count = 1;
a3621e74 2014
8e11feef 2015 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2016 if ( ( base + ofs >= trie->uniquecharcount ) &&
2017 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2018 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2019 {
3dab1dad 2020 if ( ++count > 1 ) {
2b8b4781 2021 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2022 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2023 if ( state == 1 ) break;
3dab1dad
YO
2024 if ( count == 2 ) {
2025 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2026 DEBUG_OPTIMISE_r(
8e11feef
RGS
2027 PerlIO_printf(Perl_debug_log,
2028 "%*sNew Start State=%"UVuf" Class: [",
2029 (int)depth * 2 + 2, "",
786e8c11 2030 (UV)state));
be8e71aa 2031 if (idx >= 0) {
2b8b4781 2032 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2033 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2034
3dab1dad 2035 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2036 if ( folder )
2037 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2038 DEBUG_OPTIMISE_r(
f1f66076 2039 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2040 );
8e11feef
RGS
2041 }
2042 }
2043 TRIE_BITMAP_SET(trie,*ch);
2044 if ( folder )
2045 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2046 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2047 }
2048 idx = ofs;
2049 }
3dab1dad
YO
2050 }
2051 if ( count == 1 ) {
2b8b4781 2052 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2053 STRLEN len;
2054 char *ch = SvPV( *tmp, len );
de734bd5
A
2055 DEBUG_OPTIMISE_r({
2056 SV *sv=sv_newmortal();
8e11feef
RGS
2057 PerlIO_printf( Perl_debug_log,
2058 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2059 (int)depth * 2 + 2, "",
de734bd5
A
2060 (UV)state, (UV)idx,
2061 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2062 PL_colors[0], PL_colors[1],
2063 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2064 PERL_PV_ESCAPE_FIRSTCHAR
2065 )
2066 );
2067 });
3dab1dad
YO
2068 if ( state==1 ) {
2069 OP( convert ) = nodetype;
2070 str=STRING(convert);
2071 STR_LEN(convert)=0;
2072 }
c490c714
YO
2073 STR_LEN(convert) += len;
2074 while (len--)
de734bd5 2075 *str++ = *ch++;
8e11feef 2076 } else {
f9049ba1 2077#ifdef DEBUGGING
8e11feef
RGS
2078 if (state>1)
2079 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2080#endif
8e11feef
RGS
2081 break;
2082 }
2083 }
2e64971a 2084 trie->prefixlen = (state-1);
3dab1dad 2085 if (str) {
8e11feef 2086 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2087 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2088 trie->startstate = state;
07be1b83
YO
2089 trie->minlen -= (state - 1);
2090 trie->maxlen -= (state - 1);
33809eae
JH
2091#ifdef DEBUGGING
2092 /* At least the UNICOS C compiler choked on this
2093 * being argument to DEBUG_r(), so let's just have
2094 * it right here. */
2095 if (
2096#ifdef PERL_EXT_RE_BUILD
2097 1
2098#else
2099 DEBUG_r_TEST
2100#endif
2101 ) {
2102 regnode *fix = convert;
2103 U32 word = trie->wordcount;
2104 mjd_nodelen++;
2105 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2106 while( ++fix < n ) {
2107 Set_Node_Offset_Length(fix, 0, 0);
2108 }
2109 while (word--) {
2110 SV ** const tmp = av_fetch( trie_words, word, 0 );
2111 if (tmp) {
2112 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2113 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2114 else
2115 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2116 }
2117 }
2118 }
2119#endif
8e11feef
RGS
2120 if (trie->maxlen) {
2121 convert = n;
2122 } else {
3dab1dad 2123 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2124 DEBUG_r(optimize= n);
3dab1dad
YO
2125 }
2126 }
2127 }
a5ca303d
YO
2128 if (!jumper)
2129 jumper = last;
3dab1dad 2130 if ( trie->maxlen ) {
8e11feef
RGS
2131 NEXT_OFF( convert ) = (U16)(tail - convert);
2132 ARG_SET( convert, data_slot );
786e8c11
YO
2133 /* Store the offset to the first unabsorbed branch in
2134 jump[0], which is otherwise unused by the jump logic.
2135 We use this when dumping a trie and during optimisation. */
2136 if (trie->jump)
7f69552c 2137 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2138
6c48061a
YO
2139 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2140 * and there is a bitmap
2141 * and the first "jump target" node we found leaves enough room
2142 * then convert the TRIE node into a TRIEC node, with the bitmap
2143 * embedded inline in the opcode - this is hypothetically faster.
2144 */
2145 if ( !trie->states[trie->startstate].wordnum
2146 && trie->bitmap
2147 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2148 {
2149 OP( convert ) = TRIEC;
2150 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2151 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2152 trie->bitmap= NULL;
2153 } else
2154 OP( convert ) = TRIE;
a3621e74 2155
3dab1dad
YO
2156 /* store the type in the flags */
2157 convert->flags = nodetype;
a5ca303d
YO
2158 DEBUG_r({
2159 optimize = convert
2160 + NODE_STEP_REGNODE
2161 + regarglen[ OP( convert ) ];
2162 });
2163 /* XXX We really should free up the resource in trie now,
2164 as we won't use them - (which resources?) dmq */
3dab1dad 2165 }
a3621e74 2166 /* needed for dumping*/
e62cc96a 2167 DEBUG_r(if (optimize) {
07be1b83 2168 regnode *opt = convert;
bcdf7404 2169
e62cc96a 2170 while ( ++opt < optimize) {
07be1b83
YO
2171 Set_Node_Offset_Length(opt,0,0);
2172 }
786e8c11
YO
2173 /*
2174 Try to clean up some of the debris left after the
2175 optimisation.
a3621e74 2176 */
786e8c11 2177 while( optimize < jumper ) {
07be1b83 2178 mjd_nodelen += Node_Length((optimize));
a3621e74 2179 OP( optimize ) = OPTIMIZED;
07be1b83 2180 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2181 optimize++;
2182 }
07be1b83 2183 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2184 });
2185 } /* end node insert */
2e64971a
DM
2186
2187 /* Finish populating the prev field of the wordinfo array. Walk back
2188 * from each accept state until we find another accept state, and if
2189 * so, point the first word's .prev field at the second word. If the
2190 * second already has a .prev field set, stop now. This will be the
2191 * case either if we've already processed that word's accept state,
3b753521
FN
2192 * or that state had multiple words, and the overspill words were
2193 * already linked up earlier.
2e64971a
DM
2194 */
2195 {
2196 U16 word;
2197 U32 state;
2198 U16 prev;
2199
2200 for (word=1; word <= trie->wordcount; word++) {
2201 prev = 0;
2202 if (trie->wordinfo[word].prev)
2203 continue;
2204 state = trie->wordinfo[word].accept;
2205 while (state) {
2206 state = prev_states[state];
2207 if (!state)
2208 break;
2209 prev = trie->states[state].wordnum;
2210 if (prev)
2211 break;
2212 }
2213 trie->wordinfo[word].prev = prev;
2214 }
2215 Safefree(prev_states);
2216 }
2217
2218
2219 /* and now dump out the compressed format */
2220 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2221
55eed653 2222 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2223#ifdef DEBUGGING
2224 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2225 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2226#else
2227 SvREFCNT_dec(revcharmap);
07be1b83 2228#endif
786e8c11
YO
2229 return trie->jump
2230 ? MADE_JUMP_TRIE
2231 : trie->startstate>1
2232 ? MADE_EXACT_TRIE
2233 : MADE_TRIE;
2234}
2235
2236STATIC void
2237S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2238{
3b753521 2239/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2240
2241 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2242 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2243 ISBN 0-201-10088-6
2244
2245 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2246 suffix of the current state's 'word' that is also a proper prefix of another word in our
2247 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2248 the DFA not to have to restart after its tried and failed a word at a given point, it
2249 simply continues as though it had been matching the other word in the first place.
2250 Consider
2251 'abcdgu'=~/abcdefg|cdgu/
2252 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2253 fail, which would bring us to the state representing 'd' in the second word where we would
2254 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2255 */
2256 /* add a fail transition */
3251b653
NC
2257 const U32 trie_offset = ARG(source);
2258 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2259 U32 *q;
2260 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2261 const U32 numstates = trie->statecount;
786e8c11
YO
2262 const U32 ubound = trie->lasttrans + ucharcount;
2263 U32 q_read = 0;
2264 U32 q_write = 0;
2265 U32 charid;
2266 U32 base = trie->states[ 1 ].trans.base;
2267 U32 *fail;
2268 reg_ac_data *aho;
2269 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2270 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2271
2272 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2273#ifndef DEBUGGING
2274 PERL_UNUSED_ARG(depth);
2275#endif
2276
2277
2278 ARG_SET( stclass, data_slot );
c944940b 2279 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2280 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2281 aho->trie=trie_offset;
446bd890
NC
2282 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2283 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2284 Newxz( q, numstates, U32);
c944940b 2285 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2286 aho->refcount = 1;
2287 fail = aho->fail;
2288 /* initialize fail[0..1] to be 1 so that we always have
2289 a valid final fail state */
2290 fail[ 0 ] = fail[ 1 ] = 1;
2291
2292 for ( charid = 0; charid < ucharcount ; charid++ ) {
2293 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2294 if ( newstate ) {
2295 q[ q_write ] = newstate;
2296 /* set to point at the root */
2297 fail[ q[ q_write++ ] ]=1;
2298 }
2299 }
2300 while ( q_read < q_write) {
2301 const U32 cur = q[ q_read++ % numstates ];
2302 base = trie->states[ cur ].trans.base;
2303
2304 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2305 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2306 if (ch_state) {
2307 U32 fail_state = cur;
2308 U32 fail_base;
2309 do {
2310 fail_state = fail[ fail_state ];
2311 fail_base = aho->states[ fail_state ].trans.base;
2312 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2313
2314 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2315 fail[ ch_state ] = fail_state;
2316 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2317 {
2318 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2319 }
2320 q[ q_write++ % numstates] = ch_state;
2321 }
2322 }
2323 }
2324 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2325 when we fail in state 1, this allows us to use the
2326 charclass scan to find a valid start char. This is based on the principle
2327 that theres a good chance the string being searched contains lots of stuff
2328 that cant be a start char.
2329 */
2330 fail[ 0 ] = fail[ 1 ] = 0;
2331 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2332 PerlIO_printf(Perl_debug_log,
2333 "%*sStclass Failtable (%"UVuf" states): 0",
2334 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2335 );
786e8c11
YO
2336 for( q_read=1; q_read<numstates; q_read++ ) {
2337 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2338 }
2339 PerlIO_printf(Perl_debug_log, "\n");
2340 });
2341 Safefree(q);
2342 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2343}
2344
786e8c11 2345
a3621e74 2346/*
5d1c421c
JH
2347 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2348 * These need to be revisited when a newer toolchain becomes available.
2349 */
2350#if defined(__sparc64__) && defined(__GNUC__)
2351# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2352# undef SPARC64_GCC_WORKAROUND
2353# define SPARC64_GCC_WORKAROUND 1
2354# endif
2355#endif
2356
07be1b83 2357#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2358 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2359 SV * const mysv=sv_newmortal(); \
2360 regnode *Next = regnext(scan); \
2361 regprop(RExC_rx, mysv, scan); \
7f69552c 2362 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2363 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2364 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2365 }});
07be1b83 2366
1de06328
YO
2367
2368
2369
2370
07be1b83
YO
2371#define JOIN_EXACT(scan,min,flags) \
2372 if (PL_regkind[OP(scan)] == EXACT) \
2373 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2374
be8e71aa 2375STATIC U32
07be1b83
YO
2376S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2377 /* Merge several consecutive EXACTish nodes into one. */
2378 regnode *n = regnext(scan);
2379 U32 stringok = 1;
2380 regnode *next = scan + NODE_SZ_STR(scan);
2381 U32 merged = 0;
2382 U32 stopnow = 0;
2383#ifdef DEBUGGING
2384 regnode *stop = scan;
72f13be8 2385 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2386#else
d47053eb
RGS
2387 PERL_UNUSED_ARG(depth);
2388#endif
7918f24d
NC
2389
2390 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2391#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2392 PERL_UNUSED_ARG(flags);
2393 PERL_UNUSED_ARG(val);
07be1b83 2394#endif
07be1b83
YO
2395 DEBUG_PEEP("join",scan,depth);
2396
2397 /* Skip NOTHING, merge EXACT*. */
2398 while (n &&
2399 ( PL_regkind[OP(n)] == NOTHING ||
2400 (stringok && (OP(n) == OP(scan))))
2401 && NEXT_OFF(n)
2402 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2403
2404 if (OP(n) == TAIL || n > next)
2405 stringok = 0;
2406 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2407 DEBUG_PEEP("skip:",n,depth);
2408 NEXT_OFF(scan) += NEXT_OFF(n);
2409 next = n + NODE_STEP_REGNODE;
2410#ifdef DEBUGGING
2411 if (stringok)
2412 stop = n;
2413#endif
2414 n = regnext(n);
2415 }
2416 else if (stringok) {
786e8c11 2417 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2418 regnode * const nnext = regnext(n);
2419
2420 DEBUG_PEEP("merg",n,depth);
2421
2422 merged++;
2423 if (oldl + STR_LEN(n) > U8_MAX)
2424 break;
2425 NEXT_OFF(scan) += NEXT_OFF(n);
2426 STR_LEN(scan) += STR_LEN(n);
2427 next = n + NODE_SZ_STR(n);
2428 /* Now we can overwrite *n : */
2429 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2430#ifdef DEBUGGING
2431 stop = next - 1;
2432#endif
2433 n = nnext;
2434 if (stopnow) break;
2435 }
2436
d47053eb
RGS
2437#ifdef EXPERIMENTAL_INPLACESCAN
2438 if (flags && !NEXT_OFF(n)) {
2439 DEBUG_PEEP("atch", val, depth);
2440 if (reg_off_by_arg[OP(n)]) {
2441 ARG_SET(n, val - n);
2442 }
2443 else {
2444 NEXT_OFF(n) = val - n;
2445 }
2446 stopnow = 1;
2447 }
07be1b83
YO
2448#endif
2449 }
2450
2451 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2452 /*
2453 Two problematic code points in Unicode casefolding of EXACT nodes:
2454
2455 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2456 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2457
2458 which casefold to
2459
2460 Unicode UTF-8
2461
2462 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2463 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2464
2465 This means that in case-insensitive matching (or "loose matching",
2466 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2467 length of the above casefolded versions) can match a target string
2468 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2469 This would rather mess up the minimum length computation.
2470
2471 What we'll do is to look for the tail four bytes, and then peek
2472 at the preceding two bytes to see whether we need to decrease
2473 the minimum length by four (six minus two).
2474
2475 Thanks to the design of UTF-8, there cannot be false matches:
2476 A sequence of valid UTF-8 bytes cannot be a subsequence of
2477 another valid sequence of UTF-8 bytes.
2478
2479 */
2480 char * const s0 = STRING(scan), *s, *t;
2481 char * const s1 = s0 + STR_LEN(scan) - 1;
2482 char * const s2 = s1 - 4;
e294cc5d
JH
2483#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2484 const char t0[] = "\xaf\x49\xaf\x42";
2485#else
07be1b83 2486 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2487#endif
07be1b83
YO
2488 const char * const t1 = t0 + 3;
2489
2490 for (s = s0 + 2;
2491 s < s2 && (t = ninstr(s, s1, t0, t1));
2492 s = t + 4) {
e294cc5d
JH
2493#ifdef EBCDIC
2494 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2495 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2496#else
07be1b83
YO
2497 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2498 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2499#endif
07be1b83
YO
2500 *min -= 4;
2501 }
2502 }
2503
2504#ifdef DEBUGGING
2505 /* Allow dumping */
2506 n = scan + NODE_SZ_STR(scan);
2507 while (n <= stop) {
2508 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2509 OP(n) = OPTIMIZED;
2510 NEXT_OFF(n) = 0;
2511 }
2512 n++;
2513 }
2514#endif
2515 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2516 return stopnow;
2517}
2518
653099ff
GS
2519/* REx optimizer. Converts nodes into quickier variants "in place".
2520 Finds fixed substrings. */
2521
a0288114 2522/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2523 to the position after last scanned or to NULL. */
2524
40d049e4
YO
2525#define INIT_AND_WITHP \
2526 assert(!and_withp); \
2527 Newx(and_withp,1,struct regnode_charclass_class); \
2528 SAVEFREEPV(and_withp)
07be1b83 2529
b515a41d
YO
2530/* this is a chain of data about sub patterns we are processing that
2531 need to be handled seperately/specially in study_chunk. Its so
2532 we can simulate recursion without losing state. */
2533struct scan_frame;
2534typedef struct scan_frame {
2535 regnode *last; /* last node to process in this frame */
2536 regnode *next; /* next node to process when last is reached */
2537 struct scan_frame *prev; /*previous frame*/
2538 I32 stop; /* what stopparen do we use */
2539} scan_frame;
2540
304ee84b
YO
2541
2542#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2543
e1d1eefb
YO
2544#define CASE_SYNST_FNC(nAmE) \
2545case nAmE: \
2546 if (flags & SCF_DO_STCLASS_AND) { \
2547 for (value = 0; value < 256; value++) \
2548 if (!is_ ## nAmE ## _cp(value)) \
2549 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2550 } \
2551 else { \
2552 for (value = 0; value < 256; value++) \
2553 if (is_ ## nAmE ## _cp(value)) \
2554 ANYOF_BITMAP_SET(data->start_class, value); \
2555 } \
2556 break; \
2557case N ## nAmE: \
2558 if (flags & SCF_DO_STCLASS_AND) { \
2559 for (value = 0; value < 256; value++) \
2560 if (is_ ## nAmE ## _cp(value)) \
2561 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2562 } \
2563 else { \
2564 for (value = 0; value < 256; value++) \
2565 if (!is_ ## nAmE ## _cp(value)) \
2566 ANYOF_BITMAP_SET(data->start_class, value); \
2567 } \
2568 break
2569
2570
2571
76e3520e 2572STATIC I32
40d049e4 2573S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2574 I32 *minlenp, I32 *deltap,
40d049e4
YO
2575 regnode *last,
2576 scan_data_t *data,
2577 I32 stopparen,
2578 U8* recursed,
2579 struct regnode_charclass_class *and_withp,
2580 U32 flags, U32 depth)
c277df42
IZ
2581 /* scanp: Start here (read-write). */
2582 /* deltap: Write maxlen-minlen here. */
2583 /* last: Stop before this one. */
40d049e4
YO
2584 /* data: string data about the pattern */
2585 /* stopparen: treat close N as END */
2586 /* recursed: which subroutines have we recursed into */
2587 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2588{
97aff369 2589 dVAR;
c277df42
IZ
2590 I32 min = 0, pars = 0, code;
2591 regnode *scan = *scanp, *next;
2592 I32 delta = 0;
2593 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2594 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2595 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2596 scan_data_t data_fake;
a3621e74 2597 SV *re_trie_maxbuff = NULL;
786e8c11 2598 regnode *first_non_open = scan;
e2e6a0f1 2599 I32 stopmin = I32_MAX;
8aa23a47 2600 scan_frame *frame = NULL;
a3621e74 2601 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2602
7918f24d
NC
2603 PERL_ARGS_ASSERT_STUDY_CHUNK;
2604
13a24bad 2605#ifdef DEBUGGING
40d049e4 2606 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2607#endif
40d049e4 2608
786e8c11 2609 if ( depth == 0 ) {
40d049e4 2610 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2611 first_non_open=regnext(first_non_open);
2612 }
2613
b81d288d 2614
8aa23a47
YO
2615 fake_study_recurse:
2616 while ( scan && OP(scan) != END && scan < last ){
2617 /* Peephole optimizer: */
304ee84b 2618 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2619 DEBUG_PEEP("Peep",scan,depth);
2620 JOIN_EXACT(scan,&min,0);
2621
2622 /* Follow the next-chain of the current node and optimize
2623 away all the NOTHINGs from it. */
2624 if (OP(scan) != CURLYX) {
2625 const int max = (reg_off_by_arg[OP(scan)]
2626 ? I32_MAX
2627 /* I32 may be smaller than U16 on CRAYs! */
2628 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2629 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2630 int noff;
2631 regnode *n = scan;
2632
2633 /* Skip NOTHING and LONGJMP. */
2634 while ((n = regnext(n))
2635 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2636 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2637 && off + noff < max)
2638 off += noff;
2639 if (reg_off_by_arg[OP(scan)])
2640 ARG(scan) = off;
2641 else
2642 NEXT_OFF(scan) = off;
2643 }
a3621e74 2644
c277df42 2645
8aa23a47
YO
2646
2647 /* The principal pseudo-switch. Cannot be a switch, since we
2648 look into several different things. */
2649 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2650 || OP(scan) == IFTHEN) {
2651 next = regnext(scan);
2652 code = OP(scan);
2653 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2654
2655 if (OP(next) == code || code == IFTHEN) {
2656 /* NOTE - There is similar code to this block below for handling
2657 TRIE nodes on a re-study. If you change stuff here check there
2658 too. */
2659 I32 max1 = 0, min1 = I32_MAX, num = 0;
2660 struct regnode_charclass_class accum;
2661 regnode * const startbranch=scan;
2662
2663 if (flags & SCF_DO_SUBSTR)
304ee84b 2664 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2665 if (flags & SCF_DO_STCLASS)
2666 cl_init_zero(pRExC_state, &accum);
2667
2668 while (OP(scan) == code) {
2669 I32 deltanext, minnext, f = 0, fake;
2670 struct regnode_charclass_class this_class;
2671
2672 num++;
2673 data_fake.flags = 0;
2674 if (data) {
2675 data_fake.whilem_c = data->whilem_c;
2676 data_fake.last_closep = data->last_closep;
2677 }
2678 else
2679 data_fake.last_closep = &fake;
58e23c8d
YO
2680
2681 data_fake.pos_delta = delta;
8aa23a47
YO
2682 next = regnext(scan);
2683 scan = NEXTOPER(scan);
2684 if (code != BRANCH)
c277df42 2685 scan = NEXTOPER(scan);
8aa23a47
YO
2686 if (flags & SCF_DO_STCLASS) {
2687 cl_init(pRExC_state, &this_class);
2688 data_fake.start_class = &this_class;
2689 f = SCF_DO_STCLASS_AND;
58e23c8d 2690 }
8aa23a47
YO
2691 if (flags & SCF_WHILEM_VISITED_POS)
2692 f |= SCF_WHILEM_VISITED_POS;
2693
2694 /* we suppose the run is continuous, last=next...*/
2695 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2696 next, &data_fake,
2697 stopparen, recursed, NULL, f,depth+1);
2698 if (min1 > minnext)
2699 min1 = minnext;
2700 if (max1 < minnext + deltanext)
2701 max1 = minnext + deltanext;
2702 if (deltanext == I32_MAX)
2703 is_inf = is_inf_internal = 1;
2704 scan = next;
2705 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2706 pars++;
2707 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2708 if ( stopmin > minnext)
2709 stopmin = min + min1;
2710 flags &= ~SCF_DO_SUBSTR;
2711 if (data)
2712 data->flags |= SCF_SEEN_ACCEPT;
2713 }
2714 if (data) {
2715 if (data_fake.flags & SF_HAS_EVAL)
2716 data->flags |= SF_HAS_EVAL;
2717 data->whilem_c = data_fake.whilem_c;
3dab1dad 2718 }
8aa23a47
YO
2719 if (flags & SCF_DO_STCLASS)
2720 cl_or(pRExC_state, &accum, &this_class);
2721 }
2722 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2723 min1 = 0;
2724 if (flags & SCF_DO_SUBSTR) {
2725 data->pos_min += min1;
2726 data->pos_delta += max1 - min1;
2727 if (max1 != min1 || is_inf)
2728 data->longest = &(data->longest_float);
2729 }
2730 min += min1;
2731 delta += max1 - min1;
2732 if (flags & SCF_DO_STCLASS_OR) {
2733 cl_or(pRExC_state, data->start_class, &accum);
2734 if (min1) {
2735 cl_and(data->start_class, and_withp);
2736 flags &= ~SCF_DO_STCLASS;
653099ff 2737 }
8aa23a47
YO
2738 }
2739 else if (flags & SCF_DO_STCLASS_AND) {
2740 if (min1) {
2741 cl_and(data->start_class, &accum);
2742 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2743 }
8aa23a47
YO
2744 else {
2745 /* Switch to OR mode: cache the old value of
2746 * data->start_class */
2747 INIT_AND_WITHP;
2748 StructCopy(data->start_class, and_withp,
2749 struct regnode_charclass_class);
2750 flags &= ~SCF_DO_STCLASS_AND;
2751 StructCopy(&accum, data->start_class,
2752 struct regnode_charclass_class);
2753 flags |= SCF_DO_STCLASS_OR;
2754 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2755 }
8aa23a47 2756 }
a3621e74 2757
8aa23a47
YO
2758 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2759 /* demq.
a3621e74 2760
8aa23a47
YO
2761 Assuming this was/is a branch we are dealing with: 'scan' now
2762 points at the item that follows the branch sequence, whatever
2763 it is. We now start at the beginning of the sequence and look
2764 for subsequences of
a3621e74 2765
8aa23a47
YO
2766 BRANCH->EXACT=>x1
2767 BRANCH->EXACT=>x2
2768 tail
a3621e74 2769
8aa23a47 2770 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2771
8aa23a47
YO
2772 If we can find such a subseqence we need to turn the first
2773 element into a trie and then add the subsequent branch exact
2774 strings to the trie.
a3621e74 2775
8aa23a47 2776 We have two cases
a3621e74 2777
3b753521 2778 1. patterns where the whole set of branches can be converted.
a3621e74 2779
8aa23a47 2780 2. patterns where only a subset can be converted.
a3621e74 2781
8aa23a47
YO
2782 In case 1 we can replace the whole set with a single regop
2783 for the trie. In case 2 we need to keep the start and end
3b753521 2784 branches so
a3621e74 2785
8aa23a47
YO
2786 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2787 becomes BRANCH TRIE; BRANCH X;
786e8c11 2788
8aa23a47
YO
2789 There is an additional case, that being where there is a
2790 common prefix, which gets split out into an EXACT like node
2791 preceding the TRIE node.
a3621e74 2792
8aa23a47
YO
2793 If x(1..n)==tail then we can do a simple trie, if not we make
2794 a "jump" trie, such that when we match the appropriate word
2795 we "jump" to the appopriate tail node. Essentailly we turn
2796 a nested if into a case structure of sorts.
b515a41d 2797
8aa23a47
YO
2798 */
2799
2800 int made=0;
2801 if (!re_trie_maxbuff) {
2802 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2803 if (!SvIOK(re_trie_maxbuff))
2804 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2805 }
2806 if ( SvIV(re_trie_maxbuff)>=0 ) {
2807 regnode *cur;
2808 regnode *first = (regnode *)NULL;
2809 regnode *last = (regnode *)NULL;
2810 regnode *tail = scan;
2811 U8 optype = 0;
2812 U32 count=0;
a3621e74
YO
2813
2814#ifdef DEBUGGING
8aa23a47 2815 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2816#endif
8aa23a47
YO
2817 /* var tail is used because there may be a TAIL
2818 regop in the way. Ie, the exacts will point to the
2819 thing following the TAIL, but the last branch will
2820 point at the TAIL. So we advance tail. If we
2821 have nested (?:) we may have to move through several
2822 tails.
2823 */
2824
2825 while ( OP( tail ) == TAIL ) {
2826 /* this is the TAIL generated by (?:) */
2827 tail = regnext( tail );
2828 }
a3621e74 2829
8aa23a47
YO
2830
2831 DEBUG_OPTIMISE_r({
2832 regprop(RExC_rx, mysv, tail );
2833 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2834 (int)depth * 2 + 2, "",
2835 "Looking for TRIE'able sequences. Tail node is: ",
2836 SvPV_nolen_const( mysv )
2837 );
2838 });
2839
2840 /*
2841
2842 step through the branches, cur represents each
2843 branch, noper is the first thing to be matched
2844 as part of that branch and noper_next is the
2845 regnext() of that node. if noper is an EXACT
2846 and noper_next is the same as scan (our current
2847 position in the regex) then the EXACT branch is
2848 a possible optimization target. Once we have
2849 two or more consequetive such branches we can
2850 create a trie of the EXACT's contents and stich
2851 it in place. If the sequence represents all of
2852 the branches we eliminate the whole thing and
2853 replace it with a single TRIE. If it is a
2854 subsequence then we need to stitch it in. This
2855 means the first branch has to remain, and needs
2856 to be repointed at the item on the branch chain
2857 following the last branch optimized. This could
2858 be either a BRANCH, in which case the
2859 subsequence is internal, or it could be the
2860 item following the branch sequence in which
2861 case the subsequence is at the end.
2862
2863 */
2864
2865 /* dont use tail as the end marker for this traverse */
2866 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2867 regnode * const noper = NEXTOPER( cur );
b515a41d 2868#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2869 regnode * const noper_next = regnext( noper );
b515a41d
YO
2870#endif
2871
8aa23a47
YO
2872 DEBUG_OPTIMISE_r({
2873 regprop(RExC_rx, mysv, cur);
2874 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2875 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2876
2877 regprop(RExC_rx, mysv, noper);
2878 PerlIO_printf( Perl_debug_log, " -> %s",
2879 SvPV_nolen_const(mysv));
2880
2881 if ( noper_next ) {
2882 regprop(RExC_rx, mysv, noper_next );
2883 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2884 SvPV_nolen_const(mysv));
2885 }
2886 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2887 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2888 });
2889 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2890 : PL_regkind[ OP( noper ) ] == EXACT )
2891 || OP(noper) == NOTHING )
786e8c11 2892#ifdef NOJUMPTRIE
8aa23a47 2893 && noper_next == tail
786e8c11 2894#endif
8aa23a47
YO
2895 && count < U16_MAX)
2896 {
2897 count++;
2898 if ( !first || optype == NOTHING ) {
2899 if (!first) first = cur;
2900 optype = OP( noper );
2901 } else {
2902 last = cur;
2903 }
2904 } else {
a0a388a1 2905/*
0abd0d78
YO
2906 Currently we do not believe that the trie logic can
2907 handle case insensitive matching properly when the
2908 pattern is not unicode (thus forcing unicode semantics).
2909
2910 If/when this is fixed the following define can be swapped
2911 in below to fully enable trie logic.
2912
a0a388a1 2913#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2914
2915*/
2916#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2917
a0a388a1 2918 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2919 make_trie( pRExC_state,
2920 startbranch, first, cur, tail, count,
2921 optype, depth+1 );
2922 }
2923 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2924#ifdef NOJUMPTRIE
8aa23a47 2925 && noper_next == tail
786e8c11 2926#endif
8aa23a47
YO
2927 ){
2928 count = 1;
2929 first = cur;
2930 optype = OP( noper );
2931 } else {
2932 count = 0;
2933 first = NULL;
2934 optype = 0;
2935 }
2936 last = NULL;
2937 }
2938 }
2939 DEBUG_OPTIMISE_r({
2940 regprop(RExC_rx, mysv, cur);
2941 PerlIO_printf( Perl_debug_log,
2942 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2943 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2944
2945 });
a0a388a1
YO
2946
2947 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2948 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2949#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2950 if ( ((made == MADE_EXACT_TRIE &&
2951 startbranch == first)
2952 || ( first_non_open == first )) &&
2953 depth==0 ) {
2954 flags |= SCF_TRIE_RESTUDY;
2955 if ( startbranch == first
2956 && scan == tail )
2957 {
2958 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2959 }
2960 }
3dab1dad 2961#endif
8aa23a47
YO
2962 }
2963 }
2964
2965 } /* do trie */
2966
653099ff 2967 }
8aa23a47
YO
2968 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2969 scan = NEXTOPER(NEXTOPER(scan));
2970 } else /* single branch is optimized. */
2971 scan = NEXTOPER(scan);
2972 continue;
2973 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2974 scan_frame *newframe = NULL;
2975 I32 paren;
2976 regnode *start;
2977 regnode *end;
2978
2979 if (OP(scan) != SUSPEND) {
2980 /* set the pointer */
2981 if (OP(scan) == GOSUB) {
2982 paren = ARG(scan);
2983 RExC_recurse[ARG2L(scan)] = scan;
2984 start = RExC_open_parens[paren-1];
2985 end = RExC_close_parens[paren-1];
2986 } else {
2987 paren = 0;
f8fc2ecf 2988 start = RExC_rxi->program + 1;
8aa23a47
YO
2989 end = RExC_opend;
2990 }
2991 if (!recursed) {
2992 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2993 SAVEFREEPV(recursed);
2994 }
2995 if (!PAREN_TEST(recursed,paren+1)) {
2996 PAREN_SET(recursed,paren+1);
2997 Newx(newframe,1,scan_frame);
2998 } else {
2999 if (flags & SCF_DO_SUBSTR) {
304ee84b 3000 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3001 data->longest = &(data->longest_float);
3002 }
3003 is_inf = is_inf_internal = 1;
3004 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3005 cl_anything(pRExC_state, data->start_class);
3006 flags &= ~SCF_DO_STCLASS;
3007 }
3008 } else {
3009 Newx(newframe,1,scan_frame);
3010 paren = stopparen;
3011 start = scan+2;
3012 end = regnext(scan);
3013 }
3014 if (newframe) {
3015 assert(start);
3016 assert(end);
3017 SAVEFREEPV(newframe);
3018 newframe->next = regnext(scan);
3019 newframe->last = last;
3020 newframe->stop = stopparen;
3021 newframe->prev = frame;
3022
3023 frame = newframe;
3024 scan = start;
3025 stopparen = paren;
3026 last = end;
3027
3028 continue;
3029 }
3030 }
3031 else if (OP(scan) == EXACT) {
3032 I32 l = STR_LEN(scan);
3033 UV uc;
3034 if (UTF) {
3035 const U8 * const s = (U8*)STRING(scan);
3036 l = utf8_length(s, s + l);
3037 uc = utf8_to_uvchr(s, NULL);
3038 } else {
3039 uc = *((U8*)STRING(scan));
3040 }
3041 min += l;
3042 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3043 /* The code below prefers earlier match for fixed
3044 offset, later match for variable offset. */
3045 if (data->last_end == -1) { /* Update the start info. */
3046 data->last_start_min = data->pos_min;
3047 data->last_start_max = is_inf
3048 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3049 }
8aa23a47
YO
3050 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3051 if (UTF)
3052 SvUTF8_on(data->last_found);
3053 {
3054 SV * const sv = data->last_found;
3055 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3056 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3057 if (mg && mg->mg_len >= 0)
3058 mg->mg_len += utf8_length((U8*)STRING(scan),
3059 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3060 }
8aa23a47
YO
3061 data->last_end = data->pos_min + l;
3062 data->pos_min += l; /* As in the first entry. */
3063 data->flags &= ~SF_BEFORE_EOL;
3064 }
3065 if (flags & SCF_DO_STCLASS_AND) {
3066 /* Check whether it is compatible with what we know already! */
3067 int compat = 1;
3068
3069 if (uc >= 0x100 ||
3070 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3071 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3072 && (!(data->start_class->flags & ANYOF_FOLD)
3073 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3074 )
3075 compat = 0;
3076 ANYOF_CLASS_ZERO(data->start_class);
3077 ANYOF_BITMAP_ZERO(data->start_class);
3078 if (compat)
3079 ANYOF_BITMAP_SET(data->start_class, uc);
3080 data->start_class->flags &= ~ANYOF_EOS;
3081 if (uc < 0x100)
3082 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3083 }
3084 else if (flags & SCF_DO_STCLASS_OR) {
3085 /* false positive possible if the class is case-folded */
3086 if (uc < 0x100)
3087 ANYOF_BITMAP_SET(data->start_class, uc);
3088 else
3089 data->start_class->flags |= ANYOF_UNICODE_ALL;
3090 data->start_class->flags &= ~ANYOF_EOS;
3091 cl_and(data->start_class, and_withp);
3092 }
3093 flags &= ~SCF_DO_STCLASS;
3094 }
3095 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3096 I32 l = STR_LEN(scan);
3097 UV uc = *((U8*)STRING(scan));
3098
3099 /* Search for fixed substrings supports EXACT only. */
3100 if (flags & SCF_DO_SUBSTR) {
3101 assert(data);
304ee84b 3102 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3103 }
3104 if (UTF) {
3105 const U8 * const s = (U8 *)STRING(scan);
3106 l = utf8_length(s, s + l);
3107 uc = utf8_to_uvchr(s, NULL);
3108 }
3109 min += l;
3110 if (flags & SCF_DO_SUBSTR)
3111 data->pos_min += l;
3112 if (flags & SCF_DO_STCLASS_AND) {
3113 /* Check whether it is compatible with what we know already! */
3114 int compat = 1;
3115
3116 if (uc >= 0x100 ||
3117 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3118 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3119 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3120 compat = 0;
3121 ANYOF_CLASS_ZERO(data->start_class);
3122 ANYOF_BITMAP_ZERO(data->start_class);
3123 if (compat) {
3124 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3125 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
3126 data->start_class->flags |= ANYOF_FOLD;
3127 if (OP(scan) == EXACTFL)
3128 data->start_class->flags |= ANYOF_LOCALE;
653099ff 3129 }
8aa23a47
YO
3130 }
3131 else if (flags & SCF_DO_STCLASS_OR) {
3132 if (data->start_class->flags & ANYOF_FOLD) {
3133 /* false positive possible if the class is case-folded.
3134 Assume that the locale settings are the same... */
1aa99e6b
IH
3135 if (uc < 0x100)
3136 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
3137 data->start_class->flags &= ~ANYOF_EOS;
3138 }
8aa23a47 3139 cl_and(data->start_class, and_withp);
653099ff 3140 }
8aa23a47
YO
3141 flags &= ~SCF_DO_STCLASS;
3142 }
e52fc539 3143 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3144 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3145 I32 f = flags, pos_before = 0;
3146 regnode * const oscan = scan;
3147 struct regnode_charclass_class this_class;
3148 struct regnode_charclass_class *oclass = NULL;
3149 I32 next_is_eval = 0;
3150
3151 switch (PL_regkind[OP(scan)]) {
3152 case WHILEM: /* End of (?:...)* . */
3153 scan = NEXTOPER(scan);
3154 goto finish;
3155 case PLUS:
3156 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3157 next = NEXTOPER(scan);
3158 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3159 mincount = 1;
3160 maxcount = REG_INFTY;
3161 next = regnext(scan);
3162 scan = NEXTOPER(scan);
3163 goto do_curly;
3164 }
3165 }
3166 if (flags & SCF_DO_SUBSTR)
3167 data->pos_min++;
3168 min++;
3169 /* Fall through. */
3170 case STAR:
3171 if (flags & SCF_DO_STCLASS) {
3172 mincount = 0;
3173 maxcount = REG_INFTY;
3174 next = regnext(scan);
3175 scan = NEXTOPER(scan);
3176 goto do_curly;
3177 }
3178 is_inf = is_inf_internal = 1;
3179 scan = regnext(scan);
c277df42 3180 if (flags & SCF_DO_SUBSTR) {
304ee84b 3181 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3182 data->longest = &(data->longest_float);
c277df42 3183 }
8aa23a47
YO
3184 goto optimize_curly_tail;
3185 case CURLY:
3186 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3187 && (scan->flags == stopparen))
3188 {
3189 mincount = 1;
3190 maxcount = 1;
3191 } else {
3192 mincount = ARG1(scan);
3193 maxcount = ARG2(scan);
653099ff 3194 }
8aa23a47
YO
3195 next = regnext(scan);
3196 if (OP(scan) == CURLYX) {
3197 I32 lp = (data ? *(data->last_closep) : 0);
3198 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3199 }
8aa23a47
YO
3200 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3201 next_is_eval = (OP(scan) == EVAL);
3202 do_curly:
3203 if (flags & SCF_DO_SUBSTR) {
304ee84b 3204 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3205 pos_before = data->pos_min;
b45f050a 3206 }
8aa23a47
YO
3207 if (data) {
3208 fl = data->flags;
3209 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3210 if (is_inf)
3211 data->flags |= SF_IS_INF;
3212 }
3213 if (flags & SCF_DO_STCLASS) {
3214 cl_init(pRExC_state, &this_class);
3215 oclass = data->start_class;
3216 data->start_class = &this_class;
3217 f |= SCF_DO_STCLASS_AND;
3218 f &= ~SCF_DO_STCLASS_OR;
3219 }
3220 /* These are the cases when once a subexpression
3221 fails at a particular position, it cannot succeed
3222 even after backtracking at the enclosing scope.
b9a59e08 3223
8aa23a47
YO
3224 XXXX what if minimal match and we are at the
3225 initial run of {n,m}? */
3226 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3227 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3228
8aa23a47
YO
3229 /* This will finish on WHILEM, setting scan, or on NULL: */
3230 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3231 last, data, stopparen, recursed, NULL,
3232 (mincount == 0
3233 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3234
8aa23a47
YO
3235 if (flags & SCF_DO_STCLASS)
3236 data->start_class = oclass;
3237 if (mincount == 0 || minnext == 0) {
3238 if (flags & SCF_DO_STCLASS_OR) {
3239 cl_or(pRExC_state, data->start_class, &this_class);
3240 }
3241 else if (flags & SCF_DO_STCLASS_AND) {
3242 /* Switch to OR mode: cache the old value of
3243 * data->start_class */
3244 INIT_AND_WITHP;
3245 StructCopy(data->start_class, and_withp,
3246 struct regnode_charclass_class);
3247 flags &= ~SCF_DO_STCLASS_AND;
3248 StructCopy(&this_class, data->start_class,
3249 struct regnode_charclass_class);
3250 flags |= SCF_DO_STCLASS_OR;
3251 data->start_class->flags |= ANYOF_EOS;
3252 }
3253 } else { /* Non-zero len */
3254 if (flags & SCF_DO_STCLASS_OR) {
3255 cl_or(pRExC_state, data->start_class, &this_class);
3256 cl_and(data->start_class, and_withp);
3257 }
3258 else if (flags & SCF_DO_STCLASS_AND)
3259 cl_and(data->start_class, &this_class);
3260 flags &= ~SCF_DO_STCLASS;
3261 }
3262 if (!scan) /* It was not CURLYX, but CURLY. */
3263 scan = next;
3264 if ( /* ? quantifier ok, except for (?{ ... }) */
3265 (next_is_eval || !(mincount == 0 && maxcount == 1))
3266 && (minnext == 0) && (deltanext == 0)
3267 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3268 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3269 {
668c081a
NC
3270 ckWARNreg(RExC_parse,
3271 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3272 }
3273
3274 min += minnext * mincount;
3275 is_inf_internal |= ((maxcount == REG_INFTY
3276 && (minnext + deltanext) > 0)
3277 || deltanext == I32_MAX);
3278 is_inf |= is_inf_internal;
3279 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3280
3281 /* Try powerful optimization CURLYX => CURLYN. */
3282 if ( OP(oscan) == CURLYX && data
3283 && data->flags & SF_IN_PAR
3284 && !(data->flags & SF_HAS_EVAL)
3285 && !deltanext && minnext == 1 ) {
3286 /* Try to optimize to CURLYN. */
3287 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3288 regnode * const nxt1 = nxt;
497b47a8 3289#ifdef DEBUGGING
8aa23a47 3290 regnode *nxt2;
497b47a8 3291#endif
c277df42 3292
8aa23a47
YO
3293 /* Skip open. */
3294 nxt = regnext(nxt);
e52fc539 3295 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3296 && !(PL_regkind[OP(nxt)] == EXACT
3297 && STR_LEN(nxt) == 1))
3298 goto nogo;
497b47a8 3299#ifdef DEBUGGING
8aa23a47 3300 nxt2 = nxt;
497b47a8 3301#endif
8aa23a47
YO
3302 nxt = regnext(nxt);
3303 if (OP(nxt) != CLOSE)
3304 goto nogo;
3305 if (RExC_open_parens) {
3306 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3307 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3308 }
3309 /* Now we know that nxt2 is the only contents: */
3310 oscan->flags = (U8)ARG(nxt);
3311 OP(oscan) = CURLYN;
3312 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3313
c277df42 3314#ifdef DEBUGGING
8aa23a47 3315 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3316 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3317 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3318 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3319 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3320 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3321#endif
8aa23a47
YO
3322 }
3323 nogo:
3324
3325 /* Try optimization CURLYX => CURLYM. */
3326 if ( OP(oscan) == CURLYX && data
3327 && !(data->flags & SF_HAS_PAR)
3328 && !(data->flags & SF_HAS_EVAL)
3329 && !deltanext /* atom is fixed width */
3330 && minnext != 0 /* CURLYM can't handle zero width */
3331 ) {
3332 /* XXXX How to optimize if data == 0? */
3333 /* Optimize to a simpler form. */
3334 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3335 regnode *nxt2;
3336
3337 OP(oscan) = CURLYM;
3338 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3339 && (OP(nxt2) != WHILEM))
3340 nxt = nxt2;
3341 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3342 /* Need to optimize away parenths. */
b3c0965f 3343 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3344 /* Set the parenth number. */
3345 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3346
8aa23a47
YO
3347 oscan->flags = (U8)ARG(nxt);
3348 if (RExC_open_parens) {
3349 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3350 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3351 }
8aa23a47
YO
3352 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3353 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3354
c277df42 3355#ifdef DEBUGGING
8aa23a47
YO
3356 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3357 OP(nxt + 1) = OPTIMIZED; /* was count. */
3358 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3359 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3360#endif
c277df42 3361#if 0
8aa23a47
YO
3362 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3363 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3364 if (nnxt == nxt) {
3365 if (reg_off_by_arg[OP(nxt1)])
3366 ARG_SET(nxt1, nxt2 - nxt1);
3367 else if (nxt2 - nxt1 < U16_MAX)
3368 NEXT_OFF(nxt1) = nxt2 - nxt1;
3369 else
3370 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3371 }
8aa23a47 3372 nxt1 = nnxt;
c277df42 3373 }
5d1c421c 3374#endif
8aa23a47
YO
3375 /* Optimize again: */
3376 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3377 NULL, stopparen, recursed, NULL, 0,depth+1);
3378 }
3379 else
3380 oscan->flags = 0;
3381 }
3382 else if ((OP(oscan) == CURLYX)
3383 && (flags & SCF_WHILEM_VISITED_POS)
3384 /* See the comment on a similar expression above.
3b753521 3385 However, this time it's not a subexpression
8aa23a47
YO
3386 we care about, but the expression itself. */
3387 && (maxcount == REG_INFTY)
3388 && data && ++data->whilem_c < 16) {
3389 /* This stays as CURLYX, we can put the count/of pair. */
3390 /* Find WHILEM (as in regexec.c) */
3391 regnode *nxt = oscan + NEXT_OFF(oscan);
3392
3393 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3394 nxt += ARG(nxt);
3395 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3396 | (RExC_whilem_seen << 4)); /* On WHILEM */
3397 }
3398 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3399 pars++;
3400 if (flags & SCF_DO_SUBSTR) {
3401 SV *last_str = NULL;
3402 int counted = mincount != 0;
a0ed51b3 3403
8aa23a47
YO
3404 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3405#if defined(SPARC64_GCC_WORKAROUND)
3406 I32 b = 0;
3407 STRLEN l = 0;
3408 const char *s = NULL;
3409 I32 old = 0;
b515a41d 3410
8aa23a47
YO
3411 if (pos_before >= data->last_start_min)
3412 b = pos_before;
3413 else
3414 b = data->last_start_min;
b515a41d 3415
8aa23a47
YO
3416 l = 0;
3417 s = SvPV_const(data->last_found, l);
3418 old = b - data->last_start_min;
3419
3420#else
3421 I32 b = pos_before >= data->last_start_min
3422 ? pos_before : data->last_start_min;
3423 STRLEN l;
3424 const char * const s = SvPV_const(data->last_found, l);
3425 I32 old = b - data->last_start_min;
3426#endif
3427
3428 if (UTF)
3429 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3430 l -= old;
3431 /* Get the added string: */
740cce10 3432 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3433 if (deltanext == 0 && pos_before == b) {
3434 /* What was added is a constant string */
3435 if (mincount > 1) {
3436 SvGROW(last_str, (mincount * l) + 1);
3437 repeatcpy(SvPVX(last_str) + l,
3438 SvPVX_const(last_str), l, mincount - 1);
3439 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3440 /* Add additional parts. */
3441 SvCUR_set(data->last_found,
3442 SvCUR(data->last_found) - l);
3443 sv_catsv(data->last_found, last_str);
3444 {
3445 SV * sv = data->last_found;
3446 MAGIC *mg =
3447 SvUTF8(sv) && SvMAGICAL(sv) ?
3448 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3449 if (mg && mg->mg_len >= 0)
bd94e887 3450 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3451 }
8aa23a47 3452 data->last_end += l * (mincount - 1);
b515a41d 3453 }
8aa23a47
YO
3454 } else {
3455 /* start offset must point into the last copy */
3456 data->last_start_min += minnext * (mincount - 1);
3457 data->last_start_max += is_inf ? I32_MAX
3458 : (maxcount - 1) * (minnext + data->pos_delta);
3459 }
c277df42 3460 }
8aa23a47
YO
3461 /* It is counted once already... */
3462 data->pos_min += minnext * (mincount - counted);
3463 data->pos_delta += - counted * deltanext +
3464 (minnext + deltanext) * maxcount - minnext * mincount;
3465 if (mincount != maxcount) {
3466 /* Cannot extend fixed substrings found inside
3467 the group. */
304ee84b 3468 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3469 if (mincount && last_str) {
3470 SV * const sv = data->last_found;
3471 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3472 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3473
3474 if (mg)
3475 mg->mg_len = -1;
3476 sv_setsv(sv, last_str);
3477 data->last_end = data->pos_min;
3478 data->last_start_min =
3479 data->pos_min - CHR_SVLEN(last_str);
3480 data->last_start_max = is_inf
3481 ? I32_MAX
3482 : data->pos_min + data->pos_delta
3483 - CHR_SVLEN(last_str);
3484 }
3485 data->longest = &(data->longest_float);
3486 }
3487 SvREFCNT_dec(last_str);
c277df42 3488 }
8aa23a47
YO
3489 if (data && (fl & SF_HAS_EVAL))
3490 data->flags |= SF_HAS_EVAL;
3491 optimize_curly_tail:
3492 if (OP(oscan) != CURLYX) {
3493 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3494 && NEXT_OFF(next))
3495 NEXT_OFF(oscan) += NEXT_OFF(next);
3496 }
3497 continue;
3498 default: /* REF and CLUMP only? */
3499 if (flags & SCF_DO_SUBSTR) {
304ee84b 3500 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3501 data->longest = &(data->longest_float);
3502 }
3503 is_inf = is_inf_internal = 1;
3504 if (flags & SCF_DO_STCLASS_OR)
3505 cl_anything(pRExC_state, data->start_class);
3506 flags &= ~SCF_DO_STCLASS;
3507 break;
c277df42 3508 }
8aa23a47 3509 }
e1d1eefb
YO
3510 else if (OP(scan) == LNBREAK) {
3511 if (flags & SCF_DO_STCLASS) {
3512 int value = 0;
3513 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3514 if (flags & SCF_DO_STCLASS_AND) {
3515 for (value = 0; value < 256; value++)
e64b1bd1 3516 if (!is_VERTWS_cp(value))
b9a59e08
KW
3517 ANYOF_BITMAP_CLEAR(data->start_class, value);
3518 }
3519 else {
e1d1eefb 3520 for (value = 0; value < 256; value++)
e64b1bd1 3521 if (is_VERTWS_cp(value))
b9a59e08
KW
3522 ANYOF_BITMAP_SET(data->start_class, value);
3523 }
e1d1eefb
YO
3524 if (flags & SCF_DO_STCLASS_OR)
3525 cl_and(data->start_class, and_withp);
3526 flags &= ~SCF_DO_STCLASS;
3527 }
3528 min += 1;
f9a79580 3529 delta += 1;
e1d1eefb
YO
3530 if (flags & SCF_DO_SUBSTR) {
3531 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3532 data->pos_min += 1;
f9a79580 3533 data->pos_delta += 1;
e1d1eefb
YO
3534 data->longest = &(data->longest_float);
3535 }
e1d1eefb 3536 }
f9a79580
RGS
3537 else if (OP(scan) == FOLDCHAR) {
3538 int d = ARG(scan)==0xDF ? 1 : 2;
3539 flags &= ~SCF_DO_STCLASS;
3540 min += 1;
3541 delta += d;
3542 if (flags & SCF_DO_SUBSTR) {
3543 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3544 data->pos_min += 1;
3545 data->pos_delta += d;
3546 data->longest = &(data->longest_float);
3547 }
3548 }
e52fc539 3549 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 3550 int value = 0;
653099ff 3551
8aa23a47 3552 if (flags & SCF_DO_SUBSTR) {
304ee84b 3553 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3554 data->pos_min++;
3555 }
3556 min++;
3557 if (flags & SCF_DO_STCLASS) {
3558 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3559
8aa23a47
YO
3560 /* Some of the logic below assumes that switching
3561 locale on will only add false positives. */
3562 switch (PL_regkind[OP(scan)]) {
3563 case SANY:
3564 default:
3565 do_default:
3566 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3567 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3568 cl_anything(pRExC_state, data->start_class);
3569 break;
3570 case REG_ANY:
3571 if (OP(scan) == SANY)
3572 goto do_default;
3573 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3574 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3575 || (data->start_class->flags & ANYOF_CLASS));
3576 cl_anything(pRExC_state, data->start_class);
653099ff 3577 }
8aa23a47
YO
3578 if (flags & SCF_DO_STCLASS_AND || !value)
3579 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3580 break;
3581 case ANYOF:
3582 if (flags & SCF_DO_STCLASS_AND)
3583 cl_and(data->start_class,
3584 (struct regnode_charclass_class*)scan);
653099ff 3585 else
8aa23a47
YO
3586 cl_or(pRExC_state, data->start_class,
3587 (struct regnode_charclass_class*)scan);
3588 break;
3589 case ALNUM:
3590 if (flags & SCF_DO_STCLASS_AND) {
3591 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3592 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
a12cf05f
KW
3593 if (FLAGS(scan) & USE_UNI) {
3594 for (value = 0; value < 256; value++) {
3595 if (!isWORDCHAR_L1(value)) {
3596 ANYOF_BITMAP_CLEAR(data->start_class, value);
3597 }
3598 }
3599 } else {
3600 for (value = 0; value < 256; value++) {
3601 if (!isALNUM(value)) {
3602 ANYOF_BITMAP_CLEAR(data->start_class, value);
3603 }
3604 }
3605 }
8aa23a47 3606 }
653099ff 3607 }
8aa23a47
YO
3608 else {
3609 if (data->start_class->flags & ANYOF_LOCALE)
3610 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
a12cf05f
KW
3611 else if (FLAGS(scan) & USE_UNI) {
3612 for (value = 0; value < 256; value++) {
3613 if (isWORDCHAR_L1(value)) {
3614 ANYOF_BITMAP_SET(data->start_class, value);
3615 }
3616 }
3617 } else {
3618 for (value = 0; value < 256; value++) {
3619 if (isALNUM(value)) {
3620 ANYOF_BITMAP_SET(data->start_class, value);
3621 }
3622 }
3623 }
8aa23a47
YO
3624 }
3625 break;
3626 case ALNUML:
3627 if (flags & SCF_DO_STCLASS_AND) {
3628 if (data->start_class->flags & ANYOF_LOCALE)
3629 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3630 }
3631 else {
3632 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3633 data->start_class->flags |= ANYOF_LOCALE;
3634 }
3635 break;
3636 case NALNUM:
3637 if (flags & SCF_DO_STCLASS_AND) {
3638 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3639 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
a12cf05f
KW
3640 if (FLAGS(scan) & USE_UNI) {
3641 for (value = 0; value < 256; value++) {
3642 if (isWORDCHAR_L1(value)) {
3643 ANYOF_BITMAP_CLEAR(data->start_class, value);
3644 }
3645 }
3646 } else {
3647 for (value = 0; value < 256; value++) {
3648 if (isALNUM(value)) {
3649 ANYOF_BITMAP_CLEAR(data->start_class, value);
3650 }
3651 }
3652 }
653099ff
GS
3653 }
3654 }
8aa23a47
YO
3655 else {
3656 if (data->start_class->flags & ANYOF_LOCALE)
3657 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3658 else {
3659 for (value = 0; value < 256; value++)
3660 if (!isALNUM(value))
b9a59e08 3661 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47 3662 }
653099ff 3663 }
8aa23a47
YO
3664 break;
3665 case NALNUML:
3666 if (flags & SCF_DO_STCLASS_AND) {
3667 if (data->start_class->flags & ANYOF_LOCALE)
3668 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3669 }
8aa23a47
YO
3670 else {
3671 data->start_class->flags |= ANYOF_LOCALE;
3672 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3673 }
3674 break;
3675 case SPACE:
3676 if (flags & SCF_DO_STCLASS_AND) {
3677 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3678 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
a12cf05f
KW
3679 if (FLAGS(scan) & USE_UNI) {
3680 for (value = 0; value < 256; value++) {
3681 if (!isSPACE_L1(value)) {
3682 ANYOF_BITMAP_CLEAR(data->start_class, value);
3683 }
3684 }
3685 } else {
3686 for (value = 0; value < 256; value++) {
3687 if (!isSPACE(value)) {
3688 ANYOF_BITMAP_CLEAR(data->start_class, value);
3689 }
3690 }
3691 }
653099ff
GS
3692 }
3693 }
8aa23a47 3694 else {
a12cf05f 3695 if (data->start_class->flags & ANYOF_LOCALE) {
8aa23a47 3696 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
a12cf05f
KW
3697 }
3698 else if (FLAGS(scan) & USE_UNI) {
3699 for (value = 0; value < 256; value++) {
3700 if (isSPACE_L1(value)) {
3701 ANYOF_BITMAP_SET(data->start_class, value);
3702 }
3703 }
3704 } else {
3705 for (value = 0; value < 256; value++) {
3706 if (isSPACE(value)) {
3707 ANYOF_BITMAP_SET(data->start_class, value);
3708 }
3709 }
8aa23a47 3710 }
653099ff 3711 }
8aa23a47
YO
3712 break;
3713 case SPACEL:
3714 if (flags & SCF_DO_STCLASS_AND) {
3715 if (data->start_class->flags & ANYOF_LOCALE)
3716 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3717 }
3718 else {
3719 data->start_class->flags |= ANYOF_LOCALE;
3720 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3721 }
3722 break;
3723 case NSPACE:
3724 if (flags & SCF_DO_STCLASS_AND) {
3725 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3726 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
a12cf05f
KW
3727 if (FLAGS(scan) & USE_UNI) {
3728 for (value = 0; value < 256; value++) {
3729 if (isSPACE_L1(value)) {
3730 ANYOF_BITMAP_CLEAR(data->start_class, value);
3731 }
3732 }
3733 } else {
3734 for (value = 0; value < 256; value++) {
3735 if (isSPACE(value)) {
3736 ANYOF_BITMAP_CLEAR(data->start_class, value);
3737 }
3738 }
3739 }
653099ff 3740 }
8aa23a47
YO
3741 }
3742 else {
3743 if (data->start_class->flags & ANYOF_LOCALE)
3744 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
a12cf05f
KW
3745 else if (FLAGS(scan) & USE_UNI) {
3746 for (value = 0; value < 256; value++) {
3747 if (!isSPACE_L1(value)) {
3748 ANYOF_BITMAP_SET(data->start_class, value);
3749 }
3750 }
3751 }
3752 else {
3753 for (value = 0; value < 256; value++) {
3754 if (!isSPACE(value)) {
3755 ANYOF_BITMAP_SET(data->start_class, value);
3756 }
3757 }
3758 }
653099ff 3759 }
8aa23a47
YO
3760 break;
3761 case NSPACEL:
3762 if (flags & SCF_DO_STCLASS_AND) {
3763 if (data->start_class->flags & ANYOF_LOCALE) {
3764 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3765 for (value = 0; value < 256; value++)
3766 if (!isSPACE(value))
3767 ANYOF_BITMAP_CLEAR(data->start_class, value);
3768 }
653099ff 3769 }
8aa23a47
YO
3770 else {
3771 data->start_class->flags |= ANYOF_LOCALE;
3772 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3773 }
3774 break;
3775 case DIGIT:
3776 if (flags & SCF_DO_STCLASS_AND) {
3777 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3778 for (value = 0; value < 256; value++)
3779 if (!isDIGIT(value))
3780 ANYOF_BITMAP_CLEAR(data->start_class, value);
3781 }
3782 else {
3783 if (data->start_class->flags & ANYOF_LOCALE)
3784 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3785 else {
3786 for (value = 0; value < 256; value++)
3787 if (isDIGIT(value))
b9a59e08 3788 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47
YO
3789 }
3790 }
3791 break;
3792 case NDIGIT:
3793 if (flags & SCF_DO_STCLASS_AND) {
3794 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3795 for (value = 0; value < 256; value++)
3796 if (isDIGIT(value))
3797 ANYOF_BITMAP_CLEAR(data->start_class, value);
3798 }
3799 else {
3800 if (data->start_class->flags & ANYOF_LOCALE)
3801 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3802 else {
3803 for (value = 0; value < 256; value++)
3804 if (!isDIGIT(value))
b9a59e08 3805 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3806 }
3807 }
8aa23a47 3808 break;
e1d1eefb
YO
3809 CASE_SYNST_FNC(VERTWS);
3810 CASE_SYNST_FNC(HORIZWS);
3811
8aa23a47
YO
3812 }
3813 if (flags & SCF_DO_STCLASS_OR)
3814 cl_and(data->start_class, and_withp);
3815 flags &= ~SCF_DO_STCLASS;
3816 }
3817 }
3818 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3819 data->flags |= (OP(scan) == MEOL
3820 ? SF_BEFORE_MEOL
3821 : SF_BEFORE_SEOL);
3822 }
3823 else if ( PL_regkind[OP(scan)] == BRANCHJ
3824 /* Lookbehind, or need to calculate parens/evals/stclass: */
3825 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3826 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3827 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3828 || OP(scan) == UNLESSM )
3829 {
3830 /* Negative Lookahead/lookbehind
3831 In this case we can't do fixed string optimisation.
3832 */
1de06328 3833
8aa23a47
YO
3834 I32 deltanext, minnext, fake = 0;
3835 regnode *nscan;
3836 struct regnode_charclass_class intrnl;
3837 int f = 0;
1de06328 3838
8aa23a47
YO
3839 data_fake.flags = 0;
3840 if (data) {
3841 data_fake.whilem_c = data->whilem_c;
3842 data_fake.last_closep = data->last_closep;
c277df42 3843 }
8aa23a47
YO
3844 else
3845 data_fake.last_closep = &fake;
58e23c8d 3846 data_fake.pos_delta = delta;
8aa23a47
YO
3847 if ( flags & SCF_DO_STCLASS && !scan->flags
3848 && OP(scan) == IFMATCH ) { /* Lookahead */
3849 cl_init(pRExC_state, &intrnl);
3850 data_fake.start_class = &intrnl;
3851 f |= SCF_DO_STCLASS_AND;
3852 }
3853 if (flags & SCF_WHILEM_VISITED_POS)
3854 f |= SCF_WHILEM_VISITED_POS;
3855 next = regnext(scan);
3856 nscan = NEXTOPER(NEXTOPER(scan));
3857 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3858 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3859 if (scan->flags) {
3860 if (deltanext) {
58e23c8d 3861 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3862 }
3863 else if (minnext > (I32)U8_MAX) {
58e23c8d 3864 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3865 }
3866 scan->flags = (U8)minnext;
3867 }
3868 if (data) {
3869 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3870 pars++;
3871 if (data_fake.flags & SF_HAS_EVAL)
3872 data->flags |= SF_HAS_EVAL;
3873 data->whilem_c = data_fake.whilem_c;
3874 }
3875 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
3876 if (flags & SCF_DO_STCLASS_OR) {
3877 /* OR before, AND after: ideally we would recurse with
3878 * data_fake to get the AND applied by study of the
3879 * remainder of the pattern, and then derecurse;
3880 * *** HACK *** for now just treat as "no information".
3881 * See [perl #56690].
3882 */
3883 cl_init(pRExC_state, data->start_class);
3884 } else {
3885 /* AND before and after: combine and continue */
3886 const int was = (data->start_class->flags & ANYOF_EOS);
3887
3888 cl_and(data->start_class, &intrnl);
3889 if (was)
3890 data->start_class->flags |= ANYOF_EOS;
3891 }
8aa23a47 3892 }
cb434fcc 3893 }
8aa23a47
YO
3894#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3895 else {
3896 /* Positive Lookahead/lookbehind
3897 In this case we can do fixed string optimisation,
3898 but we must be careful about it. Note in the case of
3899 lookbehind the positions will be offset by the minimum
3900 length of the pattern, something we won't know about
3901 until after the recurse.
3902 */
3903 I32 deltanext, fake = 0;
3904 regnode *nscan;
3905 struct regnode_charclass_class intrnl;
3906 int f = 0;
3907 /* We use SAVEFREEPV so that when the full compile
3908 is finished perl will clean up the allocated
3b753521 3909 minlens when it's all done. This way we don't
8aa23a47
YO
3910 have to worry about freeing them when we know
3911 they wont be used, which would be a pain.
3912 */
3913 I32 *minnextp;
3914 Newx( minnextp, 1, I32 );
3915 SAVEFREEPV(minnextp);
3916
3917 if (data) {
3918 StructCopy(data, &data_fake, scan_data_t);
3919 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3920 f |= SCF_DO_SUBSTR;
3921 if (scan->flags)
304ee84b 3922 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3923 data_fake.last_found=newSVsv(data->last_found);
3924 }
3925 }
3926 else
3927 data_fake.last_closep = &fake;
3928 data_fake.flags = 0;
58e23c8d 3929 data_fake.pos_delta = delta;
8aa23a47
YO
3930 if (is_inf)
3931 data_fake.flags |= SF_IS_INF;
3932 if ( flags & SCF_DO_STCLASS && !scan->flags
3933 && OP(scan) == IFMATCH ) { /* Lookahead */
3934 cl_init(pRExC_state, &intrnl);
3935 data_fake.start_class = &intrnl;
3936 f |= SCF_DO_STCLASS_AND;
3937 }
3938 if (flags & SCF_WHILEM_VISITED_POS)
3939 f |= SCF_WHILEM_VISITED_POS;
3940 next = regnext(scan);
3941 nscan = NEXTOPER(NEXTOPER(scan));
3942
3943 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3944 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3945 if (scan->flags) {
3946 if (deltanext) {
58e23c8d 3947 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3948 }
3949 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3950 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3951 }
3952 scan->flags = (U8)*minnextp;
3953 }
3954
3955 *minnextp += min;
3956
3957 if (f & SCF_DO_STCLASS_AND) {
3958 const int was = (data->start_class->flags & ANYOF_EOS);
3959
3960 cl_and(data->start_class, &intrnl);
3961 if (was)
3962 data->start_class->flags |= ANYOF_EOS;
3963 }
3964 if (data) {
3965 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3966 pars++;
3967 if (data_fake.flags & SF_HAS_EVAL)
3968 data->flags |= SF_HAS_EVAL;
3969 data->whilem_c = data_fake.whilem_c;
3970 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3971 if (RExC_rx->minlen<*minnextp)
3972 RExC_rx->minlen=*minnextp;
304ee84b 3973 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3974 SvREFCNT_dec(data_fake.last_found);
3975
3976 if ( data_fake.minlen_fixed != minlenp )
3977 {
3978 data->offset_fixed= data_fake.offset_fixed;
3979 data->minlen_fixed= data_fake.minlen_fixed;
3980 data->lookbehind_fixed+= scan->flags;
3981 }
3982 if ( data_fake.minlen_float != minlenp )
3983 {
3984 data->minlen_float= data_fake.minlen_float;
3985 data->offset_float_min=data_fake.offset_float_min;
3986 data->offset_float_max=data_fake.offset_float_max;
3987 data->lookbehind_float+= scan->flags;
3988 }
3989 }
3990 }
3991
3992
40d049e4 3993 }
8aa23a47
YO
3994#endif
3995 }
3996 else if (OP(scan) == OPEN) {
3997 if (stopparen != (I32)ARG(scan))
3998 pars++;
3999 }
4000 else if (OP(scan) == CLOSE) {
4001 if (stopparen == (I32)ARG(scan)) {
4002 break;
4003 }
4004 if ((I32)ARG(scan) == is_par) {
4005 next = regnext(scan);
b515a41d 4006
8aa23a47
YO
4007 if ( next && (OP(next) != WHILEM) && next < last)
4008 is_par = 0; /* Disable optimization */
40d049e4 4009 }
8aa23a47
YO
4010 if (data)
4011 *(data->last_closep) = ARG(scan);
4012 }
4013 else if (OP(scan) == EVAL) {
c277df42
IZ
4014 if (data)
4015 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
4016 }
4017 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4018 if (flags & SCF_DO_SUBSTR) {
304ee84b 4019 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 4020 flags &= ~SCF_DO_SUBSTR;
40d049e4 4021 }
8aa23a47
YO
4022 if (data && OP(scan)==ACCEPT) {
4023 data->flags |= SCF_SEEN_ACCEPT;
4024 if (stopmin > min)
4025 stopmin = min;
e2e6a0f1 4026 }
8aa23a47
YO
4027 }
4028 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4029 {
0f5d15d6 4030 if (flags & SCF_DO_SUBSTR) {
304ee84b 4031 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
4032 data->longest = &(data->longest_float);
4033 }
4034 is_inf = is_inf_internal = 1;
653099ff 4035 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 4036 cl_anything(pRExC_state, data->start_class);
96776eda 4037 flags &= ~SCF_DO_STCLASS;
8aa23a47 4038 }
58e23c8d 4039 else if (OP(scan) == GPOS) {
bbe252da 4040 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
4041 !(delta || is_inf || (data && data->pos_delta)))
4042 {
bbe252da
YO
4043 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4044 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
4045 if (RExC_rx->gofs < (U32)min)
4046 RExC_rx->gofs = min;
4047 } else {
bbe252da 4048 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
4049 RExC_rx->gofs = 0;
4050 }
4051 }
786e8c11 4052#ifdef TRIE_STUDY_OPT
40d049e4 4053#ifdef FULL_TRIE_STUDY
8aa23a47
YO
4054 else if (PL_regkind[OP(scan)] == TRIE) {
4055 /* NOTE - There is similar code to this block above for handling
4056 BRANCH nodes on the initial study. If you change stuff here
4057 check there too. */
4058 regnode *trie_node= scan;
4059 regnode *tail= regnext(scan);
f8fc2ecf 4060 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4061 I32 max1 = 0, min1 = I32_MAX;
4062 struct regnode_charclass_class accum;
4063
4064 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 4065 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
4066 if (flags & SCF_DO_STCLASS)
4067 cl_init_zero(pRExC_state, &accum);
4068
4069 if (!trie->jump) {
4070 min1= trie->minlen;
4071 max1= trie->maxlen;
4072 } else {
4073 const regnode *nextbranch= NULL;
4074 U32 word;
4075
4076 for ( word=1 ; word <= trie->wordcount ; word++)
4077 {
4078 I32 deltanext=0, minnext=0, f = 0, fake;
4079 struct regnode_charclass_class this_class;
4080
4081 data_fake.flags = 0;
4082 if (data) {
4083 data_fake.whilem_c = data->whilem_c;
4084 data_fake.last_closep = data->last_closep;
4085 }
4086 else
4087 data_fake.last_closep = &fake;
58e23c8d 4088 data_fake.pos_delta = delta;
8aa23a47
YO
4089 if (flags & SCF_DO_STCLASS) {
4090 cl_init(pRExC_state, &this_class);
4091 data_fake.start_class = &this_class;
4092 f = SCF_DO_STCLASS_AND;
4093 }
4094 if (flags & SCF_WHILEM_VISITED_POS)
4095 f |= SCF_WHILEM_VISITED_POS;
4096
4097 if (trie->jump[word]) {
4098 if (!nextbranch)
4099 nextbranch = trie_node + trie->jump[0];
4100 scan= trie_node + trie->jump[word];
4101 /* We go from the jump point to the branch that follows
4102 it. Note this means we need the vestigal unused branches
4103 even though they arent otherwise used.
4104 */
4105 minnext = study_chunk(pRExC_state, &scan, minlenp,
4106 &deltanext, (regnode *)nextbranch, &data_fake,
4107 stopparen, recursed, NULL, f,depth+1);
4108 }
4109 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4110 nextbranch= regnext((regnode*)nextbranch);
4111
4112 if (min1 > (I32)(minnext + trie->minlen))
4113 min1 = minnext + trie->minlen;
4114 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4115 max1 = minnext + deltanext + trie->maxlen;
4116 if (deltanext == I32_MAX)
4117 is_inf = is_inf_internal = 1;
4118
4119 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4120 pars++;
4121 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4122 if ( stopmin > min + min1)
4123 stopmin = min + min1;
4124 flags &= ~SCF_DO_SUBSTR;
4125 if (data)
4126 data->flags |= SCF_SEEN_ACCEPT;
4127 }
4128 if (data) {
4129 if (data_fake.flags & SF_HAS_EVAL)
4130 data->flags |= SF_HAS_EVAL;
4131 data->whilem_c = data_fake.whilem_c;
4132 }
4133 if (flags & SCF_DO_STCLASS)
4134 cl_or(pRExC_state, &accum, &this_class);
4135 }
4136 }
4137 if (flags & SCF_DO_SUBSTR) {
4138 data->pos_min += min1;
4139 data->pos_delta += max1 - min1;
4140 if (max1 != min1 || is_inf)
4141 data->longest = &(data->longest_float);
4142 }
4143 min += min1;
4144 delta += max1 - min1;
4145 if (flags & SCF_DO_STCLASS_OR) {
4146 cl_or(pRExC_state, data->start_class, &accum);
4147 if (min1) {
4148 cl_and(data->start_class, and_withp);
4149 flags &= ~SCF_DO_STCLASS;
4150 }
4151 }
4152 else if (flags & SCF_DO_STCLASS_AND) {
4153 if (min1) {
4154 cl_and(data->start_class, &accum);
4155 flags &= ~SCF_DO_STCLASS;
4156 }
4157 else {
4158 /* Switch to OR mode: cache the old value of
4159 * data->start_class */
4160 INIT_AND_WITHP;
4161 StructCopy(data->start_class, and_withp,
4162 struct regnode_charclass_class);
4163 flags &= ~SCF_DO_STCLASS_AND;
4164 StructCopy(&accum, data->start_class,
4165 struct regnode_charclass_class);
4166 flags |= SCF_DO_STCLASS_OR;
4167 data->start_class->flags |= ANYOF_EOS;
4168 }
4169 }
4170 scan= tail;
4171 continue;
4172 }
786e8c11 4173#else
8aa23a47 4174 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4175 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4176 U8*bang=NULL;
4177
4178 min += trie->minlen;
4179 delta += (trie->maxlen - trie->minlen);
4180 flags &= ~SCF_DO_STCLASS; /* xxx */
4181 if (flags & SCF_DO_SUBSTR) {
304ee84b 4182 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4183 data->pos_min += trie->minlen;
4184 data->pos_delta += (trie->maxlen - trie->minlen);
4185 if (trie->maxlen != trie->minlen)
4186 data->longest = &(data->longest_float);
4187 }
4188 if (trie->jump) /* no more substrings -- for now /grr*/
4189 flags &= ~SCF_DO_SUBSTR;
b515a41d 4190 }
8aa23a47
YO
4191#endif /* old or new */
4192#endif /* TRIE_STUDY_OPT */
e1d1eefb 4193
8aa23a47
YO
4194 /* Else: zero-length, ignore. */
4195 scan = regnext(scan);
4196 }
4197 if (frame) {
4198 last = frame->last;
4199 scan = frame->next;
4200 stopparen = frame->stop;
4201 frame = frame->prev;
4202 goto fake_study_recurse;
c277df42
IZ
4203 }
4204
4205 finish:
8aa23a47 4206 assert(!frame);
304ee84b 4207 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4208
c277df42 4209 *scanp = scan;
aca2d497 4210 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4211 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4212 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4213 if (is_par > (I32)U8_MAX)
c277df42
IZ
4214 is_par = 0;
4215 if (is_par && pars==1 && data) {
4216 data->flags |= SF_IN_PAR;
4217 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4218 }
4219 else if (pars && data) {
c277df42
IZ
4220 data->flags |= SF_HAS_PAR;
4221 data->flags &= ~SF_IN_PAR;
4222 }
653099ff 4223 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4224 cl_and(data->start_class, and_withp);
786e8c11
YO
4225 if (flags & SCF_TRIE_RESTUDY)
4226 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4227
304ee84b 4228 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4229
e2e6a0f1 4230 return min < stopmin ? min : stopmin;
c277df42
IZ
4231}
4232
2eccd3b2
NC
4233STATIC U32
4234S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4235{
4a4e7719
NC
4236 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4237
7918f24d
NC
4238 PERL_ARGS_ASSERT_ADD_DATA;
4239
4a4e7719
NC
4240 Renewc(RExC_rxi->data,
4241 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4242 char, struct reg_data);
4243 if(count)
f8fc2ecf 4244 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4245 else
f8fc2ecf 4246 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4247 RExC_rxi->data->count = count + n;
4248 Copy(s, RExC_rxi->data->what + count, n, U8);
4249 return count;
c277df42
IZ
4250}
4251
f8149455 4252/*XXX: todo make this not included in a non debugging perl */
76234dfb 4253#ifndef PERL_IN_XSUB_RE
d88dccdf 4254void
864dbfa3 4255Perl_reginitcolors(pTHX)
d88dccdf 4256{
97aff369 4257 dVAR;
1df70142 4258 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4259 if (s) {
1df70142
AL
4260 char *t = savepv(s);
4261 int i = 0;
4262 PL_colors[0] = t;
d88dccdf 4263 while (++i < 6) {
1df70142
AL
4264 t = strchr(t, '\t');
4265 if (t) {
4266 *t = '\0';
4267 PL_colors[i] = ++t;
d88dccdf
IZ
4268 }
4269 else
1df70142 4270 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4271 }
4272 } else {
1df70142 4273 int i = 0;
b81d288d 4274 while (i < 6)
06b5626a 4275 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4276 }
4277 PL_colorset = 1;
4278}
76234dfb 4279#endif
8615cb43 4280
07be1b83 4281
786e8c11
YO
4282#ifdef TRIE_STUDY_OPT
4283#define CHECK_RESTUDY_GOTO \
4284 if ( \
4285 (data.flags & SCF_TRIE_RESTUDY) \
4286 && ! restudied++ \
4287 ) goto reStudy
4288#else
4289#define CHECK_RESTUDY_GOTO
4290#endif
f9f4320a 4291
a687059c 4292/*
e50aee73 4293 - pregcomp - compile a regular expression into internal code
a687059c
LW
4294 *
4295 * We can't allocate space until we know how big the compiled form will be,
4296 * but we can't compile it (and thus know how big it is) until we've got a
4297 * place to put the code. So we cheat: we compile it twice, once with code
4298 * generation turned off and size counting turned on, and once "for real".
4299 * This also means that we don't allocate space until we are sure that the
4300 * thing really will compile successfully, and we never have to move the
4301 * code and thus invalidate pointers into it. (Note that it has to be in
4302 * one piece because free() must be able to free it all.) [NB: not true in perl]
4303 *
4304 * Beware that the optimization-preparation code in here knows about some
4305 * of the structure of the compiled regexp. [I'll say.]
4306 */
b9b4dddf
YO
4307
4308
4309
f9f4320a 4310#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4311#define RE_ENGINE_PTR &PL_core_reg_engine
4312#else
f9f4320a
YO
4313extern const struct regexp_engine my_reg_engine;
4314#define RE_ENGINE_PTR &my_reg_engine
4315#endif
6d5c990f
RGS
4316
4317#ifndef PERL_IN_XSUB_RE
3ab4a224 4318REGEXP *
1593ad57 4319Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4320{
97aff369 4321 dVAR;
6d5c990f 4322 HV * const table = GvHV(PL_hintgv);
7918f24d
NC
4323
4324 PERL_ARGS_ASSERT_PREGCOMP;
4325
f9f4320a
YO
4326 /* Dispatch a request to compile a regexp to correct
4327 regexp engine. */
f9f4320a
YO
4328 if (table) {
4329 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4330 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4331 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4332 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4333 DEBUG_COMPILE_r({
8d8756e7 4334 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4335 SvIV(*ptr));
4336 });
3ab4a224 4337 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4338 }
b9b4dddf 4339 }
3ab4a224 4340 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4341}
6d5c990f 4342#endif
2a5d9b1d 4343
3ab4a224 4344REGEXP *
1593ad57 4345Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
2a5d9b1d
RGS
4346{
4347 dVAR;
288b8c02
NC
4348 REGEXP *rx;
4349 struct regexp *r;
f8fc2ecf 4350 register regexp_internal *ri;
3ab4a224 4351 STRLEN plen;
5d51ce98
KW
4352 char *exp;
4353 char* xend;
c277df42 4354 regnode *scan;
a0d0e21e 4355 I32 flags;
a0d0e21e 4356 I32 minlen = 0;
e7f38d0f
YO
4357
4358 /* these are all flags - maybe they should be turned
4359 * into a single int with different bit masks */
4360 I32 sawlookahead = 0;
a0d0e21e
LW
4361 I32 sawplus = 0;
4362 I32 sawopen = 0;
e7f38d0f 4363
bbd61b5f
KW
4364 U8 jump_ret = 0;
4365 dJMPENV;
2c2d71f5 4366 scan_data_t data;
830247a4 4367 RExC_state_t RExC_state;
be8e71aa 4368 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83 4369#ifdef TRIE_STUDY_OPT
5d51ce98 4370 int restudied;
07be1b83
YO
4371 RExC_state_t copyRExC_state;
4372#endif
2a5d9b1d 4373 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4374
4375 PERL_ARGS_ASSERT_RE_COMPILE;
4376
6d5c990f 4377 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4378
b9ad30b4 4379 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
a0ed51b3 4380
02daf0ab 4381
bbd61b5f
KW
4382 /* Longjmp back to here if have to switch in midstream to utf8 */
4383 if (! RExC_orig_utf8) {
4384 JMPENV_PUSH(jump_ret);
4385 }
4386
5d51ce98
KW
4387 if (jump_ret == 0) { /* First time through */
4388 exp = SvPV(pattern, plen);
4389 xend = exp + plen;
4390
4391 DEBUG_COMPILE_r({
4392 SV *dsv= sv_newmortal();
4393 RE_PV_QUOTED_DECL(s, RExC_utf8,
4394 dsv, exp, plen, 60);
4395 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4396 PL_colors[4],PL_colors[5],s);
4397 });
4398 }
4399 else { /* longjumped back */
bbd61b5f
KW
4400 STRLEN len = plen;
4401
5d51ce98
KW
4402 /* If the cause for the longjmp was other than changing to utf8, pop
4403 * our own setjmp, and longjmp to the correct handler */
bbd61b5f
KW
4404 if (jump_ret != UTF8_LONGJMP) {
4405 JMPENV_POP;
4406 JMPENV_JUMP(jump_ret);
4407 }
4408
595598ee
KW
4409 GET_RE_DEBUG_FLAGS;
4410
bbd61b5f
KW
4411 /* It's possible to write a regexp in ascii that represents Unicode
4412 codepoints outside of the byte range, such as via \x{100}. If we
4413 detect such a sequence we have to convert the entire pattern to utf8
4414 and then recompile, as our sizing calculation will have been based
4415 on 1 byte == 1 character, but we will need to use utf8 to encode
4416 at least some part of the pattern, and therefore must convert the whole
4417 thing.
4418 -- dmq */
4419 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4420 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
595598ee 4421 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
bbd61b5f
KW
4422 xend = exp + len;
4423 RExC_orig_utf8 = RExC_utf8 = 1;
4424 SAVEFREEPV(exp);
4425 }
4426
5d51ce98
KW
4427#ifdef TRIE_STUDY_OPT
4428 restudied = 0;
4429#endif
4430
02daf0ab 4431 RExC_precomp = exp;
c737faaf 4432 RExC_flags = pm_flags;
830247a4 4433 RExC_sawback = 0;
bbce6d69 4434
830247a4
IZ
4435 RExC_seen = 0;
4436 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4437 RExC_seen_evals = 0;
4438 RExC_extralen = 0;
c277df42 4439
bbce6d69 4440 /* First pass: determine size, legality. */
830247a4 4441 RExC_parse = exp;
fac92740 4442 RExC_start = exp;
830247a4
IZ
4443 RExC_end = xend;
4444 RExC_naughty = 0;
4445 RExC_npar = 1;
e2e6a0f1 4446 RExC_nestroot = 0;
830247a4
IZ
4447 RExC_size = 0L;
4448 RExC_emit = &PL_regdummy;
4449 RExC_whilem_seen = 0;
40d049e4
YO
4450 RExC_open_parens = NULL;
4451 RExC_close_parens = NULL;
4452 RExC_opend = NULL;
81714fb9 4453 RExC_paren_names = NULL;
1f1031fe
YO
4454#ifdef DEBUGGING
4455 RExC_paren_name_list = NULL;
4456#endif
40d049e4
YO
4457 RExC_recurse = NULL;
4458 RExC_recurse_count = 0;
81714fb9 4459
85ddcde9
JH
4460#if 0 /* REGC() is (currently) a NOP at the first pass.
4461 * Clever compilers notice this and complain. --jhi */
830247a4 4462 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4463#endif
3dab1dad
YO
4464 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4465 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4466 RExC_precomp = NULL;
a0d0e21e
LW
4467 return(NULL);
4468 }
bbd61b5f
KW
4469
4470 /* Here, finished first pass. Get rid of our setjmp, which we added for
4471 * efficiency only if the passed-in string wasn't in utf8, as shown by
4472 * RExC_orig_utf8. But if the first pass was redone, that variable will be
4473 * 1 here even though the original string wasn't utf8, but in this case
4474 * there will have been a long jump */
4475 if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
4476 JMPENV_POP;
02daf0ab 4477 }
07be1b83 4478 DEBUG_PARSE_r({
81714fb9
YO
4479 PerlIO_printf(Perl_debug_log,
4480 "Required size %"IVdf" nodes\n"
4481 "Starting second pass (creation)\n",
4482 (IV)RExC_size);
07be1b83
YO
4483 RExC_lastnum=0;
4484 RExC_lastparse=NULL;
4485 });
c277df42
IZ
4486 /* Small enough for pointer-storage convention?
4487 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4488 if (RExC_size >= 0x10000L && RExC_extralen)
4489 RExC_size += RExC_extralen;
c277df42 4490 else
830247a4
IZ
4491 RExC_extralen = 0;
4492 if (RExC_whilem_seen > 15)
4493 RExC_whilem_seen = 15;
a0d0e21e 4494
f9f4320a
YO
4495 /* Allocate space and zero-initialize. Note, the two step process
4496 of zeroing when in debug mode, thus anything assigned has to
4497 happen after that */
d2f13c59 4498 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 4499 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
4500 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4501 char, regexp_internal);
4502 if ( r == NULL || ri == NULL )
b45f050a 4503 FAIL("Regexp out of space");
0f79a09d
GS
4504#ifdef DEBUGGING
4505 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4506 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4507#else
f8fc2ecf
YO
4508 /* bulk initialize base fields with 0. */
4509 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4510#endif
58e23c8d
YO
4511
4512 /* non-zero initialization begins here */
f8fc2ecf 4513 RXi_SET( r, ri );
f9f4320a 4514 r->engine= RE_ENGINE_PTR;
c737faaf 4515 r->extflags = pm_flags;
bcdf7404 4516 {
f7819f85 4517 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
77ba44fb 4518 bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
c5ea2ffa
KW
4519
4520 /* The caret is output if there are any defaults: if not all the STD
4521 * flags are set, or if no character set specifier is needed */
4522 bool has_default =
4523 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4524 || ! has_charset);
bcdf7404 4525 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
4526 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4527 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
4528 const char *fptr = STD_PAT_MODS; /*"msix"*/
4529 char *p;
fb85c044 4530 /* Allocate for the worst case, which is all the std flags are turned
c5ea2ffa
KW
4531 * on. If more precision is desired, we could do a population count of
4532 * the flags set. This could be done with a small lookup table, or by
4533 * shifting, masking and adding, or even, when available, assembly
4534 * language for a machine-language population count.
4535 * We never output a minus, as all those are defaults, so are
4536 * covered by the caret */
fb85c044 4537 const STRLEN wraplen = plen + has_p + has_runon
c5ea2ffa
KW
4538 + has_default /* If needs a caret */
4539 + has_charset /* If needs a character set specifier */
bcdf7404
YO
4540 + (sizeof(STD_PAT_MODS) - 1)
4541 + (sizeof("(?:)") - 1);
4542
c5ea2ffa 4543 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
f7c278bf 4544 SvPOK_on(rx);
8f6ae13c 4545 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 4546 *p++='('; *p++='?';
9de15fec
KW
4547
4548 /* If a default, cover it using the caret */
c5ea2ffa 4549 if (has_default) {
85508812 4550 *p++= DEFAULT_PAT_MOD;
fb85c044 4551 }
c5ea2ffa
KW
4552 if (has_charset) {
4553 if (r->extflags & RXf_PMf_LOCALE) {
4554 *p++ = LOCALE_PAT_MOD;
4555 } else {
4556 *p++ = UNICODE_PAT_MOD;
4557 }
9de15fec 4558 }
f7819f85
A
4559 if (has_p)
4560 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404 4561 {
bcdf7404 4562 char ch;
bcdf7404
YO
4563 while((ch = *fptr++)) {
4564 if(reganch & 1)
4565 *p++ = ch;
bcdf7404
YO
4566 reganch >>= 1;
4567 }
bcdf7404
YO
4568 }
4569
28d8d7f4 4570 *p++ = ':';
bb661a58 4571 Copy(RExC_precomp, p, plen, char);
efd26800
NC
4572 assert ((RX_WRAPPED(rx) - p) < 16);
4573 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 4574 p += plen;
bcdf7404 4575 if (has_runon)
28d8d7f4
YO
4576 *p++ = '\n';
4577 *p++ = ')';
4578 *p = 0;
fb85c044 4579 SvCUR_set(rx, p - SvPVX_const(rx));
bcdf7404
YO
4580 }
4581
bbe252da 4582 r->intflags = 0;
830247a4 4583 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4584
6bda09f9 4585 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4586 Newxz(RExC_open_parens, RExC_npar,regnode *);
4587 SAVEFREEPV(RExC_open_parens);
4588 Newxz(RExC_close_parens,RExC_npar,regnode *);
4589 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4590 }
4591
4592 /* Useful during FAIL. */
7122b237
YO
4593#ifdef RE_TRACK_PATTERN_OFFSETS
4594 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4595 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4596 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4597 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4598 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4599#endif
4600 SetProgLen(ri,RExC_size);
288b8c02 4601 RExC_rx_sv = rx;
830247a4 4602 RExC_rx = r;
f8fc2ecf 4603 RExC_rxi = ri;
bbce6d69 4604
4605 /* Second pass: emit code. */
c737faaf 4606 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
4607 RExC_parse = exp;
4608 RExC_end = xend;
4609 RExC_naughty = 0;
4610 RExC_npar = 1;
f8fc2ecf
YO
4611 RExC_emit_start = ri->program;
4612 RExC_emit = ri->program;
3b57cd43
YO
4613 RExC_emit_bound = ri->program + RExC_size + 1;
4614
2cd61cdb 4615 /* Store the count of eval-groups for security checks: */
f8149455 4616 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4617 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 4618 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 4619 ReREFCNT_dec(rx);
a0d0e21e 4620 return(NULL);
80757612 4621 }
07be1b83
YO
4622 /* XXXX To minimize changes to RE engine we always allocate
4623 3-units-long substrs field. */
4624 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4625 if (RExC_recurse_count) {
4626 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4627 SAVEFREEPV(RExC_recurse);
4628 }
a0d0e21e 4629
07be1b83 4630reStudy:
e7f38d0f 4631 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
07be1b83 4632 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4633
07be1b83 4634#ifdef TRIE_STUDY_OPT
0934c9d9
SH
4635 if (!restudied) {
4636 StructCopy(&zero_scan_data, &data, scan_data_t);
4637 copyRExC_state = RExC_state;
4638 } else {
5d458dd8 4639 U32 seen=RExC_seen;
07be1b83 4640 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4641
4642 RExC_state = copyRExC_state;
4643 if (seen & REG_TOP_LEVEL_BRANCHES)
4644 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4645 else
4646 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4647 if (data.last_found) {
07be1b83 4648 SvREFCNT_dec(data.longest_fixed);
07be1b83 4649 SvREFCNT_dec(data.longest_float);
07be1b83 4650 SvREFCNT_dec(data.last_found);
1de06328 4651 }
40d049e4 4652 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4653 }
40d049e4
YO
4654#else
4655 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4656#endif
fc8cd66c 4657
a0d0e21e 4658 /* Dig out information for optimizations. */
f7819f85 4659 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
4660 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4661
a0ed51b3 4662 if (UTF)
8f6ae13c 4663 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 4664 ri->regstclass = NULL;
830247a4 4665 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4666 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4667 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4668
1de06328
YO
4669 /* testing for BRANCH here tells us whether there is "must appear"
4670 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4671 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4672 I32 fake;
c5254dd6 4673 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4674 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4675 int stclass_flag;
07be1b83 4676 I32 last_close = 0; /* pointed to by data */
5339e136
YO
4677 regnode *first= scan;
4678 regnode *first_next= regnext(first);
639081d6
YO
4679 /*
4680 * Skip introductions and multiplicators >= 1
4681 * so that we can extract the 'meat' of the pattern that must
4682 * match in the large if() sequence following.
4683 * NOTE that EXACT is NOT covered here, as it is normally
4684 * picked up by the optimiser separately.
4685 *
4686 * This is unfortunate as the optimiser isnt handling lookahead
4687 * properly currently.
4688 *
4689 */
a0d0e21e 4690 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4691 /* An OR of *one* alternative - should not happen now. */
5339e136 4692 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83 4693 /* for now we can't handle lookbehind IFMATCH*/
e7f38d0f 4694 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
a0d0e21e
LW
4695 (OP(first) == PLUS) ||
4696 (OP(first) == MINMOD) ||
653099ff 4697 /* An {n,m} with n>0 */
5339e136
YO
4698 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4699 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 4700 {
639081d6
YO
4701 /*
4702 * the only op that could be a regnode is PLUS, all the rest
4703 * will be regnode_1 or regnode_2.
4704 *
4705 */
a0d0e21e
LW
4706 if (OP(first) == PLUS)
4707 sawplus = 1;
4708 else
3dab1dad 4709 first += regarglen[OP(first)];
639081d6
YO
4710
4711 first = NEXTOPER(first);
5339e136 4712 first_next= regnext(first);
a687059c
LW
4713 }
4714
a0d0e21e
LW
4715 /* Starting-point info. */
4716 again:
786e8c11 4717 DEBUG_PEEP("first:",first,0);
07be1b83 4718 /* Ignore EXACT as we deal with it later. */
3dab1dad 4719 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4720 if (OP(first) == EXACT)
6f207bd3 4721 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4722 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4723 ri->regstclass = first;
b3c9acc1 4724 }
07be1b83 4725#ifdef TRIE_STCLASS
786e8c11 4726 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4727 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4728 {
786e8c11 4729 regnode *trie_op;
07be1b83 4730 /* this can happen only on restudy */
786e8c11 4731 if ( OP(first) == TRIE ) {
c944940b 4732 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4733 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4734 StructCopy(first,trieop,struct regnode_1);
4735 trie_op=(regnode *)trieop;
4736 } else {
c944940b 4737 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4738 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4739 StructCopy(first,trieop,struct regnode_charclass);
4740 trie_op=(regnode *)trieop;
4741 }
1de06328 4742 OP(trie_op)+=2;
786e8c11 4743 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4744 ri->regstclass = trie_op;
07be1b83
YO
4745 }
4746#endif
e52fc539 4747 else if (REGNODE_SIMPLE(OP(first)))
f8fc2ecf 4748 ri->regstclass = first;
3dab1dad
YO
4749 else if (PL_regkind[OP(first)] == BOUND ||
4750 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4751 ri->regstclass = first;
3dab1dad 4752 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4753 r->extflags |= (OP(first) == MBOL
4754 ? RXf_ANCH_MBOL
cad2e5aa 4755 : (OP(first) == SBOL
bbe252da
YO
4756 ? RXf_ANCH_SBOL
4757 : RXf_ANCH_BOL));
a0d0e21e 4758 first = NEXTOPER(first);
774d564b 4759 goto again;
4760 }
4761 else if (OP(first) == GPOS) {
bbe252da 4762 r->extflags |= RXf_ANCH_GPOS;
774d564b 4763 first = NEXTOPER(first);
4764 goto again;
a0d0e21e 4765 }
cf2a2b69
YO
4766 else if ((!sawopen || !RExC_sawback) &&
4767 (OP(first) == STAR &&
3dab1dad 4768 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4769 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4770 {
4771 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4772 const int type =
4773 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4774 ? RXf_ANCH_MBOL
4775 : RXf_ANCH_SBOL;
4776 r->extflags |= type;
4777 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4778 first = NEXTOPER(first);
774d564b 4779 goto again;
a0d0e21e 4780 }
e7f38d0f 4781 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
830247a4 4782 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4783 /* x+ must match at the 1st pos of run of x's */
bbe252da 4784 r->intflags |= PREGf_SKIP;
a0d0e21e 4785
c277df42 4786 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4787#ifdef TRIE_STUDY_OPT
81714fb9 4788 DEBUG_PARSE_r(
be8e71aa
YO
4789 if (!restudied)
4790 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4791 (IV)(first - scan + 1))
4792 );
4793#else
81714fb9 4794 DEBUG_PARSE_r(
be8e71aa
YO
4795 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4796 (IV)(first - scan + 1))
4797 );
4798#endif
4799
4800
a0d0e21e
LW
4801 /*
4802 * If there's something expensive in the r.e., find the
4803 * longest literal string that must appear and make it the
4804 * regmust. Resolve ties in favor of later strings, since
4805 * the regstart check works with the beginning of the r.e.
4806 * and avoiding duplication strengthens checking. Not a
4807 * strong reason, but sufficient in the absence of others.
4808 * [Now we resolve ties in favor of the earlier string if
c277df42 4809 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4810 * earlier string may buy us something the later one won't.]
4811 */
de8c5301 4812
396482e1
GA
4813 data.longest_fixed = newSVpvs("");
4814 data.longest_float = newSVpvs("");
4815 data.last_found = newSVpvs("");
c277df42
IZ
4816 data.longest = &(data.longest_fixed);
4817 first = scan;
f8fc2ecf 4818 if (!ri->regstclass) {
830247a4 4819 cl_init(pRExC_state, &ch_class);
653099ff
GS
4820 data.start_class = &ch_class;
4821 stclass_flag = SCF_DO_STCLASS_AND;
4822 } else /* XXXX Check for BOUND? */
4823 stclass_flag = 0;
cb434fcc 4824 data.last_closep = &last_close;
de8c5301 4825
1de06328 4826 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4827 &data, -1, NULL, NULL,
4828 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4829
07be1b83 4830
786e8c11
YO
4831 CHECK_RESTUDY_GOTO;
4832
4833
830247a4 4834 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4835 && data.last_start_min == 0 && data.last_end > 0
830247a4 4836 && !RExC_seen_zerolen
2bf803e2 4837 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4838 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4839 r->extflags |= RXf_CHECK_ALL;
304ee84b 4840 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4841 SvREFCNT_dec(data.last_found);
4842
1de06328
YO
4843 /* Note that code very similar to this but for anchored string
4844 follows immediately below, changes may need to be made to both.
4845 Be careful.
4846 */
a0ed51b3 4847 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4848 if (longest_float_length
c277df42
IZ
4849 || (data.flags & SF_FL_BEFORE_EOL
4850 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4851 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4852 {
1182767e 4853 I32 t,ml;
cf93c79d 4854
1de06328 4855 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4856 && data.offset_fixed == data.offset_float_min
4857 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4858 goto remove_float; /* As in (a)+. */
4859
1de06328
YO
4860 /* copy the information about the longest float from the reg_scan_data
4861 over to the program. */
33b8afdf
JH
4862 if (SvUTF8(data.longest_float)) {
4863 r->float_utf8 = data.longest_float;
c445ea15 4864 r->float_substr = NULL;
33b8afdf
JH
4865 } else {
4866 r->float_substr = data.longest_float;
c445ea15 4867 r->float_utf8 = NULL;
33b8afdf 4868 }
1de06328
YO
4869 /* float_end_shift is how many chars that must be matched that
4870 follow this item. We calculate it ahead of time as once the
4871 lookbehind offset is added in we lose the ability to correctly
4872 calculate it.*/
4873 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4874 : (I32)longest_float_length;
1de06328
YO
4875 r->float_end_shift = ml - data.offset_float_min
4876 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4877 + data.lookbehind_float;
4878 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4879 r->float_max_offset = data.offset_float_max;
1182767e 4880 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4881 r->float_max_offset -= data.lookbehind_float;
4882
cf93c79d
IZ
4883 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4884 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4885 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4886 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4887 }
4888 else {
aca2d497 4889 remove_float:
c445ea15 4890 r->float_substr = r->float_utf8 = NULL;
c277df42 4891 SvREFCNT_dec(data.longest_float);
c5254dd6 4892 longest_float_length = 0;
a0d0e21e 4893 }
c277df42 4894
1de06328
YO
4895 /* Note that code very similar to this but for floating string
4896 is immediately above, changes may need to be made to both.
4897 Be careful.
4898 */
a0ed51b3 4899 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4900 if (longest_fixed_length
c277df42
IZ
4901 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4902 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4903 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4904 {
1182767e 4905 I32 t,ml;
cf93c79d 4906
1de06328
YO
4907 /* copy the information about the longest fixed
4908 from the reg_scan_data over to the program. */
33b8afdf
JH
4909 if (SvUTF8(data.longest_fixed)) {
4910 r->anchored_utf8 = data.longest_fixed;
c445ea15 4911 r->anchored_substr = NULL;
33b8afdf
JH
4912 } else {
4913 r->anchored_substr = data.longest_fixed;
c445ea15 4914 r->anchored_utf8 = NULL;
33b8afdf 4915 }
1de06328
YO
4916 /* fixed_end_shift is how many chars that must be matched that
4917 follow this item. We calculate it ahead of time as once the
4918 lookbehind offset is added in we lose the ability to correctly
4919 calculate it.*/
4920 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4921 : (I32)longest_fixed_length;
1de06328
YO
4922 r->anchored_end_shift = ml - data.offset_fixed
4923 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4924 + data.lookbehind_fixed;
4925 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4926
cf93c79d
IZ
4927 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4928 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4929 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4930 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4931 }
4932 else {
c445ea15 4933 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4934 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4935 longest_fixed_length = 0;
a0d0e21e 4936 }
f8fc2ecf
YO
4937 if (ri->regstclass
4938 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4939 ri->regstclass = NULL;
33b8afdf
JH
4940 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4941 && stclass_flag
653099ff 4942 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4943 && !cl_is_anything(data.start_class))
4944 {
2eccd3b2 4945 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4946
f8fc2ecf 4947 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4948 struct regnode_charclass_class);
4949 StructCopy(data.start_class,
f8fc2ecf 4950 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4951 struct regnode_charclass_class);
f8fc2ecf 4952 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4953 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4954 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4955 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4956 PerlIO_printf(Perl_debug_log,
a0288114 4957 "synthetic stclass \"%s\".\n",
3f7c398e 4958 SvPVX_const(sv));});
653099ff 4959 }
c277df42
IZ
4960
4961 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4962 if (longest_fixed_length > longest_float_length) {
1de06328 4963 r->check_end_shift = r->anchored_end_shift;
c277df42 4964 r->check_substr = r->anchored_substr;
33b8afdf 4965 r->check_utf8 = r->anchored_utf8;
c277df42 4966 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4967 if (r->extflags & RXf_ANCH_SINGLE)
4968 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4969 }
4970 else {
1de06328 4971 r->check_end_shift = r->float_end_shift;
c277df42 4972 r->check_substr = r->float_substr;
33b8afdf 4973 r->check_utf8 = r->float_utf8;
1de06328
YO
4974 r->check_offset_min = r->float_min_offset;
4975 r->check_offset_max = r->float_max_offset;
a0d0e21e 4976 }
30382c73
IZ
4977 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4978 This should be changed ASAP! */
bbe252da
YO
4979 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4980 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4981 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4982 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4983 }
1de06328
YO
4984 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4985 if ( (STRLEN)minlen < longest_float_length )
4986 minlen= longest_float_length;
4987 if ( (STRLEN)minlen < longest_fixed_length )
4988 minlen= longest_fixed_length;
4989 */
a0ed51b3
LW
4990 }
4991 else {
c277df42
IZ
4992 /* Several toplevels. Best we can is to set minlen. */
4993 I32 fake;
653099ff 4994 struct regnode_charclass_class ch_class;
cb434fcc 4995 I32 last_close = 0;
c277df42 4996
5d458dd8 4997 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4998
f8fc2ecf 4999 scan = ri->program + 1;
830247a4 5000 cl_init(pRExC_state, &ch_class);
653099ff 5001 data.start_class = &ch_class;
cb434fcc 5002 data.last_closep = &last_close;
07be1b83 5003
de8c5301 5004
1de06328 5005 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 5006 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 5007
786e8c11 5008 CHECK_RESTUDY_GOTO;
07be1b83 5009
33b8afdf 5010 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 5011 = r->float_substr = r->float_utf8 = NULL;
653099ff 5012 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
5013 && !cl_is_anything(data.start_class))
5014 {
2eccd3b2 5015 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 5016
f8fc2ecf 5017 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
5018 struct regnode_charclass_class);
5019 StructCopy(data.start_class,
f8fc2ecf 5020 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 5021 struct regnode_charclass_class);
f8fc2ecf 5022 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5023 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5024 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 5025 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5026 PerlIO_printf(Perl_debug_log,
a0288114 5027 "synthetic stclass \"%s\".\n",
3f7c398e 5028 SvPVX_const(sv));});
653099ff 5029 }
a0d0e21e
LW
5030 }
5031
1de06328
YO
5032 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5033 the "real" pattern. */
cf9788e3
RGS
5034 DEBUG_OPTIMISE_r({
5035 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 5036 (IV)minlen, (IV)r->minlen);
cf9788e3 5037 });
de8c5301 5038 r->minlenret = minlen;
1de06328
YO
5039 if (r->minlen < minlen)
5040 r->minlen = minlen;
5041
b81d288d 5042 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 5043 r->extflags |= RXf_GPOS_SEEN;
830247a4 5044 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 5045 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 5046 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 5047 r->extflags |= RXf_EVAL_SEEN;
f33976b4 5048 if (RExC_seen & REG_SEEN_CANY)
bbe252da 5049 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 5050 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 5051 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 5052 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 5053 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 5054 if (RExC_paren_names)
85fbaab2 5055 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 5056 else
5daac39c 5057 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 5058
7bd1e614 5059#ifdef STUPID_PATTERN_CHECKS
5509d87a 5060 if (RX_PRELEN(rx) == 0)
640f820d 5061 r->extflags |= RXf_NULL;
5509d87a 5062 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
5063 /* XXX: this should happen BEFORE we compile */
5064 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 5065 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 5066 r->extflags |= RXf_WHITE;
5509d87a 5067 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 5068 r->extflags |= RXf_START_ONLY;
f1b875a0 5069#else
5509d87a 5070 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
5071 /* XXX: this should happen BEFORE we compile */
5072 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5073 else {
5074 regnode *first = ri->program + 1;
39aa8307
JH
5075 U8 fop = OP(first);
5076 U8 nop = OP(NEXTOPER(first));
7bd1e614 5077
640f820d
AB
5078 if (PL_regkind[fop] == NOTHING && nop == END)
5079 r->extflags |= RXf_NULL;
5080 else if (PL_regkind[fop] == BOL && nop == END)
7bd1e614
YO
5081 r->extflags |= RXf_START_ONLY;
5082 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5083 r->extflags |= RXf_WHITE;
5084 }
f1b875a0 5085#endif
1f1031fe
YO
5086#ifdef DEBUGGING
5087 if (RExC_paren_names) {
af534a04 5088 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
1f1031fe
YO
5089 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5090 } else
1f1031fe 5091#endif
cde0cee5 5092 ri->name_list_idx = 0;
1f1031fe 5093
40d049e4
YO
5094 if (RExC_recurse_count) {
5095 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5096 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5097 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5098 }
5099 }
f0ab9afb 5100 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
5101 /* assume we don't need to swap parens around before we match */
5102
be8e71aa
YO
5103 DEBUG_DUMP_r({
5104 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
5105 regdump(r);
5106 });
7122b237
YO
5107#ifdef RE_TRACK_PATTERN_OFFSETS
5108 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5109 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
5110 U32 i;
5111 GET_RE_DEBUG_FLAGS_DECL;
7122b237 5112 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 5113 for (i = 1; i <= len; i++) {
7122b237 5114 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 5115 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 5116 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
5117 }
5118 PerlIO_printf(Perl_debug_log, "\n");
5119 });
7122b237 5120#endif
288b8c02 5121 return rx;
a687059c
LW
5122}
5123
f9f4320a 5124#undef RE_ENGINE_PTR
3dab1dad 5125
93b32b6d 5126
81714fb9 5127SV*
192b9cd1
AB
5128Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5129 const U32 flags)
5130{
7918f24d
NC
5131 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5132
192b9cd1
AB
5133 PERL_UNUSED_ARG(value);
5134
f1b875a0 5135 if (flags & RXapif_FETCH) {
192b9cd1 5136 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 5137 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6ad8f254 5138 Perl_croak_no_modify(aTHX);
192b9cd1 5139 return NULL;
f1b875a0 5140 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
5141 return reg_named_buff_exists(rx, key, flags)
5142 ? &PL_sv_yes
5143 : &PL_sv_no;
f1b875a0 5144 } else if (flags & RXapif_REGNAMES) {
192b9cd1 5145 return reg_named_buff_all(rx, flags);
f1b875a0 5146 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
5147 return reg_named_buff_scalar(rx, flags);
5148 } else {
5149 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5150 return NULL;
5151 }
5152}
5153
5154SV*
5155Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5156 const U32 flags)
5157{
7918f24d 5158 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
5159 PERL_UNUSED_ARG(lastkey);
5160
f1b875a0 5161 if (flags & RXapif_FIRSTKEY)
192b9cd1 5162 return reg_named_buff_firstkey(rx, flags);
f1b875a0 5163 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
5164 return reg_named_buff_nextkey(rx, flags);
5165 else {
5166 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5167 return NULL;
5168 }
5169}
5170
5171SV*
288b8c02
NC
5172Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5173 const U32 flags)
81714fb9 5174{
44a2ac75
YO
5175 AV *retarray = NULL;
5176 SV *ret;
288b8c02 5177 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5178
5179 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5180
f1b875a0 5181 if (flags & RXapif_ALL)
44a2ac75 5182 retarray=newAV();
93b32b6d 5183
5daac39c
NC
5184 if (rx && RXp_PAREN_NAMES(rx)) {
5185 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
5186 if (he_str) {
5187 IV i;
5188 SV* sv_dat=HeVAL(he_str);
5189 I32 *nums=(I32*)SvPVX(sv_dat);
5190 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
5191 if ((I32)(rx->nparens) >= nums[i]
5192 && rx->offs[nums[i]].start != -1
5193 && rx->offs[nums[i]].end != -1)
93b32b6d 5194 {
49d7dfbc 5195 ret = newSVpvs("");
288b8c02 5196 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
5197 if (!retarray)
5198 return ret;
5199 } else {
5200 ret = newSVsv(&PL_sv_undef);
5201 }
ec83ea38 5202 if (retarray)
93b32b6d 5203 av_push(retarray, ret);
81714fb9 5204 }
93b32b6d 5205 if (retarray)
ad64d0ec 5206 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
5207 }
5208 }
5209 return NULL;
5210}
5211
5212bool
288b8c02 5213Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
5214 const U32 flags)
5215{
288b8c02 5216 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5217
5218 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5219
5daac39c 5220 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5221 if (flags & RXapif_ALL) {
5daac39c 5222 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 5223 } else {
288b8c02 5224 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
5225 if (sv) {
5226 SvREFCNT_dec(sv);
192b9cd1
AB
5227 return TRUE;
5228 } else {
5229 return FALSE;
5230 }
5231 }
5232 } else {
5233 return FALSE;
5234 }
5235}
5236
5237SV*
288b8c02 5238Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5239{
288b8c02 5240 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5241
5242 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5243
5daac39c
NC
5244 if ( rx && RXp_PAREN_NAMES(rx) ) {
5245 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 5246
288b8c02 5247 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
5248 } else {
5249 return FALSE;
5250 }
192b9cd1
AB
5251}
5252
5253SV*
288b8c02 5254Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5255{
288b8c02 5256 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 5257 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5258
5259 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5260
5daac39c
NC
5261 if (rx && RXp_PAREN_NAMES(rx)) {
5262 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
5263 HE *temphe;
5264 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5265 IV i;
5266 IV parno = 0;
5267 SV* sv_dat = HeVAL(temphe);
5268 I32 *nums = (I32*)SvPVX(sv_dat);
5269 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5270 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5271 rx->offs[nums[i]].start != -1 &&
5272 rx->offs[nums[i]].end != -1)
5273 {
5274 parno = nums[i];
5275 break;
5276 }
5277 }
f1b875a0 5278 if (parno || flags & RXapif_ALL) {
a663657d 5279 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5280 }
81714fb9
YO
5281 }
5282 }
44a2ac75
YO
5283 return NULL;
5284}
5285
192b9cd1 5286SV*
288b8c02 5287Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5288{
5289 SV *ret;
5290 AV *av;
5291 I32 length;
288b8c02 5292 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5293
7918f24d
NC
5294 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5295
5daac39c 5296 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5297 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5298 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5299 } else if (flags & RXapif_ONE) {
288b8c02 5300 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 5301 av = MUTABLE_AV(SvRV(ret));
192b9cd1 5302 length = av_len(av);
ec83ea38 5303 SvREFCNT_dec(ret);
192b9cd1
AB
5304 return newSViv(length + 1);
5305 } else {
5306 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5307 return NULL;
5308 }
5309 }
5310 return &PL_sv_undef;
5311}
5312
5313SV*
288b8c02 5314Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5315{
288b8c02 5316 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5317 AV *av = newAV();
5318
7918f24d
NC
5319 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5320
5daac39c
NC
5321 if (rx && RXp_PAREN_NAMES(rx)) {
5322 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5323 HE *temphe;
5324 (void)hv_iterinit(hv);
5325 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5326 IV i;
5327 IV parno = 0;
5328 SV* sv_dat = HeVAL(temphe);
5329 I32 *nums = (I32*)SvPVX(sv_dat);
5330 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5331 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5332 rx->offs[nums[i]].start != -1 &&
5333 rx->offs[nums[i]].end != -1)
5334 {
5335 parno = nums[i];
5336 break;
5337 }
5338 }
f1b875a0 5339 if (parno || flags & RXapif_ALL) {
a663657d 5340 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
5341 }
5342 }
5343 }
5344
ad64d0ec 5345 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
5346}
5347
49d7dfbc 5348void
288b8c02
NC
5349Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5350 SV * const sv)
44a2ac75 5351{
288b8c02 5352 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 5353 char *s = NULL;
a9d504c3 5354 I32 i = 0;
44a2ac75 5355 I32 s1, t1;
7918f24d
NC
5356
5357 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 5358
cde0cee5
YO
5359 if (!rx->subbeg) {
5360 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5361 return;
cde0cee5
YO
5362 }
5363 else
f1b875a0 5364 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 5365 /* $` */
f0ab9afb 5366 i = rx->offs[0].start;
cde0cee5 5367 s = rx->subbeg;
44a2ac75
YO
5368 }
5369 else
f1b875a0 5370 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 5371 /* $' */
f0ab9afb
NC
5372 s = rx->subbeg + rx->offs[0].end;
5373 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
5374 }
5375 else
5376 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
5377 (s1 = rx->offs[paren].start) != -1 &&
5378 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
5379 {
5380 /* $& $1 ... */
5381 i = t1 - s1;
5382 s = rx->subbeg + s1;
cde0cee5
YO
5383 } else {
5384 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5385 return;
cde0cee5
YO
5386 }
5387 assert(rx->sublen >= (s - rx->subbeg) + i );
5388 if (i >= 0) {
5389 const int oldtainted = PL_tainted;
5390 TAINT_NOT;
5391 sv_setpvn(sv, s, i);
5392 PL_tainted = oldtainted;
5393 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 5394 ? (RXp_MATCH_UTF8(rx)
cde0cee5 5395 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 5396 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
5397 {
5398 SvUTF8_on(sv);
5399 }
5400 else
5401 SvUTF8_off(sv);
5402 if (PL_tainting) {
07bc277f 5403 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
5404 if (SvTYPE(sv) >= SVt_PVMG) {
5405 MAGIC* const mg = SvMAGIC(sv);
5406 MAGIC* mgt;
5407 PL_tainted = 1;
5408 SvMAGIC_set(sv, mg->mg_moremagic);
5409 SvTAINT(sv);
5410 if ((mgt = SvMAGIC(sv))) {
5411 mg->mg_moremagic = mgt;
5412 SvMAGIC_set(sv, mg);
44a2ac75 5413 }
cde0cee5
YO
5414 } else {
5415 PL_tainted = 1;
5416 SvTAINT(sv);
5417 }
5418 } else
5419 SvTAINTED_off(sv);
44a2ac75 5420 }
81714fb9 5421 } else {
44a2ac75 5422 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5423 return;
81714fb9
YO
5424 }
5425}
93b32b6d 5426
2fdbfb4d
AB
5427void
5428Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5429 SV const * const value)
5430{
7918f24d
NC
5431 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5432
2fdbfb4d
AB
5433 PERL_UNUSED_ARG(rx);
5434 PERL_UNUSED_ARG(paren);
5435 PERL_UNUSED_ARG(value);
5436
5437 if (!PL_localizing)
6ad8f254 5438 Perl_croak_no_modify(aTHX);
2fdbfb4d
AB
5439}
5440
5441I32
288b8c02 5442Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
5443 const I32 paren)
5444{
288b8c02 5445 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
5446 I32 i;
5447 I32 s1, t1;
5448
7918f24d
NC
5449 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5450
2fdbfb4d
AB
5451 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5452 switch (paren) {
192b9cd1 5453 /* $` / ${^PREMATCH} */
f1b875a0 5454 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
5455 if (rx->offs[0].start != -1) {
5456 i = rx->offs[0].start;
5457 if (i > 0) {
5458 s1 = 0;
5459 t1 = i;
5460 goto getlen;
5461 }
5462 }
5463 return 0;
192b9cd1 5464 /* $' / ${^POSTMATCH} */
f1b875a0 5465 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
5466 if (rx->offs[0].end != -1) {
5467 i = rx->sublen - rx->offs[0].end;
5468 if (i > 0) {
5469 s1 = rx->offs[0].end;
5470 t1 = rx->sublen;
5471 goto getlen;
5472 }
5473 }
5474 return 0;
192b9cd1
AB
5475 /* $& / ${^MATCH}, $1, $2, ... */
5476 default:
2fdbfb4d
AB
5477 if (paren <= (I32)rx->nparens &&
5478 (s1 = rx->offs[paren].start) != -1 &&
5479 (t1 = rx->offs[paren].end) != -1)
5480 {
5481 i = t1 - s1;
5482 goto getlen;
5483 } else {
5484 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 5485 report_uninit((const SV *)sv);
2fdbfb4d
AB
5486 return 0;
5487 }
5488 }
5489 getlen:
07bc277f 5490 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
5491 const char * const s = rx->subbeg + s1;
5492 const U8 *ep;
5493 STRLEN el;
5494
5495 i = t1 - s1;
5496 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5497 i = el;
5498 }
5499 return i;
5500}
5501
fe578d7f 5502SV*
49d7dfbc 5503Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 5504{
7918f24d 5505 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 5506 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
5507 if (0)
5508 return NULL;
5509 else
5510 return newSVpvs("Regexp");
fe578d7f 5511}
0a4db386 5512
894be9b7 5513/* Scans the name of a named buffer from the pattern.
0a4db386
YO
5514 * If flags is REG_RSN_RETURN_NULL returns null.
5515 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5516 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5517 * to the parsed name as looked up in the RExC_paren_names hash.
5518 * If there is an error throws a vFAIL().. type exception.
894be9b7 5519 */
0a4db386
YO
5520
5521#define REG_RSN_RETURN_NULL 0
5522#define REG_RSN_RETURN_NAME 1
5523#define REG_RSN_RETURN_DATA 2
5524
894be9b7 5525STATIC SV*
7918f24d
NC
5526S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5527{
894be9b7 5528 char *name_start = RExC_parse;
1f1031fe 5529
7918f24d
NC
5530 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5531
1f1031fe
YO
5532 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5533 /* skip IDFIRST by using do...while */
5534 if (UTF)
5535 do {
5536 RExC_parse += UTF8SKIP(RExC_parse);
5537 } while (isALNUM_utf8((U8*)RExC_parse));
5538 else
5539 do {
5540 RExC_parse++;
5541 } while (isALNUM(*RExC_parse));
894be9b7 5542 }
1f1031fe 5543
0a4db386 5544 if ( flags ) {
59cd0e26
NC
5545 SV* sv_name
5546 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5547 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
5548 if ( flags == REG_RSN_RETURN_NAME)
5549 return sv_name;
5550 else if (flags==REG_RSN_RETURN_DATA) {
5551 HE *he_str = NULL;
5552 SV *sv_dat = NULL;
5553 if ( ! sv_name ) /* should not happen*/
5554 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5555 if (RExC_paren_names)
5556 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5557 if ( he_str )
5558 sv_dat = HeVAL(he_str);
5559 if ( ! sv_dat )
5560 vFAIL("Reference to nonexistent named group");
5561 return sv_dat;
5562 }
5563 else {
5564 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5565 }
5566 /* NOT REACHED */
894be9b7 5567 }
0a4db386 5568 return NULL;
894be9b7
YO
5569}
5570
3dab1dad
YO
5571#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5572 int rem=(int)(RExC_end - RExC_parse); \
5573 int cut; \
5574 int num; \
5575 int iscut=0; \
5576 if (rem>10) { \
5577 rem=10; \
5578 iscut=1; \
5579 } \
5580 cut=10-rem; \
5581 if (RExC_lastparse!=RExC_parse) \
5582 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5583 rem, RExC_parse, \
5584 cut + 4, \
5585 iscut ? "..." : "<" \
5586 ); \
5587 else \
5588 PerlIO_printf(Perl_debug_log,"%16s",""); \
5589 \
5590 if (SIZE_ONLY) \
3b57cd43 5591 num = RExC_size + 1; \
3dab1dad
YO
5592 else \
5593 num=REG_NODE_NUM(RExC_emit); \
5594 if (RExC_lastnum!=num) \
0a4db386 5595 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 5596 else \
0a4db386 5597 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
5598 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5599 (int)((depth*2)), "", \
3dab1dad
YO
5600 (funcname) \
5601 ); \
5602 RExC_lastnum=num; \
5603 RExC_lastparse=RExC_parse; \
5604})
5605
07be1b83
YO
5606
5607
3dab1dad
YO
5608#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5609 DEBUG_PARSE_MSG((funcname)); \
5610 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5611})
6bda09f9
YO
5612#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5613 DEBUG_PARSE_MSG((funcname)); \
5614 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5615})
a687059c
LW
5616/*
5617 - reg - regular expression, i.e. main body or parenthesized thing
5618 *
5619 * Caller must absorb opening parenthesis.
5620 *
5621 * Combining parenthesis handling with the base level of regular expression
5622 * is a trifle forced, but the need to tie the tails of the branches to what
5623 * follows makes it hard to avoid.
5624 */
07be1b83
YO
5625#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5626#ifdef DEBUGGING
5627#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5628#else
5629#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5630#endif
3dab1dad 5631
76e3520e 5632STATIC regnode *
3dab1dad 5633S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 5634 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 5635{
27da23d5 5636 dVAR;
c277df42
IZ
5637 register regnode *ret; /* Will be the head of the group. */
5638 register regnode *br;
5639 register regnode *lastbr;
cbbf8932 5640 register regnode *ender = NULL;
a0d0e21e 5641 register I32 parno = 0;
cbbf8932 5642 I32 flags;
f7819f85 5643 U32 oregflags = RExC_flags;
6136c704
AL
5644 bool have_branch = 0;
5645 bool is_open = 0;
594d7033
YO
5646 I32 freeze_paren = 0;
5647 I32 after_freeze = 0;
9d1d55b5
JP
5648
5649 /* for (?g), (?gc), and (?o) warnings; warning
5650 about (?c) will warn about (?g) -- japhy */
5651
6136c704
AL
5652#define WASTED_O 0x01
5653#define WASTED_G 0x02
5654#define WASTED_C 0x04
5655#define WASTED_GC (0x02|0x04)
cbbf8932 5656 I32 wastedflags = 0x00;
9d1d55b5 5657
fac92740 5658 char * parse_start = RExC_parse; /* MJD */
a28509cc 5659 char * const oregcomp_parse = RExC_parse;
a0d0e21e 5660
3dab1dad 5661 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5662
5663 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
5664 DEBUG_PARSE("reg ");
5665
821b33a5 5666 *flagp = 0; /* Tentatively. */
a0d0e21e 5667
9d1d55b5 5668
a0d0e21e
LW
5669 /* Make an OPEN node, if parenthesized. */
5670 if (paren) {
e2e6a0f1
YO
5671 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5672 char *start_verb = RExC_parse;
5673 STRLEN verb_len = 0;
5674 char *start_arg = NULL;
5675 unsigned char op = 0;
5676 int argok = 1;
5677 int internal_argval = 0; /* internal_argval is only useful if !argok */
5678 while ( *RExC_parse && *RExC_parse != ')' ) {
5679 if ( *RExC_parse == ':' ) {
5680 start_arg = RExC_parse + 1;
5681 break;
5682 }
5683 RExC_parse++;
5684 }
5685 ++start_verb;
5686 verb_len = RExC_parse - start_verb;
5687 if ( start_arg ) {
5688 RExC_parse++;
5689 while ( *RExC_parse && *RExC_parse != ')' )
5690 RExC_parse++;
5691 if ( *RExC_parse != ')' )
5692 vFAIL("Unterminated verb pattern argument");
5693 if ( RExC_parse == start_arg )
5694 start_arg = NULL;
5695 } else {
5696 if ( *RExC_parse != ')' )
5697 vFAIL("Unterminated verb pattern");
5698 }
5d458dd8 5699
e2e6a0f1
YO
5700 switch ( *start_verb ) {
5701 case 'A': /* (*ACCEPT) */
568a785a 5702 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
5703 op = ACCEPT;
5704 internal_argval = RExC_nestroot;
5705 }
5706 break;
5707 case 'C': /* (*COMMIT) */
568a785a 5708 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 5709 op = COMMIT;
e2e6a0f1
YO
5710 break;
5711 case 'F': /* (*FAIL) */
568a785a 5712 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
5713 op = OPFAIL;
5714 argok = 0;
5715 }
5716 break;
5d458dd8
YO
5717 case ':': /* (*:NAME) */
5718 case 'M': /* (*MARK:NAME) */
568a785a 5719 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 5720 op = MARKPOINT;
5d458dd8
YO
5721 argok = -1;
5722 }
5723 break;
5724 case 'P': /* (*PRUNE) */
568a785a 5725 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 5726 op = PRUNE;
e2e6a0f1 5727 break;
5d458dd8 5728 case 'S': /* (*SKIP) */
568a785a 5729 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
5730 op = SKIP;
5731 break;
5732 case 'T': /* (*THEN) */
5733 /* [19:06] <TimToady> :: is then */
568a785a 5734 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
5735 op = CUTGROUP;
5736 RExC_seen |= REG_SEEN_CUTGROUP;
5737 }
e2e6a0f1
YO
5738 break;
5739 }
5740 if ( ! op ) {
5741 RExC_parse++;
5742 vFAIL3("Unknown verb pattern '%.*s'",
5743 verb_len, start_verb);
5744 }
5745 if ( argok ) {
5746 if ( start_arg && internal_argval ) {
5747 vFAIL3("Verb pattern '%.*s' may not have an argument",
5748 verb_len, start_verb);
5749 } else if ( argok < 0 && !start_arg ) {
5750 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5751 verb_len, start_verb);
5752 } else {
5753 ret = reganode(pRExC_state, op, internal_argval);
5754 if ( ! internal_argval && ! SIZE_ONLY ) {
5755 if (start_arg) {
5756 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5757 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5758 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
5759 ret->flags = 0;
5760 } else {
5761 ret->flags = 1;
5762 }
5763 }
5764 }
5765 if (!internal_argval)
5766 RExC_seen |= REG_SEEN_VERBARG;
5767 } else if ( start_arg ) {
5768 vFAIL3("Verb pattern '%.*s' may not have an argument",
5769 verb_len, start_verb);
5770 } else {
5771 ret = reg_node(pRExC_state, op);
5772 }
5773 nextchar(pRExC_state);
5774 return ret;
5775 } else
fac92740 5776 if (*RExC_parse == '?') { /* (?...) */
6136c704 5777 bool is_logical = 0;
a28509cc 5778 const char * const seqstart = RExC_parse;
fb85c044 5779 bool has_use_defaults = FALSE;
ca9dfc88 5780
830247a4
IZ
5781 RExC_parse++;
5782 paren = *RExC_parse++;
c277df42 5783 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5784 switch (paren) {
894be9b7 5785
1f1031fe
YO
5786 case 'P': /* (?P...) variants for those used to PCRE/Python */
5787 paren = *RExC_parse++;
5788 if ( paren == '<') /* (?P<...>) named capture */
5789 goto named_capture;
5790 else if (paren == '>') { /* (?P>name) named recursion */
5791 goto named_recursion;
5792 }
5793 else if (paren == '=') { /* (?P=...) named backref */
5794 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5795 you change this make sure you change that */
5796 char* name_start = RExC_parse;
5797 U32 num = 0;
5798 SV *sv_dat = reg_scan_name(pRExC_state,
5799 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5800 if (RExC_parse == name_start || *RExC_parse != ')')
5801 vFAIL2("Sequence %.3s... not terminated",parse_start);
5802
5803 if (!SIZE_ONLY) {
5804 num = add_data( pRExC_state, 1, "S" );
5805 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5806 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
5807 }
5808 RExC_sawback = 1;
5809 ret = reganode(pRExC_state,
5810 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5811 num);
5812 *flagp |= HASWIDTH;
5813
5814 Set_Node_Offset(ret, parse_start+1);
5815 Set_Node_Cur_Length(ret); /* MJD */
5816
5817 nextchar(pRExC_state);
5818 return ret;
5819 }
57b84237
YO
5820 RExC_parse++;
5821 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5822 /*NOTREACHED*/
5823 case '<': /* (?<...) */
b81d288d 5824 if (*RExC_parse == '!')
c277df42 5825 paren = ',';
0a4db386 5826 else if (*RExC_parse != '=')
1f1031fe 5827 named_capture:
0a4db386 5828 { /* (?<...>) */
81714fb9 5829 char *name_start;
894be9b7 5830 SV *svname;
81714fb9
YO
5831 paren= '>';
5832 case '\'': /* (?'...') */
5833 name_start= RExC_parse;
0a4db386
YO
5834 svname = reg_scan_name(pRExC_state,
5835 SIZE_ONLY ? /* reverse test from the others */
5836 REG_RSN_RETURN_NAME :
5837 REG_RSN_RETURN_NULL);
57b84237
YO
5838 if (RExC_parse == name_start) {
5839 RExC_parse++;
5840 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5841 /*NOTREACHED*/
5842 }
81714fb9
YO
5843 if (*RExC_parse != paren)
5844 vFAIL2("Sequence (?%c... not terminated",
5845 paren=='>' ? '<' : paren);
5846 if (SIZE_ONLY) {
e62cc96a
YO
5847 HE *he_str;
5848 SV *sv_dat = NULL;
894be9b7
YO
5849 if (!svname) /* shouldnt happen */
5850 Perl_croak(aTHX_
5851 "panic: reg_scan_name returned NULL");
81714fb9
YO
5852 if (!RExC_paren_names) {
5853 RExC_paren_names= newHV();
ad64d0ec 5854 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
5855#ifdef DEBUGGING
5856 RExC_paren_name_list= newAV();
ad64d0ec 5857 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 5858#endif
81714fb9
YO
5859 }
5860 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5861 if ( he_str )
81714fb9 5862 sv_dat = HeVAL(he_str);
e62cc96a 5863 if ( ! sv_dat ) {
81714fb9 5864 /* croak baby croak */
e62cc96a
YO
5865 Perl_croak(aTHX_
5866 "panic: paren_name hash element allocation failed");
5867 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
5868 /* (?|...) can mean we have dupes so scan to check
5869 its already been stored. Maybe a flag indicating
5870 we are inside such a construct would be useful,
5871 but the arrays are likely to be quite small, so
5872 for now we punt -- dmq */
5873 IV count = SvIV(sv_dat);
5874 I32 *pv = (I32*)SvPVX(sv_dat);
5875 IV i;
5876 for ( i = 0 ; i < count ; i++ ) {
5877 if ( pv[i] == RExC_npar ) {
5878 count = 0;
5879 break;
5880 }
5881 }
5882 if ( count ) {
5883 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5884 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5885 pv[count] = RExC_npar;
3a92e6ae 5886 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 5887 }
81714fb9
YO
5888 } else {
5889 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5890 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5891 SvIOK_on(sv_dat);
3ec35e0f 5892 SvIV_set(sv_dat, 1);
e62cc96a 5893 }
1f1031fe
YO
5894#ifdef DEBUGGING
5895 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5896 SvREFCNT_dec(svname);
5897#endif
e62cc96a 5898
81714fb9
YO
5899 /*sv_dump(sv_dat);*/
5900 }
5901 nextchar(pRExC_state);
5902 paren = 1;
5903 goto capturing_parens;
5904 }
5905 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5906 RExC_parse++;
fac92740 5907 case '=': /* (?=...) */
89c6a13e 5908 RExC_seen_zerolen++;
5c3fa2e7 5909 break;
fac92740 5910 case '!': /* (?!...) */
830247a4 5911 RExC_seen_zerolen++;
e2e6a0f1
YO
5912 if (*RExC_parse == ')') {
5913 ret=reg_node(pRExC_state, OPFAIL);
5914 nextchar(pRExC_state);
5915 return ret;
5916 }
594d7033
YO
5917 break;
5918 case '|': /* (?|...) */
5919 /* branch reset, behave like a (?:...) except that
5920 buffers in alternations share the same numbers */
5921 paren = ':';
5922 after_freeze = freeze_paren = RExC_npar;
5923 break;
fac92740
MJD
5924 case ':': /* (?:...) */
5925 case '>': /* (?>...) */
a0d0e21e 5926 break;
fac92740
MJD
5927 case '$': /* (?$...) */
5928 case '@': /* (?@...) */
8615cb43 5929 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5930 break;
fac92740 5931 case '#': /* (?#...) */
830247a4
IZ
5932 while (*RExC_parse && *RExC_parse != ')')
5933 RExC_parse++;
5934 if (*RExC_parse != ')')
c277df42 5935 FAIL("Sequence (?#... not terminated");
830247a4 5936 nextchar(pRExC_state);
a0d0e21e
LW
5937 *flagp = TRYAGAIN;
5938 return NULL;
894be9b7
YO
5939 case '0' : /* (?0) */
5940 case 'R' : /* (?R) */
5941 if (*RExC_parse != ')')
6bda09f9 5942 FAIL("Sequence (?R) not terminated");
1a147d38 5943 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 5944 *flagp |= POSTPONED;
7f69552c
YO
5945 nextchar(pRExC_state);
5946 return ret;
5947 /*notreached*/
894be9b7
YO
5948 { /* named and numeric backreferences */
5949 I32 num;
894be9b7
YO
5950 case '&': /* (?&NAME) */
5951 parse_start = RExC_parse - 1;
1f1031fe 5952 named_recursion:
894be9b7 5953 {
0a4db386
YO
5954 SV *sv_dat = reg_scan_name(pRExC_state,
5955 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5956 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5957 }
5958 goto gen_recurse_regop;
5959 /* NOT REACHED */
542fa716
YO
5960 case '+':
5961 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5962 RExC_parse++;
5963 vFAIL("Illegal pattern");
5964 }
5965 goto parse_recursion;
5966 /* NOT REACHED*/
5967 case '-': /* (?-1) */
5968 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5969 RExC_parse--; /* rewind to let it be handled later */
5970 goto parse_flags;
5971 }
5972 /*FALLTHROUGH */
6bda09f9
YO
5973 case '1': case '2': case '3': case '4': /* (?1) */
5974 case '5': case '6': case '7': case '8': case '9':
5975 RExC_parse--;
542fa716 5976 parse_recursion:
894be9b7
YO
5977 num = atoi(RExC_parse);
5978 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5979 if (*RExC_parse == '-')
5980 RExC_parse++;
6bda09f9
YO
5981 while (isDIGIT(*RExC_parse))
5982 RExC_parse++;
5983 if (*RExC_parse!=')')
5984 vFAIL("Expecting close bracket");
894be9b7
YO
5985
5986 gen_recurse_regop:
542fa716
YO
5987 if ( paren == '-' ) {
5988 /*
5989 Diagram of capture buffer numbering.
5990 Top line is the normal capture buffer numbers
3b753521 5991 Bottom line is the negative indexing as from
542fa716
YO
5992 the X (the (?-2))
5993
5994 + 1 2 3 4 5 X 6 7
5995 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5996 - 5 4 3 2 1 X x x
5997
5998 */
5999 num = RExC_npar + num;
6000 if (num < 1) {
6001 RExC_parse++;
6002 vFAIL("Reference to nonexistent group");
6003 }
6004 } else if ( paren == '+' ) {
6005 num = RExC_npar + num - 1;
6006 }
6007
1a147d38 6008 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
6009 if (!SIZE_ONLY) {
6010 if (num > (I32)RExC_rx->nparens) {
6011 RExC_parse++;
6012 vFAIL("Reference to nonexistent group");
6013 }
40d049e4 6014 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 6015 RExC_emit++;
226de585 6016 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 6017 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 6018 } else {
6bda09f9 6019 RExC_size++;
6bda09f9 6020 }
0a4db386 6021 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 6022 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
6023 Set_Node_Offset(ret, parse_start); /* MJD */
6024
a3b492c3 6025 *flagp |= POSTPONED;
6bda09f9
YO
6026 nextchar(pRExC_state);
6027 return ret;
894be9b7
YO
6028 } /* named and numeric backreferences */
6029 /* NOT REACHED */
6030
fac92740 6031 case '?': /* (??...) */
6136c704 6032 is_logical = 1;
57b84237
YO
6033 if (*RExC_parse != '{') {
6034 RExC_parse++;
6035 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6036 /*NOTREACHED*/
6037 }
a3b492c3 6038 *flagp |= POSTPONED;
830247a4 6039 paren = *RExC_parse++;
0f5d15d6 6040 /* FALL THROUGH */
fac92740 6041 case '{': /* (?{...}) */
c277df42 6042 {
2eccd3b2
NC
6043 I32 count = 1;
6044 U32 n = 0;
c277df42 6045 char c;
830247a4 6046 char *s = RExC_parse;
c277df42 6047
830247a4
IZ
6048 RExC_seen_zerolen++;
6049 RExC_seen |= REG_SEEN_EVAL;
6050 while (count && (c = *RExC_parse)) {
6136c704
AL
6051 if (c == '\\') {
6052 if (RExC_parse[1])
6053 RExC_parse++;
6054 }
b81d288d 6055 else if (c == '{')
c277df42 6056 count++;
b81d288d 6057 else if (c == '}')
c277df42 6058 count--;
830247a4 6059 RExC_parse++;
c277df42 6060 }
6136c704 6061 if (*RExC_parse != ')') {
b81d288d 6062 RExC_parse = s;
b45f050a
JF
6063 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6064 }
c277df42 6065 if (!SIZE_ONLY) {
f3548bdc 6066 PAD *pad;
6136c704
AL
6067 OP_4tree *sop, *rop;
6068 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 6069
569233ed
SB
6070 ENTER;
6071 Perl_save_re_context(aTHX);
f3548bdc 6072 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
6073 sop->op_private |= OPpREFCOUNTED;
6074 /* re_dup will OpREFCNT_inc */
6075 OpREFCNT_set(sop, 1);
569233ed 6076 LEAVE;
c277df42 6077
830247a4 6078 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
6079 RExC_rxi->data->data[n] = (void*)rop;
6080 RExC_rxi->data->data[n+1] = (void*)sop;
6081 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 6082 SvREFCNT_dec(sv);
a0ed51b3 6083 }
e24b16f9 6084 else { /* First pass */
830247a4 6085 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 6086 && IN_PERL_RUNTIME)
2cd61cdb
IZ
6087 /* No compiled RE interpolated, has runtime
6088 components ===> unsafe. */
6089 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 6090 if (PL_tainting && PL_tainted)
cc6b7395 6091 FAIL("Eval-group in insecure regular expression");
54df2634 6092#if PERL_VERSION > 8
923e4eb5 6093 if (IN_PERL_COMPILETIME)
b5c19bd7 6094 PL_cv_has_eval = 1;
54df2634 6095#endif
c277df42 6096 }
b5c19bd7 6097
830247a4 6098 nextchar(pRExC_state);
6136c704 6099 if (is_logical) {
830247a4 6100 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6101 if (!SIZE_ONLY)
6102 ret->flags = 2;
3dab1dad 6103 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 6104 /* deal with the length of this later - MJD */
0f5d15d6
IZ
6105 return ret;
6106 }
ccb2c380
MP
6107 ret = reganode(pRExC_state, EVAL, n);
6108 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6109 Set_Node_Offset(ret, parse_start);
6110 return ret;
c277df42 6111 }
fac92740 6112 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 6113 {
0a4db386 6114 int is_define= 0;
fac92740 6115 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
6116 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6117 || RExC_parse[1] == '<'
830247a4 6118 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
6119 I32 flag;
6120
830247a4 6121 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6122 if (!SIZE_ONLY)
6123 ret->flags = 1;
3dab1dad 6124 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 6125 goto insert_if;
b81d288d 6126 }
a0ed51b3 6127 }
0a4db386
YO
6128 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6129 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6130 {
6131 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6132 char *name_start= RExC_parse++;
2eccd3b2 6133 U32 num = 0;
0a4db386
YO
6134 SV *sv_dat=reg_scan_name(pRExC_state,
6135 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6136 if (RExC_parse == name_start || *RExC_parse != ch)
6137 vFAIL2("Sequence (?(%c... not terminated",
6138 (ch == '>' ? '<' : ch));
6139 RExC_parse++;
6140 if (!SIZE_ONLY) {
6141 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 6142 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 6143 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
6144 }
6145 ret = reganode(pRExC_state,NGROUPP,num);
6146 goto insert_if_check_paren;
6147 }
6148 else if (RExC_parse[0] == 'D' &&
6149 RExC_parse[1] == 'E' &&
6150 RExC_parse[2] == 'F' &&
6151 RExC_parse[3] == 'I' &&
6152 RExC_parse[4] == 'N' &&
6153 RExC_parse[5] == 'E')
6154 {
6155 ret = reganode(pRExC_state,DEFINEP,0);
6156 RExC_parse +=6 ;
6157 is_define = 1;
6158 goto insert_if_check_paren;
6159 }
6160 else if (RExC_parse[0] == 'R') {
6161 RExC_parse++;
6162 parno = 0;
6163 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6164 parno = atoi(RExC_parse++);
6165 while (isDIGIT(*RExC_parse))
6166 RExC_parse++;
6167 } else if (RExC_parse[0] == '&') {
6168 SV *sv_dat;
6169 RExC_parse++;
6170 sv_dat = reg_scan_name(pRExC_state,
6171 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6172 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6173 }
1a147d38 6174 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
6175 goto insert_if_check_paren;
6176 }
830247a4 6177 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 6178 /* (?(1)...) */
6136c704 6179 char c;
830247a4 6180 parno = atoi(RExC_parse++);
c277df42 6181
830247a4
IZ
6182 while (isDIGIT(*RExC_parse))
6183 RExC_parse++;
fac92740 6184 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 6185
0a4db386 6186 insert_if_check_paren:
830247a4 6187 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 6188 vFAIL("Switch condition not recognized");
c277df42 6189 insert_if:
3dab1dad
YO
6190 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6191 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 6192 if (br == NULL)
830247a4 6193 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 6194 else
3dab1dad 6195 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 6196 c = *nextchar(pRExC_state);
d1b80229
IZ
6197 if (flags&HASWIDTH)
6198 *flagp |= HASWIDTH;
c277df42 6199 if (c == '|') {
0a4db386
YO
6200 if (is_define)
6201 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 6202 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
6203 regbranch(pRExC_state, &flags, 1,depth+1);
6204 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
6205 if (flags&HASWIDTH)
6206 *flagp |= HASWIDTH;
830247a4 6207 c = *nextchar(pRExC_state);
a0ed51b3
LW
6208 }
6209 else
c277df42
IZ
6210 lastbr = NULL;
6211 if (c != ')')
8615cb43 6212 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 6213 ender = reg_node(pRExC_state, TAIL);
3dab1dad 6214 REGTAIL(pRExC_state, br, ender);
c277df42 6215 if (lastbr) {
3dab1dad
YO
6216 REGTAIL(pRExC_state, lastbr, ender);
6217 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
6218 }
6219 else
3dab1dad 6220 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
6221 RExC_size++; /* XXX WHY do we need this?!!
6222 For large programs it seems to be required
6223 but I can't figure out why. -- dmq*/
c277df42 6224 return ret;
a0ed51b3
LW
6225 }
6226 else {
830247a4 6227 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
6228 }
6229 }
1b1626e4 6230 case 0:
830247a4 6231 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 6232 vFAIL("Sequence (? incomplete");
1b1626e4 6233 break;
85508812
KW
6234 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6235 that follow */
fb85c044
KW
6236 has_use_defaults = TRUE;
6237 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9de15fec 6238 RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
fb85c044 6239 goto parse_flags;
a0d0e21e 6240 default:
cde0cee5
YO
6241 --RExC_parse;
6242 parse_flags: /* (?i) */
6243 {
6244 U32 posflags = 0, negflags = 0;
6245 U32 *flagsp = &posflags;
9de15fec 6246 bool has_charset_modifier = 0;
cde0cee5
YO
6247
6248 while (*RExC_parse) {
6249 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
6250 /* (?g), (?gc) and (?o) are useless here
6251 and must be globally applied -- japhy */
cde0cee5
YO
6252 switch (*RExC_parse) {
6253 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9de15fec
KW
6254 case LOCALE_PAT_MOD:
6255 if (has_charset_modifier || flagsp == &negflags) {
6256 goto fail_modifiers;
6257 }
8cc86590
KW
6258 posflags |= RXf_PMf_LOCALE;
6259 negflags |= RXf_PMf_UNICODE;
9de15fec
KW
6260 has_charset_modifier = 1;
6261 break;
6262 case UNICODE_PAT_MOD:
6263 if (has_charset_modifier || flagsp == &negflags) {
6264 goto fail_modifiers;
6265 }
8cc86590
KW
6266 posflags |= RXf_PMf_UNICODE;
6267 negflags |= RXf_PMf_LOCALE;
9de15fec
KW
6268 has_charset_modifier = 1;
6269 break;
6270 case DUAL_PAT_MOD:
6271 if (has_use_defaults
6272 || has_charset_modifier
6273 || flagsp == &negflags)
6274 {
6275 goto fail_modifiers;
6276 }
8cc86590 6277 negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
9de15fec
KW
6278 has_charset_modifier = 1;
6279 break;
f7819f85
A
6280 case ONCE_PAT_MOD: /* 'o' */
6281 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 6282 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 6283 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
6284 if (! (wastedflags & wflagbit) ) {
6285 wastedflags |= wflagbit;
6286 vWARN5(
6287 RExC_parse + 1,
6288 "Useless (%s%c) - %suse /%c modifier",
6289 flagsp == &negflags ? "?-" : "?",
6290 *RExC_parse,
6291 flagsp == &negflags ? "don't " : "",
6292 *RExC_parse
6293 );
6294 }
6295 }
cde0cee5
YO
6296 break;
6297
f7819f85 6298 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 6299 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
6300 if (! (wastedflags & WASTED_C) ) {
6301 wastedflags |= WASTED_GC;
9d1d55b5
JP
6302 vWARN3(
6303 RExC_parse + 1,
6304 "Useless (%sc) - %suse /gc modifier",
6305 flagsp == &negflags ? "?-" : "?",
6306 flagsp == &negflags ? "don't " : ""
6307 );
6308 }
6309 }
cde0cee5 6310 break;
f7819f85 6311 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5 6312 if (flagsp == &negflags) {
668c081a
NC
6313 if (SIZE_ONLY)
6314 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
6315 } else {
6316 *flagsp |= RXf_PMf_KEEPCOPY;
6317 }
6318 break;
6319 case '-':
3b753521 6320 /* A flag is a default iff it is following a minus, so
fb85c044
KW
6321 * if there is a minus, it means will be trying to
6322 * re-specify a default which is an error */
6323 if (has_use_defaults || flagsp == &negflags) {
9de15fec 6324 fail_modifiers:
57b84237
YO
6325 RExC_parse++;
6326 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6327 /*NOTREACHED*/
6328 }
cde0cee5
YO
6329 flagsp = &negflags;
6330 wastedflags = 0; /* reset so (?g-c) warns twice */
6331 break;
6332 case ':':
6333 paren = ':';
6334 /*FALLTHROUGH*/
6335 case ')':
6336 RExC_flags |= posflags;
6337 RExC_flags &= ~negflags;
f7819f85
A
6338 if (paren != ':') {
6339 oregflags |= posflags;
6340 oregflags &= ~negflags;
6341 }
cde0cee5
YO
6342 nextchar(pRExC_state);
6343 if (paren != ':') {
6344 *flagp = TRYAGAIN;
6345 return NULL;
6346 } else {
6347 ret = NULL;
6348 goto parse_rest;
6349 }
6350 /*NOTREACHED*/
6351 default:
cde0cee5
YO
6352 RExC_parse++;
6353 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6354 /*NOTREACHED*/
6355 }
830247a4 6356 ++RExC_parse;
48c036b1 6357 }
cde0cee5 6358 }} /* one for the default block, one for the switch */
a0d0e21e 6359 }
fac92740 6360 else { /* (...) */
81714fb9 6361 capturing_parens:
830247a4
IZ
6362 parno = RExC_npar;
6363 RExC_npar++;
e2e6a0f1 6364
830247a4 6365 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
6366 if (!SIZE_ONLY ){
6367 if (!RExC_nestroot)
6368 RExC_nestroot = parno;
c009da3d
YO
6369 if (RExC_seen & REG_SEEN_RECURSE
6370 && !RExC_open_parens[parno-1])
6371 {
e2e6a0f1 6372 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
6373 "Setting open paren #%"IVdf" to %d\n",
6374 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
6375 RExC_open_parens[parno-1]= ret;
6376 }
6bda09f9 6377 }
fac92740
MJD
6378 Set_Node_Length(ret, 1); /* MJD */
6379 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 6380 is_open = 1;
a0d0e21e 6381 }
a0ed51b3 6382 }
fac92740 6383 else /* ! paren */
a0d0e21e 6384 ret = NULL;
cde0cee5
YO
6385
6386 parse_rest:
a0d0e21e 6387 /* Pick up the branches, linking them together. */
fac92740 6388 parse_start = RExC_parse; /* MJD */
3dab1dad 6389 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e
VP
6390
6391 if (freeze_paren) {
6392 if (RExC_npar > after_freeze)
6393 after_freeze = RExC_npar;
6394 RExC_npar = freeze_paren;
6395 }
6396
fac92740 6397 /* branch_len = (paren != 0); */
2af232bd 6398
a0d0e21e
LW
6399 if (br == NULL)
6400 return(NULL);
830247a4
IZ
6401 if (*RExC_parse == '|') {
6402 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 6403 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 6404 }
fac92740 6405 else { /* MJD */
6bda09f9 6406 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
6407 Set_Node_Length(br, paren != 0);
6408 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6409 }
c277df42
IZ
6410 have_branch = 1;
6411 if (SIZE_ONLY)
830247a4 6412 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
6413 }
6414 else if (paren == ':') {
c277df42
IZ
6415 *flagp |= flags&SIMPLE;
6416 }
6136c704 6417 if (is_open) { /* Starts with OPEN. */
3dab1dad 6418 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
6419 }
6420 else if (paren != '?') /* Not Conditional */
a0d0e21e 6421 ret = br;
8ae10a67 6422 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 6423 lastbr = br;
830247a4
IZ
6424 while (*RExC_parse == '|') {
6425 if (!SIZE_ONLY && RExC_extralen) {
6426 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 6427 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
6428 }
6429 if (SIZE_ONLY)
830247a4
IZ
6430 RExC_extralen += 2; /* Account for LONGJMP. */
6431 nextchar(pRExC_state);
594d7033
YO
6432 if (freeze_paren) {
6433 if (RExC_npar > after_freeze)
6434 after_freeze = RExC_npar;
6435 RExC_npar = freeze_paren;
6436 }
3dab1dad 6437 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 6438
a687059c 6439 if (br == NULL)
a0d0e21e 6440 return(NULL);
3dab1dad 6441 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 6442 lastbr = br;
8ae10a67 6443 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
6444 }
6445
c277df42
IZ
6446 if (have_branch || paren != ':') {
6447 /* Make a closing node, and hook it on the end. */
6448 switch (paren) {
6449 case ':':
830247a4 6450 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
6451 break;
6452 case 1:
830247a4 6453 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
6454 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6455 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6456 "Setting close paren #%"IVdf" to %d\n",
6457 (IV)parno, REG_NODE_NUM(ender)));
6458 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
6459 if (RExC_nestroot == parno)
6460 RExC_nestroot = 0;
40d049e4 6461 }
fac92740
MJD
6462 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6463 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
6464 break;
6465 case '<':
c277df42
IZ
6466 case ',':
6467 case '=':
6468 case '!':
c277df42 6469 *flagp &= ~HASWIDTH;
821b33a5
IZ
6470 /* FALL THROUGH */
6471 case '>':
830247a4 6472 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
6473 break;
6474 case 0:
830247a4 6475 ender = reg_node(pRExC_state, END);
40d049e4
YO
6476 if (!SIZE_ONLY) {
6477 assert(!RExC_opend); /* there can only be one! */
6478 RExC_opend = ender;
6479 }
c277df42
IZ
6480 break;
6481 }
eaf3ca90 6482 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 6483
9674d46a 6484 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
6485 if (depth==1)
6486 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6487
c277df42 6488 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
6489 for (br = ret; br; br = regnext(br)) {
6490 const U8 op = PL_regkind[OP(br)];
6491 if (op == BRANCH) {
07be1b83 6492 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
6493 }
6494 else if (op == BRANCHJ) {
07be1b83 6495 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 6496 }
c277df42
IZ
6497 }
6498 }
a0d0e21e 6499 }
c277df42
IZ
6500
6501 {
e1ec3a88
AL
6502 const char *p;
6503 static const char parens[] = "=!<,>";
c277df42
IZ
6504
6505 if (paren && (p = strchr(parens, paren))) {
eb160463 6506 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
6507 int flag = (p - parens) > 1;
6508
6509 if (paren == '>')
6510 node = SUSPEND, flag = 0;
6bda09f9 6511 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
6512 Set_Node_Cur_Length(ret);
6513 Set_Node_Offset(ret, parse_start + 1);
c277df42 6514 ret->flags = flag;
07be1b83 6515 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 6516 }
a0d0e21e
LW
6517 }
6518
6519 /* Check for proper termination. */
ce3e6498 6520 if (paren) {
e2509266 6521 RExC_flags = oregflags;
830247a4
IZ
6522 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6523 RExC_parse = oregcomp_parse;
380a0633 6524 vFAIL("Unmatched (");
ce3e6498 6525 }
a0ed51b3 6526 }
830247a4
IZ
6527 else if (!paren && RExC_parse < RExC_end) {
6528 if (*RExC_parse == ')') {
6529 RExC_parse++;
380a0633 6530 vFAIL("Unmatched )");
a0ed51b3
LW
6531 }
6532 else
b45f050a 6533 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
6534 /* NOTREACHED */
6535 }
594d7033
YO
6536 if (after_freeze)
6537 RExC_npar = after_freeze;
a0d0e21e 6538 return(ret);
a687059c
LW
6539}
6540
6541/*
6542 - regbranch - one alternative of an | operator
6543 *
6544 * Implements the concatenation operator.
6545 */
76e3520e 6546STATIC regnode *
3dab1dad 6547S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 6548{
97aff369 6549 dVAR;
c277df42
IZ
6550 register regnode *ret;
6551 register regnode *chain = NULL;
6552 register regnode *latest;
6553 I32 flags = 0, c = 0;
3dab1dad 6554 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6555
6556 PERL_ARGS_ASSERT_REGBRANCH;
6557
3dab1dad 6558 DEBUG_PARSE("brnc");
02daf0ab 6559
b81d288d 6560 if (first)
c277df42
IZ
6561 ret = NULL;
6562 else {
b81d288d 6563 if (!SIZE_ONLY && RExC_extralen)
830247a4 6564 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 6565 else {
830247a4 6566 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
6567 Set_Node_Length(ret, 1);
6568 }
c277df42
IZ
6569 }
6570
b81d288d 6571 if (!first && SIZE_ONLY)
830247a4 6572 RExC_extralen += 1; /* BRANCHJ */
b81d288d 6573
c277df42 6574 *flagp = WORST; /* Tentatively. */
a0d0e21e 6575
830247a4
IZ
6576 RExC_parse--;
6577 nextchar(pRExC_state);
6578 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 6579 flags &= ~TRYAGAIN;
3dab1dad 6580 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6581 if (latest == NULL) {
6582 if (flags & TRYAGAIN)
6583 continue;
6584 return(NULL);
a0ed51b3
LW
6585 }
6586 else if (ret == NULL)
c277df42 6587 ret = latest;
8ae10a67 6588 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 6589 if (chain == NULL) /* First piece. */
a0d0e21e
LW
6590 *flagp |= flags&SPSTART;
6591 else {
830247a4 6592 RExC_naughty++;
3dab1dad 6593 REGTAIL(pRExC_state, chain, latest);
a687059c 6594 }
a0d0e21e 6595 chain = latest;
c277df42
IZ
6596 c++;
6597 }
6598 if (chain == NULL) { /* Loop ran zero times. */
830247a4 6599 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
6600 if (ret == NULL)
6601 ret = chain;
6602 }
6603 if (c == 1) {
6604 *flagp |= flags&SIMPLE;
a0d0e21e 6605 }
a687059c 6606
d4c19fe8 6607 return ret;
a687059c
LW
6608}
6609
6610/*
6611 - regpiece - something followed by possible [*+?]
6612 *
6613 * Note that the branching code sequences used for ? and the general cases
6614 * of * and + are somewhat optimized: they use the same NOTHING node as
6615 * both the endmarker for their branch list and the body of the last branch.
6616 * It might seem that this node could be dispensed with entirely, but the
6617 * endmarker role is not redundant.
6618 */
76e3520e 6619STATIC regnode *
3dab1dad 6620S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6621{
97aff369 6622 dVAR;
c277df42 6623 register regnode *ret;
a0d0e21e
LW
6624 register char op;
6625 register char *next;
6626 I32 flags;
1df70142 6627 const char * const origparse = RExC_parse;
a0d0e21e 6628 I32 min;
c277df42 6629 I32 max = REG_INFTY;
fac92740 6630 char *parse_start;
10edeb5d 6631 const char *maxpos = NULL;
3dab1dad 6632 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6633
6634 PERL_ARGS_ASSERT_REGPIECE;
6635
3dab1dad 6636 DEBUG_PARSE("piec");
a0d0e21e 6637
3dab1dad 6638 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6639 if (ret == NULL) {
6640 if (flags & TRYAGAIN)
6641 *flagp |= TRYAGAIN;
6642 return(NULL);
6643 }
6644
830247a4 6645 op = *RExC_parse;
a0d0e21e 6646
830247a4 6647 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 6648 maxpos = NULL;
fac92740 6649 parse_start = RExC_parse; /* MJD */
830247a4 6650 next = RExC_parse + 1;
a0d0e21e
LW
6651 while (isDIGIT(*next) || *next == ',') {
6652 if (*next == ',') {
6653 if (maxpos)
6654 break;
6655 else
6656 maxpos = next;
a687059c 6657 }
a0d0e21e
LW
6658 next++;
6659 }
6660 if (*next == '}') { /* got one */
6661 if (!maxpos)
6662 maxpos = next;
830247a4
IZ
6663 RExC_parse++;
6664 min = atoi(RExC_parse);
a0d0e21e
LW
6665 if (*maxpos == ',')
6666 maxpos++;
6667 else
830247a4 6668 maxpos = RExC_parse;
a0d0e21e
LW
6669 max = atoi(maxpos);
6670 if (!max && *maxpos != '0')
c277df42
IZ
6671 max = REG_INFTY; /* meaning "infinity" */
6672 else if (max >= REG_INFTY)
8615cb43 6673 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
6674 RExC_parse = next;
6675 nextchar(pRExC_state);
a0d0e21e
LW
6676
6677 do_curly:
6678 if ((flags&SIMPLE)) {
830247a4 6679 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 6680 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
6681 Set_Node_Offset(ret, parse_start+1); /* MJD */
6682 Set_Node_Cur_Length(ret);
a0d0e21e
LW
6683 }
6684 else {
3dab1dad 6685 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
6686
6687 w->flags = 0;
3dab1dad 6688 REGTAIL(pRExC_state, ret, w);
830247a4 6689 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
6690 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6691 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
6692 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6693 }
6bda09f9 6694 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
6695 /* MJD hk */
6696 Set_Node_Offset(ret, parse_start+1);
2af232bd 6697 Set_Node_Length(ret,
fac92740 6698 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 6699
830247a4 6700 if (!SIZE_ONLY && RExC_extralen)
c277df42 6701 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 6702 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 6703 if (SIZE_ONLY)
830247a4
IZ
6704 RExC_whilem_seen++, RExC_extralen += 3;
6705 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 6706 }
c277df42 6707 ret->flags = 0;
a0d0e21e
LW
6708
6709 if (min > 0)
821b33a5
IZ
6710 *flagp = WORST;
6711 if (max > 0)
6712 *flagp |= HASWIDTH;
8fa23287 6713 if (max < min)
8615cb43 6714 vFAIL("Can't do {n,m} with n > m");
c277df42 6715 if (!SIZE_ONLY) {
eb160463
GS
6716 ARG1_SET(ret, (U16)min);
6717 ARG2_SET(ret, (U16)max);
a687059c 6718 }
a687059c 6719
a0d0e21e 6720 goto nest_check;
a687059c 6721 }
a0d0e21e 6722 }
a687059c 6723
a0d0e21e
LW
6724 if (!ISMULT1(op)) {
6725 *flagp = flags;
a687059c 6726 return(ret);
a0d0e21e 6727 }
bb20fd44 6728
c277df42 6729#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
6730
6731 /* if this is reinstated, don't forget to put this back into perldiag:
6732
6733 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6734
6735 (F) The part of the regexp subject to either the * or + quantifier
6736 could match an empty string. The {#} shows in the regular
6737 expression about where the problem was discovered.
6738
6739 */
6740
bb20fd44 6741 if (!(flags&HASWIDTH) && op != '?')
b45f050a 6742 vFAIL("Regexp *+ operand could be empty");
b81d288d 6743#endif
bb20fd44 6744
fac92740 6745 parse_start = RExC_parse;
830247a4 6746 nextchar(pRExC_state);
a0d0e21e 6747
821b33a5 6748 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
6749
6750 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 6751 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 6752 ret->flags = 0;
830247a4 6753 RExC_naughty += 4;
a0d0e21e
LW
6754 }
6755 else if (op == '*') {
6756 min = 0;
6757 goto do_curly;
a0ed51b3
LW
6758 }
6759 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 6760 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 6761 ret->flags = 0;
830247a4 6762 RExC_naughty += 3;
a0d0e21e
LW
6763 }
6764 else if (op == '+') {
6765 min = 1;
6766 goto do_curly;
a0ed51b3
LW
6767 }
6768 else if (op == '?') {
a0d0e21e
LW
6769 min = 0; max = 1;
6770 goto do_curly;
6771 }
6772 nest_check:
668c081a
NC
6773 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6774 ckWARN3reg(RExC_parse,
6775 "%.*s matches null string many times",
6776 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6777 origparse);
a0d0e21e
LW
6778 }
6779
b9b4dddf 6780 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 6781 nextchar(pRExC_state);
6bda09f9 6782 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 6783 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 6784 }
b9b4dddf
YO
6785#ifndef REG_ALLOW_MINMOD_SUSPEND
6786 else
6787#endif
6788 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6789 regnode *ender;
6790 nextchar(pRExC_state);
6791 ender = reg_node(pRExC_state, SUCCEED);
6792 REGTAIL(pRExC_state, ret, ender);
6793 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6794 ret->flags = 0;
6795 ender = reg_node(pRExC_state, TAIL);
6796 REGTAIL(pRExC_state, ret, ender);
6797 /*ret= ender;*/
6798 }
6799
6800 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 6801 RExC_parse++;
b45f050a
JF
6802 vFAIL("Nested quantifiers");
6803 }
a0d0e21e
LW
6804
6805 return(ret);
a687059c
LW
6806}
6807
fc8cd66c
YO
6808
6809/* reg_namedseq(pRExC_state,UVp)
6810
6811 This is expected to be called by a parser routine that has
afefe6bf 6812 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
6813 expected to point at the first char following the N at the time
6814 of the call.
ff3f963a
KW
6815
6816 The \N may be inside (indicated by valuep not being NULL) or outside a
6817 character class.
6818
6819 \N may begin either a named sequence, or if outside a character class, mean
6820 to match a non-newline. For non single-quoted regexes, the tokenizer has
6821 attempted to decide which, and in the case of a named sequence converted it
6822 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6823 where c1... are the characters in the sequence. For single-quoted regexes,
6824 the tokenizer passes the \N sequence through unchanged; this code will not
6825 attempt to determine this nor expand those. The net effect is that if the
6826 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6827 signals that this \N occurrence means to match a non-newline.
6828
6829 Only the \N{U+...} form should occur in a character class, for the same
6830 reason that '.' inside a character class means to just match a period: it
6831 just doesn't make sense.
fc8cd66c
YO
6832
6833 If valuep is non-null then it is assumed that we are parsing inside
6834 of a charclass definition and the first codepoint in the resolved
6835 string is returned via *valuep and the routine will return NULL.
6836 In this mode if a multichar string is returned from the charnames
ff3f963a 6837 handler, a warning will be issued, and only the first char in the
fc8cd66c
YO
6838 sequence will be examined. If the string returned is zero length
6839 then the value of *valuep is undefined and NON-NULL will
6840 be returned to indicate failure. (This will NOT be a valid pointer
6841 to a regnode.)
6842
ff3f963a
KW
6843 If valuep is null then it is assumed that we are parsing normal text and a
6844 new EXACT node is inserted into the program containing the resolved string,
6845 and a pointer to the new node is returned. But if the string is zero length
6846 a NOTHING node is emitted instead.
afefe6bf 6847
fc8cd66c 6848 On success RExC_parse is set to the char following the endbrace.
ff3f963a 6849 Parsing failures will generate a fatal error via vFAIL(...)
fc8cd66c
YO
6850 */
6851STATIC regnode *
afefe6bf 6852S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
fc8cd66c 6853{
c3c41406 6854 char * endbrace; /* '}' following the name */
fc8cd66c 6855 regnode *ret = NULL;
ff3f963a
KW
6856#ifdef DEBUGGING
6857 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6858#endif
c3c41406 6859 char* p;
ff3f963a
KW
6860
6861 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6862
6863 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
ff3f963a
KW
6864
6865 GET_RE_DEBUG_FLAGS;
c3c41406
KW
6866
6867 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6868 * modifier. The other meaning does not */
6869 p = (RExC_flags & RXf_PMf_EXTENDED)
6870 ? regwhite( pRExC_state, RExC_parse )
6871 : RExC_parse;
7918f24d 6872
ff3f963a 6873 /* Disambiguate between \N meaning a named character versus \N meaning
c3c41406
KW
6874 * [^\n]. The former is assumed when it can't be the latter. */
6875 if (*p != '{' || regcurly(p)) {
6876 RExC_parse = p;
ff3f963a 6877 if (valuep) {
afefe6bf 6878 /* no bare \N in a charclass */
ff3f963a
KW
6879 vFAIL("\\N in a character class must be a named character: \\N{...}");
6880 }
afefe6bf
RGS
6881 nextchar(pRExC_state);
6882 ret = reg_node(pRExC_state, REG_ANY);
6883 *flagp |= HASWIDTH|SIMPLE;
6884 RExC_naughty++;
6885 RExC_parse--;
6886 Set_Node_Length(ret, 1); /* MJD */
6887 return ret;
fc8cd66c 6888 }
a4893424 6889
c3c41406
KW
6890 /* Here, we have decided it should be a named sequence */
6891
6892 /* The test above made sure that the next real character is a '{', but
6893 * under the /x modifier, it could be separated by space (or a comment and
6894 * \n) and this is not allowed (for consistency with \x{...} and the
6895 * tokenizer handling of \N{NAME}). */
6896 if (*RExC_parse != '{') {
6897 vFAIL("Missing braces on \\N{}");
6898 }
6899
ff3f963a 6900 RExC_parse++; /* Skip past the '{' */
c3c41406
KW
6901
6902 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6903 || ! (endbrace == RExC_parse /* nothing between the {} */
6904 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6905 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6906 {
6907 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6908 vFAIL("\\N{NAME} must be resolved by the lexer");
6909 }
6910
ff3f963a
KW
6911 if (endbrace == RExC_parse) { /* empty: \N{} */
6912 if (! valuep) {
6913 RExC_parse = endbrace + 1;
6914 return reg_node(pRExC_state,NOTHING);
a4893424 6915 }
fc8cd66c 6916
ff3f963a
KW
6917 if (SIZE_ONLY) {
6918 ckWARNreg(RExC_parse,
6919 "Ignoring zero length \\N{} in character class"
6920 );
6921 RExC_parse = endbrace + 1;
6922 }
6923 *valuep = 0;
6924 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 6925 }
ff3f963a 6926
62fed28b 6927 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
ff3f963a
KW
6928 RExC_parse += 2; /* Skip past the 'U+' */
6929
6930 if (valuep) { /* In a bracketed char class */
6931 /* We only pay attention to the first char of
6932 multichar strings being returned. I kinda wonder
6933 if this makes sense as it does change the behaviour
6934 from earlier versions, OTOH that behaviour was broken
6935 as well. XXX Solution is to recharacterize as
6936 [rest-of-class]|multi1|multi2... */
6937
6938 STRLEN length_of_hex;
6939 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6940 | PERL_SCAN_DISALLOW_PREFIX
6941 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6942
37820adc
KW
6943 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6944 if (endchar < endbrace) {
ff3f963a
KW
6945 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6946 }
ff3f963a
KW
6947
6948 length_of_hex = (STRLEN)(endchar - RExC_parse);
6949 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6950
6951 /* The tokenizer should have guaranteed validity, but it's possible to
6952 * bypass it by using single quoting, so check */
c3c41406
KW
6953 if (length_of_hex == 0
6954 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6955 {
6956 RExC_parse += length_of_hex; /* Includes all the valid */
6957 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6958 ? UTF8SKIP(RExC_parse)
6959 : 1;
6960 /* Guard against malformed utf8 */
6961 if (RExC_parse >= endchar) RExC_parse = endchar;
6962 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
6963 }
6964
6965 RExC_parse = endbrace + 1;
6966 if (endchar == endbrace) return NULL;
6967
6968 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 6969 }
ff3f963a
KW
6970 else { /* Not a char class */
6971 char *s; /* String to put in generated EXACT node */
fda99bee 6972 STRLEN len = 0; /* Its current byte length */
ff3f963a
KW
6973 char *endchar; /* Points to '.' or '}' ending cur char in the input
6974 stream */
6975
6976 ret = reg_node(pRExC_state,
6977 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6978 s= STRING(ret);
6979
6980 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
6981 * the input which is of the form now 'c1.c2.c3...}' until find the
fda99bee 6982 * ending brace or exceed length 255. The characters that exceed this
ff3f963a
KW
6983 * limit are dropped. The limit could be relaxed should it become
6984 * desirable by reparsing this as (?:\N{NAME}), so could generate
6985 * multiple EXACT nodes, as is done for just regular input. But this
6986 * is primarily a named character, and not intended to be a huge long
6987 * string, so 255 bytes should be good enough */
6988 while (1) {
c3c41406 6989 STRLEN length_of_hex;
ff3f963a
KW
6990 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6991 | PERL_SCAN_DISALLOW_PREFIX
6992 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6993 UV cp; /* Ord of current character */
6994
6995 /* Code points are separated by dots. If none, there is only one
6996 * code point, and is terminated by the brace */
37820adc 6997 endchar = RExC_parse + strcspn(RExC_parse, ".}");
ff3f963a
KW
6998
6999 /* The values are Unicode even on EBCDIC machines */
c3c41406
KW
7000 length_of_hex = (STRLEN)(endchar - RExC_parse);
7001 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7002 if ( length_of_hex == 0
7003 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
ff3f963a 7004 {
c3c41406
KW
7005 RExC_parse += length_of_hex; /* Includes all the valid */
7006 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7007 ? UTF8SKIP(RExC_parse)
7008 : 1;
7009 /* Guard against malformed utf8 */
7010 if (RExC_parse >= endchar) RExC_parse = endchar;
7011 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
7012 }
7013
7014 if (! FOLD) { /* Not folding, just append to the string */
7015 STRLEN unilen;
7016
7017 /* Quit before adding this character if would exceed limit */
7018 if (len + UNISKIP(cp) > U8_MAX) break;
fc8cd66c 7019
ff3f963a
KW
7020 unilen = reguni(pRExC_state, cp, s);
7021 if (unilen > 0) {
7022 s += unilen;
7023 len += unilen;
7024 }
7025 } else { /* Folding, output the folded equivalent */
7026 STRLEN foldlen,numlen;
7027 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7028 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7029
7030 /* Quit before exceeding size limit */
7031 if (len + foldlen > U8_MAX) break;
7032
7033 for (foldbuf = tmpbuf;
7034 foldlen;
7035 foldlen -= numlen)
7036 {
7037 cp = utf8_to_uvchr(foldbuf, &numlen);
7038 if (numlen > 0) {
7039 const STRLEN unilen = reguni(pRExC_state, cp, s);
7040 s += unilen;
7041 len += unilen;
7042 /* In EBCDIC the numlen and unilen can differ. */
7043 foldbuf += numlen;
7044 if (numlen >= foldlen)
7045 break;
7046 }
7047 else
7048 break; /* "Can't happen." */
7049 }
7050 }
7051
7052 /* Point to the beginning of the next character in the sequence. */
7053 RExC_parse = endchar + 1;
7054
7055 /* Quit if no more characters */
7056 if (RExC_parse >= endbrace) break;
7057 }
7058
7059
7060 if (SIZE_ONLY) {
7061 if (RExC_parse < endbrace) {
7062 ckWARNreg(RExC_parse - 1,
7063 "Using just the first characters returned by \\N{}");
7064 }
7065
7066 RExC_size += STR_SZ(len);
7067 } else {
7068 STR_LEN(ret) = len;
7069 RExC_emit += STR_SZ(len);
7070 }
7071
7072 RExC_parse = endbrace + 1;
7073
7074 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7075 with malformed in t/re/pat_advanced.t */
7076 RExC_parse --;
7077 Set_Node_Cur_Length(ret); /* MJD */
7078 nextchar(pRExC_state);
7079 }
7080
7081 return ret;
fc8cd66c
YO
7082}
7083
7084
9e08bc66
TS
7085/*
7086 * reg_recode
7087 *
7088 * It returns the code point in utf8 for the value in *encp.
7089 * value: a code value in the source encoding
7090 * encp: a pointer to an Encode object
7091 *
7092 * If the result from Encode is not a single character,
7093 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7094 */
7095STATIC UV
7096S_reg_recode(pTHX_ const char value, SV **encp)
7097{
7098 STRLEN numlen = 1;
59cd0e26 7099 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 7100 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
7101 const STRLEN newlen = SvCUR(sv);
7102 UV uv = UNICODE_REPLACEMENT;
7103
7918f24d
NC
7104 PERL_ARGS_ASSERT_REG_RECODE;
7105
9e08bc66
TS
7106 if (newlen)
7107 uv = SvUTF8(sv)
7108 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7109 : *(U8*)s;
7110
7111 if (!newlen || numlen != newlen) {
7112 uv = UNICODE_REPLACEMENT;
c86f7df5 7113 *encp = NULL;
9e08bc66
TS
7114 }
7115 return uv;
7116}
7117
fc8cd66c 7118
a687059c
LW
7119/*
7120 - regatom - the lowest level
ee9b8eae
YO
7121
7122 Try to identify anything special at the start of the pattern. If there
7123 is, then handle it as required. This may involve generating a single regop,
7124 such as for an assertion; or it may involve recursing, such as to
7125 handle a () structure.
7126
7127 If the string doesn't start with something special then we gobble up
7128 as much literal text as we can.
7129
7130 Once we have been able to handle whatever type of thing started the
7131 sequence, we return.
7132
7133 Note: we have to be careful with escapes, as they can be both literal
7134 and special, and in the case of \10 and friends can either, depending
7135 on context. Specifically there are two seperate switches for handling
7136 escape sequences, with the one for handling literal escapes requiring
7137 a dummy entry for all of the special escapes that are actually handled
7138 by the other.
7139*/
7140
76e3520e 7141STATIC regnode *
3dab1dad 7142S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 7143{
97aff369 7144 dVAR;
cbbf8932 7145 register regnode *ret = NULL;
a0d0e21e 7146 I32 flags;
45948336 7147 char *parse_start = RExC_parse;
3dab1dad
YO
7148 GET_RE_DEBUG_FLAGS_DECL;
7149 DEBUG_PARSE("atom");
a0d0e21e
LW
7150 *flagp = WORST; /* Tentatively. */
7151
7918f24d 7152 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 7153
a0d0e21e 7154tryagain:
f9a79580 7155 switch ((U8)*RExC_parse) {
a0d0e21e 7156 case '^':
830247a4
IZ
7157 RExC_seen_zerolen++;
7158 nextchar(pRExC_state);
bbe252da 7159 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7160 ret = reg_node(pRExC_state, MBOL);
bbe252da 7161 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7162 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7163 else
830247a4 7164 ret = reg_node(pRExC_state, BOL);
fac92740 7165 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7166 break;
7167 case '$':
830247a4 7168 nextchar(pRExC_state);
b81d288d 7169 if (*RExC_parse)
830247a4 7170 RExC_seen_zerolen++;
bbe252da 7171 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7172 ret = reg_node(pRExC_state, MEOL);
bbe252da 7173 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7174 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7175 else
830247a4 7176 ret = reg_node(pRExC_state, EOL);
fac92740 7177 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7178 break;
7179 case '.':
830247a4 7180 nextchar(pRExC_state);
bbe252da 7181 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
7182 ret = reg_node(pRExC_state, SANY);
7183 else
7184 ret = reg_node(pRExC_state, REG_ANY);
7185 *flagp |= HASWIDTH|SIMPLE;
830247a4 7186 RExC_naughty++;
fac92740 7187 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7188 break;
7189 case '[':
b45f050a 7190 {
3dab1dad
YO
7191 char * const oregcomp_parse = ++RExC_parse;
7192 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
7193 if (*RExC_parse != ']') {
7194 RExC_parse = oregcomp_parse;
b45f050a
JF
7195 vFAIL("Unmatched [");
7196 }
830247a4 7197 nextchar(pRExC_state);
a0d0e21e 7198 *flagp |= HASWIDTH|SIMPLE;
fac92740 7199 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 7200 break;
b45f050a 7201 }
a0d0e21e 7202 case '(':
830247a4 7203 nextchar(pRExC_state);
3dab1dad 7204 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 7205 if (ret == NULL) {
bf93d4cc 7206 if (flags & TRYAGAIN) {
830247a4 7207 if (RExC_parse == RExC_end) {
bf93d4cc
GS
7208 /* Make parent create an empty node if needed. */
7209 *flagp |= TRYAGAIN;
7210 return(NULL);
7211 }
a0d0e21e 7212 goto tryagain;
bf93d4cc 7213 }
a0d0e21e
LW
7214 return(NULL);
7215 }
a3b492c3 7216 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
7217 break;
7218 case '|':
7219 case ')':
7220 if (flags & TRYAGAIN) {
7221 *flagp |= TRYAGAIN;
7222 return NULL;
7223 }
b45f050a 7224 vFAIL("Internal urp");
a0d0e21e
LW
7225 /* Supposed to be caught earlier. */
7226 break;
85afd4ae 7227 case '{':
830247a4
IZ
7228 if (!regcurly(RExC_parse)) {
7229 RExC_parse++;
85afd4ae
CS
7230 goto defchar;
7231 }
7232 /* FALL THROUGH */
a0d0e21e
LW
7233 case '?':
7234 case '+':
7235 case '*':
830247a4 7236 RExC_parse++;
b45f050a 7237 vFAIL("Quantifier follows nothing");
a0d0e21e 7238 break;
f9a79580
RGS
7239 case 0xDF:
7240 case 0xC3:
7241 case 0xCE:
a0a388a1 7242 do_foldchar:
56d400ed 7243 if (!LOC && FOLD) {
e64b1bd1 7244 U32 len,cp;
7cf3a6a3 7245 len=0; /* silence a spurious compiler warning */
56d400ed 7246 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
e64b1bd1
YO
7247 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7248 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7249 ret = reganode(pRExC_state, FOLDCHAR, cp);
7250 Set_Node_Length(ret, 1); /* MJD */
7251 nextchar(pRExC_state); /* kill whitespace under /x */
7252 return ret;
7253 }
7254 }
7255 goto outer_default;
a0d0e21e 7256 case '\\':
ee9b8eae
YO
7257 /* Special Escapes
7258
7259 This switch handles escape sequences that resolve to some kind
7260 of special regop and not to literal text. Escape sequnces that
7261 resolve to literal text are handled below in the switch marked
7262 "Literal Escapes".
7263
7264 Every entry in this switch *must* have a corresponding entry
7265 in the literal escape switch. However, the opposite is not
7266 required, as the default for this switch is to jump to the
7267 literal text handling code.
7268 */
a0a388a1
YO
7269 switch ((U8)*++RExC_parse) {
7270 case 0xDF:
7271 case 0xC3:
7272 case 0xCE:
7273 goto do_foldchar;
ee9b8eae 7274 /* Special Escapes */
a0d0e21e 7275 case 'A':
830247a4
IZ
7276 RExC_seen_zerolen++;
7277 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7278 *flagp |= SIMPLE;
ee9b8eae 7279 goto finish_meta_pat;
a0d0e21e 7280 case 'G':
830247a4
IZ
7281 ret = reg_node(pRExC_state, GPOS);
7282 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 7283 *flagp |= SIMPLE;
ee9b8eae
YO
7284 goto finish_meta_pat;
7285 case 'K':
7286 RExC_seen_zerolen++;
7287 ret = reg_node(pRExC_state, KEEPS);
7288 *flagp |= SIMPLE;
37923168
RGS
7289 /* XXX:dmq : disabling in-place substitution seems to
7290 * be necessary here to avoid cases of memory corruption, as
7291 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7292 */
7293 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 7294 goto finish_meta_pat;
a0d0e21e 7295 case 'Z':
830247a4 7296 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7297 *flagp |= SIMPLE;
a1917ab9 7298 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7299 goto finish_meta_pat;
b85d18e9 7300 case 'z':
830247a4 7301 ret = reg_node(pRExC_state, EOS);
b85d18e9 7302 *flagp |= SIMPLE;
830247a4 7303 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7304 goto finish_meta_pat;
4a2d328f 7305 case 'C':
f33976b4
DB
7306 ret = reg_node(pRExC_state, CANY);
7307 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 7308 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7309 goto finish_meta_pat;
a0ed51b3 7310 case 'X':
830247a4 7311 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 7312 *flagp |= HASWIDTH;
ee9b8eae 7313 goto finish_meta_pat;
a0d0e21e 7314 case 'w':
a12cf05f
KW
7315 if (LOC) {
7316 ret = reg_node(pRExC_state, (U8)(ALNUML));
7317 } else {
7318 ret = reg_node(pRExC_state, (U8)(ALNUM));
7319 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7320 }
a0d0e21e 7321 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7322 goto finish_meta_pat;
a0d0e21e 7323 case 'W':
a12cf05f
KW
7324 if (LOC) {
7325 ret = reg_node(pRExC_state, (U8)(NALNUML));
7326 } else {
7327 ret = reg_node(pRExC_state, (U8)(NALNUM));
7328 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7329 }
a0d0e21e 7330 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7331 goto finish_meta_pat;
a0d0e21e 7332 case 'b':
830247a4
IZ
7333 RExC_seen_zerolen++;
7334 RExC_seen |= REG_SEEN_LOOKBEHIND;
a12cf05f
KW
7335 if (LOC) {
7336 ret = reg_node(pRExC_state, (U8)(BOUNDL));
7337 } else {
7338 ret = reg_node(pRExC_state, (U8)(BOUND));
7339 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7340 }
a0d0e21e 7341 *flagp |= SIMPLE;
ee9b8eae 7342 goto finish_meta_pat;
a0d0e21e 7343 case 'B':
830247a4
IZ
7344 RExC_seen_zerolen++;
7345 RExC_seen |= REG_SEEN_LOOKBEHIND;
a12cf05f
KW
7346 if (LOC) {
7347 ret = reg_node(pRExC_state, (U8)(NBOUNDL));
7348 } else {
7349 ret = reg_node(pRExC_state, (U8)(NBOUND));
7350 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7351 }
a0d0e21e 7352 *flagp |= SIMPLE;
ee9b8eae 7353 goto finish_meta_pat;
a0d0e21e 7354 case 's':
a12cf05f
KW
7355 if (LOC) {
7356 ret = reg_node(pRExC_state, (U8)(SPACEL));
7357 } else {
7358 ret = reg_node(pRExC_state, (U8)(SPACE));
7359 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7360 }
a0d0e21e 7361 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7362 goto finish_meta_pat;
a0d0e21e 7363 case 'S':
a12cf05f
KW
7364 if (LOC) {
7365 ret = reg_node(pRExC_state, (U8)(NSPACEL));
7366 } else {
7367 ret = reg_node(pRExC_state, (U8)(NSPACE));
7368 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7369 }
a0d0e21e 7370 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7371 goto finish_meta_pat;
a0d0e21e 7372 case 'd':
ffc61ed2 7373 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 7374 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7375 goto finish_meta_pat;
a0d0e21e 7376 case 'D':
ffc61ed2 7377 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 7378 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7379 goto finish_meta_pat;
e1d1eefb
YO
7380 case 'R':
7381 ret = reg_node(pRExC_state, LNBREAK);
7382 *flagp |= HASWIDTH|SIMPLE;
7383 goto finish_meta_pat;
7384 case 'h':
7385 ret = reg_node(pRExC_state, HORIZWS);
7386 *flagp |= HASWIDTH|SIMPLE;
7387 goto finish_meta_pat;
7388 case 'H':
7389 ret = reg_node(pRExC_state, NHORIZWS);
7390 *flagp |= HASWIDTH|SIMPLE;
7391 goto finish_meta_pat;
ee9b8eae 7392 case 'v':
e1d1eefb
YO
7393 ret = reg_node(pRExC_state, VERTWS);
7394 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
7395 goto finish_meta_pat;
7396 case 'V':
e1d1eefb
YO
7397 ret = reg_node(pRExC_state, NVERTWS);
7398 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7399 finish_meta_pat:
830247a4 7400 nextchar(pRExC_state);
fac92740 7401 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 7402 break;
a14b48bc
LW
7403 case 'p':
7404 case 'P':
3568d838 7405 {
3dab1dad 7406 char* const oldregxend = RExC_end;
d008bc60 7407#ifdef DEBUGGING
ccb2c380 7408 char* parse_start = RExC_parse - 2;
d008bc60 7409#endif
a14b48bc 7410
830247a4 7411 if (RExC_parse[1] == '{') {
3568d838 7412 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
7413 RExC_end = strchr(RExC_parse, '}');
7414 if (!RExC_end) {
3dab1dad 7415 const U8 c = (U8)*RExC_parse;
830247a4
IZ
7416 RExC_parse += 2;
7417 RExC_end = oldregxend;
0da60cf5 7418 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 7419 }
830247a4 7420 RExC_end++;
a14b48bc 7421 }
af6f566e 7422 else {
830247a4 7423 RExC_end = RExC_parse + 2;
af6f566e
HS
7424 if (RExC_end > oldregxend)
7425 RExC_end = oldregxend;
7426 }
830247a4 7427 RExC_parse--;
a14b48bc 7428
3dab1dad 7429 ret = regclass(pRExC_state,depth+1);
a14b48bc 7430
830247a4
IZ
7431 RExC_end = oldregxend;
7432 RExC_parse--;
ccb2c380
MP
7433
7434 Set_Node_Offset(ret, parse_start + 2);
7435 Set_Node_Cur_Length(ret);
830247a4 7436 nextchar(pRExC_state);
a14b48bc
LW
7437 *flagp |= HASWIDTH|SIMPLE;
7438 }
7439 break;
fc8cd66c 7440 case 'N':
afefe6bf 7441 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
7442 multicharacter. join_exact() will join them up later on.
7443 Also this makes sure that things like /\N{BLAH}+/ and
7444 \N{BLAH} being multi char Just Happen. dmq*/
7445 ++RExC_parse;
afefe6bf 7446 ret= reg_namedseq(pRExC_state, NULL, flagp);
fc8cd66c 7447 break;
0a4db386 7448 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 7449 parse_named_seq:
81714fb9
YO
7450 {
7451 char ch= RExC_parse[1];
1f1031fe
YO
7452 if (ch != '<' && ch != '\'' && ch != '{') {
7453 RExC_parse++;
7454 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 7455 } else {
1f1031fe
YO
7456 /* this pretty much dupes the code for (?P=...) in reg(), if
7457 you change this make sure you change that */
81714fb9 7458 char* name_start = (RExC_parse += 2);
2eccd3b2 7459 U32 num = 0;
0a4db386
YO
7460 SV *sv_dat = reg_scan_name(pRExC_state,
7461 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 7462 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 7463 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
7464 vFAIL2("Sequence %.3s... not terminated",parse_start);
7465
7466 if (!SIZE_ONLY) {
7467 num = add_data( pRExC_state, 1, "S" );
7468 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 7469 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
7470 }
7471
81714fb9
YO
7472 RExC_sawback = 1;
7473 ret = reganode(pRExC_state,
7474 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7475 num);
7476 *flagp |= HASWIDTH;
1f1031fe 7477
81714fb9
YO
7478 /* override incorrect value set in reganode MJD */
7479 Set_Node_Offset(ret, parse_start+1);
7480 Set_Node_Cur_Length(ret); /* MJD */
7481 nextchar(pRExC_state);
1f1031fe 7482
81714fb9
YO
7483 }
7484 break;
1f1031fe 7485 }
2bf803e2 7486 case 'g':
a0d0e21e
LW
7487 case '1': case '2': case '3': case '4':
7488 case '5': case '6': case '7': case '8': case '9':
7489 {
c74340f9 7490 I32 num;
2bf803e2
YO
7491 bool isg = *RExC_parse == 'g';
7492 bool isrel = 0;
7493 bool hasbrace = 0;
7494 if (isg) {
c74340f9 7495 RExC_parse++;
2bf803e2
YO
7496 if (*RExC_parse == '{') {
7497 RExC_parse++;
7498 hasbrace = 1;
7499 }
7500 if (*RExC_parse == '-') {
7501 RExC_parse++;
7502 isrel = 1;
7503 }
1f1031fe
YO
7504 if (hasbrace && !isDIGIT(*RExC_parse)) {
7505 if (isrel) RExC_parse--;
7506 RExC_parse -= 2;
7507 goto parse_named_seq;
7508 } }
c74340f9 7509 num = atoi(RExC_parse);
b72d83b2
RGS
7510 if (isg && num == 0)
7511 vFAIL("Reference to invalid group 0");
c74340f9 7512 if (isrel) {
5624f11d 7513 num = RExC_npar - num;
c74340f9
YO
7514 if (num < 1)
7515 vFAIL("Reference to nonexistent or unclosed group");
7516 }
2bf803e2 7517 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
7518 goto defchar;
7519 else {
3dab1dad 7520 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
7521 while (isDIGIT(*RExC_parse))
7522 RExC_parse++;
1f1031fe
YO
7523 if (parse_start == RExC_parse - 1)
7524 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
7525 if (hasbrace) {
7526 if (*RExC_parse != '}')
7527 vFAIL("Unterminated \\g{...} pattern");
7528 RExC_parse++;
7529 }
c74340f9
YO
7530 if (!SIZE_ONLY) {
7531 if (num > (I32)RExC_rx->nparens)
7532 vFAIL("Reference to nonexistent group");
c74340f9 7533 }
830247a4 7534 RExC_sawback = 1;
eb160463
GS
7535 ret = reganode(pRExC_state,
7536 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7537 num);
a0d0e21e 7538 *flagp |= HASWIDTH;
2af232bd 7539
fac92740 7540 /* override incorrect value set in reganode MJD */
2af232bd 7541 Set_Node_Offset(ret, parse_start+1);
fac92740 7542 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
7543 RExC_parse--;
7544 nextchar(pRExC_state);
a0d0e21e
LW
7545 }
7546 }
7547 break;
7548 case '\0':
830247a4 7549 if (RExC_parse >= RExC_end)
b45f050a 7550 FAIL("Trailing \\");
a0d0e21e
LW
7551 /* FALL THROUGH */
7552 default:
a0288114 7553 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 7554 back into the quick-grab loop below */
45948336 7555 parse_start--;
a0d0e21e
LW
7556 goto defchar;
7557 }
7558 break;
4633a7c4
LW
7559
7560 case '#':
bbe252da 7561 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 7562 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
7563 goto tryagain;
7564 }
7565 /* FALL THROUGH */
7566
f9a79580
RGS
7567 default:
7568 outer_default:{
ba210ebe 7569 register STRLEN len;
58ae7d3f 7570 register UV ender;
a0d0e21e 7571 register char *p;
3dab1dad 7572 char *s;
80aecb99 7573 STRLEN foldlen;
89ebb4a3 7574 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
7575
7576 parse_start = RExC_parse - 1;
a0d0e21e 7577
830247a4 7578 RExC_parse++;
a0d0e21e
LW
7579
7580 defchar:
58ae7d3f 7581 ender = 0;
eb160463
GS
7582 ret = reg_node(pRExC_state,
7583 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 7584 s = STRING(ret);
830247a4
IZ
7585 for (len = 0, p = RExC_parse - 1;
7586 len < 127 && p < RExC_end;
a0d0e21e
LW
7587 len++)
7588 {
3dab1dad 7589 char * const oldp = p;
5b5a24f7 7590
bbe252da 7591 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 7592 p = regwhite( pRExC_state, p );
f9a79580
RGS
7593 switch ((U8)*p) {
7594 case 0xDF:
7595 case 0xC3:
7596 case 0xCE:
56d400ed 7597 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
f9a79580 7598 goto normal_default;
a0d0e21e
LW
7599 case '^':
7600 case '$':
7601 case '.':
7602 case '[':
7603 case '(':
7604 case ')':
7605 case '|':
7606 goto loopdone;
7607 case '\\':
ee9b8eae
YO
7608 /* Literal Escapes Switch
7609
7610 This switch is meant to handle escape sequences that
7611 resolve to a literal character.
7612
7613 Every escape sequence that represents something
7614 else, like an assertion or a char class, is handled
7615 in the switch marked 'Special Escapes' above in this
7616 routine, but also has an entry here as anything that
7617 isn't explicitly mentioned here will be treated as
7618 an unescaped equivalent literal.
7619 */
7620
a0a388a1 7621 switch ((U8)*++p) {
ee9b8eae 7622 /* These are all the special escapes. */
a0a388a1
YO
7623 case 0xDF:
7624 case 0xC3:
7625 case 0xCE:
7626 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7627 goto normal_default;
ee9b8eae
YO
7628 case 'A': /* Start assertion */
7629 case 'b': case 'B': /* Word-boundary assertion*/
7630 case 'C': /* Single char !DANGEROUS! */
7631 case 'd': case 'D': /* digit class */
7632 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 7633 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
7634 case 'k': case 'K': /* named backref, keep marker */
7635 case 'N': /* named char sequence */
38a44b82 7636 case 'p': case 'P': /* Unicode property */
e1d1eefb 7637 case 'R': /* LNBREAK */
ee9b8eae 7638 case 's': case 'S': /* space class */
e1d1eefb 7639 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
7640 case 'w': case 'W': /* word class */
7641 case 'X': /* eXtended Unicode "combining character sequence" */
7642 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
7643 --p;
7644 goto loopdone;
ee9b8eae
YO
7645
7646 /* Anything after here is an escape that resolves to a
7647 literal. (Except digits, which may or may not)
7648 */
a0d0e21e
LW
7649 case 'n':
7650 ender = '\n';
7651 p++;
a687059c 7652 break;
a0d0e21e
LW
7653 case 'r':
7654 ender = '\r';
7655 p++;
a687059c 7656 break;
a0d0e21e
LW
7657 case 't':
7658 ender = '\t';
7659 p++;
a687059c 7660 break;
a0d0e21e
LW
7661 case 'f':
7662 ender = '\f';
7663 p++;
a687059c 7664 break;
a0d0e21e 7665 case 'e':
c7f1f016 7666 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 7667 p++;
a687059c 7668 break;
a0d0e21e 7669 case 'a':
c7f1f016 7670 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 7671 p++;
a687059c 7672 break;
f0a2b745
KW
7673 case 'o':
7674 {
7675 STRLEN brace_len = len;
00c0cb6d 7676 UV result;
454155d9
KW
7677 const char* error_msg;
7678
7679 bool valid = grok_bslash_o(p,
7680 &result,
7681 &brace_len,
7682 &error_msg,
7683 1);
7684 p += brace_len;
7685 if (! valid) {
7686 RExC_parse = p; /* going to die anyway; point
7687 to exact spot of failure */
f0a2b745
KW
7688 vFAIL(error_msg);
7689 }
00c0cb6d
DG
7690 else
7691 {
7692 ender = result;
7693 }
f0a2b745
KW
7694 if (PL_encoding && ender < 0x100) {
7695 goto recode_encoding;
7696 }
7697 if (ender > 0xff) {
62fed28b 7698 REQUIRE_UTF8;
f0a2b745
KW
7699 }
7700 break;
7701 }
a0d0e21e 7702 case 'x':
a0ed51b3 7703 if (*++p == '{') {
1df70142 7704 char* const e = strchr(p, '}');
b81d288d 7705
b45f050a 7706 if (!e) {
830247a4 7707 RExC_parse = p + 1;
b45f050a
JF
7708 vFAIL("Missing right brace on \\x{}");
7709 }
de5f0749 7710 else {
a4c04bdc
NC
7711 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7712 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 7713 STRLEN numlen = e - p - 1;
53305cf1 7714 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028 7715 if (ender > 0xff)
62fed28b 7716 REQUIRE_UTF8;
a0ed51b3
LW
7717 p = e + 1;
7718 }
a0ed51b3
LW
7719 }
7720 else {
a4c04bdc 7721 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 7722 STRLEN numlen = 2;
53305cf1 7723 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
7724 p += numlen;
7725 }
9e08bc66
TS
7726 if (PL_encoding && ender < 0x100)
7727 goto recode_encoding;
a687059c 7728 break;
a0d0e21e
LW
7729 case 'c':
7730 p++;
f9d13529 7731 ender = grok_bslash_c(*p++, SIZE_ONLY);
a687059c 7732 break;
a0d0e21e
LW
7733 case '0': case '1': case '2': case '3':case '4':
7734 case '5': case '6': case '7': case '8':case '9':
7735 if (*p == '0' ||
ca67da41 7736 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
c99e91e9
KW
7737 {
7738 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
1df70142 7739 STRLEN numlen = 3;
53305cf1 7740 ender = grok_oct(p, &numlen, &flags, NULL);
fa1639c5 7741 if (ender > 0xff) {
62fed28b 7742 REQUIRE_UTF8;
609122bd 7743 }
a0d0e21e
LW
7744 p += numlen;
7745 }
7746 else {
7747 --p;
7748 goto loopdone;
a687059c 7749 }
9e08bc66
TS
7750 if (PL_encoding && ender < 0x100)
7751 goto recode_encoding;
7752 break;
7753 recode_encoding:
7754 {
7755 SV* enc = PL_encoding;
7756 ender = reg_recode((const char)(U8)ender, &enc);
668c081a
NC
7757 if (!enc && SIZE_ONLY)
7758 ckWARNreg(p, "Invalid escape in the specified encoding");
62fed28b 7759 REQUIRE_UTF8;
9e08bc66 7760 }
a687059c 7761 break;
a0d0e21e 7762 case '\0':
830247a4 7763 if (p >= RExC_end)
b45f050a 7764 FAIL("Trailing \\");
a687059c 7765 /* FALL THROUGH */
a0d0e21e 7766 default:
668c081a
NC
7767 if (!SIZE_ONLY&& isALPHA(*p))
7768 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 7769 goto normal_default;
a0d0e21e
LW
7770 }
7771 break;
a687059c 7772 default:
a0ed51b3 7773 normal_default:
fd400ab9 7774 if (UTF8_IS_START(*p) && UTF) {
1df70142 7775 STRLEN numlen;
5e12f4fb 7776 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 7777 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
7778 p += numlen;
7779 }
7780 else
7781 ender = *p++;
a0d0e21e 7782 break;
a687059c 7783 }
bcdf7404
YO
7784 if ( RExC_flags & RXf_PMf_EXTENDED)
7785 p = regwhite( pRExC_state, p );
60a8b682
JH
7786 if (UTF && FOLD) {
7787 /* Prime the casefolded buffer. */
ac7e0132 7788 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 7789 }
bcdf7404 7790 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
7791 if (len)
7792 p = oldp;
16ea2a2e 7793 else if (UTF) {
80aecb99 7794 if (FOLD) {
60a8b682 7795 /* Emit all the Unicode characters. */
1df70142 7796 STRLEN numlen;
80aecb99
JH
7797 for (foldbuf = tmpbuf;
7798 foldlen;
7799 foldlen -= numlen) {
7800 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7801 if (numlen > 0) {
71207a34 7802 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7803 s += unilen;
7804 len += unilen;
7805 /* In EBCDIC the numlen
7806 * and unilen can differ. */
9dc45d57 7807 foldbuf += numlen;
47654450
JH
7808 if (numlen >= foldlen)
7809 break;
9dc45d57
JH
7810 }
7811 else
7812 break; /* "Can't happen." */
80aecb99
JH
7813 }
7814 }
7815 else {
71207a34 7816 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7817 if (unilen > 0) {
0ebc6274
JH
7818 s += unilen;
7819 len += unilen;
9dc45d57 7820 }
80aecb99 7821 }
a0ed51b3 7822 }
a0d0e21e
LW
7823 else {
7824 len++;
eb160463 7825 REGC((char)ender, s++);
a0d0e21e
LW
7826 }
7827 break;
a687059c 7828 }
16ea2a2e 7829 if (UTF) {
80aecb99 7830 if (FOLD) {
60a8b682 7831 /* Emit all the Unicode characters. */
1df70142 7832 STRLEN numlen;
80aecb99
JH
7833 for (foldbuf = tmpbuf;
7834 foldlen;
7835 foldlen -= numlen) {
7836 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7837 if (numlen > 0) {
71207a34 7838 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7839 len += unilen;
7840 s += unilen;
7841 /* In EBCDIC the numlen
7842 * and unilen can differ. */
9dc45d57 7843 foldbuf += numlen;
47654450
JH
7844 if (numlen >= foldlen)
7845 break;
9dc45d57
JH
7846 }
7847 else
7848 break;
80aecb99
JH
7849 }
7850 }
7851 else {
71207a34 7852 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7853 if (unilen > 0) {
0ebc6274
JH
7854 s += unilen;
7855 len += unilen;
9dc45d57 7856 }
80aecb99
JH
7857 }
7858 len--;
a0ed51b3
LW
7859 }
7860 else
eb160463 7861 REGC((char)ender, s++);
a0d0e21e
LW
7862 }
7863 loopdone:
830247a4 7864 RExC_parse = p - 1;
fac92740 7865 Set_Node_Cur_Length(ret); /* MJD */
830247a4 7866 nextchar(pRExC_state);
793db0cb
JH
7867 {
7868 /* len is STRLEN which is unsigned, need to copy to signed */
7869 IV iv = len;
7870 if (iv < 0)
7871 vFAIL("Internal disaster");
7872 }
a0d0e21e
LW
7873 if (len > 0)
7874 *flagp |= HASWIDTH;
090f7165 7875 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 7876 *flagp |= SIMPLE;
3dab1dad 7877
cd439c50 7878 if (SIZE_ONLY)
830247a4 7879 RExC_size += STR_SZ(len);
3dab1dad
YO
7880 else {
7881 STR_LEN(ret) = len;
830247a4 7882 RExC_emit += STR_SZ(len);
07be1b83 7883 }
3dab1dad 7884 }
a0d0e21e
LW
7885 break;
7886 }
a687059c 7887
a0d0e21e 7888 return(ret);
a687059c
LW
7889}
7890
873ef191 7891STATIC char *
bcdf7404 7892S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 7893{
bcdf7404 7894 const char *e = RExC_end;
7918f24d
NC
7895
7896 PERL_ARGS_ASSERT_REGWHITE;
7897
5b5a24f7
CS
7898 while (p < e) {
7899 if (isSPACE(*p))
7900 ++p;
7901 else if (*p == '#') {
bcdf7404 7902 bool ended = 0;
5b5a24f7 7903 do {
bcdf7404
YO
7904 if (*p++ == '\n') {
7905 ended = 1;
7906 break;
7907 }
7908 } while (p < e);
7909 if (!ended)
7910 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
7911 }
7912 else
7913 break;
7914 }
7915 return p;
7916}
7917
b8c5462f
JH
7918/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7919 Character classes ([:foo:]) can also be negated ([:^foo:]).
7920 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7921 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 7922 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
7923
7924#define POSIXCC_DONE(c) ((c) == ':')
7925#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7926#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7927
b8c5462f 7928STATIC I32
830247a4 7929S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 7930{
97aff369 7931 dVAR;
936ed897 7932 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 7933
7918f24d
NC
7934 PERL_ARGS_ASSERT_REGPPOSIXCC;
7935
830247a4 7936 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 7937 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 7938 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 7939 const char c = UCHARAT(RExC_parse);
097eb12c 7940 char* const s = RExC_parse++;
b81d288d 7941
9a86a77b 7942 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
7943 RExC_parse++;
7944 if (RExC_parse == RExC_end)
620e46c5 7945 /* Grandfather lone [:, [=, [. */
830247a4 7946 RExC_parse = s;
620e46c5 7947 else {
3dab1dad 7948 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
7949 assert(*t == c);
7950
9a86a77b 7951 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 7952 const char *posixcc = s + 1;
830247a4 7953 RExC_parse++; /* skip over the ending ] */
3dab1dad 7954
b8c5462f 7955 if (*s == ':') {
1df70142
AL
7956 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7957 const I32 skip = t - posixcc;
80916619
NC
7958
7959 /* Initially switch on the length of the name. */
7960 switch (skip) {
7961 case 4:
3dab1dad
YO
7962 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7963 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 7964 break;
80916619
NC
7965 case 5:
7966 /* Names all of length 5. */
7967 /* alnum alpha ascii blank cntrl digit graph lower
7968 print punct space upper */
7969 /* Offset 4 gives the best switch position. */
7970 switch (posixcc[4]) {
7971 case 'a':
3dab1dad
YO
7972 if (memEQ(posixcc, "alph", 4)) /* alpha */
7973 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
7974 break;
7975 case 'e':
3dab1dad
YO
7976 if (memEQ(posixcc, "spac", 4)) /* space */
7977 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
7978 break;
7979 case 'h':
3dab1dad
YO
7980 if (memEQ(posixcc, "grap", 4)) /* graph */
7981 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
7982 break;
7983 case 'i':
3dab1dad
YO
7984 if (memEQ(posixcc, "asci", 4)) /* ascii */
7985 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
7986 break;
7987 case 'k':
3dab1dad
YO
7988 if (memEQ(posixcc, "blan", 4)) /* blank */
7989 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
7990 break;
7991 case 'l':
3dab1dad
YO
7992 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7993 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
7994 break;
7995 case 'm':
3dab1dad
YO
7996 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7997 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
7998 break;
7999 case 'r':
3dab1dad
YO
8000 if (memEQ(posixcc, "lowe", 4)) /* lower */
8001 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8002 else if (memEQ(posixcc, "uppe", 4)) /* upper */
8003 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
8004 break;
8005 case 't':
3dab1dad
YO
8006 if (memEQ(posixcc, "digi", 4)) /* digit */
8007 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8008 else if (memEQ(posixcc, "prin", 4)) /* print */
8009 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8010 else if (memEQ(posixcc, "punc", 4)) /* punct */
8011 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 8012 break;
b8c5462f
JH
8013 }
8014 break;
80916619 8015 case 6:
3dab1dad
YO
8016 if (memEQ(posixcc, "xdigit", 6))
8017 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
8018 break;
8019 }
80916619
NC
8020
8021 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
8022 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8023 t - s - 1, s + 1);
80916619
NC
8024 assert (posixcc[skip] == ':');
8025 assert (posixcc[skip+1] == ']');
b45f050a 8026 } else if (!SIZE_ONLY) {
b8c5462f 8027 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 8028
830247a4 8029 /* adjust RExC_parse so the warning shows after
b45f050a 8030 the class closes */
9a86a77b 8031 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 8032 RExC_parse++;
b45f050a
JF
8033 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8034 }
b8c5462f
JH
8035 } else {
8036 /* Maternal grandfather:
8037 * "[:" ending in ":" but not in ":]" */
830247a4 8038 RExC_parse = s;
767d463e 8039 }
620e46c5
JH
8040 }
8041 }
8042
b8c5462f
JH
8043 return namedclass;
8044}
8045
8046STATIC void
830247a4 8047S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 8048{
97aff369 8049 dVAR;
7918f24d
NC
8050
8051 PERL_ARGS_ASSERT_CHECKPOSIXCC;
8052
3dab1dad 8053 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
8054 const char *s = RExC_parse;
8055 const char c = *s++;
b8c5462f 8056
3dab1dad 8057 while (isALNUM(*s))
b8c5462f
JH
8058 s++;
8059 if (*s && c == *s && s[1] == ']') {
668c081a
NC
8060 ckWARN3reg(s+2,
8061 "POSIX syntax [%c %c] belongs inside character classes",
8062 c, c);
b45f050a
JF
8063
8064 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 8065 if (POSIXCC_NOTYET(c)) {
830247a4 8066 /* adjust RExC_parse so the error shows after
b45f050a 8067 the class closes */
9a86a77b 8068 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 8069 NOOP;
b45f050a
JF
8070 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8071 }
b8c5462f
JH
8072 }
8073 }
620e46c5
JH
8074}
8075
0399b215 8076/* No locale test */
e1d1eefb
YO
8077#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
8078ANYOF_##NAME: \
8079 for (value = 0; value < 256; value++) \
8080 if (TEST) \
8081 ANYOF_BITMAP_SET(ret, value); \
8082 yesno = '+'; \
8083 what = WORD; \
8084 break; \
8085case ANYOF_N##NAME: \
8086 for (value = 0; value < 256; value++) \
8087 if (!TEST) \
8088 ANYOF_BITMAP_SET(ret, value); \
8089 yesno = '!'; \
8090 what = WORD; \
8091 break
89836f1f 8092
a12cf05f
KW
8093/* Like the above, but there are differences if we are in uni-8-bit or not, so
8094 * there are two tests passed in, to use depending on that. There aren't any
8095 * cases where the label is different from the name, so no need for that
8096 * parameter */
0399b215 8097#define _C_C_T_(NAME,TEST_8,TEST_7,WORD) \
a12cf05f
KW
8098ANYOF_##NAME: \
8099 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8100 else if (UNI_SEMANTICS) { \
8101 for (value = 0; value < 256; value++) { \
8102 if (TEST_8) ANYOF_BITMAP_SET(ret, value); \
8103 } \
8104 } \
8105 else { \
8106 for (value = 0; value < 256; value++) { \
8107 if (TEST_7) ANYOF_BITMAP_SET(ret, value); \
8108 } \
8109 } \
8110 yesno = '+'; \
8111 what = WORD; \
8112 break; \
8113case ANYOF_N##NAME: \
8114 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8115 else if (UNI_SEMANTICS) { \
8116 for (value = 0; value < 256; value++) { \
8117 if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \
8118 } \
8119 } \
8120 else { \
8121 for (value = 0; value < 256; value++) { \
8122 if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \
8123 } \
8124 } \
8125 yesno = '!'; \
8126 what = WORD; \
8127 break
8128
da7fcca4
YO
8129/*
8130 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8131 so that it is possible to override the option here without having to
8132 rebuild the entire core. as we are required to do if we change regcomp.h
8133 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8134*/
8135#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8136#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8137#endif
8138
8139#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8140#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8141#else
8142#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8143#endif
8144
7f6f358c
YO
8145/*
8146 parse a class specification and produce either an ANYOF node that
89836f1f
YO
8147 matches the pattern or if the pattern matches a single char only and
8148 that char is < 256 and we are case insensitive then we produce an
8149 EXACT node instead.
7f6f358c 8150*/
89836f1f 8151
76e3520e 8152STATIC regnode *
3dab1dad 8153S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 8154{
97aff369 8155 dVAR;
9a86a77b 8156 register UV nextvalue;
3568d838 8157 register IV prevvalue = OOB_UNICODE;
ffc61ed2 8158 register IV range = 0;
e1d1eefb 8159 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 8160 register regnode *ret;
ba210ebe 8161 STRLEN numlen;
ffc61ed2 8162 IV namedclass;
cbbf8932 8163 char *rangebegin = NULL;
936ed897 8164 bool need_class = 0;
c445ea15 8165 SV *listsv = NULL;
ffc61ed2 8166 UV n;
9e55ce06 8167 bool optimize_invert = TRUE;
cbbf8932 8168 AV* unicode_alternate = NULL;
1b2d223b
JH
8169#ifdef EBCDIC
8170 UV literal_endpoint = 0;
8171#endif
2c63ecad 8172 UV stored = 0; /* 0, 1, or more than 1 chars stored in the class */
ffc61ed2 8173
3dab1dad 8174 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 8175 case we need to change the emitted regop to an EXACT. */
07be1b83 8176 const char * orig_parse = RExC_parse;
72f13be8 8177 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8178
8179 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
8180#ifndef DEBUGGING
8181 PERL_UNUSED_ARG(depth);
8182#endif
72f13be8 8183
3dab1dad 8184 DEBUG_PARSE("clas");
7f6f358c
YO
8185
8186 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
8187 ret = reganode(pRExC_state, ANYOF, 0);
8188
8189 if (!SIZE_ONLY)
8190 ANYOF_FLAGS(ret) = 0;
8191
9a86a77b 8192 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
8193 RExC_naughty++;
8194 RExC_parse++;
8195 if (!SIZE_ONLY)
8196 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8197 }
a0d0e21e 8198
73060fc4 8199 if (SIZE_ONLY) {
830247a4 8200 RExC_size += ANYOF_SKIP;
73060fc4
JH
8201 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8202 }
936ed897 8203 else {
830247a4 8204 RExC_emit += ANYOF_SKIP;
936ed897
IZ
8205 if (FOLD)
8206 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8207 if (LOC)
8208 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 8209 ANYOF_BITMAP_ZERO(ret);
396482e1 8210 listsv = newSVpvs("# comment\n");
a0d0e21e 8211 }
b8c5462f 8212
9a86a77b
JH
8213 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8214
b938889d 8215 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 8216 checkposixcc(pRExC_state);
b8c5462f 8217
f064b6ad
HS
8218 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8219 if (UCHARAT(RExC_parse) == ']')
8220 goto charclassloop;
ffc61ed2 8221
fc8cd66c 8222parseit:
9a86a77b 8223 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
8224
8225 charclassloop:
8226
8227 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8228
73b437c8 8229 if (!range)
830247a4 8230 rangebegin = RExC_parse;
ffc61ed2 8231 if (UTF) {
5e12f4fb 8232 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 8233 RExC_end - RExC_parse,
9f7f3913 8234 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
8235 RExC_parse += numlen;
8236 }
8237 else
8238 value = UCHARAT(RExC_parse++);
7f6f358c 8239
9a86a77b
JH
8240 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8241 if (value == '[' && POSIXCC(nextvalue))
830247a4 8242 namedclass = regpposixcc(pRExC_state, value);
620e46c5 8243 else if (value == '\\') {
ffc61ed2 8244 if (UTF) {
5e12f4fb 8245 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 8246 RExC_end - RExC_parse,
9f7f3913 8247 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
8248 RExC_parse += numlen;
8249 }
8250 else
8251 value = UCHARAT(RExC_parse++);
470c3474 8252 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 8253 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
8254 * be a problem later if we want switch on Unicode.
8255 * A similar issue a little bit later when switching on
8256 * namedclass. --jhi */
ffc61ed2 8257 switch ((I32)value) {
b8c5462f
JH
8258 case 'w': namedclass = ANYOF_ALNUM; break;
8259 case 'W': namedclass = ANYOF_NALNUM; break;
8260 case 's': namedclass = ANYOF_SPACE; break;
8261 case 'S': namedclass = ANYOF_NSPACE; break;
8262 case 'd': namedclass = ANYOF_DIGIT; break;
8263 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
8264 case 'v': namedclass = ANYOF_VERTWS; break;
8265 case 'V': namedclass = ANYOF_NVERTWS; break;
8266 case 'h': namedclass = ANYOF_HORIZWS; break;
8267 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
8268 case 'N': /* Handle \N{NAME} in class */
8269 {
8270 /* We only pay attention to the first char of
8271 multichar strings being returned. I kinda wonder
8272 if this makes sense as it does change the behaviour
8273 from earlier versions, OTOH that behaviour was broken
8274 as well. */
8275 UV v; /* value is register so we cant & it /grrr */
afefe6bf 8276 if (reg_namedseq(pRExC_state, &v, NULL)) {
fc8cd66c
YO
8277 goto parseit;
8278 }
8279 value= v;
8280 }
8281 break;
ffc61ed2
JH
8282 case 'p':
8283 case 'P':
3dab1dad
YO
8284 {
8285 char *e;
af6f566e 8286 if (RExC_parse >= RExC_end)
2a4859cd 8287 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 8288 if (*RExC_parse == '{') {
1df70142 8289 const U8 c = (U8)value;
ffc61ed2
JH
8290 e = strchr(RExC_parse++, '}');
8291 if (!e)
0da60cf5 8292 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
8293 while (isSPACE(UCHARAT(RExC_parse)))
8294 RExC_parse++;
8295 if (e == RExC_parse)
0da60cf5 8296 vFAIL2("Empty \\%c{}", c);
ffc61ed2 8297 n = e - RExC_parse;
ab13f0c7
JH
8298 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8299 n--;
ffc61ed2
JH
8300 }
8301 else {
8302 e = RExC_parse;
8303 n = 1;
8304 }
8305 if (!SIZE_ONLY) {
ab13f0c7
JH
8306 if (UCHARAT(RExC_parse) == '^') {
8307 RExC_parse++;
8308 n--;
8309 value = value == 'p' ? 'P' : 'p'; /* toggle */
8310 while (isSPACE(UCHARAT(RExC_parse))) {
8311 RExC_parse++;
8312 n--;
8313 }
8314 }
097eb12c
AL
8315 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8316 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
8317 }
8318 RExC_parse = e + 1;
8319 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 8320 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 8321 }
f81125e2 8322 break;
b8c5462f
JH
8323 case 'n': value = '\n'; break;
8324 case 'r': value = '\r'; break;
8325 case 't': value = '\t'; break;
8326 case 'f': value = '\f'; break;
8327 case 'b': value = '\b'; break;
c7f1f016
NIS
8328 case 'e': value = ASCII_TO_NATIVE('\033');break;
8329 case 'a': value = ASCII_TO_NATIVE('\007');break;
f0a2b745
KW
8330 case 'o':
8331 RExC_parse--; /* function expects to be pointed at the 'o' */
454155d9
KW
8332 {
8333 const char* error_msg;
8334 bool valid = grok_bslash_o(RExC_parse,
f0a2b745
KW
8335 &value,
8336 &numlen,
454155d9
KW
8337 &error_msg,
8338 SIZE_ONLY);
8339 RExC_parse += numlen;
8340 if (! valid) {
8341 vFAIL(error_msg);
8342 }
f0a2b745 8343 }
f0a2b745
KW
8344 if (PL_encoding && value < 0x100) {
8345 goto recode_encoding;
8346 }
8347 break;
b8c5462f 8348 case 'x':
ffc61ed2 8349 if (*RExC_parse == '{') {
a4c04bdc
NC
8350 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8351 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 8352 char * const e = strchr(RExC_parse++, '}');
b81d288d 8353 if (!e)
ffc61ed2 8354 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
8355
8356 numlen = e - RExC_parse;
8357 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8358 RExC_parse = e + 1;
8359 }
8360 else {
a4c04bdc 8361 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
8362 numlen = 2;
8363 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8364 RExC_parse += numlen;
8365 }
9e08bc66
TS
8366 if (PL_encoding && value < 0x100)
8367 goto recode_encoding;
b8c5462f
JH
8368 break;
8369 case 'c':
f9d13529 8370 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
b8c5462f
JH
8371 break;
8372 case '0': case '1': case '2': case '3': case '4':
c99e91e9 8373 case '5': case '6': case '7':
9e08bc66 8374 {
c99e91e9
KW
8375 /* Take 1-3 octal digits */
8376 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9e08bc66
TS
8377 numlen = 3;
8378 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8379 RExC_parse += numlen;
8380 if (PL_encoding && value < 0x100)
8381 goto recode_encoding;
8382 break;
8383 }
8384 recode_encoding:
8385 {
8386 SV* enc = PL_encoding;
8387 value = reg_recode((const char)(U8)value, &enc);
668c081a
NC
8388 if (!enc && SIZE_ONLY)
8389 ckWARNreg(RExC_parse,
8390 "Invalid escape in the specified encoding");
9e08bc66
TS
8391 break;
8392 }
1028017a 8393 default:
c99e91e9
KW
8394 /* Allow \_ to not give an error */
8395 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
668c081a
NC
8396 ckWARN2reg(RExC_parse,
8397 "Unrecognized escape \\%c in character class passed through",
8398 (int)value);
c99e91e9 8399 }
1028017a 8400 break;
b8c5462f 8401 }
ffc61ed2 8402 } /* end of \blah */
1b2d223b
JH
8403#ifdef EBCDIC
8404 else
8405 literal_endpoint++;
8406#endif
ffc61ed2
JH
8407
8408 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8409
2c63ecad
KW
8410 /* What matches in a locale is not known until runtime, so need to
8411 * (one time per class) allocate extra space to pass to regexec.
8412 * The space will contain a bit for each named class that is to be
8413 * matched against. This isn't needed for \p{} and pseudo-classes,
8414 * as they are not affected by locale, and hence are dealt with
8415 * separately */
8416 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8417 need_class = 1;
8418 if (SIZE_ONLY) {
8419 RExC_size += ANYOF_CLASS_ADD_SKIP;
8420 }
8421 else {
8422 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8423 ANYOF_CLASS_ZERO(ret);
8424 }
8425 ANYOF_FLAGS(ret) |= ANYOF_CLASS|ANYOF_LARGE;
8426 }
ffc61ed2
JH
8427
8428 /* a bad range like a-\d, a-[:digit:] ? */
8429 if (range) {
73b437c8 8430 if (!SIZE_ONLY) {
668c081a
NC
8431 const int w =
8432 RExC_parse >= rangebegin ?
8433 RExC_parse - rangebegin : 0;
8434 ckWARN4reg(RExC_parse,
b45f050a 8435 "False [] range \"%*.*s\"",
097eb12c 8436 w, w, rangebegin);
668c081a 8437
3568d838
JH
8438 if (prevvalue < 256) {
8439 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
8440 ANYOF_BITMAP_SET(ret, '-');
8441 }
8442 else {
8443 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8444 Perl_sv_catpvf(aTHX_ listsv,
3568d838 8445 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 8446 }
b8c5462f 8447 }
ffc61ed2
JH
8448
8449 range = 0; /* this was not a true range */
73b437c8 8450 }
ffc61ed2 8451
89836f1f
YO
8452
8453
73b437c8 8454 if (!SIZE_ONLY) {
c49a72a9
NC
8455 const char *what = NULL;
8456 char yesno = 0;
8457
3568d838
JH
8458 if (namedclass > OOB_NAMEDCLASS)
8459 optimize_invert = FALSE;
e2962f66
JH
8460 /* Possible truncation here but in some 64-bit environments
8461 * the compiler gets heartburn about switch on 64-bit values.
8462 * A similar issue a little earlier when switching on value.
98f323fa 8463 * --jhi */
e2962f66 8464 switch ((I32)namedclass) {
da7fcca4 8465
0399b215
KW
8466 case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
8467 case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
8468 case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
8469 case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
8470 case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
8471 case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
8472 case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
8473 case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
8474 case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
8475 case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
da7fcca4 8476#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
a12cf05f 8477 /* \s, \w match all unicode if utf8. */
0399b215
KW
8478 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8479 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
da7fcca4 8480#else
a12cf05f 8481 /* \s, \w match ascii and locale only */
0399b215
KW
8482 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8483 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
da7fcca4 8484#endif
0399b215 8485 case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
e1d1eefb
YO
8486 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8487 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
73b437c8
JH
8488 case ANYOF_ASCII:
8489 if (LOC)
936ed897 8490 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 8491 else {
c7f1f016 8492#ifndef EBCDIC
1ba5c669
JH
8493 for (value = 0; value < 128; value++)
8494 ANYOF_BITMAP_SET(ret, value);
8495#else /* EBCDIC */
ffbc6a93 8496 for (value = 0; value < 256; value++) {
3a3c4447
JH
8497 if (isASCII(value))
8498 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 8499 }
1ba5c669 8500#endif /* EBCDIC */
73b437c8 8501 }
c49a72a9
NC
8502 yesno = '+';
8503 what = "ASCII";
73b437c8
JH
8504 break;
8505 case ANYOF_NASCII:
8506 if (LOC)
936ed897 8507 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 8508 else {
c7f1f016 8509#ifndef EBCDIC
1ba5c669
JH
8510 for (value = 128; value < 256; value++)
8511 ANYOF_BITMAP_SET(ret, value);
8512#else /* EBCDIC */
ffbc6a93 8513 for (value = 0; value < 256; value++) {
3a3c4447
JH
8514 if (!isASCII(value))
8515 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 8516 }
1ba5c669 8517#endif /* EBCDIC */
73b437c8 8518 }
c49a72a9
NC
8519 yesno = '!';
8520 what = "ASCII";
89836f1f 8521 break;
ffc61ed2
JH
8522 case ANYOF_DIGIT:
8523 if (LOC)
8524 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8525 else {
8526 /* consecutive digits assumed */
8527 for (value = '0'; value <= '9'; value++)
8528 ANYOF_BITMAP_SET(ret, value);
8529 }
c49a72a9 8530 yesno = '+';
da7fcca4 8531 what = POSIX_CC_UNI_NAME("Digit");
ffc61ed2
JH
8532 break;
8533 case ANYOF_NDIGIT:
8534 if (LOC)
8535 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8536 else {
8537 /* consecutive digits assumed */
8538 for (value = 0; value < '0'; value++)
8539 ANYOF_BITMAP_SET(ret, value);
8540 for (value = '9' + 1; value < 256; value++)
8541 ANYOF_BITMAP_SET(ret, value);
8542 }
c49a72a9 8543 yesno = '!';
da7fcca4 8544 what = POSIX_CC_UNI_NAME("Digit");
89836f1f 8545 break;
f81125e2
JP
8546 case ANYOF_MAX:
8547 /* this is to handle \p and \P */
8548 break;
73b437c8 8549 default:
b45f050a 8550 vFAIL("Invalid [::] class");
73b437c8 8551 break;
b8c5462f 8552 }
c49a72a9
NC
8553 if (what) {
8554 /* Strings such as "+utf8::isWord\n" */
8555 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8556 }
2c63ecad 8557 stored+=2; /* can't optimize this class */
73b437c8 8558 continue;
a0d0e21e 8559 }
ffc61ed2
JH
8560 } /* end of namedclass \blah */
8561
a0d0e21e 8562 if (range) {
eb160463 8563 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
8564 const int w = RExC_parse - rangebegin;
8565 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 8566 range = 0; /* not a valid range */
73b437c8 8567 }
a0d0e21e
LW
8568 }
8569 else {
3568d838 8570 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
8571 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8572 RExC_parse[1] != ']') {
8573 RExC_parse++;
ffc61ed2
JH
8574
8575 /* a bad range like \w-, [:word:]- ? */
8576 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 8577 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 8578 const int w =
afd78fd5
JH
8579 RExC_parse >= rangebegin ?
8580 RExC_parse - rangebegin : 0;
830247a4 8581 vWARN4(RExC_parse,
b45f050a 8582 "False [] range \"%*.*s\"",
097eb12c 8583 w, w, rangebegin);
afd78fd5 8584 }
73b437c8 8585 if (!SIZE_ONLY)
936ed897 8586 ANYOF_BITMAP_SET(ret, '-');
73b437c8 8587 } else
ffc61ed2
JH
8588 range = 1; /* yeah, it's a range! */
8589 continue; /* but do it the next time */
a0d0e21e 8590 }
a687059c 8591 }
ffc61ed2 8592
93733859 8593 /* now is the next time */
07be1b83 8594 /*stored += (value - prevvalue + 1);*/
ae5c130c 8595 if (!SIZE_ONLY) {
3568d838 8596 if (prevvalue < 256) {
1df70142 8597 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 8598 IV i;
3568d838 8599#ifdef EBCDIC
1b2d223b
JH
8600 /* In EBCDIC [\x89-\x91] should include
8601 * the \x8e but [i-j] should not. */
8602 if (literal_endpoint == 2 &&
8603 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8604 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 8605 {
3568d838
JH
8606 if (isLOWER(prevvalue)) {
8607 for (i = prevvalue; i <= ceilvalue; i++)
2670d666
BC
8608 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8609 stored++;
ffc61ed2 8610 ANYOF_BITMAP_SET(ret, i);
2670d666 8611 }
ffc61ed2 8612 } else {
3568d838 8613 for (i = prevvalue; i <= ceilvalue; i++)
2670d666
BC
8614 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8615 stored++;
ffc61ed2 8616 ANYOF_BITMAP_SET(ret, i);
2670d666 8617 }
ffc61ed2 8618 }
8ada0baa 8619 }
ffc61ed2 8620 else
8ada0baa 8621#endif
07be1b83
YO
8622 for (i = prevvalue; i <= ceilvalue; i++) {
8623 if (!ANYOF_BITMAP_TEST(ret,i)) {
8624 stored++;
8625 ANYOF_BITMAP_SET(ret, i);
8626 }
8627 }
3568d838 8628 }
a5961de5 8629 if (value > 255 || UTF) {
1df70142
AL
8630 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8631 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 8632 stored+=2; /* can't optimize this class */
ffc61ed2 8633 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 8634 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 8635 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
8636 prevnatvalue, natvalue);
8637 }
8638 else if (prevnatvalue == natvalue) {
8639 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 8640 if (FOLD) {
89ebb4a3 8641 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 8642 STRLEN foldlen;
1df70142 8643 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 8644
e294cc5d
JH
8645#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8646 if (RExC_precomp[0] == ':' &&
8647 RExC_precomp[1] == '[' &&
8648 (f == 0xDF || f == 0x92)) {
8649 f = NATIVE_TO_UNI(f);
8650 }
8651#endif
c840d2a2
JH
8652 /* If folding and foldable and a single
8653 * character, insert also the folded version
8654 * to the charclass. */
9e55ce06 8655 if (f != value) {
e294cc5d
JH
8656#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8657 if ((RExC_precomp[0] == ':' &&
8658 RExC_precomp[1] == '[' &&
8659 (f == 0xA2 &&
8660 (value == 0xFB05 || value == 0xFB06))) ?
8661 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8662 foldlen == (STRLEN)UNISKIP(f) )
8663#else
eb160463 8664 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 8665#endif
9e55ce06
JH
8666 Perl_sv_catpvf(aTHX_ listsv,
8667 "%04"UVxf"\n", f);
8668 else {
8669 /* Any multicharacter foldings
8670 * require the following transform:
8671 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8672 * where E folds into "pq" and F folds
8673 * into "rst", all other characters
8674 * fold to single characters. We save
8675 * away these multicharacter foldings,
8676 * to be later saved as part of the
8677 * additional "s" data. */
8678 SV *sv;
8679
8680 if (!unicode_alternate)
8681 unicode_alternate = newAV();
740cce10
NC
8682 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8683 TRUE);
9e55ce06
JH
8684 av_push(unicode_alternate, sv);
8685 }
8686 }
254ba52a 8687
60a8b682
JH
8688 /* If folding and the value is one of the Greek
8689 * sigmas insert a few more sigmas to make the
8690 * folding rules of the sigmas to work right.
8691 * Note that not all the possible combinations
8692 * are handled here: some of them are handled
9e55ce06
JH
8693 * by the standard folding rules, and some of
8694 * them (literal or EXACTF cases) are handled
8695 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
8696 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8697 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8698 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 8699 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8700 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8701 }
8702 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8703 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8704 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8705 }
8706 }
ffc61ed2 8707 }
1b2d223b
JH
8708#ifdef EBCDIC
8709 literal_endpoint = 0;
8710#endif
8ada0baa 8711 }
ffc61ed2
JH
8712
8713 range = 0; /* this range (if it was one) is done now */
a0d0e21e 8714 }
ffc61ed2 8715
ffc61ed2 8716
7f6f358c
YO
8717
8718 if (SIZE_ONLY)
8719 return ret;
8720 /****** !SIZE_ONLY AFTER HERE *********/
8721
3b57cd43 8722 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
7f6f358c
YO
8723 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8724 ) {
8725 /* optimize single char class to an EXACT node
8726 but *only* when its not a UTF/high char */
07be1b83
YO
8727 const char * cur_parse= RExC_parse;
8728 RExC_emit = (regnode *)orig_emit;
8729 RExC_parse = (char *)orig_parse;
7f6f358c
YO
8730 ret = reg_node(pRExC_state,
8731 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 8732 RExC_parse = (char *)cur_parse;
7f6f358c
YO
8733 *STRING(ret)= (char)value;
8734 STR_LEN(ret)= 1;
8735 RExC_emit += STR_SZ(1);
ef8d46e8 8736 SvREFCNT_dec(listsv);
7f6f358c
YO
8737 return ret;
8738 }
ae5c130c 8739 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 8740 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
8741 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8742 ) {
a0ed51b3 8743 for (value = 0; value < 256; ++value) {
936ed897 8744 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 8745 UV fold = PL_fold[value];
ffc61ed2
JH
8746
8747 if (fold != value)
8748 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
8749 }
8750 }
936ed897 8751 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 8752 }
ffc61ed2 8753
ae5c130c 8754 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 8755 if (optimize_invert &&
ffc61ed2
JH
8756 /* If the only flag is inversion. */
8757 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 8758 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 8759 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 8760 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 8761 }
7f6f358c 8762 {
097eb12c 8763 AV * const av = newAV();
ffc61ed2 8764 SV *rv;
9e55ce06 8765 /* The 0th element stores the character class description
6a0407ee 8766 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
8767 * to initialize the appropriate swash (which gets stored in
8768 * the 1st element), and also useful for dumping the regnode.
8769 * The 2nd element stores the multicharacter foldings,
6a0407ee 8770 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
8771 av_store(av, 0, listsv);
8772 av_store(av, 1, NULL);
ad64d0ec
NC
8773 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8774 rv = newRV_noinc(MUTABLE_SV(av));
19860706 8775 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 8776 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 8777 ARG_SET(ret, n);
a0ed51b3 8778 }
a0ed51b3
LW
8779 return ret;
8780}
89836f1f
YO
8781#undef _C_C_T_
8782
a0ed51b3 8783
bcdf7404
YO
8784/* reg_skipcomment()
8785
8786 Absorbs an /x style # comments from the input stream.
8787 Returns true if there is more text remaining in the stream.
8788 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8789 terminates the pattern without including a newline.
8790
8791 Note its the callers responsibility to ensure that we are
8792 actually in /x mode
8793
8794*/
8795
8796STATIC bool
8797S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8798{
8799 bool ended = 0;
7918f24d
NC
8800
8801 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8802
bcdf7404
YO
8803 while (RExC_parse < RExC_end)
8804 if (*RExC_parse++ == '\n') {
8805 ended = 1;
8806 break;
8807 }
8808 if (!ended) {
8809 /* we ran off the end of the pattern without ending
8810 the comment, so we have to add an \n when wrapping */
8811 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8812 return 0;
8813 } else
8814 return 1;
8815}
8816
8817/* nextchar()
8818
3b753521 8819 Advances the parse position, and optionally absorbs
bcdf7404
YO
8820 "whitespace" from the inputstream.
8821
8822 Without /x "whitespace" means (?#...) style comments only,
8823 with /x this means (?#...) and # comments and whitespace proper.
8824
8825 Returns the RExC_parse point from BEFORE the scan occurs.
8826
8827 This is the /x friendly way of saying RExC_parse++.
8828*/
8829
76e3520e 8830STATIC char*
830247a4 8831S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 8832{
097eb12c 8833 char* const retval = RExC_parse++;
a0d0e21e 8834
7918f24d
NC
8835 PERL_ARGS_ASSERT_NEXTCHAR;
8836
4633a7c4 8837 for (;;) {
830247a4
IZ
8838 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8839 RExC_parse[2] == '#') {
e994fd66
AE
8840 while (*RExC_parse != ')') {
8841 if (RExC_parse == RExC_end)
8842 FAIL("Sequence (?#... not terminated");
830247a4 8843 RExC_parse++;
e994fd66 8844 }
830247a4 8845 RExC_parse++;
4633a7c4
LW
8846 continue;
8847 }
bbe252da 8848 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
8849 if (isSPACE(*RExC_parse)) {
8850 RExC_parse++;
748a9306
LW
8851 continue;
8852 }
830247a4 8853 else if (*RExC_parse == '#') {
bcdf7404
YO
8854 if ( reg_skipcomment( pRExC_state ) )
8855 continue;
748a9306 8856 }
748a9306 8857 }
4633a7c4 8858 return retval;
a0d0e21e 8859 }
a687059c
LW
8860}
8861
8862/*
c277df42 8863- reg_node - emit a node
a0d0e21e 8864*/
76e3520e 8865STATIC regnode * /* Location. */
830247a4 8866S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 8867{
97aff369 8868 dVAR;
c277df42 8869 register regnode *ptr;
504618e9 8870 regnode * const ret = RExC_emit;
07be1b83 8871 GET_RE_DEBUG_FLAGS_DECL;
a687059c 8872
7918f24d
NC
8873 PERL_ARGS_ASSERT_REG_NODE;
8874
c277df42 8875 if (SIZE_ONLY) {
830247a4
IZ
8876 SIZE_ALIGN(RExC_size);
8877 RExC_size += 1;
a0d0e21e
LW
8878 return(ret);
8879 }
3b57cd43
YO
8880 if (RExC_emit >= RExC_emit_bound)
8881 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8882
c277df42 8883 NODE_ALIGN_FILL(ret);
a0d0e21e 8884 ptr = ret;
c277df42 8885 FILL_ADVANCE_NODE(ptr, op);
7122b237 8886#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8887 if (RExC_offsets) { /* MJD */
07be1b83 8888 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 8889 "reg_node", __LINE__,
13d6edb4 8890 PL_reg_name[op],
07be1b83
YO
8891 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8892 ? "Overwriting end of array!\n" : "OK",
8893 (UV)(RExC_emit - RExC_emit_start),
8894 (UV)(RExC_parse - RExC_start),
8895 (UV)RExC_offsets[0]));
ccb2c380 8896 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 8897 }
7122b237 8898#endif
830247a4 8899 RExC_emit = ptr;
a0d0e21e 8900 return(ret);
a687059c
LW
8901}
8902
8903/*
a0d0e21e
LW
8904- reganode - emit a node with an argument
8905*/
76e3520e 8906STATIC regnode * /* Location. */
830247a4 8907S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 8908{
97aff369 8909 dVAR;
c277df42 8910 register regnode *ptr;
504618e9 8911 regnode * const ret = RExC_emit;
07be1b83 8912 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 8913
7918f24d
NC
8914 PERL_ARGS_ASSERT_REGANODE;
8915
c277df42 8916 if (SIZE_ONLY) {
830247a4
IZ
8917 SIZE_ALIGN(RExC_size);
8918 RExC_size += 2;
6bda09f9
YO
8919 /*
8920 We can't do this:
8921
8922 assert(2==regarglen[op]+1);
8923
8924 Anything larger than this has to allocate the extra amount.
8925 If we changed this to be:
8926
8927 RExC_size += (1 + regarglen[op]);
8928
8929 then it wouldn't matter. Its not clear what side effect
8930 might come from that so its not done so far.
8931 -- dmq
8932 */
a0d0e21e
LW
8933 return(ret);
8934 }
3b57cd43
YO
8935 if (RExC_emit >= RExC_emit_bound)
8936 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8937
c277df42 8938 NODE_ALIGN_FILL(ret);
a0d0e21e 8939 ptr = ret;
c277df42 8940 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 8941#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8942 if (RExC_offsets) { /* MJD */
07be1b83 8943 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 8944 "reganode",
ccb2c380 8945 __LINE__,
13d6edb4 8946 PL_reg_name[op],
07be1b83 8947 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 8948 "Overwriting end of array!\n" : "OK",
07be1b83
YO
8949 (UV)(RExC_emit - RExC_emit_start),
8950 (UV)(RExC_parse - RExC_start),
8951 (UV)RExC_offsets[0]));
ccb2c380 8952 Set_Cur_Node_Offset;
fac92740 8953 }
7122b237 8954#endif
830247a4 8955 RExC_emit = ptr;
a0d0e21e 8956 return(ret);
fe14fcc3
LW
8957}
8958
8959/*
cd439c50 8960- reguni - emit (if appropriate) a Unicode character
a0ed51b3 8961*/
71207a34
AL
8962STATIC STRLEN
8963S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 8964{
97aff369 8965 dVAR;
7918f24d
NC
8966
8967 PERL_ARGS_ASSERT_REGUNI;
8968
71207a34 8969 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
8970}
8971
8972/*
a0d0e21e
LW
8973- reginsert - insert an operator in front of already-emitted operand
8974*
8975* Means relocating the operand.
8976*/
76e3520e 8977STATIC void
6bda09f9 8978S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 8979{
97aff369 8980 dVAR;
c277df42
IZ
8981 register regnode *src;
8982 register regnode *dst;
8983 register regnode *place;
504618e9 8984 const int offset = regarglen[(U8)op];
6bda09f9 8985 const int size = NODE_STEP_REGNODE + offset;
07be1b83 8986 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8987
8988 PERL_ARGS_ASSERT_REGINSERT;
def51078 8989 PERL_UNUSED_ARG(depth);
22c35a8c 8990/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 8991 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 8992 if (SIZE_ONLY) {
6bda09f9 8993 RExC_size += size;
a0d0e21e
LW
8994 return;
8995 }
a687059c 8996
830247a4 8997 src = RExC_emit;
6bda09f9 8998 RExC_emit += size;
830247a4 8999 dst = RExC_emit;
40d049e4 9000 if (RExC_open_parens) {
6bda09f9 9001 int paren;
3b57cd43 9002 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 9003 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 9004 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 9005 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
9006 RExC_open_parens[paren] += size;
9007 } else {
3b57cd43 9008 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
9009 }
9010 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 9011 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
9012 RExC_close_parens[paren] += size;
9013 } else {
3b57cd43 9014 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
9015 }
9016 }
6bda09f9 9017 }
40d049e4 9018
fac92740 9019 while (src > opnd) {
c277df42 9020 StructCopy(--src, --dst, regnode);
7122b237 9021#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9022 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 9023 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 9024 "reg_insert",
ccb2c380 9025 __LINE__,
13d6edb4 9026 PL_reg_name[op],
07be1b83
YO
9027 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9028 ? "Overwriting end of array!\n" : "OK",
9029 (UV)(src - RExC_emit_start),
9030 (UV)(dst - RExC_emit_start),
9031 (UV)RExC_offsets[0]));
ccb2c380
MP
9032 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9033 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 9034 }
7122b237 9035#endif
fac92740
MJD
9036 }
9037
a0d0e21e
LW
9038
9039 place = opnd; /* Op node, where operand used to be. */
7122b237 9040#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9041 if (RExC_offsets) { /* MJD */
07be1b83 9042 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 9043 "reginsert",
ccb2c380 9044 __LINE__,
13d6edb4 9045 PL_reg_name[op],
07be1b83 9046 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 9047 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
9048 (UV)(place - RExC_emit_start),
9049 (UV)(RExC_parse - RExC_start),
786e8c11 9050 (UV)RExC_offsets[0]));
ccb2c380 9051 Set_Node_Offset(place, RExC_parse);
45948336 9052 Set_Node_Length(place, 1);
fac92740 9053 }
7122b237 9054#endif
c277df42
IZ
9055 src = NEXTOPER(place);
9056 FILL_ADVANCE_NODE(place, op);
9057 Zero(src, offset, regnode);
a687059c
LW
9058}
9059
9060/*
c277df42 9061- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 9062- SEE ALSO: regtail_study
a0d0e21e 9063*/
097eb12c 9064/* TODO: All three parms should be const */
76e3520e 9065STATIC void
3dab1dad 9066S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 9067{
97aff369 9068 dVAR;
c277df42 9069 register regnode *scan;
72f13be8 9070 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9071
9072 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
9073#ifndef DEBUGGING
9074 PERL_UNUSED_ARG(depth);
9075#endif
a0d0e21e 9076
c277df42 9077 if (SIZE_ONLY)
a0d0e21e
LW
9078 return;
9079
9080 /* Find last node. */
9081 scan = p;
9082 for (;;) {
504618e9 9083 regnode * const temp = regnext(scan);
3dab1dad
YO
9084 DEBUG_PARSE_r({
9085 SV * const mysv=sv_newmortal();
9086 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9087 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
9088 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9089 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9090 (temp == NULL ? "->" : ""),
13d6edb4 9091 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 9092 );
3dab1dad
YO
9093 });
9094 if (temp == NULL)
9095 break;
9096 scan = temp;
9097 }
9098
9099 if (reg_off_by_arg[OP(scan)]) {
9100 ARG_SET(scan, val - scan);
9101 }
9102 else {
9103 NEXT_OFF(scan) = val - scan;
9104 }
9105}
9106
07be1b83 9107#ifdef DEBUGGING
3dab1dad
YO
9108/*
9109- regtail_study - set the next-pointer at the end of a node chain of p to val.
9110- Look for optimizable sequences at the same time.
9111- currently only looks for EXACT chains.
07be1b83
YO
9112
9113This is expermental code. The idea is to use this routine to perform
9114in place optimizations on branches and groups as they are constructed,
9115with the long term intention of removing optimization from study_chunk so
9116that it is purely analytical.
9117
9118Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9119to control which is which.
9120
3dab1dad
YO
9121*/
9122/* TODO: All four parms should be const */
07be1b83 9123
3dab1dad
YO
9124STATIC U8
9125S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9126{
9127 dVAR;
9128 register regnode *scan;
07be1b83
YO
9129 U8 exact = PSEUDO;
9130#ifdef EXPERIMENTAL_INPLACESCAN
9131 I32 min = 0;
9132#endif
3dab1dad
YO
9133 GET_RE_DEBUG_FLAGS_DECL;
9134
7918f24d
NC
9135 PERL_ARGS_ASSERT_REGTAIL_STUDY;
9136
07be1b83 9137
3dab1dad
YO
9138 if (SIZE_ONLY)
9139 return exact;
9140
9141 /* Find last node. */
9142
9143 scan = p;
9144 for (;;) {
9145 regnode * const temp = regnext(scan);
07be1b83
YO
9146#ifdef EXPERIMENTAL_INPLACESCAN
9147 if (PL_regkind[OP(scan)] == EXACT)
9148 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9149 return EXACT;
9150#endif
3dab1dad
YO
9151 if ( exact ) {
9152 switch (OP(scan)) {
9153 case EXACT:
9154 case EXACTF:
9155 case EXACTFL:
9156 if( exact == PSEUDO )
9157 exact= OP(scan);
07be1b83
YO
9158 else if ( exact != OP(scan) )
9159 exact= 0;
3dab1dad
YO
9160 case NOTHING:
9161 break;
9162 default:
9163 exact= 0;
9164 }
9165 }
9166 DEBUG_PARSE_r({
9167 SV * const mysv=sv_newmortal();
9168 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9169 regprop(RExC_rx, mysv, scan);
eaf3ca90 9170 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 9171 SvPV_nolen_const(mysv),
eaf3ca90 9172 REG_NODE_NUM(scan),
13d6edb4 9173 PL_reg_name[exact]);
3dab1dad 9174 });
a0d0e21e
LW
9175 if (temp == NULL)
9176 break;
9177 scan = temp;
9178 }
07be1b83
YO
9179 DEBUG_PARSE_r({
9180 SV * const mysv_val=sv_newmortal();
9181 DEBUG_PARSE_MSG("");
9182 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
9183 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9184 SvPV_nolen_const(mysv_val),
9185 (IV)REG_NODE_NUM(val),
9186 (IV)(val - scan)
07be1b83
YO
9187 );
9188 });
c277df42
IZ
9189 if (reg_off_by_arg[OP(scan)]) {
9190 ARG_SET(scan, val - scan);
a0ed51b3
LW
9191 }
9192 else {
c277df42
IZ
9193 NEXT_OFF(scan) = val - scan;
9194 }
3dab1dad
YO
9195
9196 return exact;
a687059c 9197}
07be1b83 9198#endif
a687059c
LW
9199
9200/*
fd181c75 9201 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 9202 */
f7819f85 9203#ifdef DEBUGGING
c33269f7 9204static void
7918f24d
NC
9205S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9206{
f7819f85
A
9207 int bit;
9208 int set=0;
7918f24d 9209
f7819f85
A
9210 for (bit=0; bit<32; bit++) {
9211 if (flags & (1<<bit)) {
9212 if (!set++ && lead)
9213 PerlIO_printf(Perl_debug_log, "%s",lead);
9214 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9215 }
9216 }
9217 if (lead) {
9218 if (set)
9219 PerlIO_printf(Perl_debug_log, "\n");
9220 else
9221 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9222 }
9223}
9224#endif
9225
a687059c 9226void
097eb12c 9227Perl_regdump(pTHX_ const regexp *r)
a687059c 9228{
35ff7856 9229#ifdef DEBUGGING
97aff369 9230 dVAR;
c445ea15 9231 SV * const sv = sv_newmortal();
ab3bbdeb 9232 SV *dsv= sv_newmortal();
f8fc2ecf 9233 RXi_GET_DECL(r,ri);
f7819f85 9234 GET_RE_DEBUG_FLAGS_DECL;
a687059c 9235
7918f24d
NC
9236 PERL_ARGS_ASSERT_REGDUMP;
9237
f8fc2ecf 9238 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
9239
9240 /* Header fields of interest. */
ab3bbdeb
YO
9241 if (r->anchored_substr) {
9242 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9243 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 9244 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9245 "anchored %s%s at %"IVdf" ",
9246 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 9247 (IV)r->anchored_offset);
ab3bbdeb
YO
9248 } else if (r->anchored_utf8) {
9249 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9250 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 9251 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9252 "anchored utf8 %s%s at %"IVdf" ",
9253 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 9254 (IV)r->anchored_offset);
ab3bbdeb
YO
9255 }
9256 if (r->float_substr) {
9257 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9258 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 9259 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9260 "floating %s%s at %"IVdf"..%"UVuf" ",
9261 s, RE_SV_TAIL(r->float_substr),
7b0972df 9262 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
9263 } else if (r->float_utf8) {
9264 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9265 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 9266 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9267 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9268 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 9269 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 9270 }
33b8afdf 9271 if (r->check_substr || r->check_utf8)
b81d288d 9272 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
9273 (const char *)
9274 (r->check_substr == r->float_substr
9275 && r->check_utf8 == r->float_utf8
9276 ? "(checking floating" : "(checking anchored"));
bbe252da 9277 if (r->extflags & RXf_NOSCAN)
c277df42 9278 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 9279 if (r->extflags & RXf_CHECK_ALL)
c277df42 9280 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 9281 if (r->check_substr || r->check_utf8)
c277df42
IZ
9282 PerlIO_printf(Perl_debug_log, ") ");
9283
f8fc2ecf
YO
9284 if (ri->regstclass) {
9285 regprop(r, sv, ri->regstclass);
1de06328 9286 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 9287 }
bbe252da 9288 if (r->extflags & RXf_ANCH) {
774d564b 9289 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 9290 if (r->extflags & RXf_ANCH_BOL)
774d564b 9291 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 9292 if (r->extflags & RXf_ANCH_MBOL)
c277df42 9293 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 9294 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 9295 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 9296 if (r->extflags & RXf_ANCH_GPOS)
774d564b 9297 PerlIO_printf(Perl_debug_log, "(GPOS)");
9298 PerlIO_putc(Perl_debug_log, ' ');
9299 }
bbe252da 9300 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 9301 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 9302 if (r->intflags & PREGf_SKIP)
760ac839 9303 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 9304 if (r->intflags & PREGf_IMPLICIT)
760ac839 9305 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 9306 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 9307 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 9308 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 9309 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 9310 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 9311#else
7918f24d 9312 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 9313 PERL_UNUSED_CONTEXT;
65e66c80 9314 PERL_UNUSED_ARG(r);
17c3b450 9315#endif /* DEBUGGING */
a687059c
LW
9316}
9317
9318/*
a0d0e21e
LW
9319- regprop - printable representation of opcode
9320*/
3339dfd8
YO
9321#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9322STMT_START { \
9323 if (do_sep) { \
9324 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9325 if (flags & ANYOF_INVERT) \
9326 /*make sure the invert info is in each */ \
9327 sv_catpvs(sv, "^"); \
9328 do_sep = 0; \
9329 } \
9330} STMT_END
9331
46fc3d4c 9332void
32fc9b6a 9333Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 9334{
35ff7856 9335#ifdef DEBUGGING
97aff369 9336 dVAR;
9b155405 9337 register int k;
f8fc2ecf 9338 RXi_GET_DECL(prog,progi);
1de06328 9339 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 9340
7918f24d 9341 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 9342
76f68e9b 9343 sv_setpvs(sv, "");
8aa23a47 9344
03363afd 9345 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
9346 /* It would be nice to FAIL() here, but this may be called from
9347 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 9348 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 9349 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 9350
3dab1dad 9351 k = PL_regkind[OP(o)];
9b155405 9352
2a782b5b 9353 if (k == EXACT) {
f92a2122 9354 sv_catpvs(sv, " ");
ab3bbdeb
YO
9355 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9356 * is a crude hack but it may be the best for now since
9357 * we have no flag "this EXACTish node was UTF-8"
9358 * --jhi */
f92a2122
NC
9359 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9360 PERL_PV_ESCAPE_UNI_DETECT |
9361 PERL_PV_PRETTY_ELLIPSES |
9362 PERL_PV_PRETTY_LTGT |
9363 PERL_PV_PRETTY_NOCLEAR
9364 );
bb263b4e 9365 } else if (k == TRIE) {
3dab1dad 9366 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 9367 * progi->data isn't available here */
1de06328 9368 const char op = OP(o);
647f639f 9369 const U32 n = ARG(o);
1de06328 9370 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 9371 (reg_ac_data *)progi->data->data[n] :
1de06328 9372 NULL;
3251b653
NC
9373 const reg_trie_data * const trie
9374 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 9375
13d6edb4 9376 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
9377 DEBUG_TRIE_COMPILE_r(
9378 Perl_sv_catpvf(aTHX_ sv,
9379 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9380 (UV)trie->startstate,
1e2e3d02 9381 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
9382 (UV)trie->wordcount,
9383 (UV)trie->minlen,
9384 (UV)trie->maxlen,
9385 (UV)TRIE_CHARCOUNT(trie),
9386 (UV)trie->uniquecharcount
9387 )
9388 );
9389 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9390 int i;
9391 int rangestart = -1;
f46cb337 9392 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 9393 sv_catpvs(sv, "[");
1de06328
YO
9394 for (i = 0; i <= 256; i++) {
9395 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9396 if (rangestart == -1)
9397 rangestart = i;
9398 } else if (rangestart != -1) {
9399 if (i <= rangestart + 3)
9400 for (; rangestart < i; rangestart++)
9401 put_byte(sv, rangestart);
9402 else {
9403 put_byte(sv, rangestart);
9404 sv_catpvs(sv, "-");
9405 put_byte(sv, i - 1);
9406 }
9407 rangestart = -1;
9408 }
9409 }
f3a2811a 9410 sv_catpvs(sv, "]");
1de06328
YO
9411 }
9412
a3621e74 9413 } else if (k == CURLY) {
cb434fcc 9414 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
9415 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9416 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 9417 }
2c2d71f5
JH
9418 else if (k == WHILEM && o->flags) /* Ordinal/of */
9419 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 9420 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 9421 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 9422 if ( RXp_PAREN_NAMES(prog) ) {
ee9b8eae 9423 if ( k != REF || OP(o) < NREF) {
502c6561 9424 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
9425 SV **name= av_fetch(list, ARG(o), 0 );
9426 if (name)
9427 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9428 }
9429 else {
502c6561 9430 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 9431 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
9432 I32 *nums=(I32*)SvPVX(sv_dat);
9433 SV **name= av_fetch(list, nums[0], 0 );
9434 I32 n;
9435 if (name) {
9436 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9437 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9438 (n ? "," : ""), (IV)nums[n]);
9439 }
9440 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 9441 }
1f1031fe 9442 }
ee9b8eae 9443 }
1f1031fe 9444 } else if (k == GOSUB)
6bda09f9 9445 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
9446 else if (k == VERB) {
9447 if (!o->flags)
9448 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 9449 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 9450 } else if (k == LOGICAL)
04ebc1ab 9451 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
f9a79580 9452 else if (k == FOLDCHAR)
df44d732 9453 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
653099ff
GS
9454 else if (k == ANYOF) {
9455 int i, rangestart = -1;
2d03de9c 9456 const U8 flags = ANYOF_FLAGS(o);
24d786f4 9457 int do_sep = 0;
0bd48802
AL
9458
9459 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9460 static const char * const anyofs[] = {
653099ff
GS
9461 "\\w",
9462 "\\W",
9463 "\\s",
9464 "\\S",
9465 "\\d",
9466 "\\D",
9467 "[:alnum:]",
9468 "[:^alnum:]",
9469 "[:alpha:]",
9470 "[:^alpha:]",
9471 "[:ascii:]",
9472 "[:^ascii:]",
24d786f4
YO
9473 "[:cntrl:]",
9474 "[:^cntrl:]",
653099ff
GS
9475 "[:graph:]",
9476 "[:^graph:]",
9477 "[:lower:]",
9478 "[:^lower:]",
9479 "[:print:]",
9480 "[:^print:]",
9481 "[:punct:]",
9482 "[:^punct:]",
9483 "[:upper:]",
aaa51d5e 9484 "[:^upper:]",
653099ff 9485 "[:xdigit:]",
aaa51d5e
JF
9486 "[:^xdigit:]",
9487 "[:space:]",
9488 "[:^space:]",
9489 "[:blank:]",
9490 "[:^blank:]"
653099ff
GS
9491 };
9492
19860706 9493 if (flags & ANYOF_LOCALE)
396482e1 9494 sv_catpvs(sv, "{loc}");
19860706 9495 if (flags & ANYOF_FOLD)
396482e1 9496 sv_catpvs(sv, "{i}");
653099ff 9497 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 9498 if (flags & ANYOF_INVERT)
396482e1 9499 sv_catpvs(sv, "^");
3339dfd8
YO
9500
9501 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
9502 for (i = 0; i <= 256; i++) {
9503 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9504 if (rangestart == -1)
9505 rangestart = i;
9506 } else if (rangestart != -1) {
9507 if (i <= rangestart + 3)
9508 for (; rangestart < i; rangestart++)
653099ff 9509 put_byte(sv, rangestart);
ffc61ed2
JH
9510 else {
9511 put_byte(sv, rangestart);
396482e1 9512 sv_catpvs(sv, "-");
ffc61ed2 9513 put_byte(sv, i - 1);
653099ff 9514 }
24d786f4 9515 do_sep = 1;
ffc61ed2 9516 rangestart = -1;
653099ff 9517 }
847a199f 9518 }
3339dfd8
YO
9519
9520 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9521 /* output any special charclass tests (used mostly under use locale) */
ffc61ed2 9522 if (o->flags & ANYOF_CLASS)
bb7a0f54 9523 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 9524 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 9525 sv_catpv(sv, anyofs[i]);
24d786f4
YO
9526 do_sep = 1;
9527 }
9528
3339dfd8
YO
9529 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9530
9531 /* output information about the unicode matching */
ffc61ed2 9532 if (flags & ANYOF_UNICODE)
396482e1 9533 sv_catpvs(sv, "{unicode}");
1aa99e6b 9534 else if (flags & ANYOF_UNICODE_ALL)
396482e1 9535 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
9536
9537 {
9538 SV *lv;
32fc9b6a 9539 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 9540
ffc61ed2
JH
9541 if (lv) {
9542 if (sw) {
89ebb4a3 9543 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 9544
ffc61ed2 9545 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 9546 uvchr_to_utf8(s, i);
ffc61ed2 9547
3568d838 9548 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
9549 if (rangestart == -1)
9550 rangestart = i;
9551 } else if (rangestart != -1) {
ffc61ed2
JH
9552 if (i <= rangestart + 3)
9553 for (; rangestart < i; rangestart++) {
2d03de9c
AL
9554 const U8 * const e = uvchr_to_utf8(s,rangestart);
9555 U8 *p;
9556 for(p = s; p < e; p++)
ffc61ed2
JH
9557 put_byte(sv, *p);
9558 }
9559 else {
2d03de9c
AL
9560 const U8 *e = uvchr_to_utf8(s,rangestart);
9561 U8 *p;
9562 for (p = s; p < e; p++)
ffc61ed2 9563 put_byte(sv, *p);
396482e1 9564 sv_catpvs(sv, "-");
2d03de9c
AL
9565 e = uvchr_to_utf8(s, i-1);
9566 for (p = s; p < e; p++)
1df70142 9567 put_byte(sv, *p);
ffc61ed2
JH
9568 }
9569 rangestart = -1;
9570 }
19860706 9571 }
ffc61ed2 9572
396482e1 9573 sv_catpvs(sv, "..."); /* et cetera */
19860706 9574 }
fde631ed 9575
ffc61ed2 9576 {
2e0de35c 9577 char *s = savesvpv(lv);
c445ea15 9578 char * const origs = s;
b81d288d 9579
3dab1dad
YO
9580 while (*s && *s != '\n')
9581 s++;
b81d288d 9582
ffc61ed2 9583 if (*s == '\n') {
2d03de9c 9584 const char * const t = ++s;
ffc61ed2
JH
9585
9586 while (*s) {
9587 if (*s == '\n')
9588 *s = ' ';
9589 s++;
9590 }
9591 if (s[-1] == ' ')
9592 s[-1] = 0;
9593
9594 sv_catpv(sv, t);
fde631ed 9595 }
b81d288d 9596
ffc61ed2 9597 Safefree(origs);
fde631ed
JH
9598 }
9599 }
653099ff 9600 }
ffc61ed2 9601
653099ff
GS
9602 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9603 }
9b155405 9604 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 9605 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 9606#else
96a5add6 9607 PERL_UNUSED_CONTEXT;
65e66c80
SP
9608 PERL_UNUSED_ARG(sv);
9609 PERL_UNUSED_ARG(o);
f9049ba1 9610 PERL_UNUSED_ARG(prog);
17c3b450 9611#endif /* DEBUGGING */
35ff7856 9612}
a687059c 9613
cad2e5aa 9614SV *
288b8c02 9615Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 9616{ /* Assume that RE_INTUIT is set */
97aff369 9617 dVAR;
288b8c02 9618 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 9619 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9620
9621 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
9622 PERL_UNUSED_CONTEXT;
9623
a3621e74 9624 DEBUG_COMPILE_r(
cfd0369c 9625 {
2d03de9c 9626 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 9627 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
9628
9629 if (!PL_colorset) reginitcolors();
9630 PerlIO_printf(Perl_debug_log,
a0288114 9631 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
9632 PL_colors[4],
9633 prog->check_substr ? "" : "utf8 ",
9634 PL_colors[5],PL_colors[0],
cad2e5aa
JH
9635 s,
9636 PL_colors[1],
9637 (strlen(s) > 60 ? "..." : ""));
9638 } );
9639
33b8afdf 9640 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
9641}
9642
84da74a7 9643/*
f8149455 9644 pregfree()
84da74a7 9645
f8149455
YO
9646 handles refcounting and freeing the perl core regexp structure. When
9647 it is necessary to actually free the structure the first thing it
3b753521 9648 does is call the 'free' method of the regexp_engine associated to
f8149455
YO
9649 the regexp, allowing the handling of the void *pprivate; member
9650 first. (This routine is not overridable by extensions, which is why
9651 the extensions free is called first.)
9652
9653 See regdupe and regdupe_internal if you change anything here.
84da74a7 9654*/
f8149455 9655#ifndef PERL_IN_XSUB_RE
2b69d0c2 9656void
84679df5 9657Perl_pregfree(pTHX_ REGEXP *r)
a687059c 9658{
288b8c02
NC
9659 SvREFCNT_dec(r);
9660}
9661
9662void
9663Perl_pregfree2(pTHX_ REGEXP *rx)
9664{
27da23d5 9665 dVAR;
288b8c02 9666 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 9667 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 9668
7918f24d
NC
9669 PERL_ARGS_ASSERT_PREGFREE2;
9670
28d8d7f4
YO
9671 if (r->mother_re) {
9672 ReREFCNT_dec(r->mother_re);
9673 } else {
288b8c02 9674 CALLREGFREE_PVT(rx); /* free the private data */
ef8d46e8 9675 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
9676 }
9677 if (r->substrs) {
ef8d46e8
VP
9678 SvREFCNT_dec(r->anchored_substr);
9679 SvREFCNT_dec(r->anchored_utf8);
9680 SvREFCNT_dec(r->float_substr);
9681 SvREFCNT_dec(r->float_utf8);
28d8d7f4
YO
9682 Safefree(r->substrs);
9683 }
288b8c02 9684 RX_MATCH_COPY_FREE(rx);
f8c7b90f 9685#ifdef PERL_OLD_COPY_ON_WRITE
ef8d46e8 9686 SvREFCNT_dec(r->saved_copy);
ed252734 9687#endif
f0ab9afb 9688 Safefree(r->offs);
f8149455 9689}
28d8d7f4
YO
9690
9691/* reg_temp_copy()
9692
9693 This is a hacky workaround to the structural issue of match results
9694 being stored in the regexp structure which is in turn stored in
9695 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9696 could be PL_curpm in multiple contexts, and could require multiple
9697 result sets being associated with the pattern simultaneously, such
9698 as when doing a recursive match with (??{$qr})
9699
9700 The solution is to make a lightweight copy of the regexp structure
9701 when a qr// is returned from the code executed by (??{$qr}) this
9702 lightweight copy doesnt actually own any of its data except for
9703 the starp/end and the actual regexp structure itself.
9704
9705*/
9706
9707
84679df5 9708REGEXP *
f0826785 9709Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
7918f24d 9710{
f0826785 9711 struct regexp *ret;
288b8c02 9712 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 9713 register const I32 npar = r->nparens+1;
7918f24d
NC
9714
9715 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9716
f0826785
BM
9717 if (!ret_x)
9718 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9719 ret = (struct regexp *)SvANY(ret_x);
9720
288b8c02 9721 (void)ReREFCNT_inc(rx);
f7c278bf
NC
9722 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9723 by pointing directly at the buffer, but flagging that the allocated
9724 space in the copy is zero. As we've just done a struct copy, it's now
9725 a case of zero-ing that, rather than copying the current length. */
9726 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 9727 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
b6f60916
NC
9728 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9729 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
f7c278bf 9730 SvLEN_set(ret_x, 0);
b9ad13ac 9731 SvSTASH_set(ret_x, NULL);
703c388d 9732 SvMAGIC_set(ret_x, NULL);
f0ab9afb
NC
9733 Newx(ret->offs, npar, regexp_paren_pair);
9734 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 9735 if (r->substrs) {
28d8d7f4 9736 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
9737 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9738
9739 SvREFCNT_inc_void(ret->anchored_substr);
9740 SvREFCNT_inc_void(ret->anchored_utf8);
9741 SvREFCNT_inc_void(ret->float_substr);
9742 SvREFCNT_inc_void(ret->float_utf8);
9743
9744 /* check_substr and check_utf8, if non-NULL, point to either their
9745 anchored or float namesakes, and don't hold a second reference. */
486913e4 9746 }
288b8c02 9747 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 9748#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 9749 ret->saved_copy = NULL;
28d8d7f4 9750#endif
288b8c02 9751 ret->mother_re = rx;
28d8d7f4 9752
288b8c02 9753 return ret_x;
28d8d7f4 9754}
f8149455
YO
9755#endif
9756
9757/* regfree_internal()
9758
9759 Free the private data in a regexp. This is overloadable by
9760 extensions. Perl takes care of the regexp structure in pregfree(),
3b753521 9761 this covers the *pprivate pointer which technically perl doesn't
f8149455
YO
9762 know about, however of course we have to handle the
9763 regexp_internal structure when no extension is in use.
9764
9765 Note this is called before freeing anything in the regexp
9766 structure.
9767 */
9768
9769void
288b8c02 9770Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
9771{
9772 dVAR;
288b8c02 9773 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
9774 RXi_GET_DECL(r,ri);
9775 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9776
9777 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9778
f8149455
YO
9779 DEBUG_COMPILE_r({
9780 if (!PL_colorset)
9781 reginitcolors();
9782 {
9783 SV *dsv= sv_newmortal();
3c8556c3 9784 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 9785 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
9786 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9787 PL_colors[4],PL_colors[5],s);
9788 }
9789 });
7122b237
YO
9790#ifdef RE_TRACK_PATTERN_OFFSETS
9791 if (ri->u.offsets)
9792 Safefree(ri->u.offsets); /* 20010421 MJD */
9793#endif
f8fc2ecf
YO
9794 if (ri->data) {
9795 int n = ri->data->count;
f3548bdc
DM
9796 PAD* new_comppad = NULL;
9797 PAD* old_comppad;
4026c95a 9798 PADOFFSET refcnt;
dfad63ad 9799
c277df42 9800 while (--n >= 0) {
261faec3 9801 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 9802 switch (ri->data->what[n]) {
af534a04 9803 case 'a':
c277df42 9804 case 's':
81714fb9 9805 case 'S':
55eed653 9806 case 'u':
ad64d0ec 9807 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 9808 break;
653099ff 9809 case 'f':
f8fc2ecf 9810 Safefree(ri->data->data[n]);
653099ff 9811 break;
dfad63ad 9812 case 'p':
502c6561 9813 new_comppad = MUTABLE_AV(ri->data->data[n]);
dfad63ad 9814 break;
c277df42 9815 case 'o':
dfad63ad 9816 if (new_comppad == NULL)
cea2e8a9 9817 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
9818 PAD_SAVE_LOCAL(old_comppad,
9819 /* Watch out for global destruction's random ordering. */
c445ea15 9820 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 9821 );
b34c0dd4 9822 OP_REFCNT_LOCK;
f8fc2ecf 9823 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
9824 OP_REFCNT_UNLOCK;
9825 if (!refcnt)
f8fc2ecf 9826 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 9827
f3548bdc 9828 PAD_RESTORE_LOCAL(old_comppad);
ad64d0ec 9829 SvREFCNT_dec(MUTABLE_SV(new_comppad));
dfad63ad 9830 new_comppad = NULL;
c277df42
IZ
9831 break;
9832 case 'n':
9e55ce06 9833 break;
07be1b83 9834 case 'T':
be8e71aa
YO
9835 { /* Aho Corasick add-on structure for a trie node.
9836 Used in stclass optimization only */
07be1b83 9837 U32 refcount;
f8fc2ecf 9838 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
9839 OP_REFCNT_LOCK;
9840 refcount = --aho->refcount;
9841 OP_REFCNT_UNLOCK;
9842 if ( !refcount ) {
446bd890
NC
9843 PerlMemShared_free(aho->states);
9844 PerlMemShared_free(aho->fail);
446bd890
NC
9845 /* do this last!!!! */
9846 PerlMemShared_free(ri->data->data[n]);
9847 PerlMemShared_free(ri->regstclass);
07be1b83
YO
9848 }
9849 }
9850 break;
a3621e74 9851 case 't':
07be1b83 9852 {
be8e71aa 9853 /* trie structure. */
07be1b83 9854 U32 refcount;
f8fc2ecf 9855 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
9856 OP_REFCNT_LOCK;
9857 refcount = --trie->refcount;
9858 OP_REFCNT_UNLOCK;
9859 if ( !refcount ) {
446bd890 9860 PerlMemShared_free(trie->charmap);
446bd890
NC
9861 PerlMemShared_free(trie->states);
9862 PerlMemShared_free(trie->trans);
07be1b83 9863 if (trie->bitmap)
446bd890 9864 PerlMemShared_free(trie->bitmap);
786e8c11 9865 if (trie->jump)
446bd890 9866 PerlMemShared_free(trie->jump);
2e64971a 9867 PerlMemShared_free(trie->wordinfo);
446bd890
NC
9868 /* do this last!!!! */
9869 PerlMemShared_free(ri->data->data[n]);
a3621e74 9870 }
07be1b83
YO
9871 }
9872 break;
c277df42 9873 default:
f8fc2ecf 9874 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
9875 }
9876 }
f8fc2ecf
YO
9877 Safefree(ri->data->what);
9878 Safefree(ri->data);
a0d0e21e 9879 }
28d8d7f4 9880
f8fc2ecf 9881 Safefree(ri);
a687059c 9882}
c277df42 9883
a09252eb
NC
9884#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
9885#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
84da74a7
YO
9886#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9887
9888/*
32cd70f6 9889 re_dup - duplicate a regexp.
84da74a7 9890
8233f606
DM
9891 This routine is expected to clone a given regexp structure. It is only
9892 compiled under USE_ITHREADS.
32cd70f6 9893
f8149455
YO
9894 After all of the core data stored in struct regexp is duplicated
9895 the regexp_engine.dupe method is used to copy any private data
9896 stored in the *pprivate pointer. This allows extensions to handle
9897 any duplication it needs to do.
9898
9899 See pregfree() and regfree_internal() if you change anything here.
84da74a7 9900*/
a3c0e9ca 9901#if defined(USE_ITHREADS)
f8149455 9902#ifndef PERL_IN_XSUB_RE
288b8c02
NC
9903void
9904Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 9905{
84da74a7 9906 dVAR;
a86a1ca7 9907 I32 npar;
288b8c02
NC
9908 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9909 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 9910
7918f24d
NC
9911 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9912
84da74a7 9913 npar = r->nparens+1;
f0ab9afb
NC
9914 Newx(ret->offs, npar, regexp_paren_pair);
9915 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 9916 if(ret->swap) {
28d8d7f4 9917 /* no need to copy these */
f0ab9afb 9918 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 9919 }
84da74a7 9920
6057429f 9921 if (ret->substrs) {
32cd70f6
NC
9922 /* Do it this way to avoid reading from *r after the StructCopy().
9923 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9924 cache, it doesn't matter. */
66b1de87
NC
9925 const bool anchored = r->check_substr
9926 ? r->check_substr == r->anchored_substr
9927 : r->check_utf8 == r->anchored_utf8;
785a26d5 9928 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
9929 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9930
32cd70f6
NC
9931 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9932 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9933 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9934 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 9935
32cd70f6
NC
9936 /* check_substr and check_utf8, if non-NULL, point to either their
9937 anchored or float namesakes, and don't hold a second reference. */
9938
9939 if (ret->check_substr) {
9940 if (anchored) {
9941 assert(r->check_utf8 == r->anchored_utf8);
9942 ret->check_substr = ret->anchored_substr;
9943 ret->check_utf8 = ret->anchored_utf8;
9944 } else {
9945 assert(r->check_substr == r->float_substr);
9946 assert(r->check_utf8 == r->float_utf8);
9947 ret->check_substr = ret->float_substr;
9948 ret->check_utf8 = ret->float_utf8;
9949 }
66b1de87
NC
9950 } else if (ret->check_utf8) {
9951 if (anchored) {
9952 ret->check_utf8 = ret->anchored_utf8;
9953 } else {
9954 ret->check_utf8 = ret->float_utf8;
9955 }
32cd70f6 9956 }
6057429f 9957 }
f8149455 9958
5daac39c 9959 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 9960
6057429f 9961 if (ret->pprivate)
288b8c02 9962 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 9963
288b8c02 9964 if (RX_MATCH_COPIED(dstr))
6057429f 9965 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
9966 else
9967 ret->subbeg = NULL;
9968#ifdef PERL_OLD_COPY_ON_WRITE
9969 ret->saved_copy = NULL;
9970#endif
6057429f 9971
c2123ae3
NC
9972 if (ret->mother_re) {
9973 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9974 /* Our storage points directly to our mother regexp, but that's
9975 1: a buffer in a different thread
9976 2: something we no longer hold a reference on
9977 so we need to copy it locally. */
9978 /* Note we need to sue SvCUR() on our mother_re, because it, in
9979 turn, may well be pointing to its own mother_re. */
9980 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9981 SvCUR(ret->mother_re)+1));
9982 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9983 }
9984 ret->mother_re = NULL;
9985 }
6057429f 9986 ret->gofs = 0;
f8149455
YO
9987}
9988#endif /* PERL_IN_XSUB_RE */
9989
9990/*
9991 regdupe_internal()
9992
9993 This is the internal complement to regdupe() which is used to copy
9994 the structure pointed to by the *pprivate pointer in the regexp.
9995 This is the core version of the extension overridable cloning hook.
9996 The regexp structure being duplicated will be copied by perl prior
9997 to this and will be provided as the regexp *r argument, however
9998 with the /old/ structures pprivate pointer value. Thus this routine
9999 may override any copying normally done by perl.
10000
10001 It returns a pointer to the new regexp_internal structure.
10002*/
10003
10004void *
288b8c02 10005Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
10006{
10007 dVAR;
288b8c02 10008 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
10009 regexp_internal *reti;
10010 int len, npar;
10011 RXi_GET_DECL(r,ri);
7918f24d
NC
10012
10013 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455
YO
10014
10015 npar = r->nparens+1;
7122b237 10016 len = ProgLen(ri);
f8149455 10017
45cf4570 10018 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455
YO
10019 Copy(ri->program, reti->program, len+1, regnode);
10020
f8149455 10021
f8fc2ecf 10022 reti->regstclass = NULL;
bcdf7404 10023
f8fc2ecf 10024 if (ri->data) {
84da74a7 10025 struct reg_data *d;
f8fc2ecf 10026 const int count = ri->data->count;
84da74a7
YO
10027 int i;
10028
10029 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10030 char, struct reg_data);
10031 Newx(d->what, count, U8);
10032
10033 d->count = count;
10034 for (i = 0; i < count; i++) {
f8fc2ecf 10035 d->what[i] = ri->data->what[i];
84da74a7 10036 switch (d->what[i]) {
af534a04 10037 /* legal options are one of: sSfpontTua
84da74a7 10038 see also regcomp.h and pregfree() */
af534a04 10039 case 'a': /* actually an AV, but the dup function is identical. */
84da74a7 10040 case 's':
81714fb9 10041 case 'S':
0536c0a7 10042 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 10043 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 10044 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 10045 break;
84da74a7
YO
10046 case 'f':
10047 /* This is cheating. */
10048 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 10049 StructCopy(ri->data->data[i], d->data[i],
84da74a7 10050 struct regnode_charclass_class);
f8fc2ecf 10051 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
10052 break;
10053 case 'o':
bbe252da
YO
10054 /* Compiled op trees are readonly and in shared memory,
10055 and can thus be shared without duplication. */
84da74a7 10056 OP_REFCNT_LOCK;
f8fc2ecf 10057 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
10058 OP_REFCNT_UNLOCK;
10059 break;
23eab42c
NC
10060 case 'T':
10061 /* Trie stclasses are readonly and can thus be shared
10062 * without duplication. We free the stclass in pregfree
10063 * when the corresponding reg_ac_data struct is freed.
10064 */
10065 reti->regstclass= ri->regstclass;
10066 /* Fall through */
84da74a7 10067 case 't':
84da74a7 10068 OP_REFCNT_LOCK;
0536c0a7 10069 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 10070 OP_REFCNT_UNLOCK;
0536c0a7
NC
10071 /* Fall through */
10072 case 'n':
10073 d->data[i] = ri->data->data[i];
84da74a7 10074 break;
84da74a7 10075 default:
f8fc2ecf 10076 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
10077 }
10078 }
10079
f8fc2ecf 10080 reti->data = d;
84da74a7
YO
10081 }
10082 else
f8fc2ecf 10083 reti->data = NULL;
84da74a7 10084
cde0cee5
YO
10085 reti->name_list_idx = ri->name_list_idx;
10086
7122b237
YO
10087#ifdef RE_TRACK_PATTERN_OFFSETS
10088 if (ri->u.offsets) {
10089 Newx(reti->u.offsets, 2*len+1, U32);
10090 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10091 }
10092#else
10093 SetProgLen(reti,len);
10094#endif
10095
f8149455 10096 return (void*)reti;
84da74a7 10097}
f8149455
YO
10098
10099#endif /* USE_ITHREADS */
84da74a7 10100
f8149455 10101#ifndef PERL_IN_XSUB_RE
bcdf7404 10102
c277df42
IZ
10103/*
10104 - regnext - dig the "next" pointer out of a node
c277df42
IZ
10105 */
10106regnode *
864dbfa3 10107Perl_regnext(pTHX_ register regnode *p)
c277df42 10108{
97aff369 10109 dVAR;
c277df42
IZ
10110 register I32 offset;
10111
f8fc2ecf 10112 if (!p)
c277df42
IZ
10113 return(NULL);
10114
35db910f
KW
10115 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
10116 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10117 }
10118
c277df42
IZ
10119 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10120 if (offset == 0)
10121 return(NULL);
10122
c277df42 10123 return(p+offset);
c277df42 10124}
76234dfb 10125#endif
c277df42 10126
01f988be 10127STATIC void
cea2e8a9 10128S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
10129{
10130 va_list args;
10131 STRLEN l1 = strlen(pat1);
10132 STRLEN l2 = strlen(pat2);
10133 char buf[512];
06bf62c7 10134 SV *msv;
73d840c0 10135 const char *message;
c277df42 10136
7918f24d
NC
10137 PERL_ARGS_ASSERT_RE_CROAK2;
10138
c277df42
IZ
10139 if (l1 > 510)
10140 l1 = 510;
10141 if (l1 + l2 > 510)
10142 l2 = 510 - l1;
10143 Copy(pat1, buf, l1 , char);
10144 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
10145 buf[l1 + l2] = '\n';
10146 buf[l1 + l2 + 1] = '\0';
8736538c
AS
10147#ifdef I_STDARG
10148 /* ANSI variant takes additional second argument */
c277df42 10149 va_start(args, pat2);
8736538c
AS
10150#else
10151 va_start(args);
10152#endif
5a844595 10153 msv = vmess(buf, &args);
c277df42 10154 va_end(args);
cfd0369c 10155 message = SvPV_const(msv,l1);
c277df42
IZ
10156 if (l1 > 512)
10157 l1 = 512;
10158 Copy(message, buf, l1 , char);
197cf9b9 10159 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 10160 Perl_croak(aTHX_ "%s", buf);
c277df42 10161}
a0ed51b3
LW
10162
10163/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
10164
76234dfb 10165#ifndef PERL_IN_XSUB_RE
a0ed51b3 10166void
864dbfa3 10167Perl_save_re_context(pTHX)
b81d288d 10168{
97aff369 10169 dVAR;
1ade1aa1
NC
10170
10171 struct re_save_state *state;
10172
10173 SAVEVPTR(PL_curcop);
10174 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10175
10176 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10177 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
c6bf6a65 10178 SSPUSHUV(SAVEt_RE_STATE);
1ade1aa1 10179
46ab3289 10180 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 10181
a0ed51b3 10182 PL_reg_start_tmp = 0;
a0ed51b3 10183 PL_reg_start_tmpl = 0;
c445ea15 10184 PL_reg_oldsaved = NULL;
a5db57d6 10185 PL_reg_oldsavedlen = 0;
a5db57d6 10186 PL_reg_maxiter = 0;
a5db57d6 10187 PL_reg_leftiter = 0;
c445ea15 10188 PL_reg_poscache = NULL;
a5db57d6 10189 PL_reg_poscache_size = 0;
1ade1aa1
NC
10190#ifdef PERL_OLD_COPY_ON_WRITE
10191 PL_nrs = NULL;
10192#endif
ada6e8a9 10193
c445ea15
AL
10194 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10195 if (PL_curpm) {
10196 const REGEXP * const rx = PM_GETRE(PL_curpm);
10197 if (rx) {
1df70142 10198 U32 i;
07bc277f 10199 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 10200 char digits[TYPE_CHARS(long)];
d9fad198 10201 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
10202 GV *const *const gvp
10203 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10204
b37c2d43
AL
10205 if (gvp) {
10206 GV * const gv = *gvp;
10207 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10208 save_scalar(gv);
49f27e4b 10209 }
ada6e8a9
AMS
10210 }
10211 }
10212 }
a0ed51b3 10213}
76234dfb 10214#endif
51371543 10215
51371543 10216static void
acfe0abc 10217clear_re(pTHX_ void *r)
51371543 10218{
97aff369 10219 dVAR;
84679df5 10220 ReREFCNT_dec((REGEXP *)r);
51371543 10221}
ffbc6a93 10222
a28509cc
AL
10223#ifdef DEBUGGING
10224
10225STATIC void
10226S_put_byte(pTHX_ SV *sv, int c)
10227{
7918f24d
NC
10228 PERL_ARGS_ASSERT_PUT_BYTE;
10229
7fddd944
NC
10230 /* Our definition of isPRINT() ignores locales, so only bytes that are
10231 not part of UTF-8 are considered printable. I assume that the same
10232 holds for UTF-EBCDIC.
10233 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10234 which Wikipedia says:
10235
10236 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10237 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10238 identical, to the ASCII delete (DEL) or rubout control character.
10239 ) So the old condition can be simplified to !isPRINT(c) */
10240 if (!isPRINT(c))
a28509cc 10241 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5e7aa789 10242 else {
88c9ea1e 10243 const char string = c;
5e7aa789
NC
10244 if (c == '-' || c == ']' || c == '\\' || c == '^')
10245 sv_catpvs(sv, "\\");
10246 sv_catpvn(sv, &string, 1);
10247 }
a28509cc
AL
10248}
10249
786e8c11 10250
3dab1dad
YO
10251#define CLEAR_OPTSTART \
10252 if (optstart) STMT_START { \
70685ca0 10253 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
10254 optstart=NULL; \
10255 } STMT_END
10256
786e8c11 10257#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 10258
b5a2f8d8
NC
10259STATIC const regnode *
10260S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
10261 const regnode *last, const regnode *plast,
10262 SV* sv, I32 indent, U32 depth)
a28509cc 10263{
97aff369 10264 dVAR;
786e8c11 10265 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 10266 register const regnode *next;
3dab1dad 10267 const regnode *optstart= NULL;
1f1031fe 10268
f8fc2ecf 10269 RXi_GET_DECL(r,ri);
3dab1dad 10270 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10271
10272 PERL_ARGS_ASSERT_DUMPUNTIL;
10273
786e8c11
YO
10274#ifdef DEBUG_DUMPUNTIL
10275 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10276 last ? last-start : 0,plast ? plast-start : 0);
10277#endif
10278
10279 if (plast && plast < last)
10280 last= plast;
10281
10282 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 10283 /* While that wasn't END last time... */
a28509cc
AL
10284 NODE_ALIGN(node);
10285 op = OP(node);
de734bd5 10286 if (op == CLOSE || op == WHILEM)
786e8c11 10287 indent--;
b5a2f8d8 10288 next = regnext((regnode *)node);
1f1031fe 10289
a28509cc 10290 /* Where, what. */
8e11feef 10291 if (OP(node) == OPTIMIZED) {
e68ec53f 10292 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 10293 optstart = node;
3dab1dad 10294 else
8e11feef 10295 goto after_print;
3dab1dad
YO
10296 } else
10297 CLEAR_OPTSTART;
1f1031fe 10298
32fc9b6a 10299 regprop(r, sv, node);
a28509cc 10300 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 10301 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
10302
10303 if (OP(node) != OPTIMIZED) {
10304 if (next == NULL) /* Next ptr. */
10305 PerlIO_printf(Perl_debug_log, " (0)");
10306 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10307 PerlIO_printf(Perl_debug_log, " (FAIL)");
10308 else
10309 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10310 (void)PerlIO_putc(Perl_debug_log, '\n');
10311 }
10312
a28509cc
AL
10313 after_print:
10314 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
10315 assert(next);
10316 {
10317 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
10318 ? regnext((regnode *)next)
10319 : next);
be8e71aa
YO
10320 if (last && nnode > last)
10321 nnode = last;
786e8c11 10322 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 10323 }
a28509cc
AL
10324 }
10325 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 10326 assert(next);
786e8c11 10327 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
10328 }
10329 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 10330 const regnode *this_trie = node;
1de06328 10331 const char op = OP(node);
647f639f 10332 const U32 n = ARG(node);
1de06328 10333 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 10334 (reg_ac_data *)ri->data->data[n] :
1de06328 10335 NULL;
3251b653
NC
10336 const reg_trie_data * const trie =
10337 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 10338#ifdef DEBUGGING
502c6561 10339 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 10340#endif
786e8c11 10341 const regnode *nextbranch= NULL;
a28509cc 10342 I32 word_idx;
76f68e9b 10343 sv_setpvs(sv, "");
786e8c11 10344 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 10345 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
10346
10347 PerlIO_printf(Perl_debug_log, "%*s%s ",
10348 (int)(2*(indent+3)), "",
10349 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
10350 PL_colors[0], PL_colors[1],
10351 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 10352 PERL_PV_PRETTY_ELLIPSES |
7f69552c 10353 PERL_PV_PRETTY_LTGT
786e8c11
YO
10354 )
10355 : "???"
10356 );
10357 if (trie->jump) {
40d049e4 10358 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
10359 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10360 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
10361 if (dist) {
10362 if (!nextbranch)
24b23f37 10363 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
10364 DUMPUNTIL(this_trie + dist, nextbranch);
10365 }
786e8c11
YO
10366 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10367 nextbranch= regnext((regnode *)nextbranch);
10368 } else {
10369 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 10370 }
786e8c11
YO
10371 }
10372 if (last && next > last)
10373 node= last;
10374 else
10375 node= next;
a28509cc 10376 }
786e8c11
YO
10377 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10378 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10379 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
10380 }
10381 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 10382 assert(next);
786e8c11 10383 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
10384 }
10385 else if ( op == PLUS || op == STAR) {
786e8c11 10386 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
10387 }
10388 else if (op == ANYOF) {
10389 /* arglen 1 + class block */
10390 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10391 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10392 node = NEXTOPER(node);
10393 }
10394 else if (PL_regkind[(U8)op] == EXACT) {
10395 /* Literal string, where present. */
10396 node += NODE_SZ_STR(node) - 1;
10397 node = NEXTOPER(node);
10398 }
10399 else {
10400 node = NEXTOPER(node);
10401 node += regarglen[(U8)op];
10402 }
10403 if (op == CURLYX || op == OPEN)
786e8c11 10404 indent++;
a28509cc 10405 }
3dab1dad 10406 CLEAR_OPTSTART;
786e8c11 10407#ifdef DEBUG_DUMPUNTIL
70685ca0 10408 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 10409#endif
1de06328 10410 return node;
a28509cc
AL
10411}
10412
10413#endif /* DEBUGGING */
10414
241d1a3b
NC
10415/*
10416 * Local variables:
10417 * c-indentation-style: bsd
10418 * c-basic-offset: 4
10419 * indent-tabs-mode: t
10420 * End:
10421 *
37442d52
RGS
10422 * ex: set ts=8 sts=4 sw=4 noet:
10423 */