This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Dual-life Env
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
81714fb9 137 HV *paren_names; /* Paren names */
1f1031fe 138
40d049e4
YO
139 regnode **recurse; /* Recurse regops */
140 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
141#if ADD_TO_REGEXEC
142 char *starttry; /* -Dr: where regtry was called. */
143#define RExC_starttry (pRExC_state->starttry)
144#endif
3dab1dad 145#ifdef DEBUGGING
be8e71aa 146 const char *lastparse;
3dab1dad 147 I32 lastnum;
1f1031fe 148 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
149#define RExC_lastparse (pRExC_state->lastparse)
150#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 151#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 152#endif
830247a4
IZ
153} RExC_state_t;
154
e2509266 155#define RExC_flags (pRExC_state->flags)
830247a4 156#define RExC_precomp (pRExC_state->precomp)
288b8c02 157#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 158#define RExC_rx (pRExC_state->rx)
f8fc2ecf 159#define RExC_rxi (pRExC_state->rxi)
fac92740 160#define RExC_start (pRExC_state->start)
830247a4
IZ
161#define RExC_end (pRExC_state->end)
162#define RExC_parse (pRExC_state->parse)
163#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
164#ifdef RE_TRACK_PATTERN_OFFSETS
165#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
166#endif
830247a4 167#define RExC_emit (pRExC_state->emit)
fac92740 168#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 169#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
170#define RExC_naughty (pRExC_state->naughty)
171#define RExC_sawback (pRExC_state->sawback)
172#define RExC_seen (pRExC_state->seen)
173#define RExC_size (pRExC_state->size)
174#define RExC_npar (pRExC_state->npar)
e2e6a0f1 175#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
176#define RExC_extralen (pRExC_state->extralen)
177#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 179#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 180#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
181#define RExC_open_parens (pRExC_state->open_parens)
182#define RExC_close_parens (pRExC_state->close_parens)
183#define RExC_opend (pRExC_state->opend)
81714fb9 184#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
185#define RExC_recurse (pRExC_state->recurse)
186#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 187
cde0cee5 188
a687059c
LW
189#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
a687059c 192
35c8bce7
LW
193#ifdef SPSTART
194#undef SPSTART /* dratted cpp namespace... */
195#endif
a687059c
LW
196/*
197 * Flags to be passed up and down.
198 */
a687059c 199#define WORST 0 /* Worst case. */
a3b492c3 200#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
201
202/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
203 * character, and if utf8, must be invariant. */
204#define SIMPLE 0x02
a3b492c3
YO
205#define SPSTART 0x04 /* Starts with * or +. */
206#define TRYAGAIN 0x08 /* Weeded out a declaration. */
207#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 208
3dab1dad
YO
209#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
210
07be1b83
YO
211/* whether trie related optimizations are enabled */
212#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
213#define TRIE_STUDY_OPT
786e8c11 214#define FULL_TRIE_STUDY
07be1b83
YO
215#define TRIE_STCLASS
216#endif
1de06328
YO
217
218
40d049e4
YO
219
220#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
221#define PBITVAL(paren) (1 << ((paren) & 7))
222#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
223#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
224#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
225
bbd61b5f
KW
226/* If not already in utf8, do a longjmp back to the beginning */
227#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
228#define REQUIRE_UTF8 STMT_START { \
229 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
230 } STMT_END
40d049e4 231
1de06328
YO
232/* About scan_data_t.
233
234 During optimisation we recurse through the regexp program performing
235 various inplace (keyhole style) optimisations. In addition study_chunk
236 and scan_commit populate this data structure with information about
237 what strings MUST appear in the pattern. We look for the longest
3b753521 238 string that must appear at a fixed location, and we look for the
1de06328
YO
239 longest string that may appear at a floating location. So for instance
240 in the pattern:
241
242 /FOO[xX]A.*B[xX]BAR/
243
244 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
245 strings (because they follow a .* construct). study_chunk will identify
246 both FOO and BAR as being the longest fixed and floating strings respectively.
247
248 The strings can be composites, for instance
249
250 /(f)(o)(o)/
251
252 will result in a composite fixed substring 'foo'.
253
254 For each string some basic information is maintained:
255
256 - offset or min_offset
257 This is the position the string must appear at, or not before.
258 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
259 characters must match before the string we are searching for.
260 Likewise when combined with minlenp and the length of the string it
1de06328
YO
261 tells us how many characters must appear after the string we have
262 found.
263
264 - max_offset
265 Only used for floating strings. This is the rightmost point that
3b753521 266 the string can appear at. If set to I32 max it indicates that the
1de06328
YO
267 string can occur infinitely far to the right.
268
269 - minlenp
270 A pointer to the minimum length of the pattern that the string
271 was found inside. This is important as in the case of positive
272 lookahead or positive lookbehind we can have multiple patterns
273 involved. Consider
274
275 /(?=FOO).*F/
276
277 The minimum length of the pattern overall is 3, the minimum length
278 of the lookahead part is 3, but the minimum length of the part that
279 will actually match is 1. So 'FOO's minimum length is 3, but the
280 minimum length for the F is 1. This is important as the minimum length
281 is used to determine offsets in front of and behind the string being
282 looked for. Since strings can be composites this is the length of the
283 pattern at the time it was commited with a scan_commit. Note that
284 the length is calculated by study_chunk, so that the minimum lengths
285 are not known until the full pattern has been compiled, thus the
286 pointer to the value.
287
288 - lookbehind
289
290 In the case of lookbehind the string being searched for can be
291 offset past the start point of the final matching string.
292 If this value was just blithely removed from the min_offset it would
293 invalidate some of the calculations for how many chars must match
294 before or after (as they are derived from min_offset and minlen and
295 the length of the string being searched for).
296 When the final pattern is compiled and the data is moved from the
297 scan_data_t structure into the regexp structure the information
298 about lookbehind is factored in, with the information that would
299 have been lost precalculated in the end_shift field for the
300 associated string.
301
302 The fields pos_min and pos_delta are used to store the minimum offset
303 and the delta to the maximum offset at the current point in the pattern.
304
305*/
2c2d71f5
JH
306
307typedef struct scan_data_t {
1de06328
YO
308 /*I32 len_min; unused */
309 /*I32 len_delta; unused */
2c2d71f5
JH
310 I32 pos_min;
311 I32 pos_delta;
312 SV *last_found;
1de06328 313 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
314 I32 last_start_min;
315 I32 last_start_max;
1de06328
YO
316 SV **longest; /* Either &l_fixed, or &l_float. */
317 SV *longest_fixed; /* longest fixed string found in pattern */
318 I32 offset_fixed; /* offset where it starts */
319 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
320 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
321 SV *longest_float; /* longest floating string found in pattern */
322 I32 offset_float_min; /* earliest point in string it can appear */
323 I32 offset_float_max; /* latest point in string it can appear */
324 I32 *minlen_float; /* pointer to the minlen relevent to the string */
325 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
326 I32 flags;
327 I32 whilem_c;
cb434fcc 328 I32 *last_closep;
653099ff 329 struct regnode_charclass_class *start_class;
2c2d71f5
JH
330} scan_data_t;
331
a687059c 332/*
e50aee73 333 * Forward declarations for pregcomp()'s friends.
a687059c 334 */
a0d0e21e 335
27da23d5 336static const scan_data_t zero_scan_data =
1de06328 337 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
338
339#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
340#define SF_BEFORE_SEOL 0x0001
341#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
342#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
343#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
344
09b7f37c
CB
345#ifdef NO_UNARY_PLUS
346# define SF_FIX_SHIFT_EOL (0+2)
347# define SF_FL_SHIFT_EOL (0+4)
348#else
349# define SF_FIX_SHIFT_EOL (+2)
350# define SF_FL_SHIFT_EOL (+4)
351#endif
c277df42
IZ
352
353#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
354#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
355
356#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
357#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
358#define SF_IS_INF 0x0040
359#define SF_HAS_PAR 0x0080
360#define SF_IN_PAR 0x0100
361#define SF_HAS_EVAL 0x0200
362#define SCF_DO_SUBSTR 0x0400
653099ff
GS
363#define SCF_DO_STCLASS_AND 0x0800
364#define SCF_DO_STCLASS_OR 0x1000
365#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 366#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 367
786e8c11 368#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 369#define SCF_SEEN_ACCEPT 0x8000
07be1b83 370
43fead97
KW
371#define UTF cBOOL(RExC_utf8)
372#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
373#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
374#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 375
ffc61ed2 376#define OOB_UNICODE 12345678
93733859 377#define OOB_NAMEDCLASS -1
b8c5462f 378
a0ed51b3
LW
379#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
380#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
381
8615cb43 382
b45f050a
JF
383/* length of regex to show in messages that don't mark a position within */
384#define RegexLengthToShowInErrorMessages 127
385
386/*
387 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
388 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
389 * op/pragma/warn/regcomp.
390 */
7253e4e3
RK
391#define MARKER1 "<-- HERE" /* marker as it appears in the description */
392#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 393
7253e4e3 394#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
395
396/*
397 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
398 * arg. Show regex, up to a maximum length. If it's too long, chop and add
399 * "...".
400 */
58e23c8d 401#define _FAIL(code) STMT_START { \
bfed75c6 402 const char *ellipses = ""; \
ccb2c380
MP
403 IV len = RExC_end - RExC_precomp; \
404 \
405 if (!SIZE_ONLY) \
288b8c02 406 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
407 if (len > RegexLengthToShowInErrorMessages) { \
408 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
409 len = RegexLengthToShowInErrorMessages - 10; \
410 ellipses = "..."; \
411 } \
58e23c8d 412 code; \
ccb2c380 413} STMT_END
8615cb43 414
58e23c8d
YO
415#define FAIL(msg) _FAIL( \
416 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
417 msg, (int)len, RExC_precomp, ellipses))
418
419#define FAIL2(msg,arg) _FAIL( \
420 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
421 arg, (int)len, RExC_precomp, ellipses))
422
b45f050a 423/*
b45f050a
JF
424 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
425 */
ccb2c380 426#define Simple_vFAIL(m) STMT_START { \
a28509cc 427 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
428 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
b45f050a
JF
431
432/*
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
434 */
ccb2c380
MP
435#define vFAIL(m) STMT_START { \
436 if (!SIZE_ONLY) \
288b8c02 437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
438 Simple_vFAIL(m); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Like Simple_vFAIL(), but accepts two arguments.
443 */
ccb2c380 444#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 445 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
448} STMT_END
b45f050a
JF
449
450/*
451 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
452 */
ccb2c380
MP
453#define vFAIL2(m,a1) STMT_START { \
454 if (!SIZE_ONLY) \
288b8c02 455 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
456 Simple_vFAIL2(m, a1); \
457} STMT_END
b45f050a
JF
458
459
460/*
461 * Like Simple_vFAIL(), but accepts three arguments.
462 */
ccb2c380 463#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 464 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
465 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
466 (int)offset, RExC_precomp, RExC_precomp + offset); \
467} STMT_END
b45f050a
JF
468
469/*
470 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
471 */
ccb2c380
MP
472#define vFAIL3(m,a1,a2) STMT_START { \
473 if (!SIZE_ONLY) \
288b8c02 474 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
475 Simple_vFAIL3(m, a1, a2); \
476} STMT_END
b45f050a
JF
477
478/*
479 * Like Simple_vFAIL(), but accepts four arguments.
480 */
ccb2c380 481#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 482 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
483 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
484 (int)offset, RExC_precomp, RExC_precomp + offset); \
485} STMT_END
b45f050a 486
668c081a 487#define ckWARNreg(loc,m) STMT_START { \
a28509cc 488 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
489 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
491} STMT_END
492
668c081a 493#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 494 const IV offset = loc - RExC_precomp; \
d1d15184 495 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
496 m REPORT_LOCATION, \
497 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
498} STMT_END
499
668c081a 500#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 501 const IV offset = loc - RExC_precomp; \
668c081a 502 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
503 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
504} STMT_END
505
506#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 507 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
508 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
509 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
510} STMT_END
511
668c081a
NC
512#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
513 const IV offset = loc - RExC_precomp; \
514 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
515 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
516} STMT_END
517
ccb2c380 518#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 519 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
520 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
521 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
522} STMT_END
523
668c081a
NC
524#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
527 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
528} STMT_END
529
ccb2c380 530#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 531 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
532 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
533 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
534} STMT_END
9d1d55b5 535
8615cb43 536
cd439c50 537/* Allow for side effects in s */
ccb2c380
MP
538#define REGC(c,s) STMT_START { \
539 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
540} STMT_END
cd439c50 541
fac92740
MJD
542/* Macros for recording node offsets. 20001227 mjd@plover.com
543 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
544 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
545 * Element 0 holds the number n.
07be1b83 546 * Position is 1 indexed.
fac92740 547 */
7122b237
YO
548#ifndef RE_TRACK_PATTERN_OFFSETS
549#define Set_Node_Offset_To_R(node,byte)
550#define Set_Node_Offset(node,byte)
551#define Set_Cur_Node_Offset
552#define Set_Node_Length_To_R(node,len)
553#define Set_Node_Length(node,len)
554#define Set_Node_Cur_Length(node)
555#define Node_Offset(n)
556#define Node_Length(n)
557#define Set_Node_Offset_Length(node,offset,len)
558#define ProgLen(ri) ri->u.proglen
559#define SetProgLen(ri,x) ri->u.proglen = x
560#else
561#define ProgLen(ri) ri->u.offsets[0]
562#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
563#define Set_Node_Offset_To_R(node,byte) STMT_START { \
564 if (! SIZE_ONLY) { \
565 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 566 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 567 if((node) < 0) { \
551405c4 568 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
569 } else { \
570 RExC_offsets[2*(node)-1] = (byte); \
571 } \
572 } \
573} STMT_END
574
575#define Set_Node_Offset(node,byte) \
576 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
577#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
578
579#define Set_Node_Length_To_R(node,len) STMT_START { \
580 if (! SIZE_ONLY) { \
581 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 582 __LINE__, (int)(node), (int)(len))); \
ccb2c380 583 if((node) < 0) { \
551405c4 584 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
585 } else { \
586 RExC_offsets[2*(node)] = (len); \
587 } \
588 } \
589} STMT_END
590
591#define Set_Node_Length(node,len) \
592 Set_Node_Length_To_R((node)-RExC_emit_start, len)
593#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
594#define Set_Node_Cur_Length(node) \
595 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
596
597/* Get offsets and lengths */
598#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
599#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
600
07be1b83
YO
601#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
602 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
603 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
604} STMT_END
7122b237 605#endif
07be1b83
YO
606
607#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
608#define EXPERIMENTAL_INPLACESCAN
f427392e 609#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 610
304ee84b
YO
611#define DEBUG_STUDYDATA(str,data,depth) \
612DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 613 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
614 "%*s" str "Pos:%"IVdf"/%"IVdf \
615 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
616 (int)(depth)*2, "", \
617 (IV)((data)->pos_min), \
618 (IV)((data)->pos_delta), \
304ee84b 619 (UV)((data)->flags), \
1de06328 620 (IV)((data)->whilem_c), \
304ee84b
YO
621 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
622 is_inf ? "INF " : "" \
1de06328
YO
623 ); \
624 if ((data)->last_found) \
625 PerlIO_printf(Perl_debug_log, \
626 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
627 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
628 SvPVX_const((data)->last_found), \
629 (IV)((data)->last_end), \
630 (IV)((data)->last_start_min), \
631 (IV)((data)->last_start_max), \
632 ((data)->longest && \
633 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
634 SvPVX_const((data)->longest_fixed), \
635 (IV)((data)->offset_fixed), \
636 ((data)->longest && \
637 (data)->longest==&((data)->longest_float)) ? "*" : "", \
638 SvPVX_const((data)->longest_float), \
639 (IV)((data)->offset_float_min), \
640 (IV)((data)->offset_float_max) \
641 ); \
642 PerlIO_printf(Perl_debug_log,"\n"); \
643});
644
acfe0abc 645static void clear_re(pTHX_ void *r);
4327152a 646
653099ff 647/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 648 Update the longest found anchored substring and the longest found
653099ff
GS
649 floating substrings if needed. */
650
4327152a 651STATIC void
304ee84b 652S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 653{
e1ec3a88
AL
654 const STRLEN l = CHR_SVLEN(data->last_found);
655 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 656 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 657
7918f24d
NC
658 PERL_ARGS_ASSERT_SCAN_COMMIT;
659
c277df42 660 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 661 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
662 if (*data->longest == data->longest_fixed) {
663 data->offset_fixed = l ? data->last_start_min : data->pos_min;
664 if (data->flags & SF_BEFORE_EOL)
b81d288d 665 data->flags
c277df42
IZ
666 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
667 else
668 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
669 data->minlen_fixed=minlenp;
670 data->lookbehind_fixed=0;
a0ed51b3 671 }
304ee84b 672 else { /* *data->longest == data->longest_float */
c277df42 673 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
674 data->offset_float_max = (l
675 ? data->last_start_max
c277df42 676 : data->pos_min + data->pos_delta);
304ee84b 677 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 678 data->offset_float_max = I32_MAX;
c277df42 679 if (data->flags & SF_BEFORE_EOL)
b81d288d 680 data->flags
c277df42
IZ
681 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
682 else
683 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
684 data->minlen_float=minlenp;
685 data->lookbehind_float=0;
c277df42
IZ
686 }
687 }
688 SvCUR_set(data->last_found, 0);
0eda9292 689 {
a28509cc 690 SV * const sv = data->last_found;
097eb12c
AL
691 if (SvUTF8(sv) && SvMAGICAL(sv)) {
692 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
693 if (mg)
694 mg->mg_len = 0;
695 }
0eda9292 696 }
c277df42
IZ
697 data->last_end = -1;
698 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 699 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
700}
701
653099ff
GS
702/* Can match anything (initialization) */
703STATIC void
097eb12c 704S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 705{
7918f24d
NC
706 PERL_ARGS_ASSERT_CL_ANYTHING;
707
653099ff 708 ANYOF_CLASS_ZERO(cl);
f8bef550 709 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 710 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
711 if (LOC)
712 cl->flags |= ANYOF_LOCALE;
8951c461 713 cl->flags |= ANYOF_FOLD;
653099ff
GS
714}
715
716/* Can match anything (initialization) */
717STATIC int
5f66b61c 718S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
719{
720 int value;
721
7918f24d
NC
722 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
723
aaa51d5e 724 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
725 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
726 return 1;
1aa99e6b
IH
727 if (!(cl->flags & ANYOF_UNICODE_ALL))
728 return 0;
10edeb5d 729 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 730 return 0;
653099ff
GS
731 return 1;
732}
733
734/* Can match anything (initialization) */
735STATIC void
097eb12c 736S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 737{
7918f24d
NC
738 PERL_ARGS_ASSERT_CL_INIT;
739
8ecf7187 740 Zero(cl, 1, struct regnode_charclass_class);
653099ff 741 cl->type = ANYOF;
830247a4 742 cl_anything(pRExC_state, cl);
653099ff
GS
743}
744
745STATIC void
097eb12c 746S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 747{
7918f24d
NC
748 PERL_ARGS_ASSERT_CL_INIT_ZERO;
749
8ecf7187 750 Zero(cl, 1, struct regnode_charclass_class);
653099ff 751 cl->type = ANYOF;
830247a4 752 cl_anything(pRExC_state, cl);
653099ff
GS
753 if (LOC)
754 cl->flags |= ANYOF_LOCALE;
755}
756
757/* 'And' a given class with another one. Can create false positives */
758/* We assume that cl is not inverted */
759STATIC void
5f66b61c 760S_cl_and(struct regnode_charclass_class *cl,
a28509cc 761 const struct regnode_charclass_class *and_with)
653099ff 762{
7918f24d 763 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
764
765 assert(and_with->type == ANYOF);
653099ff
GS
766 if (!(and_with->flags & ANYOF_CLASS)
767 && !(cl->flags & ANYOF_CLASS)
768 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
769 && !(and_with->flags & ANYOF_FOLD)
770 && !(cl->flags & ANYOF_FOLD)) {
771 int i;
772
773 if (and_with->flags & ANYOF_INVERT)
774 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
775 cl->bitmap[i] &= ~and_with->bitmap[i];
776 else
777 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
778 cl->bitmap[i] &= and_with->bitmap[i];
779 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
780 if (!(and_with->flags & ANYOF_EOS))
781 cl->flags &= ~ANYOF_EOS;
1aa99e6b 782
8951c461
KW
783 if (!(and_with->flags & ANYOF_FOLD))
784 cl->flags &= ~ANYOF_FOLD;
785
3ff7ceb3 786 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
14ebb1a2 787 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b 788 cl->flags &= ~ANYOF_UNICODE_ALL;
ef87b810
KW
789 cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
790 only the one(s)
791 actually set */
1aa99e6b
IH
792 ARG_SET(cl, ARG(and_with));
793 }
14ebb1a2
JH
794 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
795 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 796 cl->flags &= ~ANYOF_UNICODE_ALL;
3ff7ceb3 797 if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
14ebb1a2 798 !(and_with->flags & ANYOF_INVERT))
3ff7ceb3 799 cl->flags &= ~ANYOF_NONBITMAP;
653099ff
GS
800}
801
802/* 'OR' a given class with another one. Can create false positives */
803/* We assume that cl is not inverted */
804STATIC void
097eb12c 805S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 806{
7918f24d
NC
807 PERL_ARGS_ASSERT_CL_OR;
808
653099ff
GS
809 if (or_with->flags & ANYOF_INVERT) {
810 /* We do not use
811 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
812 * <= (B1 | !B2) | (CL1 | !CL2)
813 * which is wasteful if CL2 is small, but we ignore CL2:
814 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
815 * XXXX Can we handle case-fold? Unclear:
816 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
817 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
818 */
819 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
820 && !(or_with->flags & ANYOF_FOLD)
821 && !(cl->flags & ANYOF_FOLD) ) {
822 int i;
823
824 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
825 cl->bitmap[i] |= ~or_with->bitmap[i];
826 } /* XXXX: logic is complicated otherwise */
827 else {
830247a4 828 cl_anything(pRExC_state, cl);
653099ff
GS
829 }
830 } else {
831 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
832 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 833 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
834 || (cl->flags & ANYOF_FOLD)) ) {
835 int i;
836
837 /* OR char bitmap and class bitmap separately */
838 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
839 cl->bitmap[i] |= or_with->bitmap[i];
840 if (or_with->flags & ANYOF_CLASS) {
841 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
842 cl->classflags[i] |= or_with->classflags[i];
843 cl->flags |= ANYOF_CLASS;
844 }
845 }
846 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 847 cl_anything(pRExC_state, cl);
653099ff
GS
848 }
849 }
850 if (or_with->flags & ANYOF_EOS)
851 cl->flags |= ANYOF_EOS;
1aa99e6b 852
8951c461
KW
853 if (or_with->flags & ANYOF_FOLD)
854 cl->flags |= ANYOF_FOLD;
855
9826f543
KW
856 /* If both nodes match something outside the bitmap, but what they match
857 * outside is not the same pointer, and hence not easily compared, give up
858 * and allow the start class to match everything outside the bitmap */
3ff7ceb3 859 if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
1aa99e6b
IH
860 ARG(cl) != ARG(or_with)) {
861 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 862 }
9826f543 863
1aa99e6b
IH
864 if (or_with->flags & ANYOF_UNICODE_ALL) {
865 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 866 }
653099ff
GS
867}
868
a3621e74
YO
869#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
870#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
871#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
872#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
873
3dab1dad
YO
874
875#ifdef DEBUGGING
07be1b83 876/*
2b8b4781
NC
877 dump_trie(trie,widecharmap,revcharmap)
878 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
879 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
880
881 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
882 The _interim_ variants are used for debugging the interim
883 tables that are used to generate the final compressed
884 representation which is what dump_trie expects.
885
3dab1dad
YO
886 Part of the reason for their existance is to provide a form
887 of documentation as to how the different representations function.
07be1b83
YO
888
889*/
3dab1dad
YO
890
891/*
3dab1dad
YO
892 Dumps the final compressed table form of the trie to Perl_debug_log.
893 Used for debugging make_trie().
894*/
b9a59e08 895
3dab1dad 896STATIC void
2b8b4781
NC
897S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
898 AV *revcharmap, U32 depth)
3dab1dad
YO
899{
900 U32 state;
ab3bbdeb 901 SV *sv=sv_newmortal();
55eed653 902 int colwidth= widecharmap ? 6 : 4;
2e64971a 903 U16 word;
3dab1dad
YO
904 GET_RE_DEBUG_FLAGS_DECL;
905
7918f24d 906 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 907
3dab1dad
YO
908 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
909 (int)depth * 2 + 2,"",
910 "Match","Base","Ofs" );
911
912 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 913 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 914 if ( tmp ) {
ab3bbdeb
YO
915 PerlIO_printf( Perl_debug_log, "%*s",
916 colwidth,
ddc5bc0f 917 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
918 PL_colors[0], PL_colors[1],
919 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
920 PERL_PV_ESCAPE_FIRSTCHAR
921 )
922 );
3dab1dad
YO
923 }
924 }
925 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
926 (int)depth * 2 + 2,"");
927
928 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 929 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
930 PerlIO_printf( Perl_debug_log, "\n");
931
1e2e3d02 932 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 933 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
934
935 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
936
937 if ( trie->states[ state ].wordnum ) {
938 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
939 } else {
940 PerlIO_printf( Perl_debug_log, "%6s", "" );
941 }
942
943 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
944
945 if ( base ) {
946 U32 ofs = 0;
947
948 while( ( base + ofs < trie->uniquecharcount ) ||
949 ( base + ofs - trie->uniquecharcount < trie->lasttrans
950 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
951 ofs++;
952
953 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
954
955 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
956 if ( ( base + ofs >= trie->uniquecharcount ) &&
957 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
958 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
959 {
ab3bbdeb
YO
960 PerlIO_printf( Perl_debug_log, "%*"UVXf,
961 colwidth,
3dab1dad
YO
962 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
963 } else {
ab3bbdeb 964 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
965 }
966 }
967
968 PerlIO_printf( Perl_debug_log, "]");
969
970 }
971 PerlIO_printf( Perl_debug_log, "\n" );
972 }
2e64971a
DM
973 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
974 for (word=1; word <= trie->wordcount; word++) {
975 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
976 (int)word, (int)(trie->wordinfo[word].prev),
977 (int)(trie->wordinfo[word].len));
978 }
979 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
980}
981/*
3dab1dad
YO
982 Dumps a fully constructed but uncompressed trie in list form.
983 List tries normally only are used for construction when the number of
984 possible chars (trie->uniquecharcount) is very high.
985 Used for debugging make_trie().
986*/
987STATIC void
55eed653 988S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
989 HV *widecharmap, AV *revcharmap, U32 next_alloc,
990 U32 depth)
3dab1dad
YO
991{
992 U32 state;
ab3bbdeb 993 SV *sv=sv_newmortal();
55eed653 994 int colwidth= widecharmap ? 6 : 4;
3dab1dad 995 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
996
997 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
998
3dab1dad 999 /* print out the table precompression. */
ab3bbdeb
YO
1000 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1001 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1002 "------:-----+-----------------\n" );
3dab1dad
YO
1003
1004 for( state=1 ; state < next_alloc ; state ++ ) {
1005 U16 charid;
1006
ab3bbdeb 1007 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1008 (int)depth * 2 + 2,"", (UV)state );
1009 if ( ! trie->states[ state ].wordnum ) {
1010 PerlIO_printf( Perl_debug_log, "%5s| ","");
1011 } else {
1012 PerlIO_printf( Perl_debug_log, "W%4x| ",
1013 trie->states[ state ].wordnum
1014 );
1015 }
1016 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1017 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1018 if ( tmp ) {
1019 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1020 colwidth,
ddc5bc0f 1021 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1022 PL_colors[0], PL_colors[1],
1023 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1024 PERL_PV_ESCAPE_FIRSTCHAR
1025 ) ,
1e2e3d02
YO
1026 TRIE_LIST_ITEM(state,charid).forid,
1027 (UV)TRIE_LIST_ITEM(state,charid).newstate
1028 );
1029 if (!(charid % 10))
664e119d
RGS
1030 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1031 (int)((depth * 2) + 14), "");
1e2e3d02 1032 }
ab3bbdeb
YO
1033 }
1034 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1035 }
1036}
1037
1038/*
3dab1dad
YO
1039 Dumps a fully constructed but uncompressed trie in table form.
1040 This is the normal DFA style state transition table, with a few
1041 twists to facilitate compression later.
1042 Used for debugging make_trie().
1043*/
1044STATIC void
55eed653 1045S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1046 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1047 U32 depth)
3dab1dad
YO
1048{
1049 U32 state;
1050 U16 charid;
ab3bbdeb 1051 SV *sv=sv_newmortal();
55eed653 1052 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1053 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1054
1055 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1056
1057 /*
1058 print out the table precompression so that we can do a visual check
1059 that they are identical.
1060 */
1061
1062 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1063
1064 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1065 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1066 if ( tmp ) {
ab3bbdeb
YO
1067 PerlIO_printf( Perl_debug_log, "%*s",
1068 colwidth,
ddc5bc0f 1069 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1070 PL_colors[0], PL_colors[1],
1071 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1072 PERL_PV_ESCAPE_FIRSTCHAR
1073 )
1074 );
3dab1dad
YO
1075 }
1076 }
1077
1078 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1079
1080 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1081 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1082 }
1083
1084 PerlIO_printf( Perl_debug_log, "\n" );
1085
1086 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1087
1088 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1089 (int)depth * 2 + 2,"",
1090 (UV)TRIE_NODENUM( state ) );
1091
1092 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1093 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1094 if (v)
1095 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1096 else
1097 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1098 }
1099 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1100 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1101 } else {
1102 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1103 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1104 }
1105 }
07be1b83 1106}
3dab1dad
YO
1107
1108#endif
1109
2e64971a 1110
786e8c11
YO
1111/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1112 startbranch: the first branch in the whole branch sequence
1113 first : start branch of sequence of branch-exact nodes.
1114 May be the same as startbranch
1115 last : Thing following the last branch.
1116 May be the same as tail.
1117 tail : item following the branch sequence
1118 count : words in the sequence
1119 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1120 depth : indent depth
3dab1dad 1121
786e8c11 1122Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1123
786e8c11
YO
1124A trie is an N'ary tree where the branches are determined by digital
1125decomposition of the key. IE, at the root node you look up the 1st character and
1126follow that branch repeat until you find the end of the branches. Nodes can be
1127marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1128
786e8c11 1129 /he|she|his|hers/
72f13be8 1130
786e8c11
YO
1131would convert into the following structure. Numbers represent states, letters
1132following numbers represent valid transitions on the letter from that state, if
1133the number is in square brackets it represents an accepting state, otherwise it
1134will be in parenthesis.
07be1b83 1135
786e8c11
YO
1136 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1137 | |
1138 | (2)
1139 | |
1140 (1) +-i->(6)-+-s->[7]
1141 |
1142 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1143
786e8c11
YO
1144 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1145
1146This shows that when matching against the string 'hers' we will begin at state 1
1147read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1148then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1149is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1150single traverse. We store a mapping from accepting to state to which word was
1151matched, and then when we have multiple possibilities we try to complete the
1152rest of the regex in the order in which they occured in the alternation.
1153
1154The only prior NFA like behaviour that would be changed by the TRIE support is
1155the silent ignoring of duplicate alternations which are of the form:
1156
1157 / (DUPE|DUPE) X? (?{ ... }) Y /x
1158
4b714af6 1159Thus EVAL blocks following a trie may be called a different number of times with
786e8c11
YO
1160and without the optimisation. With the optimisations dupes will be silently
1161ignored. This inconsistant behaviour of EVAL type nodes is well established as
1162the following demonstrates:
1163
1164 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1165
1166which prints out 'word' three times, but
1167
1168 'words'=~/(word|word|word)(?{ print $1 })S/
1169
1170which doesnt print it out at all. This is due to other optimisations kicking in.
1171
1172Example of what happens on a structural level:
1173
1174The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1175
1176 1: CURLYM[1] {1,32767}(18)
1177 5: BRANCH(8)
1178 6: EXACT <ac>(16)
1179 8: BRANCH(11)
1180 9: EXACT <ad>(16)
1181 11: BRANCH(14)
1182 12: EXACT <ab>(16)
1183 16: SUCCEED(0)
1184 17: NOTHING(18)
1185 18: END(0)
1186
1187This would be optimizable with startbranch=5, first=5, last=16, tail=16
1188and should turn into:
1189
1190 1: CURLYM[1] {1,32767}(18)
1191 5: TRIE(16)
1192 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1193 <ac>
1194 <ad>
1195 <ab>
1196 16: SUCCEED(0)
1197 17: NOTHING(18)
1198 18: END(0)
1199
1200Cases where tail != last would be like /(?foo|bar)baz/:
1201
1202 1: BRANCH(4)
1203 2: EXACT <foo>(8)
1204 4: BRANCH(7)
1205 5: EXACT <bar>(8)
1206 7: TAIL(8)
1207 8: EXACT <baz>(10)
1208 10: END(0)
1209
1210which would be optimizable with startbranch=1, first=1, last=7, tail=8
1211and would end up looking like:
1212
1213 1: TRIE(8)
1214 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1215 <foo>
1216 <bar>
1217 7: TAIL(8)
1218 8: EXACT <baz>(10)
1219 10: END(0)
1220
1221 d = uvuni_to_utf8_flags(d, uv, 0);
1222
1223is the recommended Unicode-aware way of saying
1224
1225 *(d++) = uv;
1226*/
1227
1e2e3d02 1228#define TRIE_STORE_REVCHAR \
786e8c11 1229 STMT_START { \
73031816
NC
1230 if (UTF) { \
1231 SV *zlopp = newSV(2); \
88c9ea1e
CB
1232 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1233 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1234 SvCUR_set(zlopp, kapow - flrbbbbb); \
1235 SvPOK_on(zlopp); \
1236 SvUTF8_on(zlopp); \
1237 av_push(revcharmap, zlopp); \
1238 } else { \
6bdeddd2 1239 char ooooff = (char)uvc; \
73031816
NC
1240 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1241 } \
1242 } STMT_END
786e8c11
YO
1243
1244#define TRIE_READ_CHAR STMT_START { \
1245 wordlen++; \
1246 if ( UTF ) { \
1247 if ( folder ) { \
1248 if ( foldlen > 0 ) { \
1249 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1250 foldlen -= len; \
1251 scan += len; \
1252 len = 0; \
1253 } else { \
1254 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1255 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1256 foldlen -= UNISKIP( uvc ); \
1257 scan = foldbuf + UNISKIP( uvc ); \
1258 } \
1259 } else { \
1260 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1261 } \
1262 } else { \
1263 uvc = (U32)*uc; \
1264 len = 1; \
1265 } \
1266} STMT_END
1267
1268
1269
1270#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1271 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1272 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1273 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1274 } \
1275 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1276 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1277 TRIE_LIST_CUR( state )++; \
1278} STMT_END
07be1b83 1279
786e8c11
YO
1280#define TRIE_LIST_NEW(state) STMT_START { \
1281 Newxz( trie->states[ state ].trans.list, \
1282 4, reg_trie_trans_le ); \
1283 TRIE_LIST_CUR( state ) = 1; \
1284 TRIE_LIST_LEN( state ) = 4; \
1285} STMT_END
07be1b83 1286
786e8c11
YO
1287#define TRIE_HANDLE_WORD(state) STMT_START { \
1288 U16 dupe= trie->states[ state ].wordnum; \
1289 regnode * const noper_next = regnext( noper ); \
1290 \
786e8c11
YO
1291 DEBUG_r({ \
1292 /* store the word for dumping */ \
1293 SV* tmp; \
1294 if (OP(noper) != NOTHING) \
740cce10 1295 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1296 else \
740cce10 1297 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1298 av_push( trie_words, tmp ); \
786e8c11
YO
1299 }); \
1300 \
1301 curword++; \
2e64971a
DM
1302 trie->wordinfo[curword].prev = 0; \
1303 trie->wordinfo[curword].len = wordlen; \
1304 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1305 \
1306 if ( noper_next < tail ) { \
1307 if (!trie->jump) \
c944940b 1308 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1309 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1310 if (!jumper) \
1311 jumper = noper_next; \
1312 if (!nextbranch) \
1313 nextbranch= regnext(cur); \
1314 } \
1315 \
1316 if ( dupe ) { \
2e64971a
DM
1317 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1318 /* chain, so that when the bits of chain are later */\
1319 /* linked together, the dups appear in the chain */\
1320 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1321 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1322 } else { \
1323 /* we haven't inserted this word yet. */ \
1324 trie->states[ state ].wordnum = curword; \
1325 } \
1326} STMT_END
07be1b83 1327
3dab1dad 1328
786e8c11
YO
1329#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1330 ( ( base + charid >= ucharcount \
1331 && base + charid < ubound \
1332 && state == trie->trans[ base - ucharcount + charid ].check \
1333 && trie->trans[ base - ucharcount + charid ].next ) \
1334 ? trie->trans[ base - ucharcount + charid ].next \
1335 : ( state==1 ? special : 0 ) \
1336 )
3dab1dad 1337
786e8c11
YO
1338#define MADE_TRIE 1
1339#define MADE_JUMP_TRIE 2
1340#define MADE_EXACT_TRIE 4
3dab1dad 1341
a3621e74 1342STATIC I32
786e8c11 1343S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1344{
27da23d5 1345 dVAR;
a3621e74
YO
1346 /* first pass, loop through and scan words */
1347 reg_trie_data *trie;
55eed653 1348 HV *widecharmap = NULL;
2b8b4781 1349 AV *revcharmap = newAV();
a3621e74 1350 regnode *cur;
9f7f3913 1351 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1352 STRLEN len = 0;
1353 UV uvc = 0;
1354 U16 curword = 0;
1355 U32 next_alloc = 0;
786e8c11
YO
1356 regnode *jumper = NULL;
1357 regnode *nextbranch = NULL;
7f69552c 1358 regnode *convert = NULL;
2e64971a 1359 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1360 /* we just use folder as a flag in utf8 */
1e696034 1361 const U8 * folder = NULL;
a3621e74 1362
2b8b4781
NC
1363#ifdef DEBUGGING
1364 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1365 AV *trie_words = NULL;
1366 /* along with revcharmap, this only used during construction but both are
1367 * useful during debugging so we store them in the struct when debugging.
8e11feef 1368 */
2b8b4781
NC
1369#else
1370 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1371 STRLEN trie_charcount=0;
3dab1dad 1372#endif
2b8b4781 1373 SV *re_trie_maxbuff;
a3621e74 1374 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1375
1376 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1377#ifndef DEBUGGING
1378 PERL_UNUSED_ARG(depth);
1379#endif
a3621e74 1380
1e696034
KW
1381 switch (flags) {
1382 case EXACTFU: folder = PL_fold_latin1; break;
1383 case EXACTF: folder = PL_fold; break;
1384 case EXACTFL: folder = PL_fold_locale; break;
1385 }
1386
c944940b 1387 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1388 trie->refcount = 1;
3dab1dad 1389 trie->startstate = 1;
786e8c11 1390 trie->wordcount = word_count;
f8fc2ecf 1391 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1392 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1393 if (!(UTF && folder))
c944940b 1394 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1395 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1396 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1397
a3621e74 1398 DEBUG_r({
2b8b4781 1399 trie_words = newAV();
a3621e74 1400 });
a3621e74 1401
0111c4fd 1402 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1403 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1404 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1405 }
3dab1dad
YO
1406 DEBUG_OPTIMISE_r({
1407 PerlIO_printf( Perl_debug_log,
786e8c11 1408 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1409 (int)depth * 2 + 2, "",
1410 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1411 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1412 (int)depth);
3dab1dad 1413 });
7f69552c
YO
1414
1415 /* Find the node we are going to overwrite */
1416 if ( first == startbranch && OP( last ) != BRANCH ) {
1417 /* whole branch chain */
1418 convert = first;
1419 } else {
1420 /* branch sub-chain */
1421 convert = NEXTOPER( first );
1422 }
1423
a3621e74
YO
1424 /* -- First loop and Setup --
1425
1426 We first traverse the branches and scan each word to determine if it
1427 contains widechars, and how many unique chars there are, this is
1428 important as we have to build a table with at least as many columns as we
1429 have unique chars.
1430
1431 We use an array of integers to represent the character codes 0..255
38a44b82 1432 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1433 native representation of the character value as the key and IV's for the
1434 coded index.
1435
1436 *TODO* If we keep track of how many times each character is used we can
1437 remap the columns so that the table compression later on is more
3b753521 1438 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1439 middle and the least common are on the outside. IMO this would be better
1440 than a most to least common mapping as theres a decent chance the most
1441 common letter will share a node with the least common, meaning the node
1442 will not be compressable. With a middle is most common approach the worst
1443 case is when we have the least common nodes twice.
1444
1445 */
1446
a3621e74 1447 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1448 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1449 const U8 *uc = (U8*)STRING( noper );
a28509cc 1450 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1451 STRLEN foldlen = 0;
1452 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1453 const U8 *scan = (U8*)NULL;
07be1b83 1454 U32 wordlen = 0; /* required init */
02daf0ab
YO
1455 STRLEN chars = 0;
1456 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1457
3dab1dad
YO
1458 if (OP(noper) == NOTHING) {
1459 trie->minlen= 0;
1460 continue;
1461 }
02daf0ab
YO
1462 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1463 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1464 regardless of encoding */
1465
a3621e74 1466 for ( ; uc < e ; uc += len ) {
3dab1dad 1467 TRIE_CHARCOUNT(trie)++;
a3621e74 1468 TRIE_READ_CHAR;
3dab1dad 1469 chars++;
a3621e74
YO
1470 if ( uvc < 256 ) {
1471 if ( !trie->charmap[ uvc ] ) {
1472 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1473 if ( folder )
1474 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1475 TRIE_STORE_REVCHAR;
a3621e74 1476 }
02daf0ab 1477 if ( set_bit ) {
62012aee
KW
1478 /* store the codepoint in the bitmap, and its folded
1479 * equivalent. */
02daf0ab 1480 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1481
1482 /* store the folded codepoint */
1483 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1484
1485 if ( !UTF ) {
1486 /* store first byte of utf8 representation of
acdf4139
KW
1487 variant codepoints */
1488 if (! UNI_IS_INVARIANT(uvc)) {
1489 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
1490 }
1491 }
02daf0ab
YO
1492 set_bit = 0; /* We've done our bit :-) */
1493 }
a3621e74
YO
1494 } else {
1495 SV** svpp;
55eed653
NC
1496 if ( !widecharmap )
1497 widecharmap = newHV();
a3621e74 1498
55eed653 1499 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1500
1501 if ( !svpp )
e4584336 1502 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1503
1504 if ( !SvTRUE( *svpp ) ) {
1505 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1506 TRIE_STORE_REVCHAR;
a3621e74
YO
1507 }
1508 }
1509 }
3dab1dad
YO
1510 if( cur == first ) {
1511 trie->minlen=chars;
1512 trie->maxlen=chars;
1513 } else if (chars < trie->minlen) {
1514 trie->minlen=chars;
1515 } else if (chars > trie->maxlen) {
1516 trie->maxlen=chars;
1517 }
1518
a3621e74
YO
1519 } /* end first pass */
1520 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1521 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1522 (int)depth * 2 + 2,"",
55eed653 1523 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1524 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1525 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1526 );
a3621e74
YO
1527
1528 /*
1529 We now know what we are dealing with in terms of unique chars and
1530 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1531 representation using a flat table will take. If it's over a reasonable
1532 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1533 conservative but potentially much slower representation using an array
1534 of lists.
1535
1536 At the end we convert both representations into the same compressed
1537 form that will be used in regexec.c for matching with. The latter
1538 is a form that cannot be used to construct with but has memory
1539 properties similar to the list form and access properties similar
1540 to the table form making it both suitable for fast searches and
1541 small enough that its feasable to store for the duration of a program.
1542
1543 See the comment in the code where the compressed table is produced
1544 inplace from the flat tabe representation for an explanation of how
1545 the compression works.
1546
1547 */
1548
1549
2e64971a
DM
1550 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1551 prev_states[1] = 0;
1552
3dab1dad 1553 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1554 /*
1555 Second Pass -- Array Of Lists Representation
1556
1557 Each state will be represented by a list of charid:state records
1558 (reg_trie_trans_le) the first such element holds the CUR and LEN
1559 points of the allocated array. (See defines above).
1560
1561 We build the initial structure using the lists, and then convert
1562 it into the compressed table form which allows faster lookups
1563 (but cant be modified once converted).
a3621e74
YO
1564 */
1565
a3621e74
YO
1566 STRLEN transcount = 1;
1567
1e2e3d02
YO
1568 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1569 "%*sCompiling trie using list compiler\n",
1570 (int)depth * 2 + 2, ""));
446bd890 1571
c944940b
JH
1572 trie->states = (reg_trie_state *)
1573 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1574 sizeof(reg_trie_state) );
a3621e74
YO
1575 TRIE_LIST_NEW(1);
1576 next_alloc = 2;
1577
1578 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1579
c445ea15
AL
1580 regnode * const noper = NEXTOPER( cur );
1581 U8 *uc = (U8*)STRING( noper );
1582 const U8 * const e = uc + STR_LEN( noper );
1583 U32 state = 1; /* required init */
1584 U16 charid = 0; /* sanity init */
1585 U8 *scan = (U8*)NULL; /* sanity init */
1586 STRLEN foldlen = 0; /* required init */
07be1b83 1587 U32 wordlen = 0; /* required init */
c445ea15
AL
1588 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1589
3dab1dad 1590 if (OP(noper) != NOTHING) {
786e8c11 1591 for ( ; uc < e ; uc += len ) {
c445ea15 1592
786e8c11 1593 TRIE_READ_CHAR;
c445ea15 1594
786e8c11
YO
1595 if ( uvc < 256 ) {
1596 charid = trie->charmap[ uvc ];
c445ea15 1597 } else {
55eed653 1598 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1599 if ( !svpp ) {
1600 charid = 0;
1601 } else {
1602 charid=(U16)SvIV( *svpp );
1603 }
c445ea15 1604 }
786e8c11
YO
1605 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1606 if ( charid ) {
a3621e74 1607
786e8c11
YO
1608 U16 check;
1609 U32 newstate = 0;
a3621e74 1610
786e8c11
YO
1611 charid--;
1612 if ( !trie->states[ state ].trans.list ) {
1613 TRIE_LIST_NEW( state );
c445ea15 1614 }
786e8c11
YO
1615 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1616 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1617 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1618 break;
1619 }
1620 }
1621 if ( ! newstate ) {
1622 newstate = next_alloc++;
2e64971a 1623 prev_states[newstate] = state;
786e8c11
YO
1624 TRIE_LIST_PUSH( state, charid, newstate );
1625 transcount++;
1626 }
1627 state = newstate;
1628 } else {
1629 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1630 }
a28509cc 1631 }
c445ea15 1632 }
3dab1dad 1633 TRIE_HANDLE_WORD(state);
a3621e74
YO
1634
1635 } /* end second pass */
1636
1e2e3d02
YO
1637 /* next alloc is the NEXT state to be allocated */
1638 trie->statecount = next_alloc;
c944940b
JH
1639 trie->states = (reg_trie_state *)
1640 PerlMemShared_realloc( trie->states,
1641 next_alloc
1642 * sizeof(reg_trie_state) );
a3621e74 1643
3dab1dad 1644 /* and now dump it out before we compress it */
2b8b4781
NC
1645 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1646 revcharmap, next_alloc,
1647 depth+1)
1e2e3d02 1648 );
a3621e74 1649
c944940b
JH
1650 trie->trans = (reg_trie_trans *)
1651 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1652 {
1653 U32 state;
a3621e74
YO
1654 U32 tp = 0;
1655 U32 zp = 0;
1656
1657
1658 for( state=1 ; state < next_alloc ; state ++ ) {
1659 U32 base=0;
1660
1661 /*
1662 DEBUG_TRIE_COMPILE_MORE_r(
1663 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1664 );
1665 */
1666
1667 if (trie->states[state].trans.list) {
1668 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1669 U16 maxid=minid;
a28509cc 1670 U16 idx;
a3621e74
YO
1671
1672 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1673 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1674 if ( forid < minid ) {
1675 minid=forid;
1676 } else if ( forid > maxid ) {
1677 maxid=forid;
1678 }
a3621e74
YO
1679 }
1680 if ( transcount < tp + maxid - minid + 1) {
1681 transcount *= 2;
c944940b
JH
1682 trie->trans = (reg_trie_trans *)
1683 PerlMemShared_realloc( trie->trans,
446bd890
NC
1684 transcount
1685 * sizeof(reg_trie_trans) );
a3621e74
YO
1686 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1687 }
1688 base = trie->uniquecharcount + tp - minid;
1689 if ( maxid == minid ) {
1690 U32 set = 0;
1691 for ( ; zp < tp ; zp++ ) {
1692 if ( ! trie->trans[ zp ].next ) {
1693 base = trie->uniquecharcount + zp - minid;
1694 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1695 trie->trans[ zp ].check = state;
1696 set = 1;
1697 break;
1698 }
1699 }
1700 if ( !set ) {
1701 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1702 trie->trans[ tp ].check = state;
1703 tp++;
1704 zp = tp;
1705 }
1706 } else {
1707 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1708 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1709 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1710 trie->trans[ tid ].check = state;
1711 }
1712 tp += ( maxid - minid + 1 );
1713 }
1714 Safefree(trie->states[ state ].trans.list);
1715 }
1716 /*
1717 DEBUG_TRIE_COMPILE_MORE_r(
1718 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1719 );
1720 */
1721 trie->states[ state ].trans.base=base;
1722 }
cc601c31 1723 trie->lasttrans = tp + 1;
a3621e74
YO
1724 }
1725 } else {
1726 /*
1727 Second Pass -- Flat Table Representation.
1728
1729 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1730 We know that we will need Charcount+1 trans at most to store the data
1731 (one row per char at worst case) So we preallocate both structures
1732 assuming worst case.
1733
1734 We then construct the trie using only the .next slots of the entry
1735 structs.
1736
3b753521 1737 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
1738 make compression both faster and easier by keeping track of how many non
1739 zero fields are in the node.
1740
1741 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1742 transition.
1743
1744 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1745 number representing the first entry of the node, and state as a
1746 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1747 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1748 are 2 entrys per node. eg:
1749
1750 A B A B
1751 1. 2 4 1. 3 7
1752 2. 0 3 3. 0 5
1753 3. 0 0 5. 0 0
1754 4. 0 0 7. 0 0
1755
1756 The table is internally in the right hand, idx form. However as we also
1757 have to deal with the states array which is indexed by nodenum we have to
1758 use TRIE_NODENUM() to convert.
1759
1760 */
1e2e3d02
YO
1761 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1762 "%*sCompiling trie using table compiler\n",
1763 (int)depth * 2 + 2, ""));
3dab1dad 1764
c944940b
JH
1765 trie->trans = (reg_trie_trans *)
1766 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1767 * trie->uniquecharcount + 1,
1768 sizeof(reg_trie_trans) );
1769 trie->states = (reg_trie_state *)
1770 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1771 sizeof(reg_trie_state) );
a3621e74
YO
1772 next_alloc = trie->uniquecharcount + 1;
1773
3dab1dad 1774
a3621e74
YO
1775 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1776
c445ea15 1777 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1778 const U8 *uc = (U8*)STRING( noper );
1779 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1780
1781 U32 state = 1; /* required init */
1782
1783 U16 charid = 0; /* sanity init */
1784 U32 accept_state = 0; /* sanity init */
1785 U8 *scan = (U8*)NULL; /* sanity init */
1786
1787 STRLEN foldlen = 0; /* required init */
07be1b83 1788 U32 wordlen = 0; /* required init */
a3621e74
YO
1789 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1790
3dab1dad 1791 if ( OP(noper) != NOTHING ) {
786e8c11 1792 for ( ; uc < e ; uc += len ) {
a3621e74 1793
786e8c11 1794 TRIE_READ_CHAR;
a3621e74 1795
786e8c11
YO
1796 if ( uvc < 256 ) {
1797 charid = trie->charmap[ uvc ];
1798 } else {
55eed653 1799 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1800 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1801 }
786e8c11
YO
1802 if ( charid ) {
1803 charid--;
1804 if ( !trie->trans[ state + charid ].next ) {
1805 trie->trans[ state + charid ].next = next_alloc;
1806 trie->trans[ state ].check++;
2e64971a
DM
1807 prev_states[TRIE_NODENUM(next_alloc)]
1808 = TRIE_NODENUM(state);
786e8c11
YO
1809 next_alloc += trie->uniquecharcount;
1810 }
1811 state = trie->trans[ state + charid ].next;
1812 } else {
1813 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1814 }
1815 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1816 }
a3621e74 1817 }
3dab1dad
YO
1818 accept_state = TRIE_NODENUM( state );
1819 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1820
1821 } /* end second pass */
1822
3dab1dad 1823 /* and now dump it out before we compress it */
2b8b4781
NC
1824 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1825 revcharmap,
1826 next_alloc, depth+1));
a3621e74 1827
a3621e74
YO
1828 {
1829 /*
1830 * Inplace compress the table.*
1831
1832 For sparse data sets the table constructed by the trie algorithm will
1833 be mostly 0/FAIL transitions or to put it another way mostly empty.
1834 (Note that leaf nodes will not contain any transitions.)
1835
1836 This algorithm compresses the tables by eliminating most such
1837 transitions, at the cost of a modest bit of extra work during lookup:
1838
1839 - Each states[] entry contains a .base field which indicates the
1840 index in the state[] array wheres its transition data is stored.
1841
3b753521 1842 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
1843
1844 - If .base is nonzero then charid is added to it to find an entry in
1845 the trans array.
1846
1847 -If trans[states[state].base+charid].check!=state then the
1848 transition is taken to be a 0/Fail transition. Thus if there are fail
1849 transitions at the front of the node then the .base offset will point
1850 somewhere inside the previous nodes data (or maybe even into a node
1851 even earlier), but the .check field determines if the transition is
1852 valid.
1853
786e8c11 1854 XXX - wrong maybe?
a3621e74 1855 The following process inplace converts the table to the compressed
3b753521 1856 table: We first do not compress the root node 1,and mark all its
a3621e74 1857 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
1858 allows us to do a DFA construction from the compressed table later,
1859 and ensures that any .base pointers we calculate later are greater
1860 than 0.
a3621e74
YO
1861
1862 - We set 'pos' to indicate the first entry of the second node.
1863
1864 - We then iterate over the columns of the node, finding the first and
1865 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1866 and set the .check pointers accordingly, and advance pos
1867 appropriately and repreat for the next node. Note that when we copy
1868 the next pointers we have to convert them from the original
1869 NODEIDX form to NODENUM form as the former is not valid post
1870 compression.
1871
1872 - If a node has no transitions used we mark its base as 0 and do not
1873 advance the pos pointer.
1874
1875 - If a node only has one transition we use a second pointer into the
1876 structure to fill in allocated fail transitions from other states.
1877 This pointer is independent of the main pointer and scans forward
1878 looking for null transitions that are allocated to a state. When it
1879 finds one it writes the single transition into the "hole". If the
786e8c11 1880 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1881
1882 - Once compressed we can Renew/realloc the structures to release the
1883 excess space.
1884
1885 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1886 specifically Fig 3.47 and the associated pseudocode.
1887
1888 demq
1889 */
a3b680e6 1890 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1891 U32 state, charid;
a3621e74 1892 U32 pos = 0, zp=0;
1e2e3d02 1893 trie->statecount = laststate;
a3621e74
YO
1894
1895 for ( state = 1 ; state < laststate ; state++ ) {
1896 U8 flag = 0;
a28509cc
AL
1897 const U32 stateidx = TRIE_NODEIDX( state );
1898 const U32 o_used = trie->trans[ stateidx ].check;
1899 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1900 trie->trans[ stateidx ].check = 0;
1901
1902 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1903 if ( flag || trie->trans[ stateidx + charid ].next ) {
1904 if ( trie->trans[ stateidx + charid ].next ) {
1905 if (o_used == 1) {
1906 for ( ; zp < pos ; zp++ ) {
1907 if ( ! trie->trans[ zp ].next ) {
1908 break;
1909 }
1910 }
1911 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1912 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1913 trie->trans[ zp ].check = state;
1914 if ( ++zp > pos ) pos = zp;
1915 break;
1916 }
1917 used--;
1918 }
1919 if ( !flag ) {
1920 flag = 1;
1921 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1922 }
1923 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1924 trie->trans[ pos ].check = state;
1925 pos++;
1926 }
1927 }
1928 }
cc601c31 1929 trie->lasttrans = pos + 1;
c944940b
JH
1930 trie->states = (reg_trie_state *)
1931 PerlMemShared_realloc( trie->states, laststate
1932 * sizeof(reg_trie_state) );
a3621e74 1933 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1934 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1935 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1936 (int)depth * 2 + 2,"",
1937 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1938 (IV)next_alloc,
1939 (IV)pos,
a3621e74
YO
1940 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1941 );
1942
1943 } /* end table compress */
1944 }
1e2e3d02
YO
1945 DEBUG_TRIE_COMPILE_MORE_r(
1946 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1947 (int)depth * 2 + 2, "",
1948 (UV)trie->statecount,
1949 (UV)trie->lasttrans)
1950 );
cc601c31 1951 /* resize the trans array to remove unused space */
c944940b
JH
1952 trie->trans = (reg_trie_trans *)
1953 PerlMemShared_realloc( trie->trans, trie->lasttrans
1954 * sizeof(reg_trie_trans) );
a3621e74 1955
3b753521 1956 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
1957 U8 nodetype =(U8)(flags & 0xFF);
1958 char *str=NULL;
786e8c11 1959
07be1b83 1960#ifdef DEBUGGING
e62cc96a 1961 regnode *optimize = NULL;
7122b237
YO
1962#ifdef RE_TRACK_PATTERN_OFFSETS
1963
b57a0404
JH
1964 U32 mjd_offset = 0;
1965 U32 mjd_nodelen = 0;
7122b237
YO
1966#endif /* RE_TRACK_PATTERN_OFFSETS */
1967#endif /* DEBUGGING */
a3621e74 1968 /*
3dab1dad
YO
1969 This means we convert either the first branch or the first Exact,
1970 depending on whether the thing following (in 'last') is a branch
1971 or not and whther first is the startbranch (ie is it a sub part of
1972 the alternation or is it the whole thing.)
3b753521 1973 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 1974 the whole branch sequence, including the first.
a3621e74 1975 */
3dab1dad 1976 /* Find the node we are going to overwrite */
7f69552c 1977 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1978 /* branch sub-chain */
3dab1dad 1979 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1980#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1981 DEBUG_r({
1982 mjd_offset= Node_Offset((convert));
1983 mjd_nodelen= Node_Length((convert));
1984 });
7122b237 1985#endif
7f69552c 1986 /* whole branch chain */
7122b237
YO
1987 }
1988#ifdef RE_TRACK_PATTERN_OFFSETS
1989 else {
7f69552c
YO
1990 DEBUG_r({
1991 const regnode *nop = NEXTOPER( convert );
1992 mjd_offset= Node_Offset((nop));
1993 mjd_nodelen= Node_Length((nop));
1994 });
07be1b83
YO
1995 }
1996 DEBUG_OPTIMISE_r(
1997 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1998 (int)depth * 2 + 2, "",
786e8c11 1999 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2000 );
7122b237 2001#endif
3dab1dad
YO
2002 /* But first we check to see if there is a common prefix we can
2003 split out as an EXACT and put in front of the TRIE node. */
2004 trie->startstate= 1;
55eed653 2005 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2006 U32 state;
1e2e3d02 2007 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2008 U32 ofs = 0;
8e11feef
RGS
2009 I32 idx = -1;
2010 U32 count = 0;
2011 const U32 base = trie->states[ state ].trans.base;
a3621e74 2012
3dab1dad 2013 if ( trie->states[state].wordnum )
8e11feef 2014 count = 1;
a3621e74 2015
8e11feef 2016 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2017 if ( ( base + ofs >= trie->uniquecharcount ) &&
2018 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2019 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2020 {
3dab1dad 2021 if ( ++count > 1 ) {
2b8b4781 2022 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2023 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2024 if ( state == 1 ) break;
3dab1dad
YO
2025 if ( count == 2 ) {
2026 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2027 DEBUG_OPTIMISE_r(
8e11feef
RGS
2028 PerlIO_printf(Perl_debug_log,
2029 "%*sNew Start State=%"UVuf" Class: [",
2030 (int)depth * 2 + 2, "",
786e8c11 2031 (UV)state));
be8e71aa 2032 if (idx >= 0) {
2b8b4781 2033 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2034 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2035
3dab1dad 2036 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2037 if ( folder )
2038 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2039 DEBUG_OPTIMISE_r(
f1f66076 2040 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2041 );
8e11feef
RGS
2042 }
2043 }
2044 TRIE_BITMAP_SET(trie,*ch);
2045 if ( folder )
2046 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2047 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2048 }
2049 idx = ofs;
2050 }
3dab1dad
YO
2051 }
2052 if ( count == 1 ) {
2b8b4781 2053 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2054 STRLEN len;
2055 char *ch = SvPV( *tmp, len );
de734bd5
A
2056 DEBUG_OPTIMISE_r({
2057 SV *sv=sv_newmortal();
8e11feef
RGS
2058 PerlIO_printf( Perl_debug_log,
2059 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2060 (int)depth * 2 + 2, "",
de734bd5
A
2061 (UV)state, (UV)idx,
2062 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2063 PL_colors[0], PL_colors[1],
2064 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2065 PERL_PV_ESCAPE_FIRSTCHAR
2066 )
2067 );
2068 });
3dab1dad
YO
2069 if ( state==1 ) {
2070 OP( convert ) = nodetype;
2071 str=STRING(convert);
2072 STR_LEN(convert)=0;
2073 }
c490c714
YO
2074 STR_LEN(convert) += len;
2075 while (len--)
de734bd5 2076 *str++ = *ch++;
8e11feef 2077 } else {
f9049ba1 2078#ifdef DEBUGGING
8e11feef
RGS
2079 if (state>1)
2080 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2081#endif
8e11feef
RGS
2082 break;
2083 }
2084 }
2e64971a 2085 trie->prefixlen = (state-1);
3dab1dad 2086 if (str) {
8e11feef 2087 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2088 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2089 trie->startstate = state;
07be1b83
YO
2090 trie->minlen -= (state - 1);
2091 trie->maxlen -= (state - 1);
33809eae
JH
2092#ifdef DEBUGGING
2093 /* At least the UNICOS C compiler choked on this
2094 * being argument to DEBUG_r(), so let's just have
2095 * it right here. */
2096 if (
2097#ifdef PERL_EXT_RE_BUILD
2098 1
2099#else
2100 DEBUG_r_TEST
2101#endif
2102 ) {
2103 regnode *fix = convert;
2104 U32 word = trie->wordcount;
2105 mjd_nodelen++;
2106 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2107 while( ++fix < n ) {
2108 Set_Node_Offset_Length(fix, 0, 0);
2109 }
2110 while (word--) {
2111 SV ** const tmp = av_fetch( trie_words, word, 0 );
2112 if (tmp) {
2113 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2114 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2115 else
2116 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2117 }
2118 }
2119 }
2120#endif
8e11feef
RGS
2121 if (trie->maxlen) {
2122 convert = n;
2123 } else {
3dab1dad 2124 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2125 DEBUG_r(optimize= n);
3dab1dad
YO
2126 }
2127 }
2128 }
a5ca303d
YO
2129 if (!jumper)
2130 jumper = last;
3dab1dad 2131 if ( trie->maxlen ) {
8e11feef
RGS
2132 NEXT_OFF( convert ) = (U16)(tail - convert);
2133 ARG_SET( convert, data_slot );
786e8c11
YO
2134 /* Store the offset to the first unabsorbed branch in
2135 jump[0], which is otherwise unused by the jump logic.
2136 We use this when dumping a trie and during optimisation. */
2137 if (trie->jump)
7f69552c 2138 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2139
6c48061a
YO
2140 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2141 * and there is a bitmap
2142 * and the first "jump target" node we found leaves enough room
2143 * then convert the TRIE node into a TRIEC node, with the bitmap
2144 * embedded inline in the opcode - this is hypothetically faster.
2145 */
2146 if ( !trie->states[trie->startstate].wordnum
2147 && trie->bitmap
2148 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2149 {
2150 OP( convert ) = TRIEC;
2151 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2152 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2153 trie->bitmap= NULL;
2154 } else
2155 OP( convert ) = TRIE;
a3621e74 2156
3dab1dad
YO
2157 /* store the type in the flags */
2158 convert->flags = nodetype;
a5ca303d
YO
2159 DEBUG_r({
2160 optimize = convert
2161 + NODE_STEP_REGNODE
2162 + regarglen[ OP( convert ) ];
2163 });
2164 /* XXX We really should free up the resource in trie now,
2165 as we won't use them - (which resources?) dmq */
3dab1dad 2166 }
a3621e74 2167 /* needed for dumping*/
e62cc96a 2168 DEBUG_r(if (optimize) {
07be1b83 2169 regnode *opt = convert;
bcdf7404 2170
e62cc96a 2171 while ( ++opt < optimize) {
07be1b83
YO
2172 Set_Node_Offset_Length(opt,0,0);
2173 }
786e8c11
YO
2174 /*
2175 Try to clean up some of the debris left after the
2176 optimisation.
a3621e74 2177 */
786e8c11 2178 while( optimize < jumper ) {
07be1b83 2179 mjd_nodelen += Node_Length((optimize));
a3621e74 2180 OP( optimize ) = OPTIMIZED;
07be1b83 2181 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2182 optimize++;
2183 }
07be1b83 2184 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2185 });
2186 } /* end node insert */
2e64971a
DM
2187
2188 /* Finish populating the prev field of the wordinfo array. Walk back
2189 * from each accept state until we find another accept state, and if
2190 * so, point the first word's .prev field at the second word. If the
2191 * second already has a .prev field set, stop now. This will be the
2192 * case either if we've already processed that word's accept state,
3b753521
FN
2193 * or that state had multiple words, and the overspill words were
2194 * already linked up earlier.
2e64971a
DM
2195 */
2196 {
2197 U16 word;
2198 U32 state;
2199 U16 prev;
2200
2201 for (word=1; word <= trie->wordcount; word++) {
2202 prev = 0;
2203 if (trie->wordinfo[word].prev)
2204 continue;
2205 state = trie->wordinfo[word].accept;
2206 while (state) {
2207 state = prev_states[state];
2208 if (!state)
2209 break;
2210 prev = trie->states[state].wordnum;
2211 if (prev)
2212 break;
2213 }
2214 trie->wordinfo[word].prev = prev;
2215 }
2216 Safefree(prev_states);
2217 }
2218
2219
2220 /* and now dump out the compressed format */
2221 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2222
55eed653 2223 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2224#ifdef DEBUGGING
2225 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2226 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2227#else
2228 SvREFCNT_dec(revcharmap);
07be1b83 2229#endif
786e8c11
YO
2230 return trie->jump
2231 ? MADE_JUMP_TRIE
2232 : trie->startstate>1
2233 ? MADE_EXACT_TRIE
2234 : MADE_TRIE;
2235}
2236
2237STATIC void
2238S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2239{
3b753521 2240/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2241
2242 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2243 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2244 ISBN 0-201-10088-6
2245
2246 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2247 suffix of the current state's 'word' that is also a proper prefix of another word in our
2248 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2249 the DFA not to have to restart after its tried and failed a word at a given point, it
2250 simply continues as though it had been matching the other word in the first place.
2251 Consider
2252 'abcdgu'=~/abcdefg|cdgu/
2253 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2254 fail, which would bring us to the state representing 'd' in the second word where we would
2255 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2256 */
2257 /* add a fail transition */
3251b653
NC
2258 const U32 trie_offset = ARG(source);
2259 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2260 U32 *q;
2261 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2262 const U32 numstates = trie->statecount;
786e8c11
YO
2263 const U32 ubound = trie->lasttrans + ucharcount;
2264 U32 q_read = 0;
2265 U32 q_write = 0;
2266 U32 charid;
2267 U32 base = trie->states[ 1 ].trans.base;
2268 U32 *fail;
2269 reg_ac_data *aho;
2270 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2271 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2272
2273 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2274#ifndef DEBUGGING
2275 PERL_UNUSED_ARG(depth);
2276#endif
2277
2278
2279 ARG_SET( stclass, data_slot );
c944940b 2280 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2281 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2282 aho->trie=trie_offset;
446bd890
NC
2283 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2284 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2285 Newxz( q, numstates, U32);
c944940b 2286 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2287 aho->refcount = 1;
2288 fail = aho->fail;
2289 /* initialize fail[0..1] to be 1 so that we always have
2290 a valid final fail state */
2291 fail[ 0 ] = fail[ 1 ] = 1;
2292
2293 for ( charid = 0; charid < ucharcount ; charid++ ) {
2294 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2295 if ( newstate ) {
2296 q[ q_write ] = newstate;
2297 /* set to point at the root */
2298 fail[ q[ q_write++ ] ]=1;
2299 }
2300 }
2301 while ( q_read < q_write) {
2302 const U32 cur = q[ q_read++ % numstates ];
2303 base = trie->states[ cur ].trans.base;
2304
2305 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2306 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2307 if (ch_state) {
2308 U32 fail_state = cur;
2309 U32 fail_base;
2310 do {
2311 fail_state = fail[ fail_state ];
2312 fail_base = aho->states[ fail_state ].trans.base;
2313 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2314
2315 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2316 fail[ ch_state ] = fail_state;
2317 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2318 {
2319 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2320 }
2321 q[ q_write++ % numstates] = ch_state;
2322 }
2323 }
2324 }
2325 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2326 when we fail in state 1, this allows us to use the
2327 charclass scan to find a valid start char. This is based on the principle
2328 that theres a good chance the string being searched contains lots of stuff
2329 that cant be a start char.
2330 */
2331 fail[ 0 ] = fail[ 1 ] = 0;
2332 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2333 PerlIO_printf(Perl_debug_log,
2334 "%*sStclass Failtable (%"UVuf" states): 0",
2335 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2336 );
786e8c11
YO
2337 for( q_read=1; q_read<numstates; q_read++ ) {
2338 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2339 }
2340 PerlIO_printf(Perl_debug_log, "\n");
2341 });
2342 Safefree(q);
2343 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2344}
2345
786e8c11 2346
a3621e74 2347/*
5d1c421c
JH
2348 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2349 * These need to be revisited when a newer toolchain becomes available.
2350 */
2351#if defined(__sparc64__) && defined(__GNUC__)
2352# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2353# undef SPARC64_GCC_WORKAROUND
2354# define SPARC64_GCC_WORKAROUND 1
2355# endif
2356#endif
2357
07be1b83 2358#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2359 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2360 SV * const mysv=sv_newmortal(); \
2361 regnode *Next = regnext(scan); \
2362 regprop(RExC_rx, mysv, scan); \
7f69552c 2363 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2364 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2365 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2366 }});
07be1b83 2367
1de06328
YO
2368
2369
2370
2371
07be1b83
YO
2372#define JOIN_EXACT(scan,min,flags) \
2373 if (PL_regkind[OP(scan)] == EXACT) \
2374 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2375
be8e71aa 2376STATIC U32
07be1b83
YO
2377S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2378 /* Merge several consecutive EXACTish nodes into one. */
2379 regnode *n = regnext(scan);
2380 U32 stringok = 1;
2381 regnode *next = scan + NODE_SZ_STR(scan);
2382 U32 merged = 0;
2383 U32 stopnow = 0;
2384#ifdef DEBUGGING
2385 regnode *stop = scan;
72f13be8 2386 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2387#else
d47053eb
RGS
2388 PERL_UNUSED_ARG(depth);
2389#endif
7918f24d
NC
2390
2391 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2392#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2393 PERL_UNUSED_ARG(flags);
2394 PERL_UNUSED_ARG(val);
07be1b83 2395#endif
07be1b83
YO
2396 DEBUG_PEEP("join",scan,depth);
2397
2398 /* Skip NOTHING, merge EXACT*. */
2399 while (n &&
2400 ( PL_regkind[OP(n)] == NOTHING ||
2401 (stringok && (OP(n) == OP(scan))))
2402 && NEXT_OFF(n)
2403 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2404
2405 if (OP(n) == TAIL || n > next)
2406 stringok = 0;
2407 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2408 DEBUG_PEEP("skip:",n,depth);
2409 NEXT_OFF(scan) += NEXT_OFF(n);
2410 next = n + NODE_STEP_REGNODE;
2411#ifdef DEBUGGING
2412 if (stringok)
2413 stop = n;
2414#endif
2415 n = regnext(n);
2416 }
2417 else if (stringok) {
786e8c11 2418 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2419 regnode * const nnext = regnext(n);
2420
2421 DEBUG_PEEP("merg",n,depth);
2422
2423 merged++;
2424 if (oldl + STR_LEN(n) > U8_MAX)
2425 break;
2426 NEXT_OFF(scan) += NEXT_OFF(n);
2427 STR_LEN(scan) += STR_LEN(n);
2428 next = n + NODE_SZ_STR(n);
2429 /* Now we can overwrite *n : */
2430 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2431#ifdef DEBUGGING
2432 stop = next - 1;
2433#endif
2434 n = nnext;
2435 if (stopnow) break;
2436 }
2437
d47053eb
RGS
2438#ifdef EXPERIMENTAL_INPLACESCAN
2439 if (flags && !NEXT_OFF(n)) {
2440 DEBUG_PEEP("atch", val, depth);
2441 if (reg_off_by_arg[OP(n)]) {
2442 ARG_SET(n, val - n);
2443 }
2444 else {
2445 NEXT_OFF(n) = val - n;
2446 }
2447 stopnow = 1;
2448 }
07be1b83
YO
2449#endif
2450 }
ced7f090
KW
2451#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2452#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2453#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2454#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2c2b7f86
KW
2455
2456 if (UTF
2457 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
2458 && ( STR_LEN(scan) >= 6 ) )
2459 {
07be1b83
YO
2460 /*
2461 Two problematic code points in Unicode casefolding of EXACT nodes:
2462
2463 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2464 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2465
2466 which casefold to
2467
2468 Unicode UTF-8
2469
2470 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2471 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2472
2473 This means that in case-insensitive matching (or "loose matching",
2474 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2475 length of the above casefolded versions) can match a target string
2476 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2477 This would rather mess up the minimum length computation.
2478
2479 What we'll do is to look for the tail four bytes, and then peek
2480 at the preceding two bytes to see whether we need to decrease
2481 the minimum length by four (six minus two).
2482
2483 Thanks to the design of UTF-8, there cannot be false matches:
2484 A sequence of valid UTF-8 bytes cannot be a subsequence of
2485 another valid sequence of UTF-8 bytes.
2486
2487 */
2488 char * const s0 = STRING(scan), *s, *t;
2489 char * const s1 = s0 + STR_LEN(scan) - 1;
2490 char * const s2 = s1 - 4;
e294cc5d
JH
2491#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2492 const char t0[] = "\xaf\x49\xaf\x42";
2493#else
07be1b83 2494 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2495#endif
07be1b83
YO
2496 const char * const t1 = t0 + 3;
2497
2498 for (s = s0 + 2;
2499 s < s2 && (t = ninstr(s, s1, t0, t1));
2500 s = t + 4) {
e294cc5d
JH
2501#ifdef EBCDIC
2502 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2503 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2504#else
07be1b83
YO
2505 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2506 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2507#endif
07be1b83
YO
2508 *min -= 4;
2509 }
2510 }
2511
2512#ifdef DEBUGGING
2513 /* Allow dumping */
2514 n = scan + NODE_SZ_STR(scan);
2515 while (n <= stop) {
2516 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2517 OP(n) = OPTIMIZED;
2518 NEXT_OFF(n) = 0;
2519 }
2520 n++;
2521 }
2522#endif
2523 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2524 return stopnow;
2525}
2526
653099ff
GS
2527/* REx optimizer. Converts nodes into quickier variants "in place".
2528 Finds fixed substrings. */
2529
a0288114 2530/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2531 to the position after last scanned or to NULL. */
2532
40d049e4
YO
2533#define INIT_AND_WITHP \
2534 assert(!and_withp); \
2535 Newx(and_withp,1,struct regnode_charclass_class); \
2536 SAVEFREEPV(and_withp)
07be1b83 2537
b515a41d
YO
2538/* this is a chain of data about sub patterns we are processing that
2539 need to be handled seperately/specially in study_chunk. Its so
2540 we can simulate recursion without losing state. */
2541struct scan_frame;
2542typedef struct scan_frame {
2543 regnode *last; /* last node to process in this frame */
2544 regnode *next; /* next node to process when last is reached */
2545 struct scan_frame *prev; /*previous frame*/
2546 I32 stop; /* what stopparen do we use */
2547} scan_frame;
2548
304ee84b
YO
2549
2550#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2551
e1d1eefb
YO
2552#define CASE_SYNST_FNC(nAmE) \
2553case nAmE: \
2554 if (flags & SCF_DO_STCLASS_AND) { \
2555 for (value = 0; value < 256; value++) \
2556 if (!is_ ## nAmE ## _cp(value)) \
2557 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2558 } \
2559 else { \
2560 for (value = 0; value < 256; value++) \
2561 if (is_ ## nAmE ## _cp(value)) \
2562 ANYOF_BITMAP_SET(data->start_class, value); \
2563 } \
2564 break; \
2565case N ## nAmE: \
2566 if (flags & SCF_DO_STCLASS_AND) { \
2567 for (value = 0; value < 256; value++) \
2568 if (is_ ## nAmE ## _cp(value)) \
2569 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2570 } \
2571 else { \
2572 for (value = 0; value < 256; value++) \
2573 if (!is_ ## nAmE ## _cp(value)) \
2574 ANYOF_BITMAP_SET(data->start_class, value); \
2575 } \
2576 break
2577
2578
2579
76e3520e 2580STATIC I32
40d049e4 2581S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2582 I32 *minlenp, I32 *deltap,
40d049e4
YO
2583 regnode *last,
2584 scan_data_t *data,
2585 I32 stopparen,
2586 U8* recursed,
2587 struct regnode_charclass_class *and_withp,
2588 U32 flags, U32 depth)
c277df42
IZ
2589 /* scanp: Start here (read-write). */
2590 /* deltap: Write maxlen-minlen here. */
2591 /* last: Stop before this one. */
40d049e4
YO
2592 /* data: string data about the pattern */
2593 /* stopparen: treat close N as END */
2594 /* recursed: which subroutines have we recursed into */
2595 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2596{
97aff369 2597 dVAR;
c277df42
IZ
2598 I32 min = 0, pars = 0, code;
2599 regnode *scan = *scanp, *next;
2600 I32 delta = 0;
2601 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2602 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2603 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2604 scan_data_t data_fake;
a3621e74 2605 SV *re_trie_maxbuff = NULL;
786e8c11 2606 regnode *first_non_open = scan;
e2e6a0f1 2607 I32 stopmin = I32_MAX;
8aa23a47 2608 scan_frame *frame = NULL;
a3621e74 2609 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2610
7918f24d
NC
2611 PERL_ARGS_ASSERT_STUDY_CHUNK;
2612
13a24bad 2613#ifdef DEBUGGING
40d049e4 2614 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2615#endif
40d049e4 2616
786e8c11 2617 if ( depth == 0 ) {
40d049e4 2618 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2619 first_non_open=regnext(first_non_open);
2620 }
2621
b81d288d 2622
8aa23a47
YO
2623 fake_study_recurse:
2624 while ( scan && OP(scan) != END && scan < last ){
2625 /* Peephole optimizer: */
304ee84b 2626 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2627 DEBUG_PEEP("Peep",scan,depth);
2628 JOIN_EXACT(scan,&min,0);
2629
2630 /* Follow the next-chain of the current node and optimize
2631 away all the NOTHINGs from it. */
2632 if (OP(scan) != CURLYX) {
2633 const int max = (reg_off_by_arg[OP(scan)]
2634 ? I32_MAX
2635 /* I32 may be smaller than U16 on CRAYs! */
2636 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2637 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2638 int noff;
2639 regnode *n = scan;
2640
2641 /* Skip NOTHING and LONGJMP. */
2642 while ((n = regnext(n))
2643 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2644 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2645 && off + noff < max)
2646 off += noff;
2647 if (reg_off_by_arg[OP(scan)])
2648 ARG(scan) = off;
2649 else
2650 NEXT_OFF(scan) = off;
2651 }
a3621e74 2652
c277df42 2653
8aa23a47
YO
2654
2655 /* The principal pseudo-switch. Cannot be a switch, since we
2656 look into several different things. */
2657 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2658 || OP(scan) == IFTHEN) {
2659 next = regnext(scan);
2660 code = OP(scan);
2661 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2662
2663 if (OP(next) == code || code == IFTHEN) {
2664 /* NOTE - There is similar code to this block below for handling
2665 TRIE nodes on a re-study. If you change stuff here check there
2666 too. */
2667 I32 max1 = 0, min1 = I32_MAX, num = 0;
2668 struct regnode_charclass_class accum;
2669 regnode * const startbranch=scan;
2670
2671 if (flags & SCF_DO_SUBSTR)
304ee84b 2672 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2673 if (flags & SCF_DO_STCLASS)
2674 cl_init_zero(pRExC_state, &accum);
2675
2676 while (OP(scan) == code) {
2677 I32 deltanext, minnext, f = 0, fake;
2678 struct regnode_charclass_class this_class;
2679
2680 num++;
2681 data_fake.flags = 0;
2682 if (data) {
2683 data_fake.whilem_c = data->whilem_c;
2684 data_fake.last_closep = data->last_closep;
2685 }
2686 else
2687 data_fake.last_closep = &fake;
58e23c8d
YO
2688
2689 data_fake.pos_delta = delta;
8aa23a47
YO
2690 next = regnext(scan);
2691 scan = NEXTOPER(scan);
2692 if (code != BRANCH)
c277df42 2693 scan = NEXTOPER(scan);
8aa23a47
YO
2694 if (flags & SCF_DO_STCLASS) {
2695 cl_init(pRExC_state, &this_class);
2696 data_fake.start_class = &this_class;
2697 f = SCF_DO_STCLASS_AND;
58e23c8d 2698 }
8aa23a47
YO
2699 if (flags & SCF_WHILEM_VISITED_POS)
2700 f |= SCF_WHILEM_VISITED_POS;
2701
2702 /* we suppose the run is continuous, last=next...*/
2703 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2704 next, &data_fake,
2705 stopparen, recursed, NULL, f,depth+1);
2706 if (min1 > minnext)
2707 min1 = minnext;
2708 if (max1 < minnext + deltanext)
2709 max1 = minnext + deltanext;
2710 if (deltanext == I32_MAX)
2711 is_inf = is_inf_internal = 1;
2712 scan = next;
2713 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2714 pars++;
2715 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2716 if ( stopmin > minnext)
2717 stopmin = min + min1;
2718 flags &= ~SCF_DO_SUBSTR;
2719 if (data)
2720 data->flags |= SCF_SEEN_ACCEPT;
2721 }
2722 if (data) {
2723 if (data_fake.flags & SF_HAS_EVAL)
2724 data->flags |= SF_HAS_EVAL;
2725 data->whilem_c = data_fake.whilem_c;
3dab1dad 2726 }
8aa23a47
YO
2727 if (flags & SCF_DO_STCLASS)
2728 cl_or(pRExC_state, &accum, &this_class);
2729 }
2730 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2731 min1 = 0;
2732 if (flags & SCF_DO_SUBSTR) {
2733 data->pos_min += min1;
2734 data->pos_delta += max1 - min1;
2735 if (max1 != min1 || is_inf)
2736 data->longest = &(data->longest_float);
2737 }
2738 min += min1;
2739 delta += max1 - min1;
2740 if (flags & SCF_DO_STCLASS_OR) {
2741 cl_or(pRExC_state, data->start_class, &accum);
2742 if (min1) {
2743 cl_and(data->start_class, and_withp);
2744 flags &= ~SCF_DO_STCLASS;
653099ff 2745 }
8aa23a47
YO
2746 }
2747 else if (flags & SCF_DO_STCLASS_AND) {
2748 if (min1) {
2749 cl_and(data->start_class, &accum);
2750 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2751 }
8aa23a47
YO
2752 else {
2753 /* Switch to OR mode: cache the old value of
2754 * data->start_class */
2755 INIT_AND_WITHP;
2756 StructCopy(data->start_class, and_withp,
2757 struct regnode_charclass_class);
2758 flags &= ~SCF_DO_STCLASS_AND;
2759 StructCopy(&accum, data->start_class,
2760 struct regnode_charclass_class);
2761 flags |= SCF_DO_STCLASS_OR;
2762 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2763 }
8aa23a47 2764 }
a3621e74 2765
8aa23a47
YO
2766 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2767 /* demq.
a3621e74 2768
8aa23a47
YO
2769 Assuming this was/is a branch we are dealing with: 'scan' now
2770 points at the item that follows the branch sequence, whatever
2771 it is. We now start at the beginning of the sequence and look
2772 for subsequences of
a3621e74 2773
8aa23a47
YO
2774 BRANCH->EXACT=>x1
2775 BRANCH->EXACT=>x2
2776 tail
a3621e74 2777
8aa23a47 2778 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2779
8aa23a47
YO
2780 If we can find such a subseqence we need to turn the first
2781 element into a trie and then add the subsequent branch exact
2782 strings to the trie.
a3621e74 2783
8aa23a47 2784 We have two cases
a3621e74 2785
3b753521 2786 1. patterns where the whole set of branches can be converted.
a3621e74 2787
8aa23a47 2788 2. patterns where only a subset can be converted.
a3621e74 2789
8aa23a47
YO
2790 In case 1 we can replace the whole set with a single regop
2791 for the trie. In case 2 we need to keep the start and end
3b753521 2792 branches so
a3621e74 2793
8aa23a47
YO
2794 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2795 becomes BRANCH TRIE; BRANCH X;
786e8c11 2796
8aa23a47
YO
2797 There is an additional case, that being where there is a
2798 common prefix, which gets split out into an EXACT like node
2799 preceding the TRIE node.
a3621e74 2800
8aa23a47
YO
2801 If x(1..n)==tail then we can do a simple trie, if not we make
2802 a "jump" trie, such that when we match the appropriate word
2803 we "jump" to the appopriate tail node. Essentailly we turn
2804 a nested if into a case structure of sorts.
b515a41d 2805
8aa23a47
YO
2806 */
2807
2808 int made=0;
2809 if (!re_trie_maxbuff) {
2810 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2811 if (!SvIOK(re_trie_maxbuff))
2812 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2813 }
2814 if ( SvIV(re_trie_maxbuff)>=0 ) {
2815 regnode *cur;
2816 regnode *first = (regnode *)NULL;
2817 regnode *last = (regnode *)NULL;
2818 regnode *tail = scan;
2819 U8 optype = 0;
2820 U32 count=0;
a3621e74
YO
2821
2822#ifdef DEBUGGING
8aa23a47 2823 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2824#endif
8aa23a47
YO
2825 /* var tail is used because there may be a TAIL
2826 regop in the way. Ie, the exacts will point to the
2827 thing following the TAIL, but the last branch will
2828 point at the TAIL. So we advance tail. If we
2829 have nested (?:) we may have to move through several
2830 tails.
2831 */
2832
2833 while ( OP( tail ) == TAIL ) {
2834 /* this is the TAIL generated by (?:) */
2835 tail = regnext( tail );
2836 }
a3621e74 2837
8aa23a47
YO
2838
2839 DEBUG_OPTIMISE_r({
2840 regprop(RExC_rx, mysv, tail );
2841 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2842 (int)depth * 2 + 2, "",
2843 "Looking for TRIE'able sequences. Tail node is: ",
2844 SvPV_nolen_const( mysv )
2845 );
2846 });
2847
2848 /*
2849
2850 step through the branches, cur represents each
2851 branch, noper is the first thing to be matched
2852 as part of that branch and noper_next is the
2853 regnext() of that node. if noper is an EXACT
2854 and noper_next is the same as scan (our current
2855 position in the regex) then the EXACT branch is
2856 a possible optimization target. Once we have
2857 two or more consequetive such branches we can
2858 create a trie of the EXACT's contents and stich
2859 it in place. If the sequence represents all of
2860 the branches we eliminate the whole thing and
2861 replace it with a single TRIE. If it is a
2862 subsequence then we need to stitch it in. This
2863 means the first branch has to remain, and needs
2864 to be repointed at the item on the branch chain
2865 following the last branch optimized. This could
2866 be either a BRANCH, in which case the
2867 subsequence is internal, or it could be the
2868 item following the branch sequence in which
2869 case the subsequence is at the end.
2870
2871 */
2872
2873 /* dont use tail as the end marker for this traverse */
2874 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2875 regnode * const noper = NEXTOPER( cur );
b515a41d 2876#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2877 regnode * const noper_next = regnext( noper );
b515a41d
YO
2878#endif
2879
8aa23a47
YO
2880 DEBUG_OPTIMISE_r({
2881 regprop(RExC_rx, mysv, cur);
2882 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2883 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2884
2885 regprop(RExC_rx, mysv, noper);
2886 PerlIO_printf( Perl_debug_log, " -> %s",
2887 SvPV_nolen_const(mysv));
2888
2889 if ( noper_next ) {
2890 regprop(RExC_rx, mysv, noper_next );
2891 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2892 SvPV_nolen_const(mysv));
2893 }
2894 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2895 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2896 });
2897 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2898 : PL_regkind[ OP( noper ) ] == EXACT )
2899 || OP(noper) == NOTHING )
786e8c11 2900#ifdef NOJUMPTRIE
8aa23a47 2901 && noper_next == tail
786e8c11 2902#endif
8aa23a47
YO
2903 && count < U16_MAX)
2904 {
2905 count++;
2906 if ( !first || optype == NOTHING ) {
2907 if (!first) first = cur;
2908 optype = OP( noper );
2909 } else {
2910 last = cur;
2911 }
2912 } else {
a0a388a1 2913/*
0abd0d78
YO
2914 Currently we do not believe that the trie logic can
2915 handle case insensitive matching properly when the
2916 pattern is not unicode (thus forcing unicode semantics).
2917
2918 If/when this is fixed the following define can be swapped
2919 in below to fully enable trie logic.
2920
a0a388a1 2921#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2922
2923*/
2924#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2925
a0a388a1 2926 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2927 make_trie( pRExC_state,
2928 startbranch, first, cur, tail, count,
2929 optype, depth+1 );
2930 }
2931 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2932#ifdef NOJUMPTRIE
8aa23a47 2933 && noper_next == tail
786e8c11 2934#endif
8aa23a47
YO
2935 ){
2936 count = 1;
2937 first = cur;
2938 optype = OP( noper );
2939 } else {
2940 count = 0;
2941 first = NULL;
2942 optype = 0;
2943 }
2944 last = NULL;
2945 }
2946 }
2947 DEBUG_OPTIMISE_r({
2948 regprop(RExC_rx, mysv, cur);
2949 PerlIO_printf( Perl_debug_log,
2950 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2951 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2952
2953 });
a0a388a1
YO
2954
2955 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2956 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2957#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2958 if ( ((made == MADE_EXACT_TRIE &&
2959 startbranch == first)
2960 || ( first_non_open == first )) &&
2961 depth==0 ) {
2962 flags |= SCF_TRIE_RESTUDY;
2963 if ( startbranch == first
2964 && scan == tail )
2965 {
2966 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2967 }
2968 }
3dab1dad 2969#endif
8aa23a47
YO
2970 }
2971 }
2972
2973 } /* do trie */
2974
653099ff 2975 }
8aa23a47
YO
2976 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2977 scan = NEXTOPER(NEXTOPER(scan));
2978 } else /* single branch is optimized. */
2979 scan = NEXTOPER(scan);
2980 continue;
2981 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2982 scan_frame *newframe = NULL;
2983 I32 paren;
2984 regnode *start;
2985 regnode *end;
2986
2987 if (OP(scan) != SUSPEND) {
2988 /* set the pointer */
2989 if (OP(scan) == GOSUB) {
2990 paren = ARG(scan);
2991 RExC_recurse[ARG2L(scan)] = scan;
2992 start = RExC_open_parens[paren-1];
2993 end = RExC_close_parens[paren-1];
2994 } else {
2995 paren = 0;
f8fc2ecf 2996 start = RExC_rxi->program + 1;
8aa23a47
YO
2997 end = RExC_opend;
2998 }
2999 if (!recursed) {
3000 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3001 SAVEFREEPV(recursed);
3002 }
3003 if (!PAREN_TEST(recursed,paren+1)) {
3004 PAREN_SET(recursed,paren+1);
3005 Newx(newframe,1,scan_frame);
3006 } else {
3007 if (flags & SCF_DO_SUBSTR) {
304ee84b 3008 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3009 data->longest = &(data->longest_float);
3010 }
3011 is_inf = is_inf_internal = 1;
3012 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3013 cl_anything(pRExC_state, data->start_class);
3014 flags &= ~SCF_DO_STCLASS;
3015 }
3016 } else {
3017 Newx(newframe,1,scan_frame);
3018 paren = stopparen;
3019 start = scan+2;
3020 end = regnext(scan);
3021 }
3022 if (newframe) {
3023 assert(start);
3024 assert(end);
3025 SAVEFREEPV(newframe);
3026 newframe->next = regnext(scan);
3027 newframe->last = last;
3028 newframe->stop = stopparen;
3029 newframe->prev = frame;
3030
3031 frame = newframe;
3032 scan = start;
3033 stopparen = paren;
3034 last = end;
3035
3036 continue;
3037 }
3038 }
3039 else if (OP(scan) == EXACT) {
3040 I32 l = STR_LEN(scan);
3041 UV uc;
3042 if (UTF) {
3043 const U8 * const s = (U8*)STRING(scan);
3044 l = utf8_length(s, s + l);
3045 uc = utf8_to_uvchr(s, NULL);
3046 } else {
3047 uc = *((U8*)STRING(scan));
3048 }
3049 min += l;
3050 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3051 /* The code below prefers earlier match for fixed
3052 offset, later match for variable offset. */
3053 if (data->last_end == -1) { /* Update the start info. */
3054 data->last_start_min = data->pos_min;
3055 data->last_start_max = is_inf
3056 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3057 }
8aa23a47
YO
3058 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3059 if (UTF)
3060 SvUTF8_on(data->last_found);
3061 {
3062 SV * const sv = data->last_found;
3063 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3064 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3065 if (mg && mg->mg_len >= 0)
3066 mg->mg_len += utf8_length((U8*)STRING(scan),
3067 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3068 }
8aa23a47
YO
3069 data->last_end = data->pos_min + l;
3070 data->pos_min += l; /* As in the first entry. */
3071 data->flags &= ~SF_BEFORE_EOL;
3072 }
3073 if (flags & SCF_DO_STCLASS_AND) {
3074 /* Check whether it is compatible with what we know already! */
3075 int compat = 1;
3076
54251c2e
KW
3077
3078 /* If compatibile, we or it in below. It is compatible if is
3079 * in the bitmp and either 1) its bit or its fold is set, or 2)
3080 * it's for a locale. Even if there isn't unicode semantics
3081 * here, at runtime there may be because of matching against a
3082 * utf8 string, so accept a possible false positive for
3083 * latin1-range folds */
8aa23a47
YO
3084 if (uc >= 0x100 ||
3085 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3086 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3087 && (!(data->start_class->flags & ANYOF_FOLD)
54251c2e 3088 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
8aa23a47
YO
3089 )
3090 compat = 0;
3091 ANYOF_CLASS_ZERO(data->start_class);
3092 ANYOF_BITMAP_ZERO(data->start_class);
3093 if (compat)
3094 ANYOF_BITMAP_SET(data->start_class, uc);
3095 data->start_class->flags &= ~ANYOF_EOS;
3096 if (uc < 0x100)
3097 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3098 }
3099 else if (flags & SCF_DO_STCLASS_OR) {
3100 /* false positive possible if the class is case-folded */
3101 if (uc < 0x100)
3102 ANYOF_BITMAP_SET(data->start_class, uc);
3103 else
3104 data->start_class->flags |= ANYOF_UNICODE_ALL;
3105 data->start_class->flags &= ~ANYOF_EOS;
3106 cl_and(data->start_class, and_withp);
3107 }
3108 flags &= ~SCF_DO_STCLASS;
3109 }
3110 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3111 I32 l = STR_LEN(scan);
3112 UV uc = *((U8*)STRING(scan));
3113
3114 /* Search for fixed substrings supports EXACT only. */
3115 if (flags & SCF_DO_SUBSTR) {
3116 assert(data);
304ee84b 3117 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3118 }
3119 if (UTF) {
3120 const U8 * const s = (U8 *)STRING(scan);
3121 l = utf8_length(s, s + l);
3122 uc = utf8_to_uvchr(s, NULL);
3123 }
3124 min += l;
3125 if (flags & SCF_DO_SUBSTR)
3126 data->pos_min += l;
3127 if (flags & SCF_DO_STCLASS_AND) {
3128 /* Check whether it is compatible with what we know already! */
3129 int compat = 1;
8aa23a47 3130 if (uc >= 0x100 ||
54251c2e
KW
3131 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3132 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3133 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3134 {
8aa23a47 3135 compat = 0;
54251c2e 3136 }
8aa23a47
YO
3137 ANYOF_CLASS_ZERO(data->start_class);
3138 ANYOF_BITMAP_ZERO(data->start_class);
3139 if (compat) {
3140 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3141 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47 3142 data->start_class->flags |= ANYOF_FOLD;
970c8436 3143 if (OP(scan) == EXACTFL) {
8aa23a47 3144 data->start_class->flags |= ANYOF_LOCALE;
970c8436
KW
3145 }
3146 else {
3147
54251c2e
KW
3148 /* Also set the other member of the fold pair. In case
3149 * that unicode semantics is called for at runtime, use
3150 * the full latin1 fold. (Can't do this for locale,
3151 * because not known until runtime */
3152 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
970c8436 3153 }
653099ff 3154 }
8aa23a47
YO
3155 }
3156 else if (flags & SCF_DO_STCLASS_OR) {
3157 if (data->start_class->flags & ANYOF_FOLD) {
3158 /* false positive possible if the class is case-folded.
3159 Assume that the locale settings are the same... */
970c8436 3160 if (uc < 0x100) {
1aa99e6b 3161 ANYOF_BITMAP_SET(data->start_class, uc);
970c8436
KW
3162 if (OP(scan) != EXACTFL) {
3163
3164 /* And set the other member of the fold pair, but
3165 * can't do that in locale because not known until
3166 * run-time */
3167 ANYOF_BITMAP_SET(data->start_class,
54251c2e 3168 PL_fold_latin1[uc]);
970c8436
KW
3169 }
3170 }
653099ff
GS
3171 data->start_class->flags &= ~ANYOF_EOS;
3172 }
8aa23a47 3173 cl_and(data->start_class, and_withp);
653099ff 3174 }
8aa23a47
YO
3175 flags &= ~SCF_DO_STCLASS;
3176 }
e52fc539 3177 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3178 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3179 I32 f = flags, pos_before = 0;
3180 regnode * const oscan = scan;
3181 struct regnode_charclass_class this_class;
3182 struct regnode_charclass_class *oclass = NULL;
3183 I32 next_is_eval = 0;
3184
3185 switch (PL_regkind[OP(scan)]) {
3186 case WHILEM: /* End of (?:...)* . */
3187 scan = NEXTOPER(scan);
3188 goto finish;
3189 case PLUS:
3190 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3191 next = NEXTOPER(scan);
3192 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3193 mincount = 1;
3194 maxcount = REG_INFTY;
3195 next = regnext(scan);
3196 scan = NEXTOPER(scan);
3197 goto do_curly;
3198 }
3199 }
3200 if (flags & SCF_DO_SUBSTR)
3201 data->pos_min++;
3202 min++;
3203 /* Fall through. */
3204 case STAR:
3205 if (flags & SCF_DO_STCLASS) {
3206 mincount = 0;
3207 maxcount = REG_INFTY;
3208 next = regnext(scan);
3209 scan = NEXTOPER(scan);
3210 goto do_curly;
3211 }
3212 is_inf = is_inf_internal = 1;
3213 scan = regnext(scan);
c277df42 3214 if (flags & SCF_DO_SUBSTR) {
304ee84b 3215 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3216 data->longest = &(data->longest_float);
c277df42 3217 }
8aa23a47
YO
3218 goto optimize_curly_tail;
3219 case CURLY:
3220 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3221 && (scan->flags == stopparen))
3222 {
3223 mincount = 1;
3224 maxcount = 1;
3225 } else {
3226 mincount = ARG1(scan);
3227 maxcount = ARG2(scan);
653099ff 3228 }
8aa23a47
YO
3229 next = regnext(scan);
3230 if (OP(scan) == CURLYX) {
3231 I32 lp = (data ? *(data->last_closep) : 0);
3232 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3233 }
8aa23a47
YO
3234 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3235 next_is_eval = (OP(scan) == EVAL);
3236 do_curly:
3237 if (flags & SCF_DO_SUBSTR) {
304ee84b 3238 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3239 pos_before = data->pos_min;
b45f050a 3240 }
8aa23a47
YO
3241 if (data) {
3242 fl = data->flags;
3243 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3244 if (is_inf)
3245 data->flags |= SF_IS_INF;
3246 }
3247 if (flags & SCF_DO_STCLASS) {
3248 cl_init(pRExC_state, &this_class);
3249 oclass = data->start_class;
3250 data->start_class = &this_class;
3251 f |= SCF_DO_STCLASS_AND;
3252 f &= ~SCF_DO_STCLASS_OR;
3253 }
779bcb7d
NC
3254 /* Exclude from super-linear cache processing any {n,m}
3255 regops for which the combination of input pos and regex
3256 pos is not enough information to determine if a match
3257 will be possible.
3258
3259 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3260 regex pos at the \s*, the prospects for a match depend not
3261 only on the input position but also on how many (bar\s*)
3262 repeats into the {4,8} we are. */
3263 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
8aa23a47 3264 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3265
8aa23a47
YO
3266 /* This will finish on WHILEM, setting scan, or on NULL: */
3267 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3268 last, data, stopparen, recursed, NULL,
3269 (mincount == 0
3270 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3271
8aa23a47
YO
3272 if (flags & SCF_DO_STCLASS)
3273 data->start_class = oclass;
3274 if (mincount == 0 || minnext == 0) {
3275 if (flags & SCF_DO_STCLASS_OR) {
3276 cl_or(pRExC_state, data->start_class, &this_class);
3277 }
3278 else if (flags & SCF_DO_STCLASS_AND) {
3279 /* Switch to OR mode: cache the old value of
3280 * data->start_class */
3281 INIT_AND_WITHP;
3282 StructCopy(data->start_class, and_withp,
3283 struct regnode_charclass_class);
3284 flags &= ~SCF_DO_STCLASS_AND;
3285 StructCopy(&this_class, data->start_class,
3286 struct regnode_charclass_class);
3287 flags |= SCF_DO_STCLASS_OR;
3288 data->start_class->flags |= ANYOF_EOS;
3289 }
3290 } else { /* Non-zero len */
3291 if (flags & SCF_DO_STCLASS_OR) {
3292 cl_or(pRExC_state, data->start_class, &this_class);
3293 cl_and(data->start_class, and_withp);
3294 }
3295 else if (flags & SCF_DO_STCLASS_AND)
3296 cl_and(data->start_class, &this_class);
3297 flags &= ~SCF_DO_STCLASS;
3298 }
3299 if (!scan) /* It was not CURLYX, but CURLY. */
3300 scan = next;
3301 if ( /* ? quantifier ok, except for (?{ ... }) */
3302 (next_is_eval || !(mincount == 0 && maxcount == 1))
3303 && (minnext == 0) && (deltanext == 0)
3304 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3305 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3306 {
668c081a
NC
3307 ckWARNreg(RExC_parse,
3308 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3309 }
3310
3311 min += minnext * mincount;
3312 is_inf_internal |= ((maxcount == REG_INFTY
3313 && (minnext + deltanext) > 0)
3314 || deltanext == I32_MAX);
3315 is_inf |= is_inf_internal;
3316 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3317
3318 /* Try powerful optimization CURLYX => CURLYN. */
3319 if ( OP(oscan) == CURLYX && data
3320 && data->flags & SF_IN_PAR
3321 && !(data->flags & SF_HAS_EVAL)
3322 && !deltanext && minnext == 1 ) {
3323 /* Try to optimize to CURLYN. */
3324 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3325 regnode * const nxt1 = nxt;
497b47a8 3326#ifdef DEBUGGING
8aa23a47 3327 regnode *nxt2;
497b47a8 3328#endif
c277df42 3329
8aa23a47
YO
3330 /* Skip open. */
3331 nxt = regnext(nxt);
e52fc539 3332 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3333 && !(PL_regkind[OP(nxt)] == EXACT
3334 && STR_LEN(nxt) == 1))
3335 goto nogo;
497b47a8 3336#ifdef DEBUGGING
8aa23a47 3337 nxt2 = nxt;
497b47a8 3338#endif
8aa23a47
YO
3339 nxt = regnext(nxt);
3340 if (OP(nxt) != CLOSE)
3341 goto nogo;
3342 if (RExC_open_parens) {
3343 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3344 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3345 }
3346 /* Now we know that nxt2 is the only contents: */
3347 oscan->flags = (U8)ARG(nxt);
3348 OP(oscan) = CURLYN;
3349 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3350
c277df42 3351#ifdef DEBUGGING
8aa23a47 3352 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3353 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3354 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3355 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3356 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3357 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3358#endif
8aa23a47
YO
3359 }
3360 nogo:
3361
3362 /* Try optimization CURLYX => CURLYM. */
3363 if ( OP(oscan) == CURLYX && data
3364 && !(data->flags & SF_HAS_PAR)
3365 && !(data->flags & SF_HAS_EVAL)
3366 && !deltanext /* atom is fixed width */
3367 && minnext != 0 /* CURLYM can't handle zero width */
3368 ) {
3369 /* XXXX How to optimize if data == 0? */
3370 /* Optimize to a simpler form. */
3371 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3372 regnode *nxt2;
3373
3374 OP(oscan) = CURLYM;
3375 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3376 && (OP(nxt2) != WHILEM))
3377 nxt = nxt2;
3378 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3379 /* Need to optimize away parenths. */
b3c0965f 3380 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3381 /* Set the parenth number. */
3382 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3383
8aa23a47
YO
3384 oscan->flags = (U8)ARG(nxt);
3385 if (RExC_open_parens) {
3386 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3387 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3388 }
8aa23a47
YO
3389 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3390 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3391
c277df42 3392#ifdef DEBUGGING
8aa23a47
YO
3393 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3394 OP(nxt + 1) = OPTIMIZED; /* was count. */
3395 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3396 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3397#endif
c277df42 3398#if 0
8aa23a47
YO
3399 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3400 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3401 if (nnxt == nxt) {
3402 if (reg_off_by_arg[OP(nxt1)])
3403 ARG_SET(nxt1, nxt2 - nxt1);
3404 else if (nxt2 - nxt1 < U16_MAX)
3405 NEXT_OFF(nxt1) = nxt2 - nxt1;
3406 else
3407 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3408 }
8aa23a47 3409 nxt1 = nnxt;
c277df42 3410 }
5d1c421c 3411#endif
8aa23a47
YO
3412 /* Optimize again: */
3413 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3414 NULL, stopparen, recursed, NULL, 0,depth+1);
3415 }
3416 else
3417 oscan->flags = 0;
3418 }
3419 else if ((OP(oscan) == CURLYX)
3420 && (flags & SCF_WHILEM_VISITED_POS)
3421 /* See the comment on a similar expression above.
3b753521 3422 However, this time it's not a subexpression
8aa23a47
YO
3423 we care about, but the expression itself. */
3424 && (maxcount == REG_INFTY)
3425 && data && ++data->whilem_c < 16) {
3426 /* This stays as CURLYX, we can put the count/of pair. */
3427 /* Find WHILEM (as in regexec.c) */
3428 regnode *nxt = oscan + NEXT_OFF(oscan);
3429
3430 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3431 nxt += ARG(nxt);
3432 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3433 | (RExC_whilem_seen << 4)); /* On WHILEM */
3434 }
3435 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3436 pars++;
3437 if (flags & SCF_DO_SUBSTR) {
3438 SV *last_str = NULL;
3439 int counted = mincount != 0;
a0ed51b3 3440
8aa23a47
YO
3441 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3442#if defined(SPARC64_GCC_WORKAROUND)
3443 I32 b = 0;
3444 STRLEN l = 0;
3445 const char *s = NULL;
3446 I32 old = 0;
b515a41d 3447
8aa23a47
YO
3448 if (pos_before >= data->last_start_min)
3449 b = pos_before;
3450 else
3451 b = data->last_start_min;
b515a41d 3452
8aa23a47
YO
3453 l = 0;
3454 s = SvPV_const(data->last_found, l);
3455 old = b - data->last_start_min;
3456
3457#else
3458 I32 b = pos_before >= data->last_start_min
3459 ? pos_before : data->last_start_min;
3460 STRLEN l;
3461 const char * const s = SvPV_const(data->last_found, l);
3462 I32 old = b - data->last_start_min;
3463#endif
3464
3465 if (UTF)
3466 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3467 l -= old;
3468 /* Get the added string: */
740cce10 3469 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3470 if (deltanext == 0 && pos_before == b) {
3471 /* What was added is a constant string */
3472 if (mincount > 1) {
3473 SvGROW(last_str, (mincount * l) + 1);
3474 repeatcpy(SvPVX(last_str) + l,
3475 SvPVX_const(last_str), l, mincount - 1);
3476 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3477 /* Add additional parts. */
3478 SvCUR_set(data->last_found,
3479 SvCUR(data->last_found) - l);
3480 sv_catsv(data->last_found, last_str);
3481 {
3482 SV * sv = data->last_found;
3483 MAGIC *mg =
3484 SvUTF8(sv) && SvMAGICAL(sv) ?
3485 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3486 if (mg && mg->mg_len >= 0)
bd94e887 3487 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3488 }
8aa23a47 3489 data->last_end += l * (mincount - 1);
b515a41d 3490 }
8aa23a47
YO
3491 } else {
3492 /* start offset must point into the last copy */
3493 data->last_start_min += minnext * (mincount - 1);
3494 data->last_start_max += is_inf ? I32_MAX
3495 : (maxcount - 1) * (minnext + data->pos_delta);
3496 }
c277df42 3497 }
8aa23a47
YO
3498 /* It is counted once already... */
3499 data->pos_min += minnext * (mincount - counted);
3500 data->pos_delta += - counted * deltanext +
3501 (minnext + deltanext) * maxcount - minnext * mincount;
3502 if (mincount != maxcount) {
3503 /* Cannot extend fixed substrings found inside
3504 the group. */
304ee84b 3505 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3506 if (mincount && last_str) {
3507 SV * const sv = data->last_found;
3508 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3509 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3510
3511 if (mg)
3512 mg->mg_len = -1;
3513 sv_setsv(sv, last_str);
3514 data->last_end = data->pos_min;
3515 data->last_start_min =
3516 data->pos_min - CHR_SVLEN(last_str);
3517 data->last_start_max = is_inf
3518 ? I32_MAX
3519 : data->pos_min + data->pos_delta
3520 - CHR_SVLEN(last_str);
3521 }
3522 data->longest = &(data->longest_float);
3523 }
3524 SvREFCNT_dec(last_str);
c277df42 3525 }
8aa23a47
YO
3526 if (data && (fl & SF_HAS_EVAL))
3527 data->flags |= SF_HAS_EVAL;
3528 optimize_curly_tail:
3529 if (OP(oscan) != CURLYX) {
3530 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3531 && NEXT_OFF(next))
3532 NEXT_OFF(oscan) += NEXT_OFF(next);
3533 }
3534 continue;
3535 default: /* REF and CLUMP only? */
3536 if (flags & SCF_DO_SUBSTR) {
304ee84b 3537 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3538 data->longest = &(data->longest_float);
3539 }
3540 is_inf = is_inf_internal = 1;
3541 if (flags & SCF_DO_STCLASS_OR)
3542 cl_anything(pRExC_state, data->start_class);
3543 flags &= ~SCF_DO_STCLASS;
3544 break;
c277df42 3545 }
8aa23a47 3546 }
e1d1eefb
YO
3547 else if (OP(scan) == LNBREAK) {
3548 if (flags & SCF_DO_STCLASS) {
3549 int value = 0;
3550 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3551 if (flags & SCF_DO_STCLASS_AND) {
3552 for (value = 0; value < 256; value++)
e64b1bd1 3553 if (!is_VERTWS_cp(value))
b9a59e08
KW
3554 ANYOF_BITMAP_CLEAR(data->start_class, value);
3555 }
3556 else {
e1d1eefb 3557 for (value = 0; value < 256; value++)
e64b1bd1 3558 if (is_VERTWS_cp(value))
b9a59e08
KW
3559 ANYOF_BITMAP_SET(data->start_class, value);
3560 }
e1d1eefb
YO
3561 if (flags & SCF_DO_STCLASS_OR)
3562 cl_and(data->start_class, and_withp);
3563 flags &= ~SCF_DO_STCLASS;
3564 }
3565 min += 1;
f9a79580 3566 delta += 1;
e1d1eefb
YO
3567 if (flags & SCF_DO_SUBSTR) {
3568 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3569 data->pos_min += 1;
f9a79580 3570 data->pos_delta += 1;
e1d1eefb
YO
3571 data->longest = &(data->longest_float);
3572 }
e1d1eefb 3573 }
f9a79580 3574 else if (OP(scan) == FOLDCHAR) {
ced7f090 3575 int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
f9a79580
RGS
3576 flags &= ~SCF_DO_STCLASS;
3577 min += 1;
3578 delta += d;
3579 if (flags & SCF_DO_SUBSTR) {
3580 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3581 data->pos_min += 1;
3582 data->pos_delta += d;
3583 data->longest = &(data->longest_float);
3584 }
3585 }
e52fc539 3586 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 3587 int value = 0;
653099ff 3588
8aa23a47 3589 if (flags & SCF_DO_SUBSTR) {
304ee84b 3590 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3591 data->pos_min++;
3592 }
3593 min++;
3594 if (flags & SCF_DO_STCLASS) {
3595 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3596
8aa23a47
YO
3597 /* Some of the logic below assumes that switching
3598 locale on will only add false positives. */
3599 switch (PL_regkind[OP(scan)]) {
3600 case SANY:
3601 default:
3602 do_default:
3603 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3604 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3605 cl_anything(pRExC_state, data->start_class);
3606 break;
3607 case REG_ANY:
3608 if (OP(scan) == SANY)
3609 goto do_default;
3610 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3611 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
cefafd73
KW
3612 || ((data->start_class->flags & ANYOF_CLASS)
3613 && ANYOF_CLASS_TEST_ANY_SET(data->start_class)));
8aa23a47 3614 cl_anything(pRExC_state, data->start_class);
653099ff 3615 }
8aa23a47
YO
3616 if (flags & SCF_DO_STCLASS_AND || !value)
3617 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3618 break;
3619 case ANYOF:
3620 if (flags & SCF_DO_STCLASS_AND)
3621 cl_and(data->start_class,
3622 (struct regnode_charclass_class*)scan);
653099ff 3623 else
8aa23a47
YO
3624 cl_or(pRExC_state, data->start_class,
3625 (struct regnode_charclass_class*)scan);
3626 break;
3627 case ALNUM:
3628 if (flags & SCF_DO_STCLASS_AND) {
3629 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3630 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
a12cf05f
KW
3631 if (FLAGS(scan) & USE_UNI) {
3632 for (value = 0; value < 256; value++) {
3633 if (!isWORDCHAR_L1(value)) {
3634 ANYOF_BITMAP_CLEAR(data->start_class, value);
3635 }
3636 }
3637 } else {
3638 for (value = 0; value < 256; value++) {
3639 if (!isALNUM(value)) {
3640 ANYOF_BITMAP_CLEAR(data->start_class, value);
3641 }
3642 }
3643 }
8aa23a47 3644 }
653099ff 3645 }
8aa23a47
YO
3646 else {
3647 if (data->start_class->flags & ANYOF_LOCALE)
3648 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
a12cf05f
KW
3649 else if (FLAGS(scan) & USE_UNI) {
3650 for (value = 0; value < 256; value++) {
3651 if (isWORDCHAR_L1(value)) {
3652 ANYOF_BITMAP_SET(data->start_class, value);
3653 }
3654 }
3655 } else {
3656 for (value = 0; value < 256; value++) {
3657 if (isALNUM(value)) {
3658 ANYOF_BITMAP_SET(data->start_class, value);
3659 }
3660 }
3661 }
8aa23a47
YO
3662 }
3663 break;
3664 case ALNUML:
3665 if (flags & SCF_DO_STCLASS_AND) {
3666 if (data->start_class->flags & ANYOF_LOCALE)
3667 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3668 }
3669 else {
3670 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3671 data->start_class->flags |= ANYOF_LOCALE;
3672 }
3673 break;
3674 case NALNUM:
3675 if (flags & SCF_DO_STCLASS_AND) {
3676 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3677 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
a12cf05f
KW
3678 if (FLAGS(scan) & USE_UNI) {
3679 for (value = 0; value < 256; value++) {
3680 if (isWORDCHAR_L1(value)) {
3681 ANYOF_BITMAP_CLEAR(data->start_class, value);
3682 }
3683 }
3684 } else {
3685 for (value = 0; value < 256; value++) {
3686 if (isALNUM(value)) {
3687 ANYOF_BITMAP_CLEAR(data->start_class, value);
3688 }
3689 }
3690 }
653099ff
GS
3691 }
3692 }
8aa23a47
YO
3693 else {
3694 if (data->start_class->flags & ANYOF_LOCALE)
3695 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3696 else {
3697 for (value = 0; value < 256; value++)
3698 if (!isALNUM(value))
b9a59e08 3699 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47 3700 }
653099ff 3701 }
8aa23a47
YO
3702 break;
3703 case NALNUML:
3704 if (flags & SCF_DO_STCLASS_AND) {
3705 if (data->start_class->flags & ANYOF_LOCALE)
3706 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3707 }
8aa23a47
YO
3708 else {
3709 data->start_class->flags |= ANYOF_LOCALE;
3710 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3711 }
3712 break;
3713 case SPACE:
3714 if (flags & SCF_DO_STCLASS_AND) {
3715 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3716 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
a12cf05f
KW
3717 if (FLAGS(scan) & USE_UNI) {
3718 for (value = 0; value < 256; value++) {
3719 if (!isSPACE_L1(value)) {
3720 ANYOF_BITMAP_CLEAR(data->start_class, value);
3721 }
3722 }
3723 } else {
3724 for (value = 0; value < 256; value++) {
3725 if (!isSPACE(value)) {
3726 ANYOF_BITMAP_CLEAR(data->start_class, value);
3727 }
3728 }
3729 }
653099ff
GS
3730 }
3731 }
8aa23a47 3732 else {
a12cf05f 3733 if (data->start_class->flags & ANYOF_LOCALE) {
8aa23a47 3734 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
a12cf05f
KW
3735 }
3736 else if (FLAGS(scan) & USE_UNI) {
3737 for (value = 0; value < 256; value++) {
3738 if (isSPACE_L1(value)) {
3739 ANYOF_BITMAP_SET(data->start_class, value);
3740 }
3741 }
3742 } else {
3743 for (value = 0; value < 256; value++) {
3744 if (isSPACE(value)) {
3745 ANYOF_BITMAP_SET(data->start_class, value);
3746 }
3747 }
8aa23a47 3748 }
653099ff 3749 }
8aa23a47
YO
3750 break;
3751 case SPACEL:
3752 if (flags & SCF_DO_STCLASS_AND) {
3753 if (data->start_class->flags & ANYOF_LOCALE)
3754 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3755 }
3756 else {
3757 data->start_class->flags |= ANYOF_LOCALE;
3758 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3759 }
3760 break;
3761 case NSPACE:
3762 if (flags & SCF_DO_STCLASS_AND) {
3763 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3764 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
a12cf05f
KW
3765 if (FLAGS(scan) & USE_UNI) {
3766 for (value = 0; value < 256; value++) {
3767 if (isSPACE_L1(value)) {
3768 ANYOF_BITMAP_CLEAR(data->start_class, value);
3769 }
3770 }
3771 } else {
3772 for (value = 0; value < 256; value++) {
3773 if (isSPACE(value)) {
3774 ANYOF_BITMAP_CLEAR(data->start_class, value);
3775 }
3776 }
3777 }
653099ff 3778 }
8aa23a47
YO
3779 }
3780 else {
3781 if (data->start_class->flags & ANYOF_LOCALE)
3782 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
a12cf05f
KW
3783 else if (FLAGS(scan) & USE_UNI) {
3784 for (value = 0; value < 256; value++) {
3785 if (!isSPACE_L1(value)) {
3786 ANYOF_BITMAP_SET(data->start_class, value);
3787 }
3788 }
3789 }
3790 else {
3791 for (value = 0; value < 256; value++) {
3792 if (!isSPACE(value)) {
3793 ANYOF_BITMAP_SET(data->start_class, value);
3794 }
3795 }
3796 }
653099ff 3797 }
8aa23a47
YO
3798 break;
3799 case NSPACEL:
3800 if (flags & SCF_DO_STCLASS_AND) {
3801 if (data->start_class->flags & ANYOF_LOCALE) {
3802 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3803 for (value = 0; value < 256; value++)
3804 if (!isSPACE(value))
3805 ANYOF_BITMAP_CLEAR(data->start_class, value);
3806 }
653099ff 3807 }
8aa23a47
YO
3808 else {
3809 data->start_class->flags |= ANYOF_LOCALE;
3810 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3811 }
3812 break;
3813 case DIGIT:
3814 if (flags & SCF_DO_STCLASS_AND) {
3815 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3816 for (value = 0; value < 256; value++)
3817 if (!isDIGIT(value))
3818 ANYOF_BITMAP_CLEAR(data->start_class, value);
3819 }
3820 else {
3821 if (data->start_class->flags & ANYOF_LOCALE)
3822 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3823 else {
3824 for (value = 0; value < 256; value++)
3825 if (isDIGIT(value))
b9a59e08 3826 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47
YO
3827 }
3828 }
3829 break;
3830 case NDIGIT:
3831 if (flags & SCF_DO_STCLASS_AND) {
3832 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3833 for (value = 0; value < 256; value++)
3834 if (isDIGIT(value))
3835 ANYOF_BITMAP_CLEAR(data->start_class, value);
3836 }
3837 else {
3838 if (data->start_class->flags & ANYOF_LOCALE)
3839 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3840 else {
3841 for (value = 0; value < 256; value++)
3842 if (!isDIGIT(value))
b9a59e08 3843 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3844 }
3845 }
8aa23a47 3846 break;
e1d1eefb
YO
3847 CASE_SYNST_FNC(VERTWS);
3848 CASE_SYNST_FNC(HORIZWS);
3849
8aa23a47
YO
3850 }
3851 if (flags & SCF_DO_STCLASS_OR)
3852 cl_and(data->start_class, and_withp);
3853 flags &= ~SCF_DO_STCLASS;
3854 }
3855 }
3856 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3857 data->flags |= (OP(scan) == MEOL
3858 ? SF_BEFORE_MEOL
3859 : SF_BEFORE_SEOL);
3860 }
3861 else if ( PL_regkind[OP(scan)] == BRANCHJ
3862 /* Lookbehind, or need to calculate parens/evals/stclass: */
3863 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3864 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3865 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3866 || OP(scan) == UNLESSM )
3867 {
3868 /* Negative Lookahead/lookbehind
3869 In this case we can't do fixed string optimisation.
3870 */
1de06328 3871
8aa23a47
YO
3872 I32 deltanext, minnext, fake = 0;
3873 regnode *nscan;
3874 struct regnode_charclass_class intrnl;
3875 int f = 0;
1de06328 3876
8aa23a47
YO
3877 data_fake.flags = 0;
3878 if (data) {
3879 data_fake.whilem_c = data->whilem_c;
3880 data_fake.last_closep = data->last_closep;
c277df42 3881 }
8aa23a47
YO
3882 else
3883 data_fake.last_closep = &fake;
58e23c8d 3884 data_fake.pos_delta = delta;
8aa23a47
YO
3885 if ( flags & SCF_DO_STCLASS && !scan->flags
3886 && OP(scan) == IFMATCH ) { /* Lookahead */
3887 cl_init(pRExC_state, &intrnl);
3888 data_fake.start_class = &intrnl;
3889 f |= SCF_DO_STCLASS_AND;
3890 }
3891 if (flags & SCF_WHILEM_VISITED_POS)
3892 f |= SCF_WHILEM_VISITED_POS;
3893 next = regnext(scan);
3894 nscan = NEXTOPER(NEXTOPER(scan));
3895 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3896 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3897 if (scan->flags) {
3898 if (deltanext) {
58e23c8d 3899 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3900 }
3901 else if (minnext > (I32)U8_MAX) {
58e23c8d 3902 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3903 }
3904 scan->flags = (U8)minnext;
3905 }
3906 if (data) {
3907 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3908 pars++;
3909 if (data_fake.flags & SF_HAS_EVAL)
3910 data->flags |= SF_HAS_EVAL;
3911 data->whilem_c = data_fake.whilem_c;
3912 }
3913 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
3914 if (flags & SCF_DO_STCLASS_OR) {
3915 /* OR before, AND after: ideally we would recurse with
3916 * data_fake to get the AND applied by study of the
3917 * remainder of the pattern, and then derecurse;
3918 * *** HACK *** for now just treat as "no information".
3919 * See [perl #56690].
3920 */
3921 cl_init(pRExC_state, data->start_class);
3922 } else {
3923 /* AND before and after: combine and continue */
3924 const int was = (data->start_class->flags & ANYOF_EOS);
3925
3926 cl_and(data->start_class, &intrnl);
3927 if (was)
3928 data->start_class->flags |= ANYOF_EOS;
3929 }
8aa23a47 3930 }
cb434fcc 3931 }
8aa23a47
YO
3932#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3933 else {
3934 /* Positive Lookahead/lookbehind
3935 In this case we can do fixed string optimisation,
3936 but we must be careful about it. Note in the case of
3937 lookbehind the positions will be offset by the minimum
3938 length of the pattern, something we won't know about
3939 until after the recurse.
3940 */
3941 I32 deltanext, fake = 0;
3942 regnode *nscan;
3943 struct regnode_charclass_class intrnl;
3944 int f = 0;
3945 /* We use SAVEFREEPV so that when the full compile
3946 is finished perl will clean up the allocated
3b753521 3947 minlens when it's all done. This way we don't
8aa23a47
YO
3948 have to worry about freeing them when we know
3949 they wont be used, which would be a pain.
3950 */
3951 I32 *minnextp;
3952 Newx( minnextp, 1, I32 );
3953 SAVEFREEPV(minnextp);
3954
3955 if (data) {
3956 StructCopy(data, &data_fake, scan_data_t);
3957 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3958 f |= SCF_DO_SUBSTR;
3959 if (scan->flags)
304ee84b 3960 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3961 data_fake.last_found=newSVsv(data->last_found);
3962 }
3963 }
3964 else
3965 data_fake.last_closep = &fake;
3966 data_fake.flags = 0;
58e23c8d 3967 data_fake.pos_delta = delta;
8aa23a47
YO
3968 if (is_inf)
3969 data_fake.flags |= SF_IS_INF;
3970 if ( flags & SCF_DO_STCLASS && !scan->flags
3971 && OP(scan) == IFMATCH ) { /* Lookahead */
3972 cl_init(pRExC_state, &intrnl);
3973 data_fake.start_class = &intrnl;
3974 f |= SCF_DO_STCLASS_AND;
3975 }
3976 if (flags & SCF_WHILEM_VISITED_POS)
3977 f |= SCF_WHILEM_VISITED_POS;
3978 next = regnext(scan);
3979 nscan = NEXTOPER(NEXTOPER(scan));
3980
3981 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3982 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3983 if (scan->flags) {
3984 if (deltanext) {
58e23c8d 3985 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3986 }
3987 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3988 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3989 }
3990 scan->flags = (U8)*minnextp;
3991 }
3992
3993 *minnextp += min;
3994
3995 if (f & SCF_DO_STCLASS_AND) {
3996 const int was = (data->start_class->flags & ANYOF_EOS);
3997
3998 cl_and(data->start_class, &intrnl);
3999 if (was)
4000 data->start_class->flags |= ANYOF_EOS;
4001 }
4002 if (data) {
4003 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4004 pars++;
4005 if (data_fake.flags & SF_HAS_EVAL)
4006 data->flags |= SF_HAS_EVAL;
4007 data->whilem_c = data_fake.whilem_c;
4008 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4009 if (RExC_rx->minlen<*minnextp)
4010 RExC_rx->minlen=*minnextp;
304ee84b 4011 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
4012 SvREFCNT_dec(data_fake.last_found);
4013
4014 if ( data_fake.minlen_fixed != minlenp )
4015 {
4016 data->offset_fixed= data_fake.offset_fixed;
4017 data->minlen_fixed= data_fake.minlen_fixed;
4018 data->lookbehind_fixed+= scan->flags;
4019 }
4020 if ( data_fake.minlen_float != minlenp )
4021 {
4022 data->minlen_float= data_fake.minlen_float;
4023 data->offset_float_min=data_fake.offset_float_min;
4024 data->offset_float_max=data_fake.offset_float_max;
4025 data->lookbehind_float+= scan->flags;
4026 }
4027 }
4028 }
4029
4030
40d049e4 4031 }
8aa23a47
YO
4032#endif
4033 }
4034 else if (OP(scan) == OPEN) {
4035 if (stopparen != (I32)ARG(scan))
4036 pars++;
4037 }
4038 else if (OP(scan) == CLOSE) {
4039 if (stopparen == (I32)ARG(scan)) {
4040 break;
4041 }
4042 if ((I32)ARG(scan) == is_par) {
4043 next = regnext(scan);
b515a41d 4044
8aa23a47
YO
4045 if ( next && (OP(next) != WHILEM) && next < last)
4046 is_par = 0; /* Disable optimization */
40d049e4 4047 }
8aa23a47
YO
4048 if (data)
4049 *(data->last_closep) = ARG(scan);
4050 }
4051 else if (OP(scan) == EVAL) {
c277df42
IZ
4052 if (data)
4053 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
4054 }
4055 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4056 if (flags & SCF_DO_SUBSTR) {
304ee84b 4057 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 4058 flags &= ~SCF_DO_SUBSTR;
40d049e4 4059 }
8aa23a47
YO
4060 if (data && OP(scan)==ACCEPT) {
4061 data->flags |= SCF_SEEN_ACCEPT;
4062 if (stopmin > min)
4063 stopmin = min;
e2e6a0f1 4064 }
8aa23a47
YO
4065 }
4066 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4067 {
0f5d15d6 4068 if (flags & SCF_DO_SUBSTR) {
304ee84b 4069 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
4070 data->longest = &(data->longest_float);
4071 }
4072 is_inf = is_inf_internal = 1;
653099ff 4073 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 4074 cl_anything(pRExC_state, data->start_class);
96776eda 4075 flags &= ~SCF_DO_STCLASS;
8aa23a47 4076 }
58e23c8d 4077 else if (OP(scan) == GPOS) {
bbe252da 4078 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
4079 !(delta || is_inf || (data && data->pos_delta)))
4080 {
bbe252da
YO
4081 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4082 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
4083 if (RExC_rx->gofs < (U32)min)
4084 RExC_rx->gofs = min;
4085 } else {
bbe252da 4086 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
4087 RExC_rx->gofs = 0;
4088 }
4089 }
786e8c11 4090#ifdef TRIE_STUDY_OPT
40d049e4 4091#ifdef FULL_TRIE_STUDY
8aa23a47
YO
4092 else if (PL_regkind[OP(scan)] == TRIE) {
4093 /* NOTE - There is similar code to this block above for handling
4094 BRANCH nodes on the initial study. If you change stuff here
4095 check there too. */
4096 regnode *trie_node= scan;
4097 regnode *tail= regnext(scan);
f8fc2ecf 4098 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4099 I32 max1 = 0, min1 = I32_MAX;
4100 struct regnode_charclass_class accum;
4101
4102 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 4103 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
4104 if (flags & SCF_DO_STCLASS)
4105 cl_init_zero(pRExC_state, &accum);
4106
4107 if (!trie->jump) {
4108 min1= trie->minlen;
4109 max1= trie->maxlen;
4110 } else {
4111 const regnode *nextbranch= NULL;
4112 U32 word;
4113
4114 for ( word=1 ; word <= trie->wordcount ; word++)
4115 {
4116 I32 deltanext=0, minnext=0, f = 0, fake;
4117 struct regnode_charclass_class this_class;
4118
4119 data_fake.flags = 0;
4120 if (data) {
4121 data_fake.whilem_c = data->whilem_c;
4122 data_fake.last_closep = data->last_closep;
4123 }
4124 else
4125 data_fake.last_closep = &fake;
58e23c8d 4126 data_fake.pos_delta = delta;
8aa23a47
YO
4127 if (flags & SCF_DO_STCLASS) {
4128 cl_init(pRExC_state, &this_class);
4129 data_fake.start_class = &this_class;
4130 f = SCF_DO_STCLASS_AND;
4131 }
4132 if (flags & SCF_WHILEM_VISITED_POS)
4133 f |= SCF_WHILEM_VISITED_POS;
4134
4135 if (trie->jump[word]) {
4136 if (!nextbranch)
4137 nextbranch = trie_node + trie->jump[0];
4138 scan= trie_node + trie->jump[word];
4139 /* We go from the jump point to the branch that follows
4140 it. Note this means we need the vestigal unused branches
4141 even though they arent otherwise used.
4142 */
4143 minnext = study_chunk(pRExC_state, &scan, minlenp,
4144 &deltanext, (regnode *)nextbranch, &data_fake,
4145 stopparen, recursed, NULL, f,depth+1);
4146 }
4147 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4148 nextbranch= regnext((regnode*)nextbranch);
4149
4150 if (min1 > (I32)(minnext + trie->minlen))
4151 min1 = minnext + trie->minlen;
4152 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4153 max1 = minnext + deltanext + trie->maxlen;
4154 if (deltanext == I32_MAX)
4155 is_inf = is_inf_internal = 1;
4156
4157 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4158 pars++;
4159 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4160 if ( stopmin > min + min1)
4161 stopmin = min + min1;
4162 flags &= ~SCF_DO_SUBSTR;
4163 if (data)
4164 data->flags |= SCF_SEEN_ACCEPT;
4165 }
4166 if (data) {
4167 if (data_fake.flags & SF_HAS_EVAL)
4168 data->flags |= SF_HAS_EVAL;
4169 data->whilem_c = data_fake.whilem_c;
4170 }
4171 if (flags & SCF_DO_STCLASS)
4172 cl_or(pRExC_state, &accum, &this_class);
4173 }
4174 }
4175 if (flags & SCF_DO_SUBSTR) {
4176 data->pos_min += min1;
4177 data->pos_delta += max1 - min1;
4178 if (max1 != min1 || is_inf)
4179 data->longest = &(data->longest_float);
4180 }
4181 min += min1;
4182 delta += max1 - min1;
4183 if (flags & SCF_DO_STCLASS_OR) {
4184 cl_or(pRExC_state, data->start_class, &accum);
4185 if (min1) {
4186 cl_and(data->start_class, and_withp);
4187 flags &= ~SCF_DO_STCLASS;
4188 }
4189 }
4190 else if (flags & SCF_DO_STCLASS_AND) {
4191 if (min1) {
4192 cl_and(data->start_class, &accum);
4193 flags &= ~SCF_DO_STCLASS;
4194 }
4195 else {
4196 /* Switch to OR mode: cache the old value of
4197 * data->start_class */
4198 INIT_AND_WITHP;
4199 StructCopy(data->start_class, and_withp,
4200 struct regnode_charclass_class);
4201 flags &= ~SCF_DO_STCLASS_AND;
4202 StructCopy(&accum, data->start_class,
4203 struct regnode_charclass_class);
4204 flags |= SCF_DO_STCLASS_OR;
4205 data->start_class->flags |= ANYOF_EOS;
4206 }
4207 }
4208 scan= tail;
4209 continue;
4210 }
786e8c11 4211#else
8aa23a47 4212 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4213 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4214 U8*bang=NULL;
4215
4216 min += trie->minlen;
4217 delta += (trie->maxlen - trie->minlen);
4218 flags &= ~SCF_DO_STCLASS; /* xxx */
4219 if (flags & SCF_DO_SUBSTR) {
304ee84b 4220 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4221 data->pos_min += trie->minlen;
4222 data->pos_delta += (trie->maxlen - trie->minlen);
4223 if (trie->maxlen != trie->minlen)
4224 data->longest = &(data->longest_float);
4225 }
4226 if (trie->jump) /* no more substrings -- for now /grr*/
4227 flags &= ~SCF_DO_SUBSTR;
b515a41d 4228 }
8aa23a47
YO
4229#endif /* old or new */
4230#endif /* TRIE_STUDY_OPT */
e1d1eefb 4231
8aa23a47
YO
4232 /* Else: zero-length, ignore. */
4233 scan = regnext(scan);
4234 }
4235 if (frame) {
4236 last = frame->last;
4237 scan = frame->next;
4238 stopparen = frame->stop;
4239 frame = frame->prev;
4240 goto fake_study_recurse;
c277df42
IZ
4241 }
4242
4243 finish:
8aa23a47 4244 assert(!frame);
304ee84b 4245 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4246
c277df42 4247 *scanp = scan;
aca2d497 4248 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4249 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4250 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4251 if (is_par > (I32)U8_MAX)
c277df42
IZ
4252 is_par = 0;
4253 if (is_par && pars==1 && data) {
4254 data->flags |= SF_IN_PAR;
4255 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4256 }
4257 else if (pars && data) {
c277df42
IZ
4258 data->flags |= SF_HAS_PAR;
4259 data->flags &= ~SF_IN_PAR;
4260 }
653099ff 4261 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4262 cl_and(data->start_class, and_withp);
786e8c11
YO
4263 if (flags & SCF_TRIE_RESTUDY)
4264 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4265
304ee84b 4266 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4267
e2e6a0f1 4268 return min < stopmin ? min : stopmin;
c277df42
IZ
4269}
4270
2eccd3b2
NC
4271STATIC U32
4272S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4273{
4a4e7719
NC
4274 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4275
7918f24d
NC
4276 PERL_ARGS_ASSERT_ADD_DATA;
4277
4a4e7719
NC
4278 Renewc(RExC_rxi->data,
4279 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4280 char, struct reg_data);
4281 if(count)
f8fc2ecf 4282 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4283 else
f8fc2ecf 4284 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4285 RExC_rxi->data->count = count + n;
4286 Copy(s, RExC_rxi->data->what + count, n, U8);
4287 return count;
c277df42
IZ
4288}
4289
f8149455 4290/*XXX: todo make this not included in a non debugging perl */
76234dfb 4291#ifndef PERL_IN_XSUB_RE
d88dccdf 4292void
864dbfa3 4293Perl_reginitcolors(pTHX)
d88dccdf 4294{
97aff369 4295 dVAR;
1df70142 4296 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4297 if (s) {
1df70142
AL
4298 char *t = savepv(s);
4299 int i = 0;
4300 PL_colors[0] = t;
d88dccdf 4301 while (++i < 6) {
1df70142
AL
4302 t = strchr(t, '\t');
4303 if (t) {
4304 *t = '\0';
4305 PL_colors[i] = ++t;
d88dccdf
IZ
4306 }
4307 else
1df70142 4308 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4309 }
4310 } else {
1df70142 4311 int i = 0;
b81d288d 4312 while (i < 6)
06b5626a 4313 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4314 }
4315 PL_colorset = 1;
4316}
76234dfb 4317#endif
8615cb43 4318
07be1b83 4319
786e8c11
YO
4320#ifdef TRIE_STUDY_OPT
4321#define CHECK_RESTUDY_GOTO \
4322 if ( \
4323 (data.flags & SCF_TRIE_RESTUDY) \
4324 && ! restudied++ \
4325 ) goto reStudy
4326#else
4327#define CHECK_RESTUDY_GOTO
4328#endif
f9f4320a 4329
a687059c 4330/*
e50aee73 4331 - pregcomp - compile a regular expression into internal code
a687059c
LW
4332 *
4333 * We can't allocate space until we know how big the compiled form will be,
4334 * but we can't compile it (and thus know how big it is) until we've got a
4335 * place to put the code. So we cheat: we compile it twice, once with code
4336 * generation turned off and size counting turned on, and once "for real".
4337 * This also means that we don't allocate space until we are sure that the
4338 * thing really will compile successfully, and we never have to move the
4339 * code and thus invalidate pointers into it. (Note that it has to be in
4340 * one piece because free() must be able to free it all.) [NB: not true in perl]
4341 *
4342 * Beware that the optimization-preparation code in here knows about some
4343 * of the structure of the compiled regexp. [I'll say.]
4344 */
b9b4dddf
YO
4345
4346
4347
f9f4320a 4348#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4349#define RE_ENGINE_PTR &PL_core_reg_engine
4350#else
f9f4320a
YO
4351extern const struct regexp_engine my_reg_engine;
4352#define RE_ENGINE_PTR &my_reg_engine
4353#endif
6d5c990f
RGS
4354
4355#ifndef PERL_IN_XSUB_RE
3ab4a224 4356REGEXP *
1593ad57 4357Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4358{
97aff369 4359 dVAR;
6d5c990f 4360 HV * const table = GvHV(PL_hintgv);
7918f24d
NC
4361
4362 PERL_ARGS_ASSERT_PREGCOMP;
4363
f9f4320a
YO
4364 /* Dispatch a request to compile a regexp to correct
4365 regexp engine. */
f9f4320a
YO
4366 if (table) {
4367 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4368 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4369 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4370 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4371 DEBUG_COMPILE_r({
8d8756e7 4372 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4373 SvIV(*ptr));
4374 });
3ab4a224 4375 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4376 }
b9b4dddf 4377 }
3ab4a224 4378 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4379}
6d5c990f 4380#endif
2a5d9b1d 4381
3ab4a224 4382REGEXP *
29b09c41 4383Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
2a5d9b1d
RGS
4384{
4385 dVAR;
288b8c02
NC
4386 REGEXP *rx;
4387 struct regexp *r;
f8fc2ecf 4388 register regexp_internal *ri;
3ab4a224 4389 STRLEN plen;
5d51ce98
KW
4390 char *exp;
4391 char* xend;
c277df42 4392 regnode *scan;
a0d0e21e 4393 I32 flags;
a0d0e21e 4394 I32 minlen = 0;
29b09c41 4395 U32 pm_flags;
e7f38d0f
YO
4396
4397 /* these are all flags - maybe they should be turned
4398 * into a single int with different bit masks */
4399 I32 sawlookahead = 0;
a0d0e21e
LW
4400 I32 sawplus = 0;
4401 I32 sawopen = 0;
29b09c41 4402 bool used_setjump = FALSE;
e7f38d0f 4403
bbd61b5f
KW
4404 U8 jump_ret = 0;
4405 dJMPENV;
2c2d71f5 4406 scan_data_t data;
830247a4 4407 RExC_state_t RExC_state;
be8e71aa 4408 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83 4409#ifdef TRIE_STUDY_OPT
5d51ce98 4410 int restudied;
07be1b83
YO
4411 RExC_state_t copyRExC_state;
4412#endif
2a5d9b1d 4413 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4414
4415 PERL_ARGS_ASSERT_RE_COMPILE;
4416
6d5c990f 4417 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4418
29b09c41 4419 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
7b597bb8 4420
d6bd454d 4421 /****************** LONG JUMP TARGET HERE***********************/
bbd61b5f
KW
4422 /* Longjmp back to here if have to switch in midstream to utf8 */
4423 if (! RExC_orig_utf8) {
4424 JMPENV_PUSH(jump_ret);
29b09c41 4425 used_setjump = TRUE;
bbd61b5f
KW
4426 }
4427
5d51ce98 4428 if (jump_ret == 0) { /* First time through */
29b09c41
KW
4429 exp = SvPV(pattern, plen);
4430 xend = exp + plen;
4431 /* ignore the utf8ness if the pattern is 0 length */
4432 if (plen == 0) {
4433 RExC_utf8 = RExC_orig_utf8 = 0;
4434 }
4435
5d51ce98
KW
4436 DEBUG_COMPILE_r({
4437 SV *dsv= sv_newmortal();
4438 RE_PV_QUOTED_DECL(s, RExC_utf8,
4439 dsv, exp, plen, 60);
4440 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4441 PL_colors[4],PL_colors[5],s);
4442 });
4443 }
4444 else { /* longjumped back */
bbd61b5f
KW
4445 STRLEN len = plen;
4446
5d51ce98
KW
4447 /* If the cause for the longjmp was other than changing to utf8, pop
4448 * our own setjmp, and longjmp to the correct handler */
bbd61b5f
KW
4449 if (jump_ret != UTF8_LONGJMP) {
4450 JMPENV_POP;
4451 JMPENV_JUMP(jump_ret);
4452 }
4453
595598ee
KW
4454 GET_RE_DEBUG_FLAGS;
4455
bbd61b5f
KW
4456 /* It's possible to write a regexp in ascii that represents Unicode
4457 codepoints outside of the byte range, such as via \x{100}. If we
4458 detect such a sequence we have to convert the entire pattern to utf8
4459 and then recompile, as our sizing calculation will have been based
4460 on 1 byte == 1 character, but we will need to use utf8 to encode
4461 at least some part of the pattern, and therefore must convert the whole
4462 thing.
4463 -- dmq */
4464 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4465 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
595598ee 4466 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
bbd61b5f
KW
4467 xend = exp + len;
4468 RExC_orig_utf8 = RExC_utf8 = 1;
4469 SAVEFREEPV(exp);
4470 }
4471
5d51ce98
KW
4472#ifdef TRIE_STUDY_OPT
4473 restudied = 0;
4474#endif
4475
29b09c41
KW
4476 /* Set to use unicode semantics if the pattern is in utf8 and has the
4477 * 'dual' charset specified, as it means unicode when utf8 */
4478 pm_flags = orig_pm_flags;
4479 if (RExC_utf8 && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
4480 pm_flags |= RXf_PMf_UNICODE;
4481 }
4482
02daf0ab 4483 RExC_precomp = exp;
c737faaf 4484 RExC_flags = pm_flags;
830247a4 4485 RExC_sawback = 0;
bbce6d69 4486
830247a4
IZ
4487 RExC_seen = 0;
4488 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4489 RExC_seen_evals = 0;
4490 RExC_extralen = 0;
c277df42 4491
bbce6d69 4492 /* First pass: determine size, legality. */
830247a4 4493 RExC_parse = exp;
fac92740 4494 RExC_start = exp;
830247a4
IZ
4495 RExC_end = xend;
4496 RExC_naughty = 0;
4497 RExC_npar = 1;
e2e6a0f1 4498 RExC_nestroot = 0;
830247a4
IZ
4499 RExC_size = 0L;
4500 RExC_emit = &PL_regdummy;
4501 RExC_whilem_seen = 0;
40d049e4
YO
4502 RExC_open_parens = NULL;
4503 RExC_close_parens = NULL;
4504 RExC_opend = NULL;
81714fb9 4505 RExC_paren_names = NULL;
1f1031fe
YO
4506#ifdef DEBUGGING
4507 RExC_paren_name_list = NULL;
4508#endif
40d049e4
YO
4509 RExC_recurse = NULL;
4510 RExC_recurse_count = 0;
81714fb9 4511
85ddcde9
JH
4512#if 0 /* REGC() is (currently) a NOP at the first pass.
4513 * Clever compilers notice this and complain. --jhi */
830247a4 4514 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4515#endif
3dab1dad
YO
4516 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4517 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4518 RExC_precomp = NULL;
a0d0e21e
LW
4519 return(NULL);
4520 }
bbd61b5f 4521
29b09c41
KW
4522 /* Here, finished first pass. Get rid of any added setjmp */
4523 if (used_setjump) {
bbd61b5f 4524 JMPENV_POP;
02daf0ab 4525 }
07be1b83 4526 DEBUG_PARSE_r({
81714fb9
YO
4527 PerlIO_printf(Perl_debug_log,
4528 "Required size %"IVdf" nodes\n"
4529 "Starting second pass (creation)\n",
4530 (IV)RExC_size);
07be1b83
YO
4531 RExC_lastnum=0;
4532 RExC_lastparse=NULL;
4533 });
c277df42
IZ
4534 /* Small enough for pointer-storage convention?
4535 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4536 if (RExC_size >= 0x10000L && RExC_extralen)
4537 RExC_size += RExC_extralen;
c277df42 4538 else
830247a4
IZ
4539 RExC_extralen = 0;
4540 if (RExC_whilem_seen > 15)
4541 RExC_whilem_seen = 15;
a0d0e21e 4542
f9f4320a
YO
4543 /* Allocate space and zero-initialize. Note, the two step process
4544 of zeroing when in debug mode, thus anything assigned has to
4545 happen after that */
d2f13c59 4546 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 4547 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
4548 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4549 char, regexp_internal);
4550 if ( r == NULL || ri == NULL )
b45f050a 4551 FAIL("Regexp out of space");
0f79a09d
GS
4552#ifdef DEBUGGING
4553 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4554 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4555#else
f8fc2ecf
YO
4556 /* bulk initialize base fields with 0. */
4557 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4558#endif
58e23c8d
YO
4559
4560 /* non-zero initialization begins here */
f8fc2ecf 4561 RXi_SET( r, ri );
f9f4320a 4562 r->engine= RE_ENGINE_PTR;
c737faaf 4563 r->extflags = pm_flags;
bcdf7404 4564 {
f7819f85 4565 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
77ba44fb 4566 bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
c5ea2ffa
KW
4567
4568 /* The caret is output if there are any defaults: if not all the STD
4569 * flags are set, or if no character set specifier is needed */
4570 bool has_default =
4571 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4572 || ! has_charset);
bcdf7404 4573 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
4574 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4575 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
4576 const char *fptr = STD_PAT_MODS; /*"msix"*/
4577 char *p;
fb85c044 4578 /* Allocate for the worst case, which is all the std flags are turned
c5ea2ffa
KW
4579 * on. If more precision is desired, we could do a population count of
4580 * the flags set. This could be done with a small lookup table, or by
4581 * shifting, masking and adding, or even, when available, assembly
4582 * language for a machine-language population count.
4583 * We never output a minus, as all those are defaults, so are
4584 * covered by the caret */
fb85c044 4585 const STRLEN wraplen = plen + has_p + has_runon
c5ea2ffa
KW
4586 + has_default /* If needs a caret */
4587 + has_charset /* If needs a character set specifier */
bcdf7404
YO
4588 + (sizeof(STD_PAT_MODS) - 1)
4589 + (sizeof("(?:)") - 1);
4590
c5ea2ffa 4591 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
f7c278bf 4592 SvPOK_on(rx);
8f6ae13c 4593 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 4594 *p++='('; *p++='?';
9de15fec
KW
4595
4596 /* If a default, cover it using the caret */
c5ea2ffa 4597 if (has_default) {
85508812 4598 *p++= DEFAULT_PAT_MOD;
fb85c044 4599 }
c5ea2ffa
KW
4600 if (has_charset) {
4601 if (r->extflags & RXf_PMf_LOCALE) {
4602 *p++ = LOCALE_PAT_MOD;
4603 } else {
4604 *p++ = UNICODE_PAT_MOD;
4605 }
9de15fec 4606 }
f7819f85
A
4607 if (has_p)
4608 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404 4609 {
bcdf7404 4610 char ch;
bcdf7404
YO
4611 while((ch = *fptr++)) {
4612 if(reganch & 1)
4613 *p++ = ch;
bcdf7404
YO
4614 reganch >>= 1;
4615 }
bcdf7404
YO
4616 }
4617
28d8d7f4 4618 *p++ = ':';
bb661a58 4619 Copy(RExC_precomp, p, plen, char);
efd26800
NC
4620 assert ((RX_WRAPPED(rx) - p) < 16);
4621 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 4622 p += plen;
bcdf7404 4623 if (has_runon)
28d8d7f4
YO
4624 *p++ = '\n';
4625 *p++ = ')';
4626 *p = 0;
fb85c044 4627 SvCUR_set(rx, p - SvPVX_const(rx));
bcdf7404
YO
4628 }
4629
bbe252da 4630 r->intflags = 0;
830247a4 4631 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4632
6bda09f9 4633 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4634 Newxz(RExC_open_parens, RExC_npar,regnode *);
4635 SAVEFREEPV(RExC_open_parens);
4636 Newxz(RExC_close_parens,RExC_npar,regnode *);
4637 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4638 }
4639
4640 /* Useful during FAIL. */
7122b237
YO
4641#ifdef RE_TRACK_PATTERN_OFFSETS
4642 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4643 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4644 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4645 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4646 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4647#endif
4648 SetProgLen(ri,RExC_size);
288b8c02 4649 RExC_rx_sv = rx;
830247a4 4650 RExC_rx = r;
f8fc2ecf 4651 RExC_rxi = ri;
bbce6d69 4652
4653 /* Second pass: emit code. */
c737faaf 4654 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
4655 RExC_parse = exp;
4656 RExC_end = xend;
4657 RExC_naughty = 0;
4658 RExC_npar = 1;
f8fc2ecf
YO
4659 RExC_emit_start = ri->program;
4660 RExC_emit = ri->program;
3b57cd43
YO
4661 RExC_emit_bound = ri->program + RExC_size + 1;
4662
2cd61cdb 4663 /* Store the count of eval-groups for security checks: */
f8149455 4664 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4665 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 4666 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 4667 ReREFCNT_dec(rx);
a0d0e21e 4668 return(NULL);
80757612 4669 }
07be1b83
YO
4670 /* XXXX To minimize changes to RE engine we always allocate
4671 3-units-long substrs field. */
4672 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4673 if (RExC_recurse_count) {
4674 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4675 SAVEFREEPV(RExC_recurse);
4676 }
a0d0e21e 4677
07be1b83 4678reStudy:
e7f38d0f 4679 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
07be1b83 4680 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4681
07be1b83 4682#ifdef TRIE_STUDY_OPT
0934c9d9
SH
4683 if (!restudied) {
4684 StructCopy(&zero_scan_data, &data, scan_data_t);
4685 copyRExC_state = RExC_state;
4686 } else {
5d458dd8 4687 U32 seen=RExC_seen;
07be1b83 4688 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4689
4690 RExC_state = copyRExC_state;
4691 if (seen & REG_TOP_LEVEL_BRANCHES)
4692 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4693 else
4694 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4695 if (data.last_found) {
07be1b83 4696 SvREFCNT_dec(data.longest_fixed);
07be1b83 4697 SvREFCNT_dec(data.longest_float);
07be1b83 4698 SvREFCNT_dec(data.last_found);
1de06328 4699 }
40d049e4 4700 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4701 }
40d049e4
YO
4702#else
4703 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4704#endif
fc8cd66c 4705
a0d0e21e 4706 /* Dig out information for optimizations. */
f7819f85 4707 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
4708 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4709
a0ed51b3 4710 if (UTF)
8f6ae13c 4711 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 4712 ri->regstclass = NULL;
830247a4 4713 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4714 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4715 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4716
1de06328
YO
4717 /* testing for BRANCH here tells us whether there is "must appear"
4718 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4719 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4720 I32 fake;
c5254dd6 4721 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4722 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4723 int stclass_flag;
07be1b83 4724 I32 last_close = 0; /* pointed to by data */
5339e136
YO
4725 regnode *first= scan;
4726 regnode *first_next= regnext(first);
639081d6
YO
4727 /*
4728 * Skip introductions and multiplicators >= 1
4729 * so that we can extract the 'meat' of the pattern that must
4730 * match in the large if() sequence following.
4731 * NOTE that EXACT is NOT covered here, as it is normally
4732 * picked up by the optimiser separately.
4733 *
4734 * This is unfortunate as the optimiser isnt handling lookahead
4735 * properly currently.
4736 *
4737 */
a0d0e21e 4738 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4739 /* An OR of *one* alternative - should not happen now. */
5339e136 4740 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83 4741 /* for now we can't handle lookbehind IFMATCH*/
e7f38d0f 4742 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
a0d0e21e
LW
4743 (OP(first) == PLUS) ||
4744 (OP(first) == MINMOD) ||
653099ff 4745 /* An {n,m} with n>0 */
5339e136
YO
4746 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4747 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 4748 {
639081d6
YO
4749 /*
4750 * the only op that could be a regnode is PLUS, all the rest
4751 * will be regnode_1 or regnode_2.
4752 *
4753 */
a0d0e21e
LW
4754 if (OP(first) == PLUS)
4755 sawplus = 1;
4756 else
3dab1dad 4757 first += regarglen[OP(first)];
639081d6
YO
4758
4759 first = NEXTOPER(first);
5339e136 4760 first_next= regnext(first);
a687059c
LW
4761 }
4762
a0d0e21e
LW
4763 /* Starting-point info. */
4764 again:
786e8c11 4765 DEBUG_PEEP("first:",first,0);
07be1b83 4766 /* Ignore EXACT as we deal with it later. */
3dab1dad 4767 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4768 if (OP(first) == EXACT)
6f207bd3 4769 NOOP; /* Empty, get anchored substr later. */
e5fbd0ff 4770 else
f8fc2ecf 4771 ri->regstclass = first;
b3c9acc1 4772 }
07be1b83 4773#ifdef TRIE_STCLASS
786e8c11 4774 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4775 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4776 {
786e8c11 4777 regnode *trie_op;
07be1b83 4778 /* this can happen only on restudy */
786e8c11 4779 if ( OP(first) == TRIE ) {
c944940b 4780 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4781 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4782 StructCopy(first,trieop,struct regnode_1);
4783 trie_op=(regnode *)trieop;
4784 } else {
c944940b 4785 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4786 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4787 StructCopy(first,trieop,struct regnode_charclass);
4788 trie_op=(regnode *)trieop;
4789 }
1de06328 4790 OP(trie_op)+=2;
786e8c11 4791 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4792 ri->regstclass = trie_op;
07be1b83
YO
4793 }
4794#endif
e52fc539 4795 else if (REGNODE_SIMPLE(OP(first)))
f8fc2ecf 4796 ri->regstclass = first;
3dab1dad
YO
4797 else if (PL_regkind[OP(first)] == BOUND ||
4798 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4799 ri->regstclass = first;
3dab1dad 4800 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4801 r->extflags |= (OP(first) == MBOL
4802 ? RXf_ANCH_MBOL
cad2e5aa 4803 : (OP(first) == SBOL
bbe252da
YO
4804 ? RXf_ANCH_SBOL
4805 : RXf_ANCH_BOL));
a0d0e21e 4806 first = NEXTOPER(first);
774d564b 4807 goto again;
4808 }
4809 else if (OP(first) == GPOS) {
bbe252da 4810 r->extflags |= RXf_ANCH_GPOS;
774d564b 4811 first = NEXTOPER(first);
4812 goto again;
a0d0e21e 4813 }
cf2a2b69
YO
4814 else if ((!sawopen || !RExC_sawback) &&
4815 (OP(first) == STAR &&
3dab1dad 4816 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4817 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4818 {
4819 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4820 const int type =
4821 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4822 ? RXf_ANCH_MBOL
4823 : RXf_ANCH_SBOL;
4824 r->extflags |= type;
4825 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4826 first = NEXTOPER(first);
774d564b 4827 goto again;
a0d0e21e 4828 }
e7f38d0f 4829 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
830247a4 4830 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4831 /* x+ must match at the 1st pos of run of x's */
bbe252da 4832 r->intflags |= PREGf_SKIP;
a0d0e21e 4833
c277df42 4834 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4835#ifdef TRIE_STUDY_OPT
81714fb9 4836 DEBUG_PARSE_r(
be8e71aa
YO
4837 if (!restudied)
4838 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4839 (IV)(first - scan + 1))
4840 );
4841#else
81714fb9 4842 DEBUG_PARSE_r(
be8e71aa
YO
4843 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4844 (IV)(first - scan + 1))
4845 );
4846#endif
4847
4848
a0d0e21e
LW
4849 /*
4850 * If there's something expensive in the r.e., find the
4851 * longest literal string that must appear and make it the
4852 * regmust. Resolve ties in favor of later strings, since
4853 * the regstart check works with the beginning of the r.e.
4854 * and avoiding duplication strengthens checking. Not a
4855 * strong reason, but sufficient in the absence of others.
4856 * [Now we resolve ties in favor of the earlier string if
c277df42 4857 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4858 * earlier string may buy us something the later one won't.]
4859 */
de8c5301 4860
396482e1
GA
4861 data.longest_fixed = newSVpvs("");
4862 data.longest_float = newSVpvs("");
4863 data.last_found = newSVpvs("");
c277df42
IZ
4864 data.longest = &(data.longest_fixed);
4865 first = scan;
f8fc2ecf 4866 if (!ri->regstclass) {
830247a4 4867 cl_init(pRExC_state, &ch_class);
653099ff
GS
4868 data.start_class = &ch_class;
4869 stclass_flag = SCF_DO_STCLASS_AND;
4870 } else /* XXXX Check for BOUND? */
4871 stclass_flag = 0;
cb434fcc 4872 data.last_closep = &last_close;
de8c5301 4873
1de06328 4874 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4875 &data, -1, NULL, NULL,
4876 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4877
07be1b83 4878
786e8c11
YO
4879 CHECK_RESTUDY_GOTO;
4880
4881
830247a4 4882 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4883 && data.last_start_min == 0 && data.last_end > 0
830247a4 4884 && !RExC_seen_zerolen
2bf803e2 4885 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4886 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4887 r->extflags |= RXf_CHECK_ALL;
304ee84b 4888 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4889 SvREFCNT_dec(data.last_found);
4890
1de06328
YO
4891 /* Note that code very similar to this but for anchored string
4892 follows immediately below, changes may need to be made to both.
4893 Be careful.
4894 */
a0ed51b3 4895 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4896 if (longest_float_length
c277df42
IZ
4897 || (data.flags & SF_FL_BEFORE_EOL
4898 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4899 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4900 {
1182767e 4901 I32 t,ml;
cf93c79d 4902
1de06328 4903 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4904 && data.offset_fixed == data.offset_float_min
4905 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4906 goto remove_float; /* As in (a)+. */
4907
1de06328
YO
4908 /* copy the information about the longest float from the reg_scan_data
4909 over to the program. */
33b8afdf
JH
4910 if (SvUTF8(data.longest_float)) {
4911 r->float_utf8 = data.longest_float;
c445ea15 4912 r->float_substr = NULL;
33b8afdf
JH
4913 } else {
4914 r->float_substr = data.longest_float;
c445ea15 4915 r->float_utf8 = NULL;
33b8afdf 4916 }
1de06328
YO
4917 /* float_end_shift is how many chars that must be matched that
4918 follow this item. We calculate it ahead of time as once the
4919 lookbehind offset is added in we lose the ability to correctly
4920 calculate it.*/
4921 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4922 : (I32)longest_float_length;
1de06328
YO
4923 r->float_end_shift = ml - data.offset_float_min
4924 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4925 + data.lookbehind_float;
4926 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4927 r->float_max_offset = data.offset_float_max;
1182767e 4928 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4929 r->float_max_offset -= data.lookbehind_float;
4930
cf93c79d
IZ
4931 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4932 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4933 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4934 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4935 }
4936 else {
aca2d497 4937 remove_float:
c445ea15 4938 r->float_substr = r->float_utf8 = NULL;
c277df42 4939 SvREFCNT_dec(data.longest_float);
c5254dd6 4940 longest_float_length = 0;
a0d0e21e 4941 }
c277df42 4942
1de06328
YO
4943 /* Note that code very similar to this but for floating string
4944 is immediately above, changes may need to be made to both.
4945 Be careful.
4946 */
a0ed51b3 4947 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4948 if (longest_fixed_length
c277df42
IZ
4949 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4950 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4951 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4952 {
1182767e 4953 I32 t,ml;
cf93c79d 4954
1de06328
YO
4955 /* copy the information about the longest fixed
4956 from the reg_scan_data over to the program. */
33b8afdf
JH
4957 if (SvUTF8(data.longest_fixed)) {
4958 r->anchored_utf8 = data.longest_fixed;
c445ea15 4959 r->anchored_substr = NULL;
33b8afdf
JH
4960 } else {
4961 r->anchored_substr = data.longest_fixed;
c445ea15 4962 r->anchored_utf8 = NULL;
33b8afdf 4963 }
1de06328
YO
4964 /* fixed_end_shift is how many chars that must be matched that
4965 follow this item. We calculate it ahead of time as once the
4966 lookbehind offset is added in we lose the ability to correctly
4967 calculate it.*/
4968 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4969 : (I32)longest_fixed_length;
1de06328
YO
4970 r->anchored_end_shift = ml - data.offset_fixed
4971 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4972 + data.lookbehind_fixed;
4973 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4974
cf93c79d
IZ
4975 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4976 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4977 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4978 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4979 }
4980 else {
c445ea15 4981 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4982 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4983 longest_fixed_length = 0;
a0d0e21e 4984 }
f8fc2ecf
YO
4985 if (ri->regstclass
4986 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4987 ri->regstclass = NULL;
33b8afdf
JH
4988 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4989 && stclass_flag
653099ff 4990 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4991 && !cl_is_anything(data.start_class))
4992 {
2eccd3b2 4993 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4994
f8fc2ecf 4995 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4996 struct regnode_charclass_class);
4997 StructCopy(data.start_class,
f8fc2ecf 4998 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4999 struct regnode_charclass_class);
f8fc2ecf 5000 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5001 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5002 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 5003 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5004 PerlIO_printf(Perl_debug_log,
a0288114 5005 "synthetic stclass \"%s\".\n",
3f7c398e 5006 SvPVX_const(sv));});
653099ff 5007 }
c277df42
IZ
5008
5009 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 5010 if (longest_fixed_length > longest_float_length) {
1de06328 5011 r->check_end_shift = r->anchored_end_shift;
c277df42 5012 r->check_substr = r->anchored_substr;
33b8afdf 5013 r->check_utf8 = r->anchored_utf8;
c277df42 5014 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
5015 if (r->extflags & RXf_ANCH_SINGLE)
5016 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
5017 }
5018 else {
1de06328 5019 r->check_end_shift = r->float_end_shift;
c277df42 5020 r->check_substr = r->float_substr;
33b8afdf 5021 r->check_utf8 = r->float_utf8;
1de06328
YO
5022 r->check_offset_min = r->float_min_offset;
5023 r->check_offset_max = r->float_max_offset;
a0d0e21e 5024 }
30382c73
IZ
5025 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5026 This should be changed ASAP! */
bbe252da
YO
5027 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5028 r->extflags |= RXf_USE_INTUIT;
33b8afdf 5029 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 5030 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 5031 }
1de06328
YO
5032 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5033 if ( (STRLEN)minlen < longest_float_length )
5034 minlen= longest_float_length;
5035 if ( (STRLEN)minlen < longest_fixed_length )
5036 minlen= longest_fixed_length;
5037 */
a0ed51b3
LW
5038 }
5039 else {
c277df42
IZ
5040 /* Several toplevels. Best we can is to set minlen. */
5041 I32 fake;
653099ff 5042 struct regnode_charclass_class ch_class;
cb434fcc 5043 I32 last_close = 0;
c277df42 5044
5d458dd8 5045 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 5046
f8fc2ecf 5047 scan = ri->program + 1;
830247a4 5048 cl_init(pRExC_state, &ch_class);
653099ff 5049 data.start_class = &ch_class;
cb434fcc 5050 data.last_closep = &last_close;
07be1b83 5051
de8c5301 5052
1de06328 5053 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 5054 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 5055
786e8c11 5056 CHECK_RESTUDY_GOTO;
07be1b83 5057
33b8afdf 5058 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 5059 = r->float_substr = r->float_utf8 = NULL;
653099ff 5060 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
5061 && !cl_is_anything(data.start_class))
5062 {
2eccd3b2 5063 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 5064
f8fc2ecf 5065 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
5066 struct regnode_charclass_class);
5067 StructCopy(data.start_class,
f8fc2ecf 5068 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 5069 struct regnode_charclass_class);
f8fc2ecf 5070 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5071 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5072 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 5073 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5074 PerlIO_printf(Perl_debug_log,
a0288114 5075 "synthetic stclass \"%s\".\n",
3f7c398e 5076 SvPVX_const(sv));});
653099ff 5077 }
a0d0e21e
LW
5078 }
5079
1de06328
YO
5080 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5081 the "real" pattern. */
cf9788e3
RGS
5082 DEBUG_OPTIMISE_r({
5083 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 5084 (IV)minlen, (IV)r->minlen);
cf9788e3 5085 });
de8c5301 5086 r->minlenret = minlen;
1de06328
YO
5087 if (r->minlen < minlen)
5088 r->minlen = minlen;
5089
b81d288d 5090 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 5091 r->extflags |= RXf_GPOS_SEEN;
830247a4 5092 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 5093 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 5094 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 5095 r->extflags |= RXf_EVAL_SEEN;
f33976b4 5096 if (RExC_seen & REG_SEEN_CANY)
bbe252da 5097 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 5098 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 5099 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 5100 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 5101 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 5102 if (RExC_paren_names)
85fbaab2 5103 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 5104 else
5daac39c 5105 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 5106
7bd1e614 5107#ifdef STUPID_PATTERN_CHECKS
5509d87a 5108 if (RX_PRELEN(rx) == 0)
640f820d 5109 r->extflags |= RXf_NULL;
5509d87a 5110 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
5111 /* XXX: this should happen BEFORE we compile */
5112 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 5113 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 5114 r->extflags |= RXf_WHITE;
5509d87a 5115 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 5116 r->extflags |= RXf_START_ONLY;
f1b875a0 5117#else
5509d87a 5118 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
5119 /* XXX: this should happen BEFORE we compile */
5120 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5121 else {
5122 regnode *first = ri->program + 1;
39aa8307
JH
5123 U8 fop = OP(first);
5124 U8 nop = OP(NEXTOPER(first));
7bd1e614 5125
640f820d
AB
5126 if (PL_regkind[fop] == NOTHING && nop == END)
5127 r->extflags |= RXf_NULL;
5128 else if (PL_regkind[fop] == BOL && nop == END)
7bd1e614
YO
5129 r->extflags |= RXf_START_ONLY;
5130 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5131 r->extflags |= RXf_WHITE;
5132 }
f1b875a0 5133#endif
1f1031fe
YO
5134#ifdef DEBUGGING
5135 if (RExC_paren_names) {
af534a04 5136 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
1f1031fe
YO
5137 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5138 } else
1f1031fe 5139#endif
cde0cee5 5140 ri->name_list_idx = 0;
1f1031fe 5141
40d049e4
YO
5142 if (RExC_recurse_count) {
5143 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5144 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5145 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5146 }
5147 }
f0ab9afb 5148 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
5149 /* assume we don't need to swap parens around before we match */
5150
be8e71aa
YO
5151 DEBUG_DUMP_r({
5152 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
5153 regdump(r);
5154 });
7122b237
YO
5155#ifdef RE_TRACK_PATTERN_OFFSETS
5156 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5157 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
5158 U32 i;
5159 GET_RE_DEBUG_FLAGS_DECL;
7122b237 5160 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 5161 for (i = 1; i <= len; i++) {
7122b237 5162 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 5163 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 5164 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
5165 }
5166 PerlIO_printf(Perl_debug_log, "\n");
5167 });
7122b237 5168#endif
288b8c02 5169 return rx;
a687059c
LW
5170}
5171
f9f4320a 5172#undef RE_ENGINE_PTR
3dab1dad 5173
93b32b6d 5174
81714fb9 5175SV*
192b9cd1
AB
5176Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5177 const U32 flags)
5178{
7918f24d
NC
5179 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5180
192b9cd1
AB
5181 PERL_UNUSED_ARG(value);
5182
f1b875a0 5183 if (flags & RXapif_FETCH) {
192b9cd1 5184 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 5185 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6ad8f254 5186 Perl_croak_no_modify(aTHX);
192b9cd1 5187 return NULL;
f1b875a0 5188 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
5189 return reg_named_buff_exists(rx, key, flags)
5190 ? &PL_sv_yes
5191 : &PL_sv_no;
f1b875a0 5192 } else if (flags & RXapif_REGNAMES) {
192b9cd1 5193 return reg_named_buff_all(rx, flags);
f1b875a0 5194 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
5195 return reg_named_buff_scalar(rx, flags);
5196 } else {
5197 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5198 return NULL;
5199 }
5200}
5201
5202SV*
5203Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5204 const U32 flags)
5205{
7918f24d 5206 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
5207 PERL_UNUSED_ARG(lastkey);
5208
f1b875a0 5209 if (flags & RXapif_FIRSTKEY)
192b9cd1 5210 return reg_named_buff_firstkey(rx, flags);
f1b875a0 5211 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
5212 return reg_named_buff_nextkey(rx, flags);
5213 else {
5214 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5215 return NULL;
5216 }
5217}
5218
5219SV*
288b8c02
NC
5220Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5221 const U32 flags)
81714fb9 5222{
44a2ac75
YO
5223 AV *retarray = NULL;
5224 SV *ret;
288b8c02 5225 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5226
5227 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5228
f1b875a0 5229 if (flags & RXapif_ALL)
44a2ac75 5230 retarray=newAV();
93b32b6d 5231
5daac39c
NC
5232 if (rx && RXp_PAREN_NAMES(rx)) {
5233 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
5234 if (he_str) {
5235 IV i;
5236 SV* sv_dat=HeVAL(he_str);
5237 I32 *nums=(I32*)SvPVX(sv_dat);
5238 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
5239 if ((I32)(rx->nparens) >= nums[i]
5240 && rx->offs[nums[i]].start != -1
5241 && rx->offs[nums[i]].end != -1)
93b32b6d 5242 {
49d7dfbc 5243 ret = newSVpvs("");
288b8c02 5244 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
5245 if (!retarray)
5246 return ret;
5247 } else {
5248 ret = newSVsv(&PL_sv_undef);
5249 }
ec83ea38 5250 if (retarray)
93b32b6d 5251 av_push(retarray, ret);
81714fb9 5252 }
93b32b6d 5253 if (retarray)
ad64d0ec 5254 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
5255 }
5256 }
5257 return NULL;
5258}
5259
5260bool
288b8c02 5261Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
5262 const U32 flags)
5263{
288b8c02 5264 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5265
5266 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5267
5daac39c 5268 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5269 if (flags & RXapif_ALL) {
5daac39c 5270 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 5271 } else {
288b8c02 5272 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
5273 if (sv) {
5274 SvREFCNT_dec(sv);
192b9cd1
AB
5275 return TRUE;
5276 } else {
5277 return FALSE;
5278 }
5279 }
5280 } else {
5281 return FALSE;
5282 }
5283}
5284
5285SV*
288b8c02 5286Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5287{
288b8c02 5288 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5289
5290 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5291
5daac39c
NC
5292 if ( rx && RXp_PAREN_NAMES(rx) ) {
5293 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 5294
288b8c02 5295 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
5296 } else {
5297 return FALSE;
5298 }
192b9cd1
AB
5299}
5300
5301SV*
288b8c02 5302Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5303{
288b8c02 5304 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 5305 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5306
5307 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5308
5daac39c
NC
5309 if (rx && RXp_PAREN_NAMES(rx)) {
5310 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
5311 HE *temphe;
5312 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5313 IV i;
5314 IV parno = 0;
5315 SV* sv_dat = HeVAL(temphe);
5316 I32 *nums = (I32*)SvPVX(sv_dat);
5317 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5318 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5319 rx->offs[nums[i]].start != -1 &&
5320 rx->offs[nums[i]].end != -1)
5321 {
5322 parno = nums[i];
5323 break;
5324 }
5325 }
f1b875a0 5326 if (parno || flags & RXapif_ALL) {
a663657d 5327 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5328 }
81714fb9
YO
5329 }
5330 }
44a2ac75
YO
5331 return NULL;
5332}
5333
192b9cd1 5334SV*
288b8c02 5335Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5336{
5337 SV *ret;
5338 AV *av;
5339 I32 length;
288b8c02 5340 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5341
7918f24d
NC
5342 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5343
5daac39c 5344 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5345 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5346 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5347 } else if (flags & RXapif_ONE) {
288b8c02 5348 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 5349 av = MUTABLE_AV(SvRV(ret));
192b9cd1 5350 length = av_len(av);
ec83ea38 5351 SvREFCNT_dec(ret);
192b9cd1
AB
5352 return newSViv(length + 1);
5353 } else {
5354 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5355 return NULL;
5356 }
5357 }
5358 return &PL_sv_undef;
5359}
5360
5361SV*
288b8c02 5362Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5363{
288b8c02 5364 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5365 AV *av = newAV();
5366
7918f24d
NC
5367 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5368
5daac39c
NC
5369 if (rx && RXp_PAREN_NAMES(rx)) {
5370 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5371 HE *temphe;
5372 (void)hv_iterinit(hv);
5373 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5374 IV i;
5375 IV parno = 0;
5376 SV* sv_dat = HeVAL(temphe);
5377 I32 *nums = (I32*)SvPVX(sv_dat);
5378 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5379 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5380 rx->offs[nums[i]].start != -1 &&
5381 rx->offs[nums[i]].end != -1)
5382 {
5383 parno = nums[i];
5384 break;
5385 }
5386 }
f1b875a0 5387 if (parno || flags & RXapif_ALL) {
a663657d 5388 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
5389 }
5390 }
5391 }
5392
ad64d0ec 5393 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
5394}
5395
49d7dfbc 5396void
288b8c02
NC
5397Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5398 SV * const sv)
44a2ac75 5399{
288b8c02 5400 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 5401 char *s = NULL;
a9d504c3 5402 I32 i = 0;
44a2ac75 5403 I32 s1, t1;
7918f24d
NC
5404
5405 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 5406
cde0cee5
YO
5407 if (!rx->subbeg) {
5408 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5409 return;
cde0cee5
YO
5410 }
5411 else
f1b875a0 5412 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 5413 /* $` */
f0ab9afb 5414 i = rx->offs[0].start;
cde0cee5 5415 s = rx->subbeg;
44a2ac75
YO
5416 }
5417 else
f1b875a0 5418 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 5419 /* $' */
f0ab9afb
NC
5420 s = rx->subbeg + rx->offs[0].end;
5421 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
5422 }
5423 else
5424 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
5425 (s1 = rx->offs[paren].start) != -1 &&
5426 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
5427 {
5428 /* $& $1 ... */
5429 i = t1 - s1;
5430 s = rx->subbeg + s1;
cde0cee5
YO
5431 } else {
5432 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5433 return;
cde0cee5
YO
5434 }
5435 assert(rx->sublen >= (s - rx->subbeg) + i );
5436 if (i >= 0) {
5437 const int oldtainted = PL_tainted;
5438 TAINT_NOT;
5439 sv_setpvn(sv, s, i);
5440 PL_tainted = oldtainted;
5441 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 5442 ? (RXp_MATCH_UTF8(rx)
cde0cee5 5443 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 5444 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
5445 {
5446 SvUTF8_on(sv);
5447 }
5448 else
5449 SvUTF8_off(sv);
5450 if (PL_tainting) {
07bc277f 5451 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
5452 if (SvTYPE(sv) >= SVt_PVMG) {
5453 MAGIC* const mg = SvMAGIC(sv);
5454 MAGIC* mgt;
5455 PL_tainted = 1;
5456 SvMAGIC_set(sv, mg->mg_moremagic);
5457 SvTAINT(sv);
5458 if ((mgt = SvMAGIC(sv))) {
5459 mg->mg_moremagic = mgt;
5460 SvMAGIC_set(sv, mg);
44a2ac75 5461 }
cde0cee5
YO
5462 } else {
5463 PL_tainted = 1;
5464 SvTAINT(sv);
5465 }
5466 } else
5467 SvTAINTED_off(sv);
44a2ac75 5468 }
81714fb9 5469 } else {
44a2ac75 5470 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5471 return;
81714fb9
YO
5472 }
5473}
93b32b6d 5474
2fdbfb4d
AB
5475void
5476Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5477 SV const * const value)
5478{
7918f24d
NC
5479 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5480
2fdbfb4d
AB
5481 PERL_UNUSED_ARG(rx);
5482 PERL_UNUSED_ARG(paren);
5483 PERL_UNUSED_ARG(value);
5484
5485 if (!PL_localizing)
6ad8f254 5486 Perl_croak_no_modify(aTHX);
2fdbfb4d
AB
5487}
5488
5489I32
288b8c02 5490Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
5491 const I32 paren)
5492{
288b8c02 5493 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
5494 I32 i;
5495 I32 s1, t1;
5496
7918f24d
NC
5497 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5498
2fdbfb4d
AB
5499 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5500 switch (paren) {
192b9cd1 5501 /* $` / ${^PREMATCH} */
f1b875a0 5502 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
5503 if (rx->offs[0].start != -1) {
5504 i = rx->offs[0].start;
5505 if (i > 0) {
5506 s1 = 0;
5507 t1 = i;
5508 goto getlen;
5509 }
5510 }
5511 return 0;
192b9cd1 5512 /* $' / ${^POSTMATCH} */
f1b875a0 5513 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
5514 if (rx->offs[0].end != -1) {
5515 i = rx->sublen - rx->offs[0].end;
5516 if (i > 0) {
5517 s1 = rx->offs[0].end;
5518 t1 = rx->sublen;
5519 goto getlen;
5520 }
5521 }
5522 return 0;
192b9cd1
AB
5523 /* $& / ${^MATCH}, $1, $2, ... */
5524 default:
2fdbfb4d
AB
5525 if (paren <= (I32)rx->nparens &&
5526 (s1 = rx->offs[paren].start) != -1 &&
5527 (t1 = rx->offs[paren].end) != -1)
5528 {
5529 i = t1 - s1;
5530 goto getlen;
5531 } else {
5532 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 5533 report_uninit((const SV *)sv);
2fdbfb4d
AB
5534 return 0;
5535 }
5536 }
5537 getlen:
07bc277f 5538 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
5539 const char * const s = rx->subbeg + s1;
5540 const U8 *ep;
5541 STRLEN el;
5542
5543 i = t1 - s1;
5544 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5545 i = el;
5546 }
5547 return i;
5548}
5549
fe578d7f 5550SV*
49d7dfbc 5551Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 5552{
7918f24d 5553 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 5554 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
5555 if (0)
5556 return NULL;
5557 else
5558 return newSVpvs("Regexp");
fe578d7f 5559}
0a4db386 5560
894be9b7 5561/* Scans the name of a named buffer from the pattern.
0a4db386
YO
5562 * If flags is REG_RSN_RETURN_NULL returns null.
5563 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5564 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5565 * to the parsed name as looked up in the RExC_paren_names hash.
5566 * If there is an error throws a vFAIL().. type exception.
894be9b7 5567 */
0a4db386
YO
5568
5569#define REG_RSN_RETURN_NULL 0
5570#define REG_RSN_RETURN_NAME 1
5571#define REG_RSN_RETURN_DATA 2
5572
894be9b7 5573STATIC SV*
7918f24d
NC
5574S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5575{
894be9b7 5576 char *name_start = RExC_parse;
1f1031fe 5577
7918f24d
NC
5578 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5579
1f1031fe
YO
5580 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5581 /* skip IDFIRST by using do...while */
5582 if (UTF)
5583 do {
5584 RExC_parse += UTF8SKIP(RExC_parse);
5585 } while (isALNUM_utf8((U8*)RExC_parse));
5586 else
5587 do {
5588 RExC_parse++;
5589 } while (isALNUM(*RExC_parse));
894be9b7 5590 }
1f1031fe 5591
0a4db386 5592 if ( flags ) {
59cd0e26
NC
5593 SV* sv_name
5594 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5595 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
5596 if ( flags == REG_RSN_RETURN_NAME)
5597 return sv_name;
5598 else if (flags==REG_RSN_RETURN_DATA) {
5599 HE *he_str = NULL;
5600 SV *sv_dat = NULL;
5601 if ( ! sv_name ) /* should not happen*/
5602 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5603 if (RExC_paren_names)
5604 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5605 if ( he_str )
5606 sv_dat = HeVAL(he_str);
5607 if ( ! sv_dat )
5608 vFAIL("Reference to nonexistent named group");
5609 return sv_dat;
5610 }
5611 else {
5612 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5613 }
5614 /* NOT REACHED */
894be9b7 5615 }
0a4db386 5616 return NULL;
894be9b7
YO
5617}
5618
3dab1dad
YO
5619#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5620 int rem=(int)(RExC_end - RExC_parse); \
5621 int cut; \
5622 int num; \
5623 int iscut=0; \
5624 if (rem>10) { \
5625 rem=10; \
5626 iscut=1; \
5627 } \
5628 cut=10-rem; \
5629 if (RExC_lastparse!=RExC_parse) \
5630 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5631 rem, RExC_parse, \
5632 cut + 4, \
5633 iscut ? "..." : "<" \
5634 ); \
5635 else \
5636 PerlIO_printf(Perl_debug_log,"%16s",""); \
5637 \
5638 if (SIZE_ONLY) \
3b57cd43 5639 num = RExC_size + 1; \
3dab1dad
YO
5640 else \
5641 num=REG_NODE_NUM(RExC_emit); \
5642 if (RExC_lastnum!=num) \
0a4db386 5643 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 5644 else \
0a4db386 5645 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
5646 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5647 (int)((depth*2)), "", \
3dab1dad
YO
5648 (funcname) \
5649 ); \
5650 RExC_lastnum=num; \
5651 RExC_lastparse=RExC_parse; \
5652})
5653
07be1b83
YO
5654
5655
3dab1dad
YO
5656#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5657 DEBUG_PARSE_MSG((funcname)); \
5658 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5659})
6bda09f9
YO
5660#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5661 DEBUG_PARSE_MSG((funcname)); \
5662 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5663})
a687059c
LW
5664/*
5665 - reg - regular expression, i.e. main body or parenthesized thing
5666 *
5667 * Caller must absorb opening parenthesis.
5668 *
5669 * Combining parenthesis handling with the base level of regular expression
5670 * is a trifle forced, but the need to tie the tails of the branches to what
5671 * follows makes it hard to avoid.
5672 */
07be1b83
YO
5673#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5674#ifdef DEBUGGING
5675#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5676#else
5677#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5678#endif
3dab1dad 5679
76e3520e 5680STATIC regnode *
3dab1dad 5681S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 5682 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 5683{
27da23d5 5684 dVAR;
c277df42
IZ
5685 register regnode *ret; /* Will be the head of the group. */
5686 register regnode *br;
5687 register regnode *lastbr;
cbbf8932 5688 register regnode *ender = NULL;
a0d0e21e 5689 register I32 parno = 0;
cbbf8932 5690 I32 flags;
f7819f85 5691 U32 oregflags = RExC_flags;
6136c704
AL
5692 bool have_branch = 0;
5693 bool is_open = 0;
594d7033
YO
5694 I32 freeze_paren = 0;
5695 I32 after_freeze = 0;
9d1d55b5
JP
5696
5697 /* for (?g), (?gc), and (?o) warnings; warning
5698 about (?c) will warn about (?g) -- japhy */
5699
6136c704
AL
5700#define WASTED_O 0x01
5701#define WASTED_G 0x02
5702#define WASTED_C 0x04
5703#define WASTED_GC (0x02|0x04)
cbbf8932 5704 I32 wastedflags = 0x00;
9d1d55b5 5705
fac92740 5706 char * parse_start = RExC_parse; /* MJD */
a28509cc 5707 char * const oregcomp_parse = RExC_parse;
a0d0e21e 5708
3dab1dad 5709 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5710
5711 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
5712 DEBUG_PARSE("reg ");
5713
821b33a5 5714 *flagp = 0; /* Tentatively. */
a0d0e21e 5715
9d1d55b5 5716
a0d0e21e
LW
5717 /* Make an OPEN node, if parenthesized. */
5718 if (paren) {
e2e6a0f1
YO
5719 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5720 char *start_verb = RExC_parse;
5721 STRLEN verb_len = 0;
5722 char *start_arg = NULL;
5723 unsigned char op = 0;
5724 int argok = 1;
5725 int internal_argval = 0; /* internal_argval is only useful if !argok */
5726 while ( *RExC_parse && *RExC_parse != ')' ) {
5727 if ( *RExC_parse == ':' ) {
5728 start_arg = RExC_parse + 1;
5729 break;
5730 }
5731 RExC_parse++;
5732 }
5733 ++start_verb;
5734 verb_len = RExC_parse - start_verb;
5735 if ( start_arg ) {
5736 RExC_parse++;
5737 while ( *RExC_parse && *RExC_parse != ')' )
5738 RExC_parse++;
5739 if ( *RExC_parse != ')' )
5740 vFAIL("Unterminated verb pattern argument");
5741 if ( RExC_parse == start_arg )
5742 start_arg = NULL;
5743 } else {
5744 if ( *RExC_parse != ')' )
5745 vFAIL("Unterminated verb pattern");
5746 }
5d458dd8 5747
e2e6a0f1
YO
5748 switch ( *start_verb ) {
5749 case 'A': /* (*ACCEPT) */
568a785a 5750 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
5751 op = ACCEPT;
5752 internal_argval = RExC_nestroot;
5753 }
5754 break;
5755 case 'C': /* (*COMMIT) */
568a785a 5756 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 5757 op = COMMIT;
e2e6a0f1
YO
5758 break;
5759 case 'F': /* (*FAIL) */
568a785a 5760 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
5761 op = OPFAIL;
5762 argok = 0;
5763 }
5764 break;
5d458dd8
YO
5765 case ':': /* (*:NAME) */
5766 case 'M': /* (*MARK:NAME) */
568a785a 5767 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 5768 op = MARKPOINT;
5d458dd8
YO
5769 argok = -1;
5770 }
5771 break;
5772 case 'P': /* (*PRUNE) */
568a785a 5773 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 5774 op = PRUNE;
e2e6a0f1 5775 break;
5d458dd8 5776 case 'S': /* (*SKIP) */
568a785a 5777 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
5778 op = SKIP;
5779 break;
5780 case 'T': /* (*THEN) */
5781 /* [19:06] <TimToady> :: is then */
568a785a 5782 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
5783 op = CUTGROUP;
5784 RExC_seen |= REG_SEEN_CUTGROUP;
5785 }
e2e6a0f1
YO
5786 break;
5787 }
5788 if ( ! op ) {
5789 RExC_parse++;
5790 vFAIL3("Unknown verb pattern '%.*s'",
5791 verb_len, start_verb);
5792 }
5793 if ( argok ) {
5794 if ( start_arg && internal_argval ) {
5795 vFAIL3("Verb pattern '%.*s' may not have an argument",
5796 verb_len, start_verb);
5797 } else if ( argok < 0 && !start_arg ) {
5798 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5799 verb_len, start_verb);
5800 } else {
5801 ret = reganode(pRExC_state, op, internal_argval);
5802 if ( ! internal_argval && ! SIZE_ONLY ) {
5803 if (start_arg) {
5804 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5805 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5806 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
5807 ret->flags = 0;
5808 } else {
5809 ret->flags = 1;
5810 }
5811 }
5812 }
5813 if (!internal_argval)
5814 RExC_seen |= REG_SEEN_VERBARG;
5815 } else if ( start_arg ) {
5816 vFAIL3("Verb pattern '%.*s' may not have an argument",
5817 verb_len, start_verb);
5818 } else {
5819 ret = reg_node(pRExC_state, op);
5820 }
5821 nextchar(pRExC_state);
5822 return ret;
5823 } else
fac92740 5824 if (*RExC_parse == '?') { /* (?...) */
6136c704 5825 bool is_logical = 0;
a28509cc 5826 const char * const seqstart = RExC_parse;
fb85c044 5827 bool has_use_defaults = FALSE;
ca9dfc88 5828
830247a4
IZ
5829 RExC_parse++;
5830 paren = *RExC_parse++;
c277df42 5831 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5832 switch (paren) {
894be9b7 5833
1f1031fe
YO
5834 case 'P': /* (?P...) variants for those used to PCRE/Python */
5835 paren = *RExC_parse++;
5836 if ( paren == '<') /* (?P<...>) named capture */
5837 goto named_capture;
5838 else if (paren == '>') { /* (?P>name) named recursion */
5839 goto named_recursion;
5840 }
5841 else if (paren == '=') { /* (?P=...) named backref */
5842 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5843 you change this make sure you change that */
5844 char* name_start = RExC_parse;
5845 U32 num = 0;
5846 SV *sv_dat = reg_scan_name(pRExC_state,
5847 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5848 if (RExC_parse == name_start || *RExC_parse != ')')
5849 vFAIL2("Sequence %.3s... not terminated",parse_start);
5850
5851 if (!SIZE_ONLY) {
5852 num = add_data( pRExC_state, 1, "S" );
5853 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5854 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
5855 }
5856 RExC_sawback = 1;
4444fd9f
KW
5857 ret = reganode(pRExC_state,
5858 ((! FOLD)
5859 ? NREF
5860 : (UNI_SEMANTICS)
5861 ? NREFFU
5862 : (LOC)
5863 ? NREFFL
5864 : NREFF),
5865 num);
1f1031fe
YO
5866 *flagp |= HASWIDTH;
5867
5868 Set_Node_Offset(ret, parse_start+1);
5869 Set_Node_Cur_Length(ret); /* MJD */
5870
5871 nextchar(pRExC_state);
5872 return ret;
5873 }
57b84237
YO
5874 RExC_parse++;
5875 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5876 /*NOTREACHED*/
5877 case '<': /* (?<...) */
b81d288d 5878 if (*RExC_parse == '!')
c277df42 5879 paren = ',';
0a4db386 5880 else if (*RExC_parse != '=')
1f1031fe 5881 named_capture:
0a4db386 5882 { /* (?<...>) */
81714fb9 5883 char *name_start;
894be9b7 5884 SV *svname;
81714fb9
YO
5885 paren= '>';
5886 case '\'': /* (?'...') */
5887 name_start= RExC_parse;
0a4db386
YO
5888 svname = reg_scan_name(pRExC_state,
5889 SIZE_ONLY ? /* reverse test from the others */
5890 REG_RSN_RETURN_NAME :
5891 REG_RSN_RETURN_NULL);
57b84237
YO
5892 if (RExC_parse == name_start) {
5893 RExC_parse++;
5894 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5895 /*NOTREACHED*/
5896 }
81714fb9
YO
5897 if (*RExC_parse != paren)
5898 vFAIL2("Sequence (?%c... not terminated",
5899 paren=='>' ? '<' : paren);
5900 if (SIZE_ONLY) {
e62cc96a
YO
5901 HE *he_str;
5902 SV *sv_dat = NULL;
894be9b7
YO
5903 if (!svname) /* shouldnt happen */
5904 Perl_croak(aTHX_
5905 "panic: reg_scan_name returned NULL");
81714fb9
YO
5906 if (!RExC_paren_names) {
5907 RExC_paren_names= newHV();
ad64d0ec 5908 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
5909#ifdef DEBUGGING
5910 RExC_paren_name_list= newAV();
ad64d0ec 5911 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 5912#endif
81714fb9
YO
5913 }
5914 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5915 if ( he_str )
81714fb9 5916 sv_dat = HeVAL(he_str);
e62cc96a 5917 if ( ! sv_dat ) {
81714fb9 5918 /* croak baby croak */
e62cc96a
YO
5919 Perl_croak(aTHX_
5920 "panic: paren_name hash element allocation failed");
5921 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
5922 /* (?|...) can mean we have dupes so scan to check
5923 its already been stored. Maybe a flag indicating
5924 we are inside such a construct would be useful,
5925 but the arrays are likely to be quite small, so
5926 for now we punt -- dmq */
5927 IV count = SvIV(sv_dat);
5928 I32 *pv = (I32*)SvPVX(sv_dat);
5929 IV i;
5930 for ( i = 0 ; i < count ; i++ ) {
5931 if ( pv[i] == RExC_npar ) {
5932 count = 0;
5933 break;
5934 }
5935 }
5936 if ( count ) {
5937 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5938 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5939 pv[count] = RExC_npar;
3a92e6ae 5940 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 5941 }
81714fb9
YO
5942 } else {
5943 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5944 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5945 SvIOK_on(sv_dat);
3ec35e0f 5946 SvIV_set(sv_dat, 1);
e62cc96a 5947 }
1f1031fe
YO
5948#ifdef DEBUGGING
5949 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5950 SvREFCNT_dec(svname);
5951#endif
e62cc96a 5952
81714fb9
YO
5953 /*sv_dump(sv_dat);*/
5954 }
5955 nextchar(pRExC_state);
5956 paren = 1;
5957 goto capturing_parens;
5958 }
5959 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5960 RExC_parse++;
fac92740 5961 case '=': /* (?=...) */
89c6a13e 5962 RExC_seen_zerolen++;
5c3fa2e7 5963 break;
fac92740 5964 case '!': /* (?!...) */
830247a4 5965 RExC_seen_zerolen++;
e2e6a0f1
YO
5966 if (*RExC_parse == ')') {
5967 ret=reg_node(pRExC_state, OPFAIL);
5968 nextchar(pRExC_state);
5969 return ret;
5970 }
594d7033
YO
5971 break;
5972 case '|': /* (?|...) */
5973 /* branch reset, behave like a (?:...) except that
5974 buffers in alternations share the same numbers */
5975 paren = ':';
5976 after_freeze = freeze_paren = RExC_npar;
5977 break;
fac92740
MJD
5978 case ':': /* (?:...) */
5979 case '>': /* (?>...) */
a0d0e21e 5980 break;
fac92740
MJD
5981 case '$': /* (?$...) */
5982 case '@': /* (?@...) */
8615cb43 5983 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5984 break;
fac92740 5985 case '#': /* (?#...) */
830247a4
IZ
5986 while (*RExC_parse && *RExC_parse != ')')
5987 RExC_parse++;
5988 if (*RExC_parse != ')')
c277df42 5989 FAIL("Sequence (?#... not terminated");
830247a4 5990 nextchar(pRExC_state);
a0d0e21e
LW
5991 *flagp = TRYAGAIN;
5992 return NULL;
894be9b7
YO
5993 case '0' : /* (?0) */
5994 case 'R' : /* (?R) */
5995 if (*RExC_parse != ')')
6bda09f9 5996 FAIL("Sequence (?R) not terminated");
1a147d38 5997 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 5998 *flagp |= POSTPONED;
7f69552c
YO
5999 nextchar(pRExC_state);
6000 return ret;
6001 /*notreached*/
894be9b7
YO
6002 { /* named and numeric backreferences */
6003 I32 num;
894be9b7
YO
6004 case '&': /* (?&NAME) */
6005 parse_start = RExC_parse - 1;
1f1031fe 6006 named_recursion:
894be9b7 6007 {
0a4db386
YO
6008 SV *sv_dat = reg_scan_name(pRExC_state,
6009 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6010 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
6011 }
6012 goto gen_recurse_regop;
6013 /* NOT REACHED */
542fa716
YO
6014 case '+':
6015 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6016 RExC_parse++;
6017 vFAIL("Illegal pattern");
6018 }
6019 goto parse_recursion;
6020 /* NOT REACHED*/
6021 case '-': /* (?-1) */
6022 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6023 RExC_parse--; /* rewind to let it be handled later */
6024 goto parse_flags;
6025 }
6026 /*FALLTHROUGH */
6bda09f9
YO
6027 case '1': case '2': case '3': case '4': /* (?1) */
6028 case '5': case '6': case '7': case '8': case '9':
6029 RExC_parse--;
542fa716 6030 parse_recursion:
894be9b7
YO
6031 num = atoi(RExC_parse);
6032 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
6033 if (*RExC_parse == '-')
6034 RExC_parse++;
6bda09f9
YO
6035 while (isDIGIT(*RExC_parse))
6036 RExC_parse++;
6037 if (*RExC_parse!=')')
6038 vFAIL("Expecting close bracket");
894be9b7
YO
6039
6040 gen_recurse_regop:
542fa716
YO
6041 if ( paren == '-' ) {
6042 /*
6043 Diagram of capture buffer numbering.
6044 Top line is the normal capture buffer numbers
3b753521 6045 Bottom line is the negative indexing as from
542fa716
YO
6046 the X (the (?-2))
6047
6048 + 1 2 3 4 5 X 6 7
6049 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6050 - 5 4 3 2 1 X x x
6051
6052 */
6053 num = RExC_npar + num;
6054 if (num < 1) {
6055 RExC_parse++;
6056 vFAIL("Reference to nonexistent group");
6057 }
6058 } else if ( paren == '+' ) {
6059 num = RExC_npar + num - 1;
6060 }
6061
1a147d38 6062 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
6063 if (!SIZE_ONLY) {
6064 if (num > (I32)RExC_rx->nparens) {
6065 RExC_parse++;
6066 vFAIL("Reference to nonexistent group");
6067 }
40d049e4 6068 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 6069 RExC_emit++;
226de585 6070 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 6071 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 6072 } else {
6bda09f9 6073 RExC_size++;
6bda09f9 6074 }
0a4db386 6075 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 6076 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
6077 Set_Node_Offset(ret, parse_start); /* MJD */
6078
a3b492c3 6079 *flagp |= POSTPONED;
6bda09f9
YO
6080 nextchar(pRExC_state);
6081 return ret;
894be9b7
YO
6082 } /* named and numeric backreferences */
6083 /* NOT REACHED */
6084
fac92740 6085 case '?': /* (??...) */
6136c704 6086 is_logical = 1;
57b84237
YO
6087 if (*RExC_parse != '{') {
6088 RExC_parse++;
6089 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6090 /*NOTREACHED*/
6091 }
a3b492c3 6092 *flagp |= POSTPONED;
830247a4 6093 paren = *RExC_parse++;
0f5d15d6 6094 /* FALL THROUGH */
fac92740 6095 case '{': /* (?{...}) */
c277df42 6096 {
2eccd3b2
NC
6097 I32 count = 1;
6098 U32 n = 0;
c277df42 6099 char c;
830247a4 6100 char *s = RExC_parse;
c277df42 6101
830247a4
IZ
6102 RExC_seen_zerolen++;
6103 RExC_seen |= REG_SEEN_EVAL;
6104 while (count && (c = *RExC_parse)) {
6136c704
AL
6105 if (c == '\\') {
6106 if (RExC_parse[1])
6107 RExC_parse++;
6108 }
b81d288d 6109 else if (c == '{')
c277df42 6110 count++;
b81d288d 6111 else if (c == '}')
c277df42 6112 count--;
830247a4 6113 RExC_parse++;
c277df42 6114 }
6136c704 6115 if (*RExC_parse != ')') {
b81d288d 6116 RExC_parse = s;
b45f050a
JF
6117 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6118 }
c277df42 6119 if (!SIZE_ONLY) {
f3548bdc 6120 PAD *pad;
6136c704
AL
6121 OP_4tree *sop, *rop;
6122 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 6123
569233ed
SB
6124 ENTER;
6125 Perl_save_re_context(aTHX);
d59a8b3e 6126 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
9b978d73
DM
6127 sop->op_private |= OPpREFCOUNTED;
6128 /* re_dup will OpREFCNT_inc */
6129 OpREFCNT_set(sop, 1);
569233ed 6130 LEAVE;
c277df42 6131
830247a4 6132 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
6133 RExC_rxi->data->data[n] = (void*)rop;
6134 RExC_rxi->data->data[n+1] = (void*)sop;
6135 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 6136 SvREFCNT_dec(sv);
a0ed51b3 6137 }
e24b16f9 6138 else { /* First pass */
830247a4 6139 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 6140 && IN_PERL_RUNTIME)
2cd61cdb
IZ
6141 /* No compiled RE interpolated, has runtime
6142 components ===> unsafe. */
6143 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 6144 if (PL_tainting && PL_tainted)
cc6b7395 6145 FAIL("Eval-group in insecure regular expression");
54df2634 6146#if PERL_VERSION > 8
923e4eb5 6147 if (IN_PERL_COMPILETIME)
b5c19bd7 6148 PL_cv_has_eval = 1;
54df2634 6149#endif
c277df42 6150 }
b5c19bd7 6151
830247a4 6152 nextchar(pRExC_state);
6136c704 6153 if (is_logical) {
830247a4 6154 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6155 if (!SIZE_ONLY)
6156 ret->flags = 2;
3dab1dad 6157 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 6158 /* deal with the length of this later - MJD */
0f5d15d6
IZ
6159 return ret;
6160 }
ccb2c380
MP
6161 ret = reganode(pRExC_state, EVAL, n);
6162 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6163 Set_Node_Offset(ret, parse_start);
6164 return ret;
c277df42 6165 }
fac92740 6166 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 6167 {
0a4db386 6168 int is_define= 0;
fac92740 6169 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
6170 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6171 || RExC_parse[1] == '<'
830247a4 6172 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
6173 I32 flag;
6174
830247a4 6175 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6176 if (!SIZE_ONLY)
6177 ret->flags = 1;
3dab1dad 6178 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 6179 goto insert_if;
b81d288d 6180 }
a0ed51b3 6181 }
0a4db386
YO
6182 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6183 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6184 {
6185 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6186 char *name_start= RExC_parse++;
2eccd3b2 6187 U32 num = 0;
0a4db386
YO
6188 SV *sv_dat=reg_scan_name(pRExC_state,
6189 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6190 if (RExC_parse == name_start || *RExC_parse != ch)
6191 vFAIL2("Sequence (?(%c... not terminated",
6192 (ch == '>' ? '<' : ch));
6193 RExC_parse++;
6194 if (!SIZE_ONLY) {
6195 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 6196 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 6197 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
6198 }
6199 ret = reganode(pRExC_state,NGROUPP,num);
6200 goto insert_if_check_paren;
6201 }
6202 else if (RExC_parse[0] == 'D' &&
6203 RExC_parse[1] == 'E' &&
6204 RExC_parse[2] == 'F' &&
6205 RExC_parse[3] == 'I' &&
6206 RExC_parse[4] == 'N' &&
6207 RExC_parse[5] == 'E')
6208 {
6209 ret = reganode(pRExC_state,DEFINEP,0);
6210 RExC_parse +=6 ;
6211 is_define = 1;
6212 goto insert_if_check_paren;
6213 }
6214 else if (RExC_parse[0] == 'R') {
6215 RExC_parse++;
6216 parno = 0;
6217 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6218 parno = atoi(RExC_parse++);
6219 while (isDIGIT(*RExC_parse))
6220 RExC_parse++;
6221 } else if (RExC_parse[0] == '&') {
6222 SV *sv_dat;
6223 RExC_parse++;
6224 sv_dat = reg_scan_name(pRExC_state,
6225 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6226 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6227 }
1a147d38 6228 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
6229 goto insert_if_check_paren;
6230 }
830247a4 6231 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 6232 /* (?(1)...) */
6136c704 6233 char c;
830247a4 6234 parno = atoi(RExC_parse++);
c277df42 6235
830247a4
IZ
6236 while (isDIGIT(*RExC_parse))
6237 RExC_parse++;
fac92740 6238 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 6239
0a4db386 6240 insert_if_check_paren:
830247a4 6241 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 6242 vFAIL("Switch condition not recognized");
c277df42 6243 insert_if:
3dab1dad
YO
6244 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6245 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 6246 if (br == NULL)
830247a4 6247 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 6248 else
3dab1dad 6249 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 6250 c = *nextchar(pRExC_state);
d1b80229
IZ
6251 if (flags&HASWIDTH)
6252 *flagp |= HASWIDTH;
c277df42 6253 if (c == '|') {
0a4db386
YO
6254 if (is_define)
6255 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 6256 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
6257 regbranch(pRExC_state, &flags, 1,depth+1);
6258 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
6259 if (flags&HASWIDTH)
6260 *flagp |= HASWIDTH;
830247a4 6261 c = *nextchar(pRExC_state);
a0ed51b3
LW
6262 }
6263 else
c277df42
IZ
6264 lastbr = NULL;
6265 if (c != ')')
8615cb43 6266 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 6267 ender = reg_node(pRExC_state, TAIL);
3dab1dad 6268 REGTAIL(pRExC_state, br, ender);
c277df42 6269 if (lastbr) {
3dab1dad
YO
6270 REGTAIL(pRExC_state, lastbr, ender);
6271 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
6272 }
6273 else
3dab1dad 6274 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
6275 RExC_size++; /* XXX WHY do we need this?!!
6276 For large programs it seems to be required
6277 but I can't figure out why. -- dmq*/
c277df42 6278 return ret;
a0ed51b3
LW
6279 }
6280 else {
830247a4 6281 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
6282 }
6283 }
1b1626e4 6284 case 0:
830247a4 6285 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 6286 vFAIL("Sequence (? incomplete");
1b1626e4 6287 break;
85508812
KW
6288 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6289 that follow */
fb85c044
KW
6290 has_use_defaults = TRUE;
6291 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
7b98bc43
KW
6292 if (RExC_utf8) { /* But the default for a utf8 pattern is
6293 unicode semantics */
6294 RExC_flags |= RXf_PMf_UNICODE;
6295 }
fb85c044 6296 goto parse_flags;
a0d0e21e 6297 default:
cde0cee5
YO
6298 --RExC_parse;
6299 parse_flags: /* (?i) */
6300 {
6301 U32 posflags = 0, negflags = 0;
6302 U32 *flagsp = &posflags;
9de15fec 6303 bool has_charset_modifier = 0;
cde0cee5
YO
6304
6305 while (*RExC_parse) {
6306 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
6307 /* (?g), (?gc) and (?o) are useless here
6308 and must be globally applied -- japhy */
cde0cee5
YO
6309 switch (*RExC_parse) {
6310 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9de15fec
KW
6311 case LOCALE_PAT_MOD:
6312 if (has_charset_modifier || flagsp == &negflags) {
6313 goto fail_modifiers;
6314 }
8cc86590
KW
6315 posflags |= RXf_PMf_LOCALE;
6316 negflags |= RXf_PMf_UNICODE;
9de15fec
KW
6317 has_charset_modifier = 1;
6318 break;
6319 case UNICODE_PAT_MOD:
6320 if (has_charset_modifier || flagsp == &negflags) {
6321 goto fail_modifiers;
6322 }
8cc86590
KW
6323 posflags |= RXf_PMf_UNICODE;
6324 negflags |= RXf_PMf_LOCALE;
9de15fec
KW
6325 has_charset_modifier = 1;
6326 break;
6327 case DUAL_PAT_MOD:
6328 if (has_use_defaults
6329 || has_charset_modifier
6330 || flagsp == &negflags)
6331 {
6332 goto fail_modifiers;
6333 }
7b98bc43
KW
6334
6335 /* The dual charset means unicode semantics if the
6336 * pattern (or target, not known until runtime) are
6337 * utf8 */
6338 if (RExC_utf8) {
6339 posflags |= RXf_PMf_UNICODE;
6340 negflags |= RXf_PMf_LOCALE;
6341 }
6342 else {
6343 negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6344 }
9de15fec
KW
6345 has_charset_modifier = 1;
6346 break;
f7819f85
A
6347 case ONCE_PAT_MOD: /* 'o' */
6348 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 6349 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 6350 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
6351 if (! (wastedflags & wflagbit) ) {
6352 wastedflags |= wflagbit;
6353 vWARN5(
6354 RExC_parse + 1,
6355 "Useless (%s%c) - %suse /%c modifier",
6356 flagsp == &negflags ? "?-" : "?",
6357 *RExC_parse,
6358 flagsp == &negflags ? "don't " : "",
6359 *RExC_parse
6360 );
6361 }
6362 }
cde0cee5
YO
6363 break;
6364
f7819f85 6365 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 6366 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
6367 if (! (wastedflags & WASTED_C) ) {
6368 wastedflags |= WASTED_GC;
9d1d55b5
JP
6369 vWARN3(
6370 RExC_parse + 1,
6371 "Useless (%sc) - %suse /gc modifier",
6372 flagsp == &negflags ? "?-" : "?",
6373 flagsp == &negflags ? "don't " : ""
6374 );
6375 }
6376 }
cde0cee5 6377 break;
f7819f85 6378 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5 6379 if (flagsp == &negflags) {
668c081a
NC
6380 if (SIZE_ONLY)
6381 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
6382 } else {
6383 *flagsp |= RXf_PMf_KEEPCOPY;
6384 }
6385 break;
6386 case '-':
3b753521 6387 /* A flag is a default iff it is following a minus, so
fb85c044
KW
6388 * if there is a minus, it means will be trying to
6389 * re-specify a default which is an error */
6390 if (has_use_defaults || flagsp == &negflags) {
9de15fec 6391 fail_modifiers:
57b84237
YO
6392 RExC_parse++;
6393 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6394 /*NOTREACHED*/
6395 }
cde0cee5
YO
6396 flagsp = &negflags;
6397 wastedflags = 0; /* reset so (?g-c) warns twice */
6398 break;
6399 case ':':
6400 paren = ':';
6401 /*FALLTHROUGH*/
6402 case ')':
6403 RExC_flags |= posflags;
6404 RExC_flags &= ~negflags;
f7819f85
A
6405 if (paren != ':') {
6406 oregflags |= posflags;
6407 oregflags &= ~negflags;
6408 }
cde0cee5
YO
6409 nextchar(pRExC_state);
6410 if (paren != ':') {
6411 *flagp = TRYAGAIN;
6412 return NULL;
6413 } else {
6414 ret = NULL;
6415 goto parse_rest;
6416 }
6417 /*NOTREACHED*/
6418 default:
cde0cee5
YO
6419 RExC_parse++;
6420 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6421 /*NOTREACHED*/
6422 }
830247a4 6423 ++RExC_parse;
48c036b1 6424 }
cde0cee5 6425 }} /* one for the default block, one for the switch */
a0d0e21e 6426 }
fac92740 6427 else { /* (...) */
81714fb9 6428 capturing_parens:
830247a4
IZ
6429 parno = RExC_npar;
6430 RExC_npar++;
e2e6a0f1 6431
830247a4 6432 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
6433 if (!SIZE_ONLY ){
6434 if (!RExC_nestroot)
6435 RExC_nestroot = parno;
c009da3d
YO
6436 if (RExC_seen & REG_SEEN_RECURSE
6437 && !RExC_open_parens[parno-1])
6438 {
e2e6a0f1 6439 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
6440 "Setting open paren #%"IVdf" to %d\n",
6441 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
6442 RExC_open_parens[parno-1]= ret;
6443 }
6bda09f9 6444 }
fac92740
MJD
6445 Set_Node_Length(ret, 1); /* MJD */
6446 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 6447 is_open = 1;
a0d0e21e 6448 }
a0ed51b3 6449 }
fac92740 6450 else /* ! paren */
a0d0e21e 6451 ret = NULL;
cde0cee5
YO
6452
6453 parse_rest:
a0d0e21e 6454 /* Pick up the branches, linking them together. */
fac92740 6455 parse_start = RExC_parse; /* MJD */
3dab1dad 6456 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e
VP
6457
6458 if (freeze_paren) {
6459 if (RExC_npar > after_freeze)
6460 after_freeze = RExC_npar;
6461 RExC_npar = freeze_paren;
6462 }
6463
fac92740 6464 /* branch_len = (paren != 0); */
2af232bd 6465
a0d0e21e
LW
6466 if (br == NULL)
6467 return(NULL);
830247a4
IZ
6468 if (*RExC_parse == '|') {
6469 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 6470 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 6471 }
fac92740 6472 else { /* MJD */
6bda09f9 6473 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
6474 Set_Node_Length(br, paren != 0);
6475 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6476 }
c277df42
IZ
6477 have_branch = 1;
6478 if (SIZE_ONLY)
830247a4 6479 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
6480 }
6481 else if (paren == ':') {
c277df42
IZ
6482 *flagp |= flags&SIMPLE;
6483 }
6136c704 6484 if (is_open) { /* Starts with OPEN. */
3dab1dad 6485 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
6486 }
6487 else if (paren != '?') /* Not Conditional */
a0d0e21e 6488 ret = br;
8ae10a67 6489 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 6490 lastbr = br;
830247a4
IZ
6491 while (*RExC_parse == '|') {
6492 if (!SIZE_ONLY && RExC_extralen) {
6493 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 6494 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
6495 }
6496 if (SIZE_ONLY)
830247a4
IZ
6497 RExC_extralen += 2; /* Account for LONGJMP. */
6498 nextchar(pRExC_state);
594d7033
YO
6499 if (freeze_paren) {
6500 if (RExC_npar > after_freeze)
6501 after_freeze = RExC_npar;
6502 RExC_npar = freeze_paren;
6503 }
3dab1dad 6504 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 6505
a687059c 6506 if (br == NULL)
a0d0e21e 6507 return(NULL);
3dab1dad 6508 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 6509 lastbr = br;
8ae10a67 6510 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
6511 }
6512
c277df42
IZ
6513 if (have_branch || paren != ':') {
6514 /* Make a closing node, and hook it on the end. */
6515 switch (paren) {
6516 case ':':
830247a4 6517 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
6518 break;
6519 case 1:
830247a4 6520 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
6521 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6522 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6523 "Setting close paren #%"IVdf" to %d\n",
6524 (IV)parno, REG_NODE_NUM(ender)));
6525 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
6526 if (RExC_nestroot == parno)
6527 RExC_nestroot = 0;
40d049e4 6528 }
fac92740
MJD
6529 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6530 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
6531 break;
6532 case '<':
c277df42
IZ
6533 case ',':
6534 case '=':
6535 case '!':
c277df42 6536 *flagp &= ~HASWIDTH;
821b33a5
IZ
6537 /* FALL THROUGH */
6538 case '>':
830247a4 6539 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
6540 break;
6541 case 0:
830247a4 6542 ender = reg_node(pRExC_state, END);
40d049e4
YO
6543 if (!SIZE_ONLY) {
6544 assert(!RExC_opend); /* there can only be one! */
6545 RExC_opend = ender;
6546 }
c277df42
IZ
6547 break;
6548 }
eaf3ca90 6549 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 6550
9674d46a 6551 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
6552 if (depth==1)
6553 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6554
c277df42 6555 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
6556 for (br = ret; br; br = regnext(br)) {
6557 const U8 op = PL_regkind[OP(br)];
6558 if (op == BRANCH) {
07be1b83 6559 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
6560 }
6561 else if (op == BRANCHJ) {
07be1b83 6562 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 6563 }
c277df42
IZ
6564 }
6565 }
a0d0e21e 6566 }
c277df42
IZ
6567
6568 {
e1ec3a88
AL
6569 const char *p;
6570 static const char parens[] = "=!<,>";
c277df42
IZ
6571
6572 if (paren && (p = strchr(parens, paren))) {
eb160463 6573 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
6574 int flag = (p - parens) > 1;
6575
6576 if (paren == '>')
6577 node = SUSPEND, flag = 0;
6bda09f9 6578 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
6579 Set_Node_Cur_Length(ret);
6580 Set_Node_Offset(ret, parse_start + 1);
c277df42 6581 ret->flags = flag;
07be1b83 6582 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 6583 }
a0d0e21e
LW
6584 }
6585
6586 /* Check for proper termination. */
ce3e6498 6587 if (paren) {
e2509266 6588 RExC_flags = oregflags;
830247a4
IZ
6589 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6590 RExC_parse = oregcomp_parse;
380a0633 6591 vFAIL("Unmatched (");
ce3e6498 6592 }
a0ed51b3 6593 }
830247a4
IZ
6594 else if (!paren && RExC_parse < RExC_end) {
6595 if (*RExC_parse == ')') {
6596 RExC_parse++;
380a0633 6597 vFAIL("Unmatched )");
a0ed51b3
LW
6598 }
6599 else
b45f050a 6600 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
6601 /* NOTREACHED */
6602 }
594d7033
YO
6603 if (after_freeze)
6604 RExC_npar = after_freeze;
a0d0e21e 6605 return(ret);
a687059c
LW
6606}
6607
6608/*
6609 - regbranch - one alternative of an | operator
6610 *
6611 * Implements the concatenation operator.
6612 */
76e3520e 6613STATIC regnode *
3dab1dad 6614S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 6615{
97aff369 6616 dVAR;
c277df42
IZ
6617 register regnode *ret;
6618 register regnode *chain = NULL;
6619 register regnode *latest;
6620 I32 flags = 0, c = 0;
3dab1dad 6621 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6622
6623 PERL_ARGS_ASSERT_REGBRANCH;
6624
3dab1dad 6625 DEBUG_PARSE("brnc");
02daf0ab 6626
b81d288d 6627 if (first)
c277df42
IZ
6628 ret = NULL;
6629 else {
b81d288d 6630 if (!SIZE_ONLY && RExC_extralen)
830247a4 6631 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 6632 else {
830247a4 6633 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
6634 Set_Node_Length(ret, 1);
6635 }
c277df42
IZ
6636 }
6637
b81d288d 6638 if (!first && SIZE_ONLY)
830247a4 6639 RExC_extralen += 1; /* BRANCHJ */
b81d288d 6640
c277df42 6641 *flagp = WORST; /* Tentatively. */
a0d0e21e 6642
830247a4
IZ
6643 RExC_parse--;
6644 nextchar(pRExC_state);
6645 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 6646 flags &= ~TRYAGAIN;
3dab1dad 6647 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6648 if (latest == NULL) {
6649 if (flags & TRYAGAIN)
6650 continue;
6651 return(NULL);
a0ed51b3
LW
6652 }
6653 else if (ret == NULL)
c277df42 6654 ret = latest;
8ae10a67 6655 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 6656 if (chain == NULL) /* First piece. */
a0d0e21e
LW
6657 *flagp |= flags&SPSTART;
6658 else {
830247a4 6659 RExC_naughty++;
3dab1dad 6660 REGTAIL(pRExC_state, chain, latest);
a687059c 6661 }
a0d0e21e 6662 chain = latest;
c277df42
IZ
6663 c++;
6664 }
6665 if (chain == NULL) { /* Loop ran zero times. */
830247a4 6666 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
6667 if (ret == NULL)
6668 ret = chain;
6669 }
6670 if (c == 1) {
6671 *flagp |= flags&SIMPLE;
a0d0e21e 6672 }
a687059c 6673
d4c19fe8 6674 return ret;
a687059c
LW
6675}
6676
6677/*
6678 - regpiece - something followed by possible [*+?]
6679 *
6680 * Note that the branching code sequences used for ? and the general cases
6681 * of * and + are somewhat optimized: they use the same NOTHING node as
6682 * both the endmarker for their branch list and the body of the last branch.
6683 * It might seem that this node could be dispensed with entirely, but the
6684 * endmarker role is not redundant.
6685 */
76e3520e 6686STATIC regnode *
3dab1dad 6687S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6688{
97aff369 6689 dVAR;
c277df42 6690 register regnode *ret;
a0d0e21e
LW
6691 register char op;
6692 register char *next;
6693 I32 flags;
1df70142 6694 const char * const origparse = RExC_parse;
a0d0e21e 6695 I32 min;
c277df42 6696 I32 max = REG_INFTY;
fac92740 6697 char *parse_start;
10edeb5d 6698 const char *maxpos = NULL;
3dab1dad 6699 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6700
6701 PERL_ARGS_ASSERT_REGPIECE;
6702
3dab1dad 6703 DEBUG_PARSE("piec");
a0d0e21e 6704
3dab1dad 6705 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6706 if (ret == NULL) {
6707 if (flags & TRYAGAIN)
6708 *flagp |= TRYAGAIN;
6709 return(NULL);
6710 }
6711
830247a4 6712 op = *RExC_parse;
a0d0e21e 6713
830247a4 6714 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 6715 maxpos = NULL;
fac92740 6716 parse_start = RExC_parse; /* MJD */
830247a4 6717 next = RExC_parse + 1;
a0d0e21e
LW
6718 while (isDIGIT(*next) || *next == ',') {
6719 if (*next == ',') {
6720 if (maxpos)
6721 break;
6722 else
6723 maxpos = next;
a687059c 6724 }
a0d0e21e
LW
6725 next++;
6726 }
6727 if (*next == '}') { /* got one */
6728 if (!maxpos)
6729 maxpos = next;
830247a4
IZ
6730 RExC_parse++;
6731 min = atoi(RExC_parse);
a0d0e21e
LW
6732 if (*maxpos == ',')
6733 maxpos++;
6734 else
830247a4 6735 maxpos = RExC_parse;
a0d0e21e
LW
6736 max = atoi(maxpos);
6737 if (!max && *maxpos != '0')
c277df42
IZ
6738 max = REG_INFTY; /* meaning "infinity" */
6739 else if (max >= REG_INFTY)
8615cb43 6740 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
6741 RExC_parse = next;
6742 nextchar(pRExC_state);
a0d0e21e
LW
6743
6744 do_curly:
6745 if ((flags&SIMPLE)) {
830247a4 6746 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 6747 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
6748 Set_Node_Offset(ret, parse_start+1); /* MJD */
6749 Set_Node_Cur_Length(ret);
a0d0e21e
LW
6750 }
6751 else {
3dab1dad 6752 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
6753
6754 w->flags = 0;
3dab1dad 6755 REGTAIL(pRExC_state, ret, w);
830247a4 6756 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
6757 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6758 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
6759 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6760 }
6bda09f9 6761 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
6762 /* MJD hk */
6763 Set_Node_Offset(ret, parse_start+1);
2af232bd 6764 Set_Node_Length(ret,
fac92740 6765 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 6766
830247a4 6767 if (!SIZE_ONLY && RExC_extralen)
c277df42 6768 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 6769 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 6770 if (SIZE_ONLY)
830247a4
IZ
6771 RExC_whilem_seen++, RExC_extralen += 3;
6772 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 6773 }
c277df42 6774 ret->flags = 0;
a0d0e21e
LW
6775
6776 if (min > 0)
821b33a5
IZ
6777 *flagp = WORST;
6778 if (max > 0)
6779 *flagp |= HASWIDTH;
8fa23287 6780 if (max < min)
8615cb43 6781 vFAIL("Can't do {n,m} with n > m");
c277df42 6782 if (!SIZE_ONLY) {
eb160463
GS
6783 ARG1_SET(ret, (U16)min);
6784 ARG2_SET(ret, (U16)max);
a687059c 6785 }
a687059c 6786
a0d0e21e 6787 goto nest_check;
a687059c 6788 }
a0d0e21e 6789 }
a687059c 6790
a0d0e21e
LW
6791 if (!ISMULT1(op)) {
6792 *flagp = flags;
a687059c 6793 return(ret);
a0d0e21e 6794 }
bb20fd44 6795
c277df42 6796#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
6797
6798 /* if this is reinstated, don't forget to put this back into perldiag:
6799
6800 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6801
6802 (F) The part of the regexp subject to either the * or + quantifier
6803 could match an empty string. The {#} shows in the regular
6804 expression about where the problem was discovered.
6805
6806 */
6807
bb20fd44 6808 if (!(flags&HASWIDTH) && op != '?')
b45f050a 6809 vFAIL("Regexp *+ operand could be empty");
b81d288d 6810#endif
bb20fd44 6811
fac92740 6812 parse_start = RExC_parse;
830247a4 6813 nextchar(pRExC_state);
a0d0e21e 6814
821b33a5 6815 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
6816
6817 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 6818 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 6819 ret->flags = 0;
830247a4 6820 RExC_naughty += 4;
a0d0e21e
LW
6821 }
6822 else if (op == '*') {
6823 min = 0;
6824 goto do_curly;
a0ed51b3
LW
6825 }
6826 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 6827 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 6828 ret->flags = 0;
830247a4 6829 RExC_naughty += 3;
a0d0e21e
LW
6830 }
6831 else if (op == '+') {
6832 min = 1;
6833 goto do_curly;
a0ed51b3
LW
6834 }
6835 else if (op == '?') {
a0d0e21e
LW
6836 min = 0; max = 1;
6837 goto do_curly;
6838 }
6839 nest_check:
668c081a
NC
6840 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6841 ckWARN3reg(RExC_parse,
6842 "%.*s matches null string many times",
6843 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6844 origparse);
a0d0e21e
LW
6845 }
6846
b9b4dddf 6847 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 6848 nextchar(pRExC_state);
6bda09f9 6849 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 6850 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 6851 }
b9b4dddf
YO
6852#ifndef REG_ALLOW_MINMOD_SUSPEND
6853 else
6854#endif
6855 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6856 regnode *ender;
6857 nextchar(pRExC_state);
6858 ender = reg_node(pRExC_state, SUCCEED);
6859 REGTAIL(pRExC_state, ret, ender);
6860 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6861 ret->flags = 0;
6862 ender = reg_node(pRExC_state, TAIL);
6863 REGTAIL(pRExC_state, ret, ender);
6864 /*ret= ender;*/
6865 }
6866
6867 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 6868 RExC_parse++;
b45f050a
JF
6869 vFAIL("Nested quantifiers");
6870 }
a0d0e21e
LW
6871
6872 return(ret);
a687059c
LW
6873}
6874
fc8cd66c
YO
6875
6876/* reg_namedseq(pRExC_state,UVp)
6877
6878 This is expected to be called by a parser routine that has
afefe6bf 6879 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
6880 expected to point at the first char following the N at the time
6881 of the call.
ff3f963a
KW
6882
6883 The \N may be inside (indicated by valuep not being NULL) or outside a
6884 character class.
6885
6886 \N may begin either a named sequence, or if outside a character class, mean
6887 to match a non-newline. For non single-quoted regexes, the tokenizer has
6888 attempted to decide which, and in the case of a named sequence converted it
6889 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6890 where c1... are the characters in the sequence. For single-quoted regexes,
6891 the tokenizer passes the \N sequence through unchanged; this code will not
6892 attempt to determine this nor expand those. The net effect is that if the
6893 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6894 signals that this \N occurrence means to match a non-newline.
6895
6896 Only the \N{U+...} form should occur in a character class, for the same
6897 reason that '.' inside a character class means to just match a period: it
6898 just doesn't make sense.
fc8cd66c
YO
6899
6900 If valuep is non-null then it is assumed that we are parsing inside
6901 of a charclass definition and the first codepoint in the resolved
6902 string is returned via *valuep and the routine will return NULL.
6903 In this mode if a multichar string is returned from the charnames
ff3f963a 6904 handler, a warning will be issued, and only the first char in the
fc8cd66c
YO
6905 sequence will be examined. If the string returned is zero length
6906 then the value of *valuep is undefined and NON-NULL will
6907 be returned to indicate failure. (This will NOT be a valid pointer
6908 to a regnode.)
6909
ff3f963a
KW
6910 If valuep is null then it is assumed that we are parsing normal text and a
6911 new EXACT node is inserted into the program containing the resolved string,
6912 and a pointer to the new node is returned. But if the string is zero length
6913 a NOTHING node is emitted instead.
afefe6bf 6914
fc8cd66c 6915 On success RExC_parse is set to the char following the endbrace.
ff3f963a 6916 Parsing failures will generate a fatal error via vFAIL(...)
fc8cd66c
YO
6917 */
6918STATIC regnode *
afefe6bf 6919S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
fc8cd66c 6920{
c3c41406 6921 char * endbrace; /* '}' following the name */
fc8cd66c 6922 regnode *ret = NULL;
ff3f963a
KW
6923#ifdef DEBUGGING
6924 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6925#endif
c3c41406 6926 char* p;
ff3f963a
KW
6927
6928 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6929
6930 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
ff3f963a
KW
6931
6932 GET_RE_DEBUG_FLAGS;
c3c41406
KW
6933
6934 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6935 * modifier. The other meaning does not */
6936 p = (RExC_flags & RXf_PMf_EXTENDED)
6937 ? regwhite( pRExC_state, RExC_parse )
6938 : RExC_parse;
7918f24d 6939
ff3f963a 6940 /* Disambiguate between \N meaning a named character versus \N meaning
c3c41406
KW
6941 * [^\n]. The former is assumed when it can't be the latter. */
6942 if (*p != '{' || regcurly(p)) {
6943 RExC_parse = p;
ff3f963a 6944 if (valuep) {
afefe6bf 6945 /* no bare \N in a charclass */
ff3f963a
KW
6946 vFAIL("\\N in a character class must be a named character: \\N{...}");
6947 }
afefe6bf
RGS
6948 nextchar(pRExC_state);
6949 ret = reg_node(pRExC_state, REG_ANY);
6950 *flagp |= HASWIDTH|SIMPLE;
6951 RExC_naughty++;
6952 RExC_parse--;
6953 Set_Node_Length(ret, 1); /* MJD */
6954 return ret;
fc8cd66c 6955 }
a4893424 6956
c3c41406
KW
6957 /* Here, we have decided it should be a named sequence */
6958
6959 /* The test above made sure that the next real character is a '{', but
6960 * under the /x modifier, it could be separated by space (or a comment and
6961 * \n) and this is not allowed (for consistency with \x{...} and the
6962 * tokenizer handling of \N{NAME}). */
6963 if (*RExC_parse != '{') {
6964 vFAIL("Missing braces on \\N{}");
6965 }
6966
ff3f963a 6967 RExC_parse++; /* Skip past the '{' */
c3c41406
KW
6968
6969 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6970 || ! (endbrace == RExC_parse /* nothing between the {} */
6971 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6972 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6973 {
6974 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6975 vFAIL("\\N{NAME} must be resolved by the lexer");
6976 }
6977
ff3f963a
KW
6978 if (endbrace == RExC_parse) { /* empty: \N{} */
6979 if (! valuep) {
6980 RExC_parse = endbrace + 1;
6981 return reg_node(pRExC_state,NOTHING);
a4893424 6982 }
fc8cd66c 6983
ff3f963a
KW
6984 if (SIZE_ONLY) {
6985 ckWARNreg(RExC_parse,
6986 "Ignoring zero length \\N{} in character class"
6987 );
6988 RExC_parse = endbrace + 1;
6989 }
6990 *valuep = 0;
6991 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 6992 }
ff3f963a 6993
62fed28b 6994 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
ff3f963a
KW
6995 RExC_parse += 2; /* Skip past the 'U+' */
6996
6997 if (valuep) { /* In a bracketed char class */
6998 /* We only pay attention to the first char of
6999 multichar strings being returned. I kinda wonder
7000 if this makes sense as it does change the behaviour
7001 from earlier versions, OTOH that behaviour was broken
7002 as well. XXX Solution is to recharacterize as
7003 [rest-of-class]|multi1|multi2... */
7004
7005 STRLEN length_of_hex;
7006 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7007 | PERL_SCAN_DISALLOW_PREFIX
7008 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7009
37820adc
KW
7010 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7011 if (endchar < endbrace) {
ff3f963a
KW
7012 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7013 }
ff3f963a
KW
7014
7015 length_of_hex = (STRLEN)(endchar - RExC_parse);
7016 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7017
7018 /* The tokenizer should have guaranteed validity, but it's possible to
7019 * bypass it by using single quoting, so check */
c3c41406
KW
7020 if (length_of_hex == 0
7021 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7022 {
7023 RExC_parse += length_of_hex; /* Includes all the valid */
7024 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7025 ? UTF8SKIP(RExC_parse)
7026 : 1;
7027 /* Guard against malformed utf8 */
7028 if (RExC_parse >= endchar) RExC_parse = endchar;
7029 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
7030 }
7031
7032 RExC_parse = endbrace + 1;
7033 if (endchar == endbrace) return NULL;
7034
7035 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 7036 }
ff3f963a
KW
7037 else { /* Not a char class */
7038 char *s; /* String to put in generated EXACT node */
fda99bee 7039 STRLEN len = 0; /* Its current byte length */
ff3f963a
KW
7040 char *endchar; /* Points to '.' or '}' ending cur char in the input
7041 stream */
7042
2c2b7f86
KW
7043 ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
7044 : (LOC)
7045 ? EXACTFL
7046 : UNI_SEMANTICS
7047 ? EXACTFU
7048 : EXACTF));
ff3f963a
KW
7049 s= STRING(ret);
7050
7051 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
7052 * the input which is of the form now 'c1.c2.c3...}' until find the
fda99bee 7053 * ending brace or exceed length 255. The characters that exceed this
ff3f963a
KW
7054 * limit are dropped. The limit could be relaxed should it become
7055 * desirable by reparsing this as (?:\N{NAME}), so could generate
7056 * multiple EXACT nodes, as is done for just regular input. But this
7057 * is primarily a named character, and not intended to be a huge long
7058 * string, so 255 bytes should be good enough */
7059 while (1) {
c3c41406 7060 STRLEN length_of_hex;
ff3f963a
KW
7061 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7062 | PERL_SCAN_DISALLOW_PREFIX
7063 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7064 UV cp; /* Ord of current character */
7065
7066 /* Code points are separated by dots. If none, there is only one
7067 * code point, and is terminated by the brace */
37820adc 7068 endchar = RExC_parse + strcspn(RExC_parse, ".}");
ff3f963a
KW
7069
7070 /* The values are Unicode even on EBCDIC machines */
c3c41406
KW
7071 length_of_hex = (STRLEN)(endchar - RExC_parse);
7072 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7073 if ( length_of_hex == 0
7074 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
ff3f963a 7075 {
c3c41406
KW
7076 RExC_parse += length_of_hex; /* Includes all the valid */
7077 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7078 ? UTF8SKIP(RExC_parse)
7079 : 1;
7080 /* Guard against malformed utf8 */
7081 if (RExC_parse >= endchar) RExC_parse = endchar;
7082 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
7083 }
7084
7085 if (! FOLD) { /* Not folding, just append to the string */
7086 STRLEN unilen;
7087
7088 /* Quit before adding this character if would exceed limit */
7089 if (len + UNISKIP(cp) > U8_MAX) break;
fc8cd66c 7090
ff3f963a
KW
7091 unilen = reguni(pRExC_state, cp, s);
7092 if (unilen > 0) {
7093 s += unilen;
7094 len += unilen;
7095 }
7096 } else { /* Folding, output the folded equivalent */
7097 STRLEN foldlen,numlen;
7098 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7099 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7100
7101 /* Quit before exceeding size limit */
7102 if (len + foldlen > U8_MAX) break;
7103
7104 for (foldbuf = tmpbuf;
7105 foldlen;
7106 foldlen -= numlen)
7107 {
7108 cp = utf8_to_uvchr(foldbuf, &numlen);
7109 if (numlen > 0) {
7110 const STRLEN unilen = reguni(pRExC_state, cp, s);
7111 s += unilen;
7112 len += unilen;
7113 /* In EBCDIC the numlen and unilen can differ. */
7114 foldbuf += numlen;
7115 if (numlen >= foldlen)
7116 break;
7117 }
7118 else
7119 break; /* "Can't happen." */
7120 }
7121 }
7122
7123 /* Point to the beginning of the next character in the sequence. */
7124 RExC_parse = endchar + 1;
7125
7126 /* Quit if no more characters */
7127 if (RExC_parse >= endbrace) break;
7128 }
7129
7130
7131 if (SIZE_ONLY) {
7132 if (RExC_parse < endbrace) {
7133 ckWARNreg(RExC_parse - 1,
7134 "Using just the first characters returned by \\N{}");
7135 }
7136
7137 RExC_size += STR_SZ(len);
7138 } else {
7139 STR_LEN(ret) = len;
7140 RExC_emit += STR_SZ(len);
7141 }
7142
7143 RExC_parse = endbrace + 1;
7144
7145 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7146 with malformed in t/re/pat_advanced.t */
7147 RExC_parse --;
7148 Set_Node_Cur_Length(ret); /* MJD */
7149 nextchar(pRExC_state);
7150 }
7151
7152 return ret;
fc8cd66c
YO
7153}
7154
7155
9e08bc66
TS
7156/*
7157 * reg_recode
7158 *
7159 * It returns the code point in utf8 for the value in *encp.
7160 * value: a code value in the source encoding
7161 * encp: a pointer to an Encode object
7162 *
7163 * If the result from Encode is not a single character,
7164 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7165 */
7166STATIC UV
7167S_reg_recode(pTHX_ const char value, SV **encp)
7168{
7169 STRLEN numlen = 1;
59cd0e26 7170 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 7171 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
7172 const STRLEN newlen = SvCUR(sv);
7173 UV uv = UNICODE_REPLACEMENT;
7174
7918f24d
NC
7175 PERL_ARGS_ASSERT_REG_RECODE;
7176
9e08bc66
TS
7177 if (newlen)
7178 uv = SvUTF8(sv)
7179 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7180 : *(U8*)s;
7181
7182 if (!newlen || numlen != newlen) {
7183 uv = UNICODE_REPLACEMENT;
c86f7df5 7184 *encp = NULL;
9e08bc66
TS
7185 }
7186 return uv;
7187}
7188
fc8cd66c 7189
a687059c
LW
7190/*
7191 - regatom - the lowest level
ee9b8eae
YO
7192
7193 Try to identify anything special at the start of the pattern. If there
7194 is, then handle it as required. This may involve generating a single regop,
7195 such as for an assertion; or it may involve recursing, such as to
7196 handle a () structure.
7197
7198 If the string doesn't start with something special then we gobble up
7199 as much literal text as we can.
7200
7201 Once we have been able to handle whatever type of thing started the
7202 sequence, we return.
7203
7204 Note: we have to be careful with escapes, as they can be both literal
7205 and special, and in the case of \10 and friends can either, depending
7206 on context. Specifically there are two seperate switches for handling
7207 escape sequences, with the one for handling literal escapes requiring
7208 a dummy entry for all of the special escapes that are actually handled
7209 by the other.
7210*/
7211
76e3520e 7212STATIC regnode *
3dab1dad 7213S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 7214{
97aff369 7215 dVAR;
cbbf8932 7216 register regnode *ret = NULL;
a0d0e21e 7217 I32 flags;
45948336 7218 char *parse_start = RExC_parse;
3dab1dad
YO
7219 GET_RE_DEBUG_FLAGS_DECL;
7220 DEBUG_PARSE("atom");
a0d0e21e
LW
7221 *flagp = WORST; /* Tentatively. */
7222
7918f24d 7223 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 7224
a0d0e21e 7225tryagain:
f9a79580 7226 switch ((U8)*RExC_parse) {
a0d0e21e 7227 case '^':
830247a4
IZ
7228 RExC_seen_zerolen++;
7229 nextchar(pRExC_state);
bbe252da 7230 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7231 ret = reg_node(pRExC_state, MBOL);
bbe252da 7232 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7233 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7234 else
830247a4 7235 ret = reg_node(pRExC_state, BOL);
fac92740 7236 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7237 break;
7238 case '$':
830247a4 7239 nextchar(pRExC_state);
b81d288d 7240 if (*RExC_parse)
830247a4 7241 RExC_seen_zerolen++;
bbe252da 7242 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7243 ret = reg_node(pRExC_state, MEOL);
bbe252da 7244 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7245 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7246 else
830247a4 7247 ret = reg_node(pRExC_state, EOL);
fac92740 7248 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7249 break;
7250 case '.':
830247a4 7251 nextchar(pRExC_state);
bbe252da 7252 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
7253 ret = reg_node(pRExC_state, SANY);
7254 else
7255 ret = reg_node(pRExC_state, REG_ANY);
7256 *flagp |= HASWIDTH|SIMPLE;
830247a4 7257 RExC_naughty++;
fac92740 7258 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7259 break;
7260 case '[':
b45f050a 7261 {
3dab1dad
YO
7262 char * const oregcomp_parse = ++RExC_parse;
7263 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
7264 if (*RExC_parse != ']') {
7265 RExC_parse = oregcomp_parse;
b45f050a
JF
7266 vFAIL("Unmatched [");
7267 }
830247a4 7268 nextchar(pRExC_state);
a0d0e21e 7269 *flagp |= HASWIDTH|SIMPLE;
fac92740 7270 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 7271 break;
b45f050a 7272 }
a0d0e21e 7273 case '(':
830247a4 7274 nextchar(pRExC_state);
3dab1dad 7275 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 7276 if (ret == NULL) {
bf93d4cc 7277 if (flags & TRYAGAIN) {
830247a4 7278 if (RExC_parse == RExC_end) {
bf93d4cc
GS
7279 /* Make parent create an empty node if needed. */
7280 *flagp |= TRYAGAIN;
7281 return(NULL);
7282 }
a0d0e21e 7283 goto tryagain;
bf93d4cc 7284 }
a0d0e21e
LW
7285 return(NULL);
7286 }
a3b492c3 7287 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
7288 break;
7289 case '|':
7290 case ')':
7291 if (flags & TRYAGAIN) {
7292 *flagp |= TRYAGAIN;
7293 return NULL;
7294 }
b45f050a 7295 vFAIL("Internal urp");
a0d0e21e
LW
7296 /* Supposed to be caught earlier. */
7297 break;
85afd4ae 7298 case '{':
830247a4
IZ
7299 if (!regcurly(RExC_parse)) {
7300 RExC_parse++;
85afd4ae
CS
7301 goto defchar;
7302 }
7303 /* FALL THROUGH */
a0d0e21e
LW
7304 case '?':
7305 case '+':
7306 case '*':
830247a4 7307 RExC_parse++;
b45f050a 7308 vFAIL("Quantifier follows nothing");
a0d0e21e 7309 break;
ced7f090
KW
7310 case LATIN_SMALL_LETTER_SHARP_S:
7311 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7312 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7313#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
7314#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below.
7315 case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
7316#endif
a0a388a1 7317 do_foldchar:
56d400ed 7318 if (!LOC && FOLD) {
e64b1bd1 7319 U32 len,cp;
7cf3a6a3 7320 len=0; /* silence a spurious compiler warning */
56d400ed 7321 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
e64b1bd1
YO
7322 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7323 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7324 ret = reganode(pRExC_state, FOLDCHAR, cp);
7325 Set_Node_Length(ret, 1); /* MJD */
7326 nextchar(pRExC_state); /* kill whitespace under /x */
7327 return ret;
7328 }
7329 }
7330 goto outer_default;
a0d0e21e 7331 case '\\':
ee9b8eae
YO
7332 /* Special Escapes
7333
7334 This switch handles escape sequences that resolve to some kind
7335 of special regop and not to literal text. Escape sequnces that
7336 resolve to literal text are handled below in the switch marked
7337 "Literal Escapes".
7338
7339 Every entry in this switch *must* have a corresponding entry
7340 in the literal escape switch. However, the opposite is not
7341 required, as the default for this switch is to jump to the
7342 literal text handling code.
7343 */
a0a388a1 7344 switch ((U8)*++RExC_parse) {
ced7f090
KW
7345 case LATIN_SMALL_LETTER_SHARP_S:
7346 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7347 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
a0a388a1 7348 goto do_foldchar;
ee9b8eae 7349 /* Special Escapes */
a0d0e21e 7350 case 'A':
830247a4
IZ
7351 RExC_seen_zerolen++;
7352 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7353 *flagp |= SIMPLE;
ee9b8eae 7354 goto finish_meta_pat;
a0d0e21e 7355 case 'G':
830247a4
IZ
7356 ret = reg_node(pRExC_state, GPOS);
7357 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 7358 *flagp |= SIMPLE;
ee9b8eae
YO
7359 goto finish_meta_pat;
7360 case 'K':
7361 RExC_seen_zerolen++;
7362 ret = reg_node(pRExC_state, KEEPS);
7363 *flagp |= SIMPLE;
37923168
RGS
7364 /* XXX:dmq : disabling in-place substitution seems to
7365 * be necessary here to avoid cases of memory corruption, as
7366 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7367 */
7368 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 7369 goto finish_meta_pat;
a0d0e21e 7370 case 'Z':
830247a4 7371 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7372 *flagp |= SIMPLE;
a1917ab9 7373 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7374 goto finish_meta_pat;
b85d18e9 7375 case 'z':
830247a4 7376 ret = reg_node(pRExC_state, EOS);
b85d18e9 7377 *flagp |= SIMPLE;
830247a4 7378 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7379 goto finish_meta_pat;
4a2d328f 7380 case 'C':
f33976b4
DB
7381 ret = reg_node(pRExC_state, CANY);
7382 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 7383 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7384 goto finish_meta_pat;
a0ed51b3 7385 case 'X':
830247a4 7386 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 7387 *flagp |= HASWIDTH;
ee9b8eae 7388 goto finish_meta_pat;
a0d0e21e 7389 case 'w':
a12cf05f
KW
7390 if (LOC) {
7391 ret = reg_node(pRExC_state, (U8)(ALNUML));
7392 } else {
7393 ret = reg_node(pRExC_state, (U8)(ALNUM));
7394 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7395 }
a0d0e21e 7396 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7397 goto finish_meta_pat;
a0d0e21e 7398 case 'W':
a12cf05f
KW
7399 if (LOC) {
7400 ret = reg_node(pRExC_state, (U8)(NALNUML));
7401 } else {
7402 ret = reg_node(pRExC_state, (U8)(NALNUM));
7403 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7404 }
a0d0e21e 7405 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7406 goto finish_meta_pat;
a0d0e21e 7407 case 'b':
830247a4
IZ
7408 RExC_seen_zerolen++;
7409 RExC_seen |= REG_SEEN_LOOKBEHIND;
a12cf05f
KW
7410 if (LOC) {
7411 ret = reg_node(pRExC_state, (U8)(BOUNDL));
7412 } else {
7413 ret = reg_node(pRExC_state, (U8)(BOUND));
7414 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7415 }
a0d0e21e 7416 *flagp |= SIMPLE;
ee9b8eae 7417 goto finish_meta_pat;
a0d0e21e 7418 case 'B':
830247a4
IZ
7419 RExC_seen_zerolen++;
7420 RExC_seen |= REG_SEEN_LOOKBEHIND;
a12cf05f
KW
7421 if (LOC) {
7422 ret = reg_node(pRExC_state, (U8)(NBOUNDL));
7423 } else {
7424 ret = reg_node(pRExC_state, (U8)(NBOUND));
7425 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7426 }
a0d0e21e 7427 *flagp |= SIMPLE;
ee9b8eae 7428 goto finish_meta_pat;
a0d0e21e 7429 case 's':
a12cf05f
KW
7430 if (LOC) {
7431 ret = reg_node(pRExC_state, (U8)(SPACEL));
7432 } else {
7433 ret = reg_node(pRExC_state, (U8)(SPACE));
7434 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7435 }
a0d0e21e 7436 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7437 goto finish_meta_pat;
a0d0e21e 7438 case 'S':
a12cf05f
KW
7439 if (LOC) {
7440 ret = reg_node(pRExC_state, (U8)(NSPACEL));
7441 } else {
7442 ret = reg_node(pRExC_state, (U8)(NSPACE));
7443 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7444 }
a0d0e21e 7445 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7446 goto finish_meta_pat;
a0d0e21e 7447 case 'd':
6ab9ea91
KW
7448 if (LOC) {
7449 ret = reg_node(pRExC_state, (U8)(DIGITL));
7450 } else {
7451 ret = reg_node(pRExC_state, (U8)(DIGIT));
7452 }
a0d0e21e 7453 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7454 goto finish_meta_pat;
a0d0e21e 7455 case 'D':
6ab9ea91
KW
7456 if (LOC) {
7457 ret = reg_node(pRExC_state, (U8)(NDIGITL));
7458 } else {
7459 ret = reg_node(pRExC_state, (U8)(NDIGIT));
7460 }
a0d0e21e 7461 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7462 goto finish_meta_pat;
e1d1eefb
YO
7463 case 'R':
7464 ret = reg_node(pRExC_state, LNBREAK);
7465 *flagp |= HASWIDTH|SIMPLE;
7466 goto finish_meta_pat;
7467 case 'h':
7468 ret = reg_node(pRExC_state, HORIZWS);
7469 *flagp |= HASWIDTH|SIMPLE;
7470 goto finish_meta_pat;
7471 case 'H':
7472 ret = reg_node(pRExC_state, NHORIZWS);
7473 *flagp |= HASWIDTH|SIMPLE;
7474 goto finish_meta_pat;
ee9b8eae 7475 case 'v':
e1d1eefb
YO
7476 ret = reg_node(pRExC_state, VERTWS);
7477 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
7478 goto finish_meta_pat;
7479 case 'V':
e1d1eefb
YO
7480 ret = reg_node(pRExC_state, NVERTWS);
7481 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7482 finish_meta_pat:
830247a4 7483 nextchar(pRExC_state);
fac92740 7484 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 7485 break;
a14b48bc
LW
7486 case 'p':
7487 case 'P':
3568d838 7488 {
3dab1dad 7489 char* const oldregxend = RExC_end;
d008bc60 7490#ifdef DEBUGGING
ccb2c380 7491 char* parse_start = RExC_parse - 2;
d008bc60 7492#endif
a14b48bc 7493
830247a4 7494 if (RExC_parse[1] == '{') {
3568d838 7495 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
7496 RExC_end = strchr(RExC_parse, '}');
7497 if (!RExC_end) {
3dab1dad 7498 const U8 c = (U8)*RExC_parse;
830247a4
IZ
7499 RExC_parse += 2;
7500 RExC_end = oldregxend;
0da60cf5 7501 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 7502 }
830247a4 7503 RExC_end++;
a14b48bc 7504 }
af6f566e 7505 else {
830247a4 7506 RExC_end = RExC_parse + 2;
af6f566e
HS
7507 if (RExC_end > oldregxend)
7508 RExC_end = oldregxend;
7509 }
830247a4 7510 RExC_parse--;
a14b48bc 7511
3dab1dad 7512 ret = regclass(pRExC_state,depth+1);
a14b48bc 7513
830247a4
IZ
7514 RExC_end = oldregxend;
7515 RExC_parse--;
ccb2c380
MP
7516
7517 Set_Node_Offset(ret, parse_start + 2);
7518 Set_Node_Cur_Length(ret);
830247a4 7519 nextchar(pRExC_state);
a14b48bc
LW
7520 *flagp |= HASWIDTH|SIMPLE;
7521 }
7522 break;
fc8cd66c 7523 case 'N':
afefe6bf 7524 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
7525 multicharacter. join_exact() will join them up later on.
7526 Also this makes sure that things like /\N{BLAH}+/ and
7527 \N{BLAH} being multi char Just Happen. dmq*/
7528 ++RExC_parse;
afefe6bf 7529 ret= reg_namedseq(pRExC_state, NULL, flagp);
fc8cd66c 7530 break;
0a4db386 7531 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 7532 parse_named_seq:
81714fb9
YO
7533 {
7534 char ch= RExC_parse[1];
1f1031fe
YO
7535 if (ch != '<' && ch != '\'' && ch != '{') {
7536 RExC_parse++;
7537 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 7538 } else {
1f1031fe
YO
7539 /* this pretty much dupes the code for (?P=...) in reg(), if
7540 you change this make sure you change that */
81714fb9 7541 char* name_start = (RExC_parse += 2);
2eccd3b2 7542 U32 num = 0;
0a4db386
YO
7543 SV *sv_dat = reg_scan_name(pRExC_state,
7544 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 7545 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 7546 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
7547 vFAIL2("Sequence %.3s... not terminated",parse_start);
7548
7549 if (!SIZE_ONLY) {
7550 num = add_data( pRExC_state, 1, "S" );
7551 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 7552 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
7553 }
7554
81714fb9
YO
7555 RExC_sawback = 1;
7556 ret = reganode(pRExC_state,
4444fd9f
KW
7557 ((! FOLD)
7558 ? NREF
7559 : (UNI_SEMANTICS)
7560 ? NREFFU
7561 : (LOC)
7562 ? NREFFL
7563 : NREFF),
7564 num);
81714fb9 7565 *flagp |= HASWIDTH;
1f1031fe 7566
81714fb9
YO
7567 /* override incorrect value set in reganode MJD */
7568 Set_Node_Offset(ret, parse_start+1);
7569 Set_Node_Cur_Length(ret); /* MJD */
7570 nextchar(pRExC_state);
1f1031fe 7571
81714fb9
YO
7572 }
7573 break;
1f1031fe 7574 }
2bf803e2 7575 case 'g':
a0d0e21e
LW
7576 case '1': case '2': case '3': case '4':
7577 case '5': case '6': case '7': case '8': case '9':
7578 {
c74340f9 7579 I32 num;
2bf803e2
YO
7580 bool isg = *RExC_parse == 'g';
7581 bool isrel = 0;
7582 bool hasbrace = 0;
7583 if (isg) {
c74340f9 7584 RExC_parse++;
2bf803e2
YO
7585 if (*RExC_parse == '{') {
7586 RExC_parse++;
7587 hasbrace = 1;
7588 }
7589 if (*RExC_parse == '-') {
7590 RExC_parse++;
7591 isrel = 1;
7592 }
1f1031fe
YO
7593 if (hasbrace && !isDIGIT(*RExC_parse)) {
7594 if (isrel) RExC_parse--;
7595 RExC_parse -= 2;
7596 goto parse_named_seq;
7597 } }
c74340f9 7598 num = atoi(RExC_parse);
b72d83b2
RGS
7599 if (isg && num == 0)
7600 vFAIL("Reference to invalid group 0");
c74340f9 7601 if (isrel) {
5624f11d 7602 num = RExC_npar - num;
c74340f9
YO
7603 if (num < 1)
7604 vFAIL("Reference to nonexistent or unclosed group");
7605 }
2bf803e2 7606 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
7607 goto defchar;
7608 else {
3dab1dad 7609 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
7610 while (isDIGIT(*RExC_parse))
7611 RExC_parse++;
1f1031fe
YO
7612 if (parse_start == RExC_parse - 1)
7613 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
7614 if (hasbrace) {
7615 if (*RExC_parse != '}')
7616 vFAIL("Unterminated \\g{...} pattern");
7617 RExC_parse++;
7618 }
c74340f9
YO
7619 if (!SIZE_ONLY) {
7620 if (num > (I32)RExC_rx->nparens)
7621 vFAIL("Reference to nonexistent group");
c74340f9 7622 }
830247a4 7623 RExC_sawback = 1;
eb160463 7624 ret = reganode(pRExC_state,
4444fd9f
KW
7625 ((! FOLD)
7626 ? REF
7627 : (UNI_SEMANTICS)
7628 ? REFFU
7629 : (LOC)
7630 ? REFFL
7631 : REFF),
7632 num);
a0d0e21e 7633 *flagp |= HASWIDTH;
2af232bd 7634
fac92740 7635 /* override incorrect value set in reganode MJD */
2af232bd 7636 Set_Node_Offset(ret, parse_start+1);
fac92740 7637 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
7638 RExC_parse--;
7639 nextchar(pRExC_state);
a0d0e21e
LW
7640 }
7641 }
7642 break;
7643 case '\0':
830247a4 7644 if (RExC_parse >= RExC_end)
b45f050a 7645 FAIL("Trailing \\");
a0d0e21e
LW
7646 /* FALL THROUGH */
7647 default:
a0288114 7648 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 7649 back into the quick-grab loop below */
45948336 7650 parse_start--;
a0d0e21e
LW
7651 goto defchar;
7652 }
7653 break;
4633a7c4
LW
7654
7655 case '#':
bbe252da 7656 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 7657 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
7658 goto tryagain;
7659 }
7660 /* FALL THROUGH */
7661
f9a79580
RGS
7662 default:
7663 outer_default:{
ba210ebe 7664 register STRLEN len;
58ae7d3f 7665 register UV ender;
a0d0e21e 7666 register char *p;
3dab1dad 7667 char *s;
80aecb99 7668 STRLEN foldlen;
89ebb4a3 7669 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
7670
7671 parse_start = RExC_parse - 1;
a0d0e21e 7672
830247a4 7673 RExC_parse++;
a0d0e21e
LW
7674
7675 defchar:
58ae7d3f 7676 ender = 0;
eb160463 7677 ret = reg_node(pRExC_state,
2c2b7f86
KW
7678 (U8) ((! FOLD) ? EXACT
7679 : (LOC)
7680 ? EXACTFL
7681 : (UNI_SEMANTICS)
7682 ? EXACTFU
7683 : EXACTF)
7684 );
cd439c50 7685 s = STRING(ret);
830247a4
IZ
7686 for (len = 0, p = RExC_parse - 1;
7687 len < 127 && p < RExC_end;
a0d0e21e
LW
7688 len++)
7689 {
3dab1dad 7690 char * const oldp = p;
5b5a24f7 7691
bbe252da 7692 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 7693 p = regwhite( pRExC_state, p );
f9a79580 7694 switch ((U8)*p) {
ced7f090
KW
7695 case LATIN_SMALL_LETTER_SHARP_S:
7696 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7697 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
56d400ed 7698 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
f9a79580 7699 goto normal_default;
a0d0e21e
LW
7700 case '^':
7701 case '$':
7702 case '.':
7703 case '[':
7704 case '(':
7705 case ')':
7706 case '|':
7707 goto loopdone;
7708 case '\\':
ee9b8eae
YO
7709 /* Literal Escapes Switch
7710
7711 This switch is meant to handle escape sequences that
7712 resolve to a literal character.
7713
7714 Every escape sequence that represents something
7715 else, like an assertion or a char class, is handled
7716 in the switch marked 'Special Escapes' above in this
7717 routine, but also has an entry here as anything that
7718 isn't explicitly mentioned here will be treated as
7719 an unescaped equivalent literal.
7720 */
7721
a0a388a1 7722 switch ((U8)*++p) {
ee9b8eae 7723 /* These are all the special escapes. */
ced7f090
KW
7724 case LATIN_SMALL_LETTER_SHARP_S:
7725 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7726 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
a0a388a1
YO
7727 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7728 goto normal_default;
ee9b8eae
YO
7729 case 'A': /* Start assertion */
7730 case 'b': case 'B': /* Word-boundary assertion*/
7731 case 'C': /* Single char !DANGEROUS! */
7732 case 'd': case 'D': /* digit class */
7733 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 7734 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
7735 case 'k': case 'K': /* named backref, keep marker */
7736 case 'N': /* named char sequence */
38a44b82 7737 case 'p': case 'P': /* Unicode property */
e1d1eefb 7738 case 'R': /* LNBREAK */
ee9b8eae 7739 case 's': case 'S': /* space class */
e1d1eefb 7740 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
7741 case 'w': case 'W': /* word class */
7742 case 'X': /* eXtended Unicode "combining character sequence" */
7743 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
7744 --p;
7745 goto loopdone;
ee9b8eae
YO
7746
7747 /* Anything after here is an escape that resolves to a
7748 literal. (Except digits, which may or may not)
7749 */
a0d0e21e
LW
7750 case 'n':
7751 ender = '\n';
7752 p++;
a687059c 7753 break;
a0d0e21e
LW
7754 case 'r':
7755 ender = '\r';
7756 p++;
a687059c 7757 break;
a0d0e21e
LW
7758 case 't':
7759 ender = '\t';
7760 p++;
a687059c 7761 break;
a0d0e21e
LW
7762 case 'f':
7763 ender = '\f';
7764 p++;
a687059c 7765 break;
a0d0e21e 7766 case 'e':
c7f1f016 7767 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 7768 p++;
a687059c 7769 break;
a0d0e21e 7770 case 'a':
c7f1f016 7771 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 7772 p++;
a687059c 7773 break;
f0a2b745
KW
7774 case 'o':
7775 {
7776 STRLEN brace_len = len;
00c0cb6d 7777 UV result;
454155d9
KW
7778 const char* error_msg;
7779
7780 bool valid = grok_bslash_o(p,
7781 &result,
7782 &brace_len,
7783 &error_msg,
7784 1);
7785 p += brace_len;
7786 if (! valid) {
7787 RExC_parse = p; /* going to die anyway; point
7788 to exact spot of failure */
f0a2b745
KW
7789 vFAIL(error_msg);
7790 }
00c0cb6d
DG
7791 else
7792 {
7793 ender = result;
7794 }
f0a2b745
KW
7795 if (PL_encoding && ender < 0x100) {
7796 goto recode_encoding;
7797 }
7798 if (ender > 0xff) {
62fed28b 7799 REQUIRE_UTF8;
f0a2b745
KW
7800 }
7801 break;
7802 }
a0d0e21e 7803 case 'x':
a0ed51b3 7804 if (*++p == '{') {
1df70142 7805 char* const e = strchr(p, '}');
b81d288d 7806
b45f050a 7807 if (!e) {
830247a4 7808 RExC_parse = p + 1;
b45f050a
JF
7809 vFAIL("Missing right brace on \\x{}");
7810 }
de5f0749 7811 else {
a4c04bdc
NC
7812 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7813 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 7814 STRLEN numlen = e - p - 1;
53305cf1 7815 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028 7816 if (ender > 0xff)
62fed28b 7817 REQUIRE_UTF8;
a0ed51b3
LW
7818 p = e + 1;
7819 }
a0ed51b3
LW
7820 }
7821 else {
a4c04bdc 7822 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 7823 STRLEN numlen = 2;
53305cf1 7824 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
7825 p += numlen;
7826 }
9e08bc66
TS
7827 if (PL_encoding && ender < 0x100)
7828 goto recode_encoding;
a687059c 7829 break;
a0d0e21e
LW
7830 case 'c':
7831 p++;
f9d13529 7832 ender = grok_bslash_c(*p++, SIZE_ONLY);
a687059c 7833 break;
a0d0e21e
LW
7834 case '0': case '1': case '2': case '3':case '4':
7835 case '5': case '6': case '7': case '8':case '9':
7836 if (*p == '0' ||
ca67da41 7837 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
c99e91e9
KW
7838 {
7839 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
1df70142 7840 STRLEN numlen = 3;
53305cf1 7841 ender = grok_oct(p, &numlen, &flags, NULL);
fa1639c5 7842 if (ender > 0xff) {
62fed28b 7843 REQUIRE_UTF8;
609122bd 7844 }
a0d0e21e
LW
7845 p += numlen;
7846 }
7847 else {
7848 --p;
7849 goto loopdone;
a687059c 7850 }
9e08bc66
TS
7851 if (PL_encoding && ender < 0x100)
7852 goto recode_encoding;
7853 break;
7854 recode_encoding:
7855 {
7856 SV* enc = PL_encoding;
7857 ender = reg_recode((const char)(U8)ender, &enc);
668c081a
NC
7858 if (!enc && SIZE_ONLY)
7859 ckWARNreg(p, "Invalid escape in the specified encoding");
62fed28b 7860 REQUIRE_UTF8;
9e08bc66 7861 }
a687059c 7862 break;
a0d0e21e 7863 case '\0':
830247a4 7864 if (p >= RExC_end)
b45f050a 7865 FAIL("Trailing \\");
a687059c 7866 /* FALL THROUGH */
a0d0e21e 7867 default:
668c081a
NC
7868 if (!SIZE_ONLY&& isALPHA(*p))
7869 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 7870 goto normal_default;
a0d0e21e
LW
7871 }
7872 break;
a687059c 7873 default:
a0ed51b3 7874 normal_default:
fd400ab9 7875 if (UTF8_IS_START(*p) && UTF) {
1df70142 7876 STRLEN numlen;
5e12f4fb 7877 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 7878 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
7879 p += numlen;
7880 }
7881 else
7882 ender = *p++;
a0d0e21e 7883 break;
a687059c 7884 }
bcdf7404
YO
7885 if ( RExC_flags & RXf_PMf_EXTENDED)
7886 p = regwhite( pRExC_state, p );
60a8b682
JH
7887 if (UTF && FOLD) {
7888 /* Prime the casefolded buffer. */
ac7e0132 7889 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 7890 }
bcdf7404 7891 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
7892 if (len)
7893 p = oldp;
16ea2a2e 7894 else if (UTF) {
80aecb99 7895 if (FOLD) {
60a8b682 7896 /* Emit all the Unicode characters. */
1df70142 7897 STRLEN numlen;
80aecb99
JH
7898 for (foldbuf = tmpbuf;
7899 foldlen;
7900 foldlen -= numlen) {
7901 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7902 if (numlen > 0) {
71207a34 7903 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7904 s += unilen;
7905 len += unilen;
7906 /* In EBCDIC the numlen
7907 * and unilen can differ. */
9dc45d57 7908 foldbuf += numlen;
47654450
JH
7909 if (numlen >= foldlen)
7910 break;
9dc45d57
JH
7911 }
7912 else
7913 break; /* "Can't happen." */
80aecb99
JH
7914 }
7915 }
7916 else {
71207a34 7917 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7918 if (unilen > 0) {
0ebc6274
JH
7919 s += unilen;
7920 len += unilen;
9dc45d57 7921 }
80aecb99 7922 }
a0ed51b3 7923 }
a0d0e21e
LW
7924 else {
7925 len++;
eb160463 7926 REGC((char)ender, s++);
a0d0e21e
LW
7927 }
7928 break;
a687059c 7929 }
16ea2a2e 7930 if (UTF) {
80aecb99 7931 if (FOLD) {
60a8b682 7932 /* Emit all the Unicode characters. */
1df70142 7933 STRLEN numlen;
80aecb99
JH
7934 for (foldbuf = tmpbuf;
7935 foldlen;
7936 foldlen -= numlen) {
7937 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7938 if (numlen > 0) {
71207a34 7939 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7940 len += unilen;
7941 s += unilen;
7942 /* In EBCDIC the numlen
7943 * and unilen can differ. */
9dc45d57 7944 foldbuf += numlen;
47654450
JH
7945 if (numlen >= foldlen)
7946 break;
9dc45d57
JH
7947 }
7948 else
7949 break;
80aecb99
JH
7950 }
7951 }
7952 else {
71207a34 7953 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7954 if (unilen > 0) {
0ebc6274
JH
7955 s += unilen;
7956 len += unilen;
9dc45d57 7957 }
80aecb99
JH
7958 }
7959 len--;
a0ed51b3
LW
7960 }
7961 else
eb160463 7962 REGC((char)ender, s++);
a0d0e21e
LW
7963 }
7964 loopdone:
830247a4 7965 RExC_parse = p - 1;
fac92740 7966 Set_Node_Cur_Length(ret); /* MJD */
830247a4 7967 nextchar(pRExC_state);
793db0cb
JH
7968 {
7969 /* len is STRLEN which is unsigned, need to copy to signed */
7970 IV iv = len;
7971 if (iv < 0)
7972 vFAIL("Internal disaster");
7973 }
a0d0e21e
LW
7974 if (len > 0)
7975 *flagp |= HASWIDTH;
090f7165 7976 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 7977 *flagp |= SIMPLE;
3dab1dad 7978
cd439c50 7979 if (SIZE_ONLY)
830247a4 7980 RExC_size += STR_SZ(len);
3dab1dad
YO
7981 else {
7982 STR_LEN(ret) = len;
830247a4 7983 RExC_emit += STR_SZ(len);
07be1b83 7984 }
3dab1dad 7985 }
a0d0e21e
LW
7986 break;
7987 }
a687059c 7988
a0d0e21e 7989 return(ret);
a687059c
LW
7990}
7991
873ef191 7992STATIC char *
bcdf7404 7993S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 7994{
bcdf7404 7995 const char *e = RExC_end;
7918f24d
NC
7996
7997 PERL_ARGS_ASSERT_REGWHITE;
7998
5b5a24f7
CS
7999 while (p < e) {
8000 if (isSPACE(*p))
8001 ++p;
8002 else if (*p == '#') {
bcdf7404 8003 bool ended = 0;
5b5a24f7 8004 do {
bcdf7404
YO
8005 if (*p++ == '\n') {
8006 ended = 1;
8007 break;
8008 }
8009 } while (p < e);
8010 if (!ended)
8011 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
8012 }
8013 else
8014 break;
8015 }
8016 return p;
8017}
8018
b8c5462f
JH
8019/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
8020 Character classes ([:foo:]) can also be negated ([:^foo:]).
8021 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
8022 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 8023 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
8024
8025#define POSIXCC_DONE(c) ((c) == ':')
8026#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
8027#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
8028
b8c5462f 8029STATIC I32
830247a4 8030S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 8031{
97aff369 8032 dVAR;
936ed897 8033 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 8034
7918f24d
NC
8035 PERL_ARGS_ASSERT_REGPPOSIXCC;
8036
830247a4 8037 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 8038 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 8039 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 8040 const char c = UCHARAT(RExC_parse);
097eb12c 8041 char* const s = RExC_parse++;
b81d288d 8042
9a86a77b 8043 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
8044 RExC_parse++;
8045 if (RExC_parse == RExC_end)
620e46c5 8046 /* Grandfather lone [:, [=, [. */
830247a4 8047 RExC_parse = s;
620e46c5 8048 else {
3dab1dad 8049 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
8050 assert(*t == c);
8051
9a86a77b 8052 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 8053 const char *posixcc = s + 1;
830247a4 8054 RExC_parse++; /* skip over the ending ] */
3dab1dad 8055
b8c5462f 8056 if (*s == ':') {
1df70142
AL
8057 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
8058 const I32 skip = t - posixcc;
80916619
NC
8059
8060 /* Initially switch on the length of the name. */
8061 switch (skip) {
8062 case 4:
3dab1dad
YO
8063 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
8064 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 8065 break;
80916619
NC
8066 case 5:
8067 /* Names all of length 5. */
8068 /* alnum alpha ascii blank cntrl digit graph lower
8069 print punct space upper */
8070 /* Offset 4 gives the best switch position. */
8071 switch (posixcc[4]) {
8072 case 'a':
3dab1dad
YO
8073 if (memEQ(posixcc, "alph", 4)) /* alpha */
8074 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
8075 break;
8076 case 'e':
3dab1dad
YO
8077 if (memEQ(posixcc, "spac", 4)) /* space */
8078 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
8079 break;
8080 case 'h':
3dab1dad
YO
8081 if (memEQ(posixcc, "grap", 4)) /* graph */
8082 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
8083 break;
8084 case 'i':
3dab1dad
YO
8085 if (memEQ(posixcc, "asci", 4)) /* ascii */
8086 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
8087 break;
8088 case 'k':
3dab1dad
YO
8089 if (memEQ(posixcc, "blan", 4)) /* blank */
8090 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
8091 break;
8092 case 'l':
3dab1dad
YO
8093 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
8094 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
8095 break;
8096 case 'm':
3dab1dad
YO
8097 if (memEQ(posixcc, "alnu", 4)) /* alnum */
8098 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
8099 break;
8100 case 'r':
3dab1dad
YO
8101 if (memEQ(posixcc, "lowe", 4)) /* lower */
8102 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8103 else if (memEQ(posixcc, "uppe", 4)) /* upper */
8104 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
8105 break;
8106 case 't':
3dab1dad
YO
8107 if (memEQ(posixcc, "digi", 4)) /* digit */
8108 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8109 else if (memEQ(posixcc, "prin", 4)) /* print */
8110 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8111 else if (memEQ(posixcc, "punc", 4)) /* punct */
8112 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 8113 break;
b8c5462f
JH
8114 }
8115 break;
80916619 8116 case 6:
3dab1dad
YO
8117 if (memEQ(posixcc, "xdigit", 6))
8118 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
8119 break;
8120 }
80916619
NC
8121
8122 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
8123 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8124 t - s - 1, s + 1);
80916619
NC
8125 assert (posixcc[skip] == ':');
8126 assert (posixcc[skip+1] == ']');
b45f050a 8127 } else if (!SIZE_ONLY) {
b8c5462f 8128 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 8129
830247a4 8130 /* adjust RExC_parse so the warning shows after
b45f050a 8131 the class closes */
9a86a77b 8132 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 8133 RExC_parse++;
b45f050a
JF
8134 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8135 }
b8c5462f
JH
8136 } else {
8137 /* Maternal grandfather:
8138 * "[:" ending in ":" but not in ":]" */
830247a4 8139 RExC_parse = s;
767d463e 8140 }
620e46c5
JH
8141 }
8142 }
8143
b8c5462f
JH
8144 return namedclass;
8145}
8146
8147STATIC void
830247a4 8148S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 8149{
97aff369 8150 dVAR;
7918f24d
NC
8151
8152 PERL_ARGS_ASSERT_CHECKPOSIXCC;
8153
3dab1dad 8154 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
8155 const char *s = RExC_parse;
8156 const char c = *s++;
b8c5462f 8157
3dab1dad 8158 while (isALNUM(*s))
b8c5462f
JH
8159 s++;
8160 if (*s && c == *s && s[1] == ']') {
668c081a
NC
8161 ckWARN3reg(s+2,
8162 "POSIX syntax [%c %c] belongs inside character classes",
8163 c, c);
b45f050a
JF
8164
8165 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 8166 if (POSIXCC_NOTYET(c)) {
830247a4 8167 /* adjust RExC_parse so the error shows after
b45f050a 8168 the class closes */
9a86a77b 8169 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 8170 NOOP;
b45f050a
JF
8171 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8172 }
b8c5462f
JH
8173 }
8174 }
620e46c5
JH
8175}
8176
0399b215 8177/* No locale test */
e1d1eefb
YO
8178#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
8179ANYOF_##NAME: \
8180 for (value = 0; value < 256; value++) \
8181 if (TEST) \
2283d326 8182 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
e1d1eefb
YO
8183 yesno = '+'; \
8184 what = WORD; \
8185 break; \
8186case ANYOF_N##NAME: \
8187 for (value = 0; value < 256; value++) \
8188 if (!TEST) \
2283d326 8189 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
e1d1eefb
YO
8190 yesno = '!'; \
8191 what = WORD; \
8192 break
89836f1f 8193
a12cf05f
KW
8194/* Like the above, but there are differences if we are in uni-8-bit or not, so
8195 * there are two tests passed in, to use depending on that. There aren't any
8196 * cases where the label is different from the name, so no need for that
8197 * parameter */
0399b215 8198#define _C_C_T_(NAME,TEST_8,TEST_7,WORD) \
a12cf05f
KW
8199ANYOF_##NAME: \
8200 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8201 else if (UNI_SEMANTICS) { \
8202 for (value = 0; value < 256; value++) { \
2283d326
KW
8203 if (TEST_8) stored += \
8204 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
a12cf05f
KW
8205 } \
8206 } \
8207 else { \
3c8eea2b 8208 for (value = 0; value < 128; value++) { \
2283d326 8209 if (TEST_7) stored += \
3c8eea2b 8210 S_set_regclass_bit(aTHX_ pRExC_state, ret, UNI_TO_NATIVE(value)); \
a12cf05f
KW
8211 } \
8212 } \
8213 yesno = '+'; \
8214 what = WORD; \
8215 break; \
8216case ANYOF_N##NAME: \
8217 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8218 else if (UNI_SEMANTICS) { \
8219 for (value = 0; value < 256; value++) { \
2283d326
KW
8220 if (! TEST_8) stored += \
8221 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
a12cf05f
KW
8222 } \
8223 } \
8224 else { \
3c8eea2b 8225 for (value = 0; value < 128; value++) { \
2283d326
KW
8226 if (! TEST_7) stored += \
8227 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
a12cf05f 8228 } \
3c8eea2b
KW
8229 for (value = 128; value < 256; value++) { \
8230 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8231 } \
a12cf05f
KW
8232 } \
8233 yesno = '!'; \
8234 what = WORD; \
8235 break
8236
da7fcca4
YO
8237/*
8238 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8239 so that it is possible to override the option here without having to
8240 rebuild the entire core. as we are required to do if we change regcomp.h
8241 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8242*/
8243#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8244#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8245#endif
8246
8247#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8248#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8249#else
8250#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8251#endif
8252
2283d326
KW
8253STATIC U8
8254S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8255{
8256
8257 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
8258 * Locale folding is done at run-time, so this function should not be
8259 * called for nodes that are for locales.
8260 *
8261 * This function simply sets the bit corresponding to the fold of the input
8262 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
8263 * 'F' is 'f'.
8264 *
8265 * It also sets any necessary flags, and returns the number of bits that
8266 * actually changed from 0 to 1 */
8267
8268 U8 stored = 0;
2283d326
KW
8269 U8 fold;
8270
8271 fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
8272 : PL_fold[value];
8273
8274 /* It assumes the bit for 'value' has already been set */
8275 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
8276 ANYOF_BITMAP_SET(node, fold);
8277 stored++;
8278 }
8279
8280 /* The fold of the German sharp s is two ASCII characters, so isn't in the
8281 * bitmap and doesn't have to be in utf8, but we only process it if unicode
8282 * semantics are called for */
8283 if (UNI_SEMANTICS && value == LATIN_SMALL_LETTER_SHARP_S) {
8284 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
8285 }
e2e75538 8286 else if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
2283d326
KW
8287 || (! UNI_SEMANTICS
8288 && ! isASCII(value)
8289 && PL_fold_latin1[value] != value))
8290 { /* A character that has a fold outside of Latin1 matches outside the
8291 bitmap, but only when the target string is utf8. Similarly when we
8292 don't have unicode semantics for the above ASCII Latin-1 characters,
8293 and they have a fold, they should match if the target is utf8, and
8294 not otherwise */
8295 ANYOF_FLAGS(node) |= ANYOF_UTF8;
8296 }
8297
8298 return stored;
8299}
8300
8301
8302PERL_STATIC_INLINE U8
8303S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U32 value)
8304{
8305 /* This inline function sets a bit in the bitmap if not already set, and if
8306 * appropriate, its fold, returning the number of bits that actually
8307 * changed from 0 to 1 */
8308
8309 U8 stored;
8310
8311 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
8312 return 0;
8313 }
8314
8315 ANYOF_BITMAP_SET(node, value);
8316 stored = 1;
8317
8318 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
8319 stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
8320 }
8321
8322 return stored;
8323}
8324
7f6f358c
YO
8325/*
8326 parse a class specification and produce either an ANYOF node that
89836f1f
YO
8327 matches the pattern or if the pattern matches a single char only and
8328 that char is < 256 and we are case insensitive then we produce an
8329 EXACT node instead.
7f6f358c 8330*/
89836f1f 8331
76e3520e 8332STATIC regnode *
3dab1dad 8333S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 8334{
97aff369 8335 dVAR;
9a86a77b 8336 register UV nextvalue;
3568d838 8337 register IV prevvalue = OOB_UNICODE;
ffc61ed2 8338 register IV range = 0;
e1d1eefb 8339 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 8340 register regnode *ret;
ba210ebe 8341 STRLEN numlen;
ffc61ed2 8342 IV namedclass;
cbbf8932 8343 char *rangebegin = NULL;
936ed897 8344 bool need_class = 0;
c445ea15 8345 SV *listsv = NULL;
ffc61ed2 8346 UV n;
9e55ce06 8347 bool optimize_invert = TRUE;
cbbf8932 8348 AV* unicode_alternate = NULL;
1b2d223b
JH
8349#ifdef EBCDIC
8350 UV literal_endpoint = 0;
8351#endif
2c63ecad 8352 UV stored = 0; /* 0, 1, or more than 1 chars stored in the class */
ffc61ed2 8353
3dab1dad 8354 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 8355 case we need to change the emitted regop to an EXACT. */
07be1b83 8356 const char * orig_parse = RExC_parse;
72f13be8 8357 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8358
8359 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
8360#ifndef DEBUGGING
8361 PERL_UNUSED_ARG(depth);
8362#endif
72f13be8 8363
3dab1dad 8364 DEBUG_PARSE("clas");
7f6f358c
YO
8365
8366 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
8367 ret = reganode(pRExC_state, ANYOF, 0);
8368
8369 if (!SIZE_ONLY)
8370 ANYOF_FLAGS(ret) = 0;
8371
9a86a77b 8372 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
8373 RExC_naughty++;
8374 RExC_parse++;
8375 if (!SIZE_ONLY)
8376 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8377 }
a0d0e21e 8378
73060fc4 8379 if (SIZE_ONLY) {
830247a4 8380 RExC_size += ANYOF_SKIP;
73060fc4
JH
8381 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8382 }
936ed897 8383 else {
830247a4 8384 RExC_emit += ANYOF_SKIP;
936ed897
IZ
8385 if (FOLD)
8386 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8387 if (LOC)
8388 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 8389 ANYOF_BITMAP_ZERO(ret);
396482e1 8390 listsv = newSVpvs("# comment\n");
a0d0e21e 8391 }
b8c5462f 8392
9a86a77b
JH
8393 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8394
b938889d 8395 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 8396 checkposixcc(pRExC_state);
b8c5462f 8397
f064b6ad
HS
8398 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8399 if (UCHARAT(RExC_parse) == ']')
8400 goto charclassloop;
ffc61ed2 8401
fc8cd66c 8402parseit:
9a86a77b 8403 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
8404
8405 charclassloop:
8406
8407 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8408
73b437c8 8409 if (!range)
830247a4 8410 rangebegin = RExC_parse;
ffc61ed2 8411 if (UTF) {
5e12f4fb 8412 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 8413 RExC_end - RExC_parse,
9f7f3913 8414 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
8415 RExC_parse += numlen;
8416 }
8417 else
8418 value = UCHARAT(RExC_parse++);
7f6f358c 8419
9a86a77b
JH
8420 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8421 if (value == '[' && POSIXCC(nextvalue))
830247a4 8422 namedclass = regpposixcc(pRExC_state, value);
620e46c5 8423 else if (value == '\\') {
ffc61ed2 8424 if (UTF) {
5e12f4fb 8425 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 8426 RExC_end - RExC_parse,
9f7f3913 8427 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
8428 RExC_parse += numlen;
8429 }
8430 else
8431 value = UCHARAT(RExC_parse++);
470c3474 8432 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 8433 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
8434 * be a problem later if we want switch on Unicode.
8435 * A similar issue a little bit later when switching on
8436 * namedclass. --jhi */
ffc61ed2 8437 switch ((I32)value) {
b8c5462f
JH
8438 case 'w': namedclass = ANYOF_ALNUM; break;
8439 case 'W': namedclass = ANYOF_NALNUM; break;
8440 case 's': namedclass = ANYOF_SPACE; break;
8441 case 'S': namedclass = ANYOF_NSPACE; break;
8442 case 'd': namedclass = ANYOF_DIGIT; break;
8443 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
8444 case 'v': namedclass = ANYOF_VERTWS; break;
8445 case 'V': namedclass = ANYOF_NVERTWS; break;
8446 case 'h': namedclass = ANYOF_HORIZWS; break;
8447 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
8448 case 'N': /* Handle \N{NAME} in class */
8449 {
8450 /* We only pay attention to the first char of
8451 multichar strings being returned. I kinda wonder
8452 if this makes sense as it does change the behaviour
8453 from earlier versions, OTOH that behaviour was broken
8454 as well. */
8455 UV v; /* value is register so we cant & it /grrr */
afefe6bf 8456 if (reg_namedseq(pRExC_state, &v, NULL)) {
fc8cd66c
YO
8457 goto parseit;
8458 }
8459 value= v;
8460 }
8461 break;
ffc61ed2
JH
8462 case 'p':
8463 case 'P':
3dab1dad
YO
8464 {
8465 char *e;
af6f566e 8466 if (RExC_parse >= RExC_end)
2a4859cd 8467 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 8468 if (*RExC_parse == '{') {
1df70142 8469 const U8 c = (U8)value;
ffc61ed2
JH
8470 e = strchr(RExC_parse++, '}');
8471 if (!e)
0da60cf5 8472 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
8473 while (isSPACE(UCHARAT(RExC_parse)))
8474 RExC_parse++;
8475 if (e == RExC_parse)
0da60cf5 8476 vFAIL2("Empty \\%c{}", c);
ffc61ed2 8477 n = e - RExC_parse;
ab13f0c7
JH
8478 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8479 n--;
ffc61ed2
JH
8480 }
8481 else {
8482 e = RExC_parse;
8483 n = 1;
8484 }
8485 if (!SIZE_ONLY) {
ab13f0c7
JH
8486 if (UCHARAT(RExC_parse) == '^') {
8487 RExC_parse++;
8488 n--;
8489 value = value == 'p' ? 'P' : 'p'; /* toggle */
8490 while (isSPACE(UCHARAT(RExC_parse))) {
8491 RExC_parse++;
8492 n--;
8493 }
8494 }
097eb12c
AL
8495 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8496 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
8497 }
8498 RExC_parse = e + 1;
08fc12dd
KW
8499
8500 /* The \p could match something in the Latin1 range, hence
8501 * something that isn't utf8 */
3ff7ceb3 8502 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
f81125e2 8503 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 8504 }
f81125e2 8505 break;
b8c5462f
JH
8506 case 'n': value = '\n'; break;
8507 case 'r': value = '\r'; break;
8508 case 't': value = '\t'; break;
8509 case 'f': value = '\f'; break;
8510 case 'b': value = '\b'; break;
c7f1f016
NIS
8511 case 'e': value = ASCII_TO_NATIVE('\033');break;
8512 case 'a': value = ASCII_TO_NATIVE('\007');break;
f0a2b745
KW
8513 case 'o':
8514 RExC_parse--; /* function expects to be pointed at the 'o' */
454155d9
KW
8515 {
8516 const char* error_msg;
8517 bool valid = grok_bslash_o(RExC_parse,
f0a2b745
KW
8518 &value,
8519 &numlen,
454155d9
KW
8520 &error_msg,
8521 SIZE_ONLY);
8522 RExC_parse += numlen;
8523 if (! valid) {
8524 vFAIL(error_msg);
8525 }
f0a2b745 8526 }
f0a2b745
KW
8527 if (PL_encoding && value < 0x100) {
8528 goto recode_encoding;
8529 }
8530 break;
b8c5462f 8531 case 'x':
ffc61ed2 8532 if (*RExC_parse == '{') {
a4c04bdc
NC
8533 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8534 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 8535 char * const e = strchr(RExC_parse++, '}');
b81d288d 8536 if (!e)
ffc61ed2 8537 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
8538
8539 numlen = e - RExC_parse;
8540 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8541 RExC_parse = e + 1;
8542 }
8543 else {
a4c04bdc 8544 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
8545 numlen = 2;
8546 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8547 RExC_parse += numlen;
8548 }
9e08bc66
TS
8549 if (PL_encoding && value < 0x100)
8550 goto recode_encoding;
b8c5462f
JH
8551 break;
8552 case 'c':
f9d13529 8553 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
b8c5462f
JH
8554 break;
8555 case '0': case '1': case '2': case '3': case '4':
c99e91e9 8556 case '5': case '6': case '7':
9e08bc66 8557 {
c99e91e9
KW
8558 /* Take 1-3 octal digits */
8559 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9e08bc66
TS
8560 numlen = 3;
8561 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8562 RExC_parse += numlen;
8563 if (PL_encoding && value < 0x100)
8564 goto recode_encoding;
8565 break;
8566 }
8567 recode_encoding:
8568 {
8569 SV* enc = PL_encoding;
8570 value = reg_recode((const char)(U8)value, &enc);
668c081a
NC
8571 if (!enc && SIZE_ONLY)
8572 ckWARNreg(RExC_parse,
8573 "Invalid escape in the specified encoding");
9e08bc66
TS
8574 break;
8575 }
1028017a 8576 default:
c99e91e9
KW
8577 /* Allow \_ to not give an error */
8578 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
668c081a
NC
8579 ckWARN2reg(RExC_parse,
8580 "Unrecognized escape \\%c in character class passed through",
8581 (int)value);
c99e91e9 8582 }
1028017a 8583 break;
b8c5462f 8584 }
ffc61ed2 8585 } /* end of \blah */
1b2d223b
JH
8586#ifdef EBCDIC
8587 else
8588 literal_endpoint++;
8589#endif
ffc61ed2
JH
8590
8591 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8592
2c63ecad
KW
8593 /* What matches in a locale is not known until runtime, so need to
8594 * (one time per class) allocate extra space to pass to regexec.
8595 * The space will contain a bit for each named class that is to be
8596 * matched against. This isn't needed for \p{} and pseudo-classes,
8597 * as they are not affected by locale, and hence are dealt with
8598 * separately */
8599 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8600 need_class = 1;
8601 if (SIZE_ONLY) {
8602 RExC_size += ANYOF_CLASS_ADD_SKIP;
8603 }
8604 else {
8605 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8606 ANYOF_CLASS_ZERO(ret);
8607 }
9051cfd9 8608 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
2c63ecad 8609 }
ffc61ed2
JH
8610
8611 /* a bad range like a-\d, a-[:digit:] ? */
8612 if (range) {
73b437c8 8613 if (!SIZE_ONLY) {
668c081a
NC
8614 const int w =
8615 RExC_parse >= rangebegin ?
8616 RExC_parse - rangebegin : 0;
8617 ckWARN4reg(RExC_parse,
b45f050a 8618 "False [] range \"%*.*s\"",
097eb12c 8619 w, w, rangebegin);
668c081a 8620
3568d838 8621 if (prevvalue < 256) {
2283d326
KW
8622 stored +=
8623 S_set_regclass_bit(aTHX_ pRExC_state, ret, prevvalue);
8624 stored +=
8625 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
ffc61ed2
JH
8626 }
8627 else {
ef87b810 8628 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
ffc61ed2 8629 Perl_sv_catpvf(aTHX_ listsv,
3568d838 8630 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 8631 }
b8c5462f 8632 }
ffc61ed2
JH
8633
8634 range = 0; /* this was not a true range */
73b437c8 8635 }
ffc61ed2 8636
89836f1f
YO
8637
8638
73b437c8 8639 if (!SIZE_ONLY) {
c49a72a9
NC
8640 const char *what = NULL;
8641 char yesno = 0;
8642
3568d838
JH
8643 if (namedclass > OOB_NAMEDCLASS)
8644 optimize_invert = FALSE;
e2962f66
JH
8645 /* Possible truncation here but in some 64-bit environments
8646 * the compiler gets heartburn about switch on 64-bit values.
8647 * A similar issue a little earlier when switching on value.
98f323fa 8648 * --jhi */
e2962f66 8649 switch ((I32)namedclass) {
da7fcca4 8650
0399b215
KW
8651 case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
8652 case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
8653 case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
8654 case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
8655 case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
8656 case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
8657 case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
8658 case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
8659 case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
8660 case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
da7fcca4 8661#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
a12cf05f 8662 /* \s, \w match all unicode if utf8. */
0399b215
KW
8663 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8664 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
da7fcca4 8665#else
a12cf05f 8666 /* \s, \w match ascii and locale only */
0399b215
KW
8667 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8668 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
da7fcca4 8669#endif
0399b215 8670 case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
e1d1eefb
YO
8671 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8672 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
73b437c8
JH
8673 case ANYOF_ASCII:
8674 if (LOC)
936ed897 8675 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 8676 else {
1ba5c669 8677 for (value = 0; value < 128; value++)
2283d326 8678 stored +=
3c8eea2b 8679 S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value));
73b437c8 8680 }
c49a72a9
NC
8681 yesno = '+';
8682 what = "ASCII";
73b437c8
JH
8683 break;
8684 case ANYOF_NASCII:
8685 if (LOC)
936ed897 8686 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 8687 else {
1ba5c669 8688 for (value = 128; value < 256; value++)
2283d326 8689 stored +=
3c8eea2b 8690 S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value));
73b437c8 8691 }
c49a72a9
NC
8692 yesno = '!';
8693 what = "ASCII";
89836f1f 8694 break;
ffc61ed2
JH
8695 case ANYOF_DIGIT:
8696 if (LOC)
8697 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8698 else {
8699 /* consecutive digits assumed */
8700 for (value = '0'; value <= '9'; value++)
2283d326
KW
8701 stored +=
8702 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
ffc61ed2 8703 }
c49a72a9 8704 yesno = '+';
da7fcca4 8705 what = POSIX_CC_UNI_NAME("Digit");
ffc61ed2
JH
8706 break;
8707 case ANYOF_NDIGIT:
8708 if (LOC)
8709 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8710 else {
8711 /* consecutive digits assumed */
8712 for (value = 0; value < '0'; value++)
2283d326
KW
8713 stored +=
8714 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
ffc61ed2 8715 for (value = '9' + 1; value < 256; value++)
2283d326
KW
8716 stored +=
8717 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
ffc61ed2 8718 }
c49a72a9 8719 yesno = '!';
da7fcca4 8720 what = POSIX_CC_UNI_NAME("Digit");
89836f1f 8721 break;
f81125e2
JP
8722 case ANYOF_MAX:
8723 /* this is to handle \p and \P */
8724 break;
73b437c8 8725 default:
b45f050a 8726 vFAIL("Invalid [::] class");
73b437c8 8727 break;
b8c5462f 8728 }
c49a72a9
NC
8729 if (what) {
8730 /* Strings such as "+utf8::isWord\n" */
8731 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8732 }
2c63ecad 8733 stored+=2; /* can't optimize this class */
ef87b810
KW
8734
8735 /* All but ASCII can match Unicode characters, but all the ones
8736 * that aren't in utf8 are in the bitmap */
8737 if (namedclass != ANYOF_ASCII) {
8738 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8739 }
73b437c8 8740 continue;
a0d0e21e 8741 }
ffc61ed2
JH
8742 } /* end of namedclass \blah */
8743
a0d0e21e 8744 if (range) {
eb160463 8745 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
8746 const int w = RExC_parse - rangebegin;
8747 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 8748 range = 0; /* not a valid range */
73b437c8 8749 }
a0d0e21e
LW
8750 }
8751 else {
3568d838 8752 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
8753 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8754 RExC_parse[1] != ']') {
8755 RExC_parse++;
ffc61ed2
JH
8756
8757 /* a bad range like \w-, [:word:]- ? */
8758 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 8759 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 8760 const int w =
afd78fd5
JH
8761 RExC_parse >= rangebegin ?
8762 RExC_parse - rangebegin : 0;
830247a4 8763 vWARN4(RExC_parse,
b45f050a 8764 "False [] range \"%*.*s\"",
097eb12c 8765 w, w, rangebegin);
afd78fd5 8766 }
73b437c8 8767 if (!SIZE_ONLY)
2283d326
KW
8768 stored +=
8769 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
73b437c8 8770 } else
ffc61ed2
JH
8771 range = 1; /* yeah, it's a range! */
8772 continue; /* but do it the next time */
a0d0e21e 8773 }
a687059c 8774 }
ffc61ed2 8775
93733859 8776 /* now is the next time */
07be1b83 8777 /*stored += (value - prevvalue + 1);*/
ae5c130c 8778 if (!SIZE_ONLY) {
3568d838 8779 if (prevvalue < 256) {
1df70142 8780 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 8781 IV i;
3568d838 8782#ifdef EBCDIC
1b2d223b
JH
8783 /* In EBCDIC [\x89-\x91] should include
8784 * the \x8e but [i-j] should not. */
8785 if (literal_endpoint == 2 &&
8786 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8787 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 8788 {
3568d838
JH
8789 if (isLOWER(prevvalue)) {
8790 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 8791 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326
KW
8792 stored +=
8793 S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
2670d666 8794 }
ffc61ed2 8795 } else {
3568d838 8796 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 8797 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326
KW
8798 stored +=
8799 S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
2670d666 8800 }
ffc61ed2 8801 }
8ada0baa 8802 }
ffc61ed2 8803 else
8ada0baa 8804#endif
07be1b83 8805 for (i = prevvalue; i <= ceilvalue; i++) {
79e7df45 8806 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
07be1b83 8807 }
3568d838 8808 }
a5961de5 8809 if (value > 255 || UTF) {
1df70142
AL
8810 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8811 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 8812 stored+=2; /* can't optimize this class */
ef87b810
KW
8813
8814 /* If the code point requires utf8 to represent, and we are not
8815 * folding, it can't match unless the target is in utf8. Only
8816 * a few code points above 255 fold to below it, so XXX an
8817 * optimization would be to know which ones and set the flag
8818 * appropriately. */
8819 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
8820 ? ANYOF_NONBITMAP
8821 : ANYOF_UTF8;
b08decb7 8822 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 8823 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
8824 prevnatvalue, natvalue);
8825 }
8826 else if (prevnatvalue == natvalue) {
8827 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 8828 if (FOLD) {
89ebb4a3 8829 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 8830 STRLEN foldlen;
1df70142 8831 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 8832
e294cc5d
JH
8833#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8834 if (RExC_precomp[0] == ':' &&
8835 RExC_precomp[1] == '[' &&
8836 (f == 0xDF || f == 0x92)) {
8837 f = NATIVE_TO_UNI(f);
8838 }
8839#endif
c840d2a2
JH
8840 /* If folding and foldable and a single
8841 * character, insert also the folded version
8842 * to the charclass. */
9e55ce06 8843 if (f != value) {
e294cc5d
JH
8844#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8845 if ((RExC_precomp[0] == ':' &&
8846 RExC_precomp[1] == '[' &&
8847 (f == 0xA2 &&
8848 (value == 0xFB05 || value == 0xFB06))) ?
8849 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8850 foldlen == (STRLEN)UNISKIP(f) )
8851#else
eb160463 8852 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 8853#endif
9e55ce06
JH
8854 Perl_sv_catpvf(aTHX_ listsv,
8855 "%04"UVxf"\n", f);
8856 else {
8857 /* Any multicharacter foldings
8858 * require the following transform:
8859 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8860 * where E folds into "pq" and F folds
8861 * into "rst", all other characters
8862 * fold to single characters. We save
8863 * away these multicharacter foldings,
8864 * to be later saved as part of the
8865 * additional "s" data. */
8866 SV *sv;
8867
8868 if (!unicode_alternate)
8869 unicode_alternate = newAV();
740cce10
NC
8870 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8871 TRUE);
9e55ce06
JH
8872 av_push(unicode_alternate, sv);
8873 }
8874 }
254ba52a 8875
60a8b682
JH
8876 /* If folding and the value is one of the Greek
8877 * sigmas insert a few more sigmas to make the
8878 * folding rules of the sigmas to work right.
8879 * Note that not all the possible combinations
8880 * are handled here: some of them are handled
9e55ce06
JH
8881 * by the standard folding rules, and some of
8882 * them (literal or EXACTF cases) are handled
8883 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
8884 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8885 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8886 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 8887 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8888 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8889 }
8890 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8891 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8892 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8893 }
8894 }
ffc61ed2 8895 }
1b2d223b
JH
8896#ifdef EBCDIC
8897 literal_endpoint = 0;
8898#endif
8ada0baa 8899 }
ffc61ed2
JH
8900
8901 range = 0; /* this range (if it was one) is done now */
a0d0e21e 8902 }
ffc61ed2 8903
ffc61ed2 8904
7f6f358c
YO
8905
8906 if (SIZE_ONLY)
8907 return ret;
8908 /****** !SIZE_ONLY AFTER HERE *********/
8909
3b57cd43 8910 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
7f6f358c
YO
8911 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8912 ) {
1d5eda9d
KW
8913 /* optimize single char class to an EXACT node but *only* when its not
8914 * a UTF/high char. Note that the information needed to decide to do
8915 * this optimization is not currently available until the 2nd pass, and
8916 * that the actually used EXACT node takes less space than the
8917 * calculated ANYOF node, and hence the amount of space calculated in
8918 * the first pass is larger than actually used. Currently we don't
8919 * keep track of enough information to do this for nodes which contain
8920 * matches outside the bitmap */
07be1b83
YO
8921 const char * cur_parse= RExC_parse;
8922 RExC_emit = (regnode *)orig_emit;
8923 RExC_parse = (char *)orig_parse;
7f6f358c
YO
8924 ret = reg_node(pRExC_state,
8925 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 8926 RExC_parse = (char *)cur_parse;
7f6f358c
YO
8927 *STRING(ret)= (char)value;
8928 STR_LEN(ret)= 1;
8929 RExC_emit += STR_SZ(1);
ef8d46e8 8930 SvREFCNT_dec(listsv);
7f6f358c
YO
8931 return ret;
8932 }
ae5c130c 8933 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 8934 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
8935 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8936 ) {
a0ed51b3 8937 for (value = 0; value < 256; ++value) {
936ed897 8938 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 8939 UV fold = PL_fold[value];
ffc61ed2
JH
8940
8941 if (fold != value)
8942 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
8943 }
8944 }
936ed897 8945 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 8946 }
ffc61ed2 8947
ae5c130c 8948 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 8949 if (optimize_invert &&
ffc61ed2
JH
8950 /* If the only flag is inversion. */
8951 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 8952 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 8953 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 8954 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 8955 }
7f6f358c 8956 {
097eb12c 8957 AV * const av = newAV();
ffc61ed2 8958 SV *rv;
9e55ce06 8959 /* The 0th element stores the character class description
6a0407ee 8960 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
8961 * to initialize the appropriate swash (which gets stored in
8962 * the 1st element), and also useful for dumping the regnode.
8963 * The 2nd element stores the multicharacter foldings,
6a0407ee 8964 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
8965 av_store(av, 0, listsv);
8966 av_store(av, 1, NULL);
ad64d0ec
NC
8967 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8968 rv = newRV_noinc(MUTABLE_SV(av));
19860706 8969 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 8970 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 8971 ARG_SET(ret, n);
a0ed51b3 8972 }
a0ed51b3
LW
8973 return ret;
8974}
89836f1f
YO
8975#undef _C_C_T_
8976
a0ed51b3 8977
bcdf7404
YO
8978/* reg_skipcomment()
8979
8980 Absorbs an /x style # comments from the input stream.
8981 Returns true if there is more text remaining in the stream.
8982 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8983 terminates the pattern without including a newline.
8984
8985 Note its the callers responsibility to ensure that we are
8986 actually in /x mode
8987
8988*/
8989
8990STATIC bool
8991S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8992{
8993 bool ended = 0;
7918f24d
NC
8994
8995 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8996
bcdf7404
YO
8997 while (RExC_parse < RExC_end)
8998 if (*RExC_parse++ == '\n') {
8999 ended = 1;
9000 break;
9001 }
9002 if (!ended) {
9003 /* we ran off the end of the pattern without ending
9004 the comment, so we have to add an \n when wrapping */
9005 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9006 return 0;
9007 } else
9008 return 1;
9009}
9010
9011/* nextchar()
9012
3b753521 9013 Advances the parse position, and optionally absorbs
bcdf7404
YO
9014 "whitespace" from the inputstream.
9015
9016 Without /x "whitespace" means (?#...) style comments only,
9017 with /x this means (?#...) and # comments and whitespace proper.
9018
9019 Returns the RExC_parse point from BEFORE the scan occurs.
9020
9021 This is the /x friendly way of saying RExC_parse++.
9022*/
9023
76e3520e 9024STATIC char*
830247a4 9025S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 9026{
097eb12c 9027 char* const retval = RExC_parse++;
a0d0e21e 9028
7918f24d
NC
9029 PERL_ARGS_ASSERT_NEXTCHAR;
9030
4633a7c4 9031 for (;;) {
830247a4
IZ
9032 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
9033 RExC_parse[2] == '#') {
e994fd66
AE
9034 while (*RExC_parse != ')') {
9035 if (RExC_parse == RExC_end)
9036 FAIL("Sequence (?#... not terminated");
830247a4 9037 RExC_parse++;
e994fd66 9038 }
830247a4 9039 RExC_parse++;
4633a7c4
LW
9040 continue;
9041 }
bbe252da 9042 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
9043 if (isSPACE(*RExC_parse)) {
9044 RExC_parse++;
748a9306
LW
9045 continue;
9046 }
830247a4 9047 else if (*RExC_parse == '#') {
bcdf7404
YO
9048 if ( reg_skipcomment( pRExC_state ) )
9049 continue;
748a9306 9050 }
748a9306 9051 }
4633a7c4 9052 return retval;
a0d0e21e 9053 }
a687059c
LW
9054}
9055
9056/*
c277df42 9057- reg_node - emit a node
a0d0e21e 9058*/
76e3520e 9059STATIC regnode * /* Location. */
830247a4 9060S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 9061{
97aff369 9062 dVAR;
c277df42 9063 register regnode *ptr;
504618e9 9064 regnode * const ret = RExC_emit;
07be1b83 9065 GET_RE_DEBUG_FLAGS_DECL;
a687059c 9066
7918f24d
NC
9067 PERL_ARGS_ASSERT_REG_NODE;
9068
c277df42 9069 if (SIZE_ONLY) {
830247a4
IZ
9070 SIZE_ALIGN(RExC_size);
9071 RExC_size += 1;
a0d0e21e
LW
9072 return(ret);
9073 }
3b57cd43
YO
9074 if (RExC_emit >= RExC_emit_bound)
9075 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9076
c277df42 9077 NODE_ALIGN_FILL(ret);
a0d0e21e 9078 ptr = ret;
c277df42 9079 FILL_ADVANCE_NODE(ptr, op);
7122b237 9080#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9081 if (RExC_offsets) { /* MJD */
07be1b83 9082 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 9083 "reg_node", __LINE__,
13d6edb4 9084 PL_reg_name[op],
07be1b83
YO
9085 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
9086 ? "Overwriting end of array!\n" : "OK",
9087 (UV)(RExC_emit - RExC_emit_start),
9088 (UV)(RExC_parse - RExC_start),
9089 (UV)RExC_offsets[0]));
ccb2c380 9090 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 9091 }
7122b237 9092#endif
830247a4 9093 RExC_emit = ptr;
a0d0e21e 9094 return(ret);
a687059c
LW
9095}
9096
9097/*
a0d0e21e
LW
9098- reganode - emit a node with an argument
9099*/
76e3520e 9100STATIC regnode * /* Location. */
830247a4 9101S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 9102{
97aff369 9103 dVAR;
c277df42 9104 register regnode *ptr;
504618e9 9105 regnode * const ret = RExC_emit;
07be1b83 9106 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 9107
7918f24d
NC
9108 PERL_ARGS_ASSERT_REGANODE;
9109
c277df42 9110 if (SIZE_ONLY) {
830247a4
IZ
9111 SIZE_ALIGN(RExC_size);
9112 RExC_size += 2;
6bda09f9
YO
9113 /*
9114 We can't do this:
9115
9116 assert(2==regarglen[op]+1);
9117
9118 Anything larger than this has to allocate the extra amount.
9119 If we changed this to be:
9120
9121 RExC_size += (1 + regarglen[op]);
9122
9123 then it wouldn't matter. Its not clear what side effect
9124 might come from that so its not done so far.
9125 -- dmq
9126 */
a0d0e21e
LW
9127 return(ret);
9128 }
3b57cd43
YO
9129 if (RExC_emit >= RExC_emit_bound)
9130 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9131
c277df42 9132 NODE_ALIGN_FILL(ret);
a0d0e21e 9133 ptr = ret;
c277df42 9134 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 9135#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9136 if (RExC_offsets) { /* MJD */
07be1b83 9137 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 9138 "reganode",
ccb2c380 9139 __LINE__,
13d6edb4 9140 PL_reg_name[op],
07be1b83 9141 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 9142 "Overwriting end of array!\n" : "OK",
07be1b83
YO
9143 (UV)(RExC_emit - RExC_emit_start),
9144 (UV)(RExC_parse - RExC_start),
9145 (UV)RExC_offsets[0]));
ccb2c380 9146 Set_Cur_Node_Offset;
fac92740 9147 }
7122b237 9148#endif
830247a4 9149 RExC_emit = ptr;
a0d0e21e 9150 return(ret);
fe14fcc3
LW
9151}
9152
9153/*
cd439c50 9154- reguni - emit (if appropriate) a Unicode character
a0ed51b3 9155*/
71207a34
AL
9156STATIC STRLEN
9157S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 9158{
97aff369 9159 dVAR;
7918f24d
NC
9160
9161 PERL_ARGS_ASSERT_REGUNI;
9162
71207a34 9163 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
9164}
9165
9166/*
a0d0e21e
LW
9167- reginsert - insert an operator in front of already-emitted operand
9168*
9169* Means relocating the operand.
9170*/
76e3520e 9171STATIC void
6bda09f9 9172S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 9173{
97aff369 9174 dVAR;
c277df42
IZ
9175 register regnode *src;
9176 register regnode *dst;
9177 register regnode *place;
504618e9 9178 const int offset = regarglen[(U8)op];
6bda09f9 9179 const int size = NODE_STEP_REGNODE + offset;
07be1b83 9180 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9181
9182 PERL_ARGS_ASSERT_REGINSERT;
def51078 9183 PERL_UNUSED_ARG(depth);
22c35a8c 9184/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 9185 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 9186 if (SIZE_ONLY) {
6bda09f9 9187 RExC_size += size;
a0d0e21e
LW
9188 return;
9189 }
a687059c 9190
830247a4 9191 src = RExC_emit;
6bda09f9 9192 RExC_emit += size;
830247a4 9193 dst = RExC_emit;
40d049e4 9194 if (RExC_open_parens) {
6bda09f9 9195 int paren;
3b57cd43 9196 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 9197 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 9198 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 9199 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
9200 RExC_open_parens[paren] += size;
9201 } else {
3b57cd43 9202 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
9203 }
9204 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 9205 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
9206 RExC_close_parens[paren] += size;
9207 } else {
3b57cd43 9208 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
9209 }
9210 }
6bda09f9 9211 }
40d049e4 9212
fac92740 9213 while (src > opnd) {
c277df42 9214 StructCopy(--src, --dst, regnode);
7122b237 9215#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9216 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 9217 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 9218 "reg_insert",
ccb2c380 9219 __LINE__,
13d6edb4 9220 PL_reg_name[op],
07be1b83
YO
9221 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9222 ? "Overwriting end of array!\n" : "OK",
9223 (UV)(src - RExC_emit_start),
9224 (UV)(dst - RExC_emit_start),
9225 (UV)RExC_offsets[0]));
ccb2c380
MP
9226 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9227 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 9228 }
7122b237 9229#endif
fac92740
MJD
9230 }
9231
a0d0e21e
LW
9232
9233 place = opnd; /* Op node, where operand used to be. */
7122b237 9234#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9235 if (RExC_offsets) { /* MJD */
07be1b83 9236 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 9237 "reginsert",
ccb2c380 9238 __LINE__,
13d6edb4 9239 PL_reg_name[op],
07be1b83 9240 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 9241 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
9242 (UV)(place - RExC_emit_start),
9243 (UV)(RExC_parse - RExC_start),
786e8c11 9244 (UV)RExC_offsets[0]));
ccb2c380 9245 Set_Node_Offset(place, RExC_parse);
45948336 9246 Set_Node_Length(place, 1);
fac92740 9247 }
7122b237 9248#endif
c277df42
IZ
9249 src = NEXTOPER(place);
9250 FILL_ADVANCE_NODE(place, op);
9251 Zero(src, offset, regnode);
a687059c
LW
9252}
9253
9254/*
c277df42 9255- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 9256- SEE ALSO: regtail_study
a0d0e21e 9257*/
097eb12c 9258/* TODO: All three parms should be const */
76e3520e 9259STATIC void
3dab1dad 9260S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 9261{
97aff369 9262 dVAR;
c277df42 9263 register regnode *scan;
72f13be8 9264 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9265
9266 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
9267#ifndef DEBUGGING
9268 PERL_UNUSED_ARG(depth);
9269#endif
a0d0e21e 9270
c277df42 9271 if (SIZE_ONLY)
a0d0e21e
LW
9272 return;
9273
9274 /* Find last node. */
9275 scan = p;
9276 for (;;) {
504618e9 9277 regnode * const temp = regnext(scan);
3dab1dad
YO
9278 DEBUG_PARSE_r({
9279 SV * const mysv=sv_newmortal();
9280 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9281 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
9282 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9283 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9284 (temp == NULL ? "->" : ""),
13d6edb4 9285 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 9286 );
3dab1dad
YO
9287 });
9288 if (temp == NULL)
9289 break;
9290 scan = temp;
9291 }
9292
9293 if (reg_off_by_arg[OP(scan)]) {
9294 ARG_SET(scan, val - scan);
9295 }
9296 else {
9297 NEXT_OFF(scan) = val - scan;
9298 }
9299}
9300
07be1b83 9301#ifdef DEBUGGING
3dab1dad
YO
9302/*
9303- regtail_study - set the next-pointer at the end of a node chain of p to val.
9304- Look for optimizable sequences at the same time.
9305- currently only looks for EXACT chains.
07be1b83
YO
9306
9307This is expermental code. The idea is to use this routine to perform
9308in place optimizations on branches and groups as they are constructed,
9309with the long term intention of removing optimization from study_chunk so
9310that it is purely analytical.
9311
9312Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9313to control which is which.
9314
3dab1dad
YO
9315*/
9316/* TODO: All four parms should be const */
07be1b83 9317
3dab1dad
YO
9318STATIC U8
9319S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9320{
9321 dVAR;
9322 register regnode *scan;
07be1b83
YO
9323 U8 exact = PSEUDO;
9324#ifdef EXPERIMENTAL_INPLACESCAN
9325 I32 min = 0;
9326#endif
3dab1dad
YO
9327 GET_RE_DEBUG_FLAGS_DECL;
9328
7918f24d
NC
9329 PERL_ARGS_ASSERT_REGTAIL_STUDY;
9330
07be1b83 9331
3dab1dad
YO
9332 if (SIZE_ONLY)
9333 return exact;
9334
9335 /* Find last node. */
9336
9337 scan = p;
9338 for (;;) {
9339 regnode * const temp = regnext(scan);
07be1b83
YO
9340#ifdef EXPERIMENTAL_INPLACESCAN
9341 if (PL_regkind[OP(scan)] == EXACT)
9342 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9343 return EXACT;
9344#endif
3dab1dad
YO
9345 if ( exact ) {
9346 switch (OP(scan)) {
9347 case EXACT:
9348 case EXACTF:
2c2b7f86 9349 case EXACTFU:
3dab1dad
YO
9350 case EXACTFL:
9351 if( exact == PSEUDO )
9352 exact= OP(scan);
07be1b83
YO
9353 else if ( exact != OP(scan) )
9354 exact= 0;
3dab1dad
YO
9355 case NOTHING:
9356 break;
9357 default:
9358 exact= 0;
9359 }
9360 }
9361 DEBUG_PARSE_r({
9362 SV * const mysv=sv_newmortal();
9363 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9364 regprop(RExC_rx, mysv, scan);
eaf3ca90 9365 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 9366 SvPV_nolen_const(mysv),
eaf3ca90 9367 REG_NODE_NUM(scan),
13d6edb4 9368 PL_reg_name[exact]);
3dab1dad 9369 });
a0d0e21e
LW
9370 if (temp == NULL)
9371 break;
9372 scan = temp;
9373 }
07be1b83
YO
9374 DEBUG_PARSE_r({
9375 SV * const mysv_val=sv_newmortal();
9376 DEBUG_PARSE_MSG("");
9377 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
9378 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9379 SvPV_nolen_const(mysv_val),
9380 (IV)REG_NODE_NUM(val),
9381 (IV)(val - scan)
07be1b83
YO
9382 );
9383 });
c277df42
IZ
9384 if (reg_off_by_arg[OP(scan)]) {
9385 ARG_SET(scan, val - scan);
a0ed51b3
LW
9386 }
9387 else {
c277df42
IZ
9388 NEXT_OFF(scan) = val - scan;
9389 }
3dab1dad
YO
9390
9391 return exact;
a687059c 9392}
07be1b83 9393#endif
a687059c
LW
9394
9395/*
fd181c75 9396 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 9397 */
f7819f85 9398#ifdef DEBUGGING
c33269f7 9399static void
7918f24d
NC
9400S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9401{
f7819f85
A
9402 int bit;
9403 int set=0;
7918f24d 9404
f7819f85
A
9405 for (bit=0; bit<32; bit++) {
9406 if (flags & (1<<bit)) {
9407 if (!set++ && lead)
9408 PerlIO_printf(Perl_debug_log, "%s",lead);
9409 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9410 }
9411 }
9412 if (lead) {
9413 if (set)
9414 PerlIO_printf(Perl_debug_log, "\n");
9415 else
9416 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9417 }
9418}
9419#endif
9420
a687059c 9421void
097eb12c 9422Perl_regdump(pTHX_ const regexp *r)
a687059c 9423{
35ff7856 9424#ifdef DEBUGGING
97aff369 9425 dVAR;
c445ea15 9426 SV * const sv = sv_newmortal();
ab3bbdeb 9427 SV *dsv= sv_newmortal();
f8fc2ecf 9428 RXi_GET_DECL(r,ri);
f7819f85 9429 GET_RE_DEBUG_FLAGS_DECL;
a687059c 9430
7918f24d
NC
9431 PERL_ARGS_ASSERT_REGDUMP;
9432
f8fc2ecf 9433 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
9434
9435 /* Header fields of interest. */
ab3bbdeb
YO
9436 if (r->anchored_substr) {
9437 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9438 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 9439 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9440 "anchored %s%s at %"IVdf" ",
9441 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 9442 (IV)r->anchored_offset);
ab3bbdeb
YO
9443 } else if (r->anchored_utf8) {
9444 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9445 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 9446 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9447 "anchored utf8 %s%s at %"IVdf" ",
9448 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 9449 (IV)r->anchored_offset);
ab3bbdeb
YO
9450 }
9451 if (r->float_substr) {
9452 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9453 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 9454 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9455 "floating %s%s at %"IVdf"..%"UVuf" ",
9456 s, RE_SV_TAIL(r->float_substr),
7b0972df 9457 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
9458 } else if (r->float_utf8) {
9459 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9460 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 9461 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9462 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9463 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 9464 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 9465 }
33b8afdf 9466 if (r->check_substr || r->check_utf8)
b81d288d 9467 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
9468 (const char *)
9469 (r->check_substr == r->float_substr
9470 && r->check_utf8 == r->float_utf8
9471 ? "(checking floating" : "(checking anchored"));
bbe252da 9472 if (r->extflags & RXf_NOSCAN)
c277df42 9473 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 9474 if (r->extflags & RXf_CHECK_ALL)
c277df42 9475 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 9476 if (r->check_substr || r->check_utf8)
c277df42
IZ
9477 PerlIO_printf(Perl_debug_log, ") ");
9478
f8fc2ecf
YO
9479 if (ri->regstclass) {
9480 regprop(r, sv, ri->regstclass);
1de06328 9481 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 9482 }
bbe252da 9483 if (r->extflags & RXf_ANCH) {
774d564b 9484 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 9485 if (r->extflags & RXf_ANCH_BOL)
774d564b 9486 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 9487 if (r->extflags & RXf_ANCH_MBOL)
c277df42 9488 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 9489 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 9490 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 9491 if (r->extflags & RXf_ANCH_GPOS)
774d564b 9492 PerlIO_printf(Perl_debug_log, "(GPOS)");
9493 PerlIO_putc(Perl_debug_log, ' ');
9494 }
bbe252da 9495 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 9496 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 9497 if (r->intflags & PREGf_SKIP)
760ac839 9498 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 9499 if (r->intflags & PREGf_IMPLICIT)
760ac839 9500 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 9501 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 9502 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 9503 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 9504 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 9505 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 9506#else
7918f24d 9507 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 9508 PERL_UNUSED_CONTEXT;
65e66c80 9509 PERL_UNUSED_ARG(r);
17c3b450 9510#endif /* DEBUGGING */
a687059c
LW
9511}
9512
9513/*
a0d0e21e
LW
9514- regprop - printable representation of opcode
9515*/
3339dfd8
YO
9516#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9517STMT_START { \
9518 if (do_sep) { \
9519 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9520 if (flags & ANYOF_INVERT) \
9521 /*make sure the invert info is in each */ \
9522 sv_catpvs(sv, "^"); \
9523 do_sep = 0; \
9524 } \
9525} STMT_END
9526
46fc3d4c 9527void
32fc9b6a 9528Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 9529{
35ff7856 9530#ifdef DEBUGGING
97aff369 9531 dVAR;
9b155405 9532 register int k;
f8fc2ecf 9533 RXi_GET_DECL(prog,progi);
1de06328 9534 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 9535
7918f24d 9536 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 9537
76f68e9b 9538 sv_setpvs(sv, "");
8aa23a47 9539
03363afd 9540 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
9541 /* It would be nice to FAIL() here, but this may be called from
9542 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 9543 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 9544 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 9545
3dab1dad 9546 k = PL_regkind[OP(o)];
9b155405 9547
2a782b5b 9548 if (k == EXACT) {
f92a2122 9549 sv_catpvs(sv, " ");
ab3bbdeb
YO
9550 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9551 * is a crude hack but it may be the best for now since
9552 * we have no flag "this EXACTish node was UTF-8"
9553 * --jhi */
f92a2122
NC
9554 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9555 PERL_PV_ESCAPE_UNI_DETECT |
9556 PERL_PV_PRETTY_ELLIPSES |
9557 PERL_PV_PRETTY_LTGT |
9558 PERL_PV_PRETTY_NOCLEAR
9559 );
bb263b4e 9560 } else if (k == TRIE) {
3dab1dad 9561 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 9562 * progi->data isn't available here */
1de06328 9563 const char op = OP(o);
647f639f 9564 const U32 n = ARG(o);
1de06328 9565 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 9566 (reg_ac_data *)progi->data->data[n] :
1de06328 9567 NULL;
3251b653
NC
9568 const reg_trie_data * const trie
9569 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 9570
13d6edb4 9571 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
9572 DEBUG_TRIE_COMPILE_r(
9573 Perl_sv_catpvf(aTHX_ sv,
9574 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9575 (UV)trie->startstate,
1e2e3d02 9576 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
9577 (UV)trie->wordcount,
9578 (UV)trie->minlen,
9579 (UV)trie->maxlen,
9580 (UV)TRIE_CHARCOUNT(trie),
9581 (UV)trie->uniquecharcount
9582 )
9583 );
9584 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9585 int i;
9586 int rangestart = -1;
f46cb337 9587 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 9588 sv_catpvs(sv, "[");
1de06328
YO
9589 for (i = 0; i <= 256; i++) {
9590 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9591 if (rangestart == -1)
9592 rangestart = i;
9593 } else if (rangestart != -1) {
9594 if (i <= rangestart + 3)
9595 for (; rangestart < i; rangestart++)
9596 put_byte(sv, rangestart);
9597 else {
9598 put_byte(sv, rangestart);
9599 sv_catpvs(sv, "-");
9600 put_byte(sv, i - 1);
9601 }
9602 rangestart = -1;
9603 }
9604 }
f3a2811a 9605 sv_catpvs(sv, "]");
1de06328
YO
9606 }
9607
a3621e74 9608 } else if (k == CURLY) {
cb434fcc 9609 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
9610 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9611 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 9612 }
2c2d71f5
JH
9613 else if (k == WHILEM && o->flags) /* Ordinal/of */
9614 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 9615 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 9616 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 9617 if ( RXp_PAREN_NAMES(prog) ) {
9d6ecd7a 9618 if ( k != REF || (OP(o) < NREF)) {
502c6561 9619 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
9620 SV **name= av_fetch(list, ARG(o), 0 );
9621 if (name)
9622 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9623 }
9624 else {
502c6561 9625 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 9626 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
9627 I32 *nums=(I32*)SvPVX(sv_dat);
9628 SV **name= av_fetch(list, nums[0], 0 );
9629 I32 n;
9630 if (name) {
9631 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9632 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9633 (n ? "," : ""), (IV)nums[n]);
9634 }
9635 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 9636 }
1f1031fe 9637 }
ee9b8eae 9638 }
1f1031fe 9639 } else if (k == GOSUB)
6bda09f9 9640 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
9641 else if (k == VERB) {
9642 if (!o->flags)
9643 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 9644 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 9645 } else if (k == LOGICAL)
04ebc1ab 9646 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
f9a79580 9647 else if (k == FOLDCHAR)
df44d732 9648 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
653099ff
GS
9649 else if (k == ANYOF) {
9650 int i, rangestart = -1;
2d03de9c 9651 const U8 flags = ANYOF_FLAGS(o);
24d786f4 9652 int do_sep = 0;
0bd48802
AL
9653
9654 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9655 static const char * const anyofs[] = {
653099ff
GS
9656 "\\w",
9657 "\\W",
9658 "\\s",
9659 "\\S",
9660 "\\d",
9661 "\\D",
9662 "[:alnum:]",
9663 "[:^alnum:]",
9664 "[:alpha:]",
9665 "[:^alpha:]",
9666 "[:ascii:]",
9667 "[:^ascii:]",
24d786f4
YO
9668 "[:cntrl:]",
9669 "[:^cntrl:]",
653099ff
GS
9670 "[:graph:]",
9671 "[:^graph:]",
9672 "[:lower:]",
9673 "[:^lower:]",
9674 "[:print:]",
9675 "[:^print:]",
9676 "[:punct:]",
9677 "[:^punct:]",
9678 "[:upper:]",
aaa51d5e 9679 "[:^upper:]",
653099ff 9680 "[:xdigit:]",
aaa51d5e
JF
9681 "[:^xdigit:]",
9682 "[:space:]",
9683 "[:^space:]",
9684 "[:blank:]",
9685 "[:^blank:]"
653099ff
GS
9686 };
9687
19860706 9688 if (flags & ANYOF_LOCALE)
396482e1 9689 sv_catpvs(sv, "{loc}");
19860706 9690 if (flags & ANYOF_FOLD)
396482e1 9691 sv_catpvs(sv, "{i}");
653099ff 9692 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 9693 if (flags & ANYOF_INVERT)
396482e1 9694 sv_catpvs(sv, "^");
3339dfd8
YO
9695
9696 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
9697 for (i = 0; i <= 256; i++) {
9698 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9699 if (rangestart == -1)
9700 rangestart = i;
9701 } else if (rangestart != -1) {
9702 if (i <= rangestart + 3)
9703 for (; rangestart < i; rangestart++)
653099ff 9704 put_byte(sv, rangestart);
ffc61ed2
JH
9705 else {
9706 put_byte(sv, rangestart);
396482e1 9707 sv_catpvs(sv, "-");
ffc61ed2 9708 put_byte(sv, i - 1);
653099ff 9709 }
24d786f4 9710 do_sep = 1;
ffc61ed2 9711 rangestart = -1;
653099ff 9712 }
847a199f 9713 }
3339dfd8
YO
9714
9715 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9716 /* output any special charclass tests (used mostly under use locale) */
cefafd73 9717 if (o->flags & ANYOF_CLASS && ANYOF_CLASS_TEST_ANY_SET(o))
bb7a0f54 9718 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 9719 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 9720 sv_catpv(sv, anyofs[i]);
24d786f4
YO
9721 do_sep = 1;
9722 }
9723
3339dfd8
YO
9724 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9725
9726 /* output information about the unicode matching */
ef87b810 9727 if (flags & ANYOF_UNICODE_ALL)
396482e1 9728 sv_catpvs(sv, "{unicode_all}");
ef87b810
KW
9729 else if (flags & ANYOF_UTF8)
9730 sv_catpvs(sv, "{unicode}");
f5ecd18d 9731 if (flags & ANYOF_NONBITMAP_NON_UTF8)
ef87b810 9732 sv_catpvs(sv, "{outside bitmap}");
ffc61ed2
JH
9733
9734 {
9735 SV *lv;
32fc9b6a 9736 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 9737
ffc61ed2
JH
9738 if (lv) {
9739 if (sw) {
89ebb4a3 9740 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 9741
ffc61ed2 9742 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 9743 uvchr_to_utf8(s, i);
ffc61ed2 9744
3568d838 9745 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
9746 if (rangestart == -1)
9747 rangestart = i;
9748 } else if (rangestart != -1) {
ffc61ed2
JH
9749 if (i <= rangestart + 3)
9750 for (; rangestart < i; rangestart++) {
2d03de9c
AL
9751 const U8 * const e = uvchr_to_utf8(s,rangestart);
9752 U8 *p;
9753 for(p = s; p < e; p++)
ffc61ed2
JH
9754 put_byte(sv, *p);
9755 }
9756 else {
2d03de9c
AL
9757 const U8 *e = uvchr_to_utf8(s,rangestart);
9758 U8 *p;
9759 for (p = s; p < e; p++)
ffc61ed2 9760 put_byte(sv, *p);
396482e1 9761 sv_catpvs(sv, "-");
2d03de9c
AL
9762 e = uvchr_to_utf8(s, i-1);
9763 for (p = s; p < e; p++)
1df70142 9764 put_byte(sv, *p);
ffc61ed2
JH
9765 }
9766 rangestart = -1;
9767 }
19860706 9768 }
ffc61ed2 9769
396482e1 9770 sv_catpvs(sv, "..."); /* et cetera */
19860706 9771 }
fde631ed 9772
ffc61ed2 9773 {
2e0de35c 9774 char *s = savesvpv(lv);
c445ea15 9775 char * const origs = s;
b81d288d 9776
3dab1dad
YO
9777 while (*s && *s != '\n')
9778 s++;
b81d288d 9779
ffc61ed2 9780 if (*s == '\n') {
2d03de9c 9781 const char * const t = ++s;
ffc61ed2
JH
9782
9783 while (*s) {
9784 if (*s == '\n')
9785 *s = ' ';
9786 s++;
9787 }
9788 if (s[-1] == ' ')
9789 s[-1] = 0;
9790
9791 sv_catpv(sv, t);
fde631ed 9792 }
b81d288d 9793
ffc61ed2 9794 Safefree(origs);
fde631ed
JH
9795 }
9796 }
653099ff 9797 }
ffc61ed2 9798
653099ff
GS
9799 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9800 }
9b155405 9801 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 9802 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 9803#else
96a5add6 9804 PERL_UNUSED_CONTEXT;
65e66c80
SP
9805 PERL_UNUSED_ARG(sv);
9806 PERL_UNUSED_ARG(o);
f9049ba1 9807 PERL_UNUSED_ARG(prog);
17c3b450 9808#endif /* DEBUGGING */
35ff7856 9809}
a687059c 9810
cad2e5aa 9811SV *
288b8c02 9812Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 9813{ /* Assume that RE_INTUIT is set */
97aff369 9814 dVAR;
288b8c02 9815 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 9816 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9817
9818 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
9819 PERL_UNUSED_CONTEXT;
9820
a3621e74 9821 DEBUG_COMPILE_r(
cfd0369c 9822 {
2d03de9c 9823 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 9824 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
9825
9826 if (!PL_colorset) reginitcolors();
9827 PerlIO_printf(Perl_debug_log,
a0288114 9828 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
9829 PL_colors[4],
9830 prog->check_substr ? "" : "utf8 ",
9831 PL_colors[5],PL_colors[0],
cad2e5aa
JH
9832 s,
9833 PL_colors[1],
9834 (strlen(s) > 60 ? "..." : ""));
9835 } );
9836
33b8afdf 9837 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
9838}
9839
84da74a7 9840/*
f8149455 9841 pregfree()
84da74a7 9842
f8149455
YO
9843 handles refcounting and freeing the perl core regexp structure. When
9844 it is necessary to actually free the structure the first thing it
3b753521 9845 does is call the 'free' method of the regexp_engine associated to
f8149455
YO
9846 the regexp, allowing the handling of the void *pprivate; member
9847 first. (This routine is not overridable by extensions, which is why
9848 the extensions free is called first.)
9849
9850 See regdupe and regdupe_internal if you change anything here.
84da74a7 9851*/
f8149455 9852#ifndef PERL_IN_XSUB_RE
2b69d0c2 9853void
84679df5 9854Perl_pregfree(pTHX_ REGEXP *r)
a687059c 9855{
288b8c02
NC
9856 SvREFCNT_dec(r);
9857}
9858
9859void
9860Perl_pregfree2(pTHX_ REGEXP *rx)
9861{
27da23d5 9862 dVAR;
288b8c02 9863 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 9864 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 9865
7918f24d
NC
9866 PERL_ARGS_ASSERT_PREGFREE2;
9867
28d8d7f4
YO
9868 if (r->mother_re) {
9869 ReREFCNT_dec(r->mother_re);
9870 } else {
288b8c02 9871 CALLREGFREE_PVT(rx); /* free the private data */
ef8d46e8 9872 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
9873 }
9874 if (r->substrs) {
ef8d46e8
VP
9875 SvREFCNT_dec(r->anchored_substr);
9876 SvREFCNT_dec(r->anchored_utf8);
9877 SvREFCNT_dec(r->float_substr);
9878 SvREFCNT_dec(r->float_utf8);
28d8d7f4
YO
9879 Safefree(r->substrs);
9880 }
288b8c02 9881 RX_MATCH_COPY_FREE(rx);
f8c7b90f 9882#ifdef PERL_OLD_COPY_ON_WRITE
ef8d46e8 9883 SvREFCNT_dec(r->saved_copy);
ed252734 9884#endif
f0ab9afb 9885 Safefree(r->offs);
f8149455 9886}
28d8d7f4
YO
9887
9888/* reg_temp_copy()
9889
9890 This is a hacky workaround to the structural issue of match results
9891 being stored in the regexp structure which is in turn stored in
9892 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9893 could be PL_curpm in multiple contexts, and could require multiple
9894 result sets being associated with the pattern simultaneously, such
9895 as when doing a recursive match with (??{$qr})
9896
9897 The solution is to make a lightweight copy of the regexp structure
9898 when a qr// is returned from the code executed by (??{$qr}) this
9899 lightweight copy doesnt actually own any of its data except for
9900 the starp/end and the actual regexp structure itself.
9901
9902*/
9903
9904
84679df5 9905REGEXP *
f0826785 9906Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
7918f24d 9907{
f0826785 9908 struct regexp *ret;
288b8c02 9909 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 9910 register const I32 npar = r->nparens+1;
7918f24d
NC
9911
9912 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9913
f0826785
BM
9914 if (!ret_x)
9915 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9916 ret = (struct regexp *)SvANY(ret_x);
9917
288b8c02 9918 (void)ReREFCNT_inc(rx);
f7c278bf
NC
9919 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9920 by pointing directly at the buffer, but flagging that the allocated
9921 space in the copy is zero. As we've just done a struct copy, it's now
9922 a case of zero-ing that, rather than copying the current length. */
9923 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 9924 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
b6f60916
NC
9925 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9926 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
f7c278bf 9927 SvLEN_set(ret_x, 0);
b9ad13ac 9928 SvSTASH_set(ret_x, NULL);
703c388d 9929 SvMAGIC_set(ret_x, NULL);
f0ab9afb
NC
9930 Newx(ret->offs, npar, regexp_paren_pair);
9931 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 9932 if (r->substrs) {
28d8d7f4 9933 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
9934 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9935
9936 SvREFCNT_inc_void(ret->anchored_substr);
9937 SvREFCNT_inc_void(ret->anchored_utf8);
9938 SvREFCNT_inc_void(ret->float_substr);
9939 SvREFCNT_inc_void(ret->float_utf8);
9940
9941 /* check_substr and check_utf8, if non-NULL, point to either their
9942 anchored or float namesakes, and don't hold a second reference. */
486913e4 9943 }
288b8c02 9944 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 9945#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 9946 ret->saved_copy = NULL;
28d8d7f4 9947#endif
288b8c02 9948 ret->mother_re = rx;
28d8d7f4 9949
288b8c02 9950 return ret_x;
28d8d7f4 9951}
f8149455
YO
9952#endif
9953
9954/* regfree_internal()
9955
9956 Free the private data in a regexp. This is overloadable by
9957 extensions. Perl takes care of the regexp structure in pregfree(),
3b753521 9958 this covers the *pprivate pointer which technically perl doesn't
f8149455
YO
9959 know about, however of course we have to handle the
9960 regexp_internal structure when no extension is in use.
9961
9962 Note this is called before freeing anything in the regexp
9963 structure.
9964 */
9965
9966void
288b8c02 9967Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
9968{
9969 dVAR;
288b8c02 9970 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
9971 RXi_GET_DECL(r,ri);
9972 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9973
9974 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9975
f8149455
YO
9976 DEBUG_COMPILE_r({
9977 if (!PL_colorset)
9978 reginitcolors();
9979 {
9980 SV *dsv= sv_newmortal();
3c8556c3 9981 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 9982 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
9983 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9984 PL_colors[4],PL_colors[5],s);
9985 }
9986 });
7122b237
YO
9987#ifdef RE_TRACK_PATTERN_OFFSETS
9988 if (ri->u.offsets)
9989 Safefree(ri->u.offsets); /* 20010421 MJD */
9990#endif
f8fc2ecf
YO
9991 if (ri->data) {
9992 int n = ri->data->count;
f3548bdc
DM
9993 PAD* new_comppad = NULL;
9994 PAD* old_comppad;
4026c95a 9995 PADOFFSET refcnt;
dfad63ad 9996
c277df42 9997 while (--n >= 0) {
261faec3 9998 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 9999 switch (ri->data->what[n]) {
af534a04 10000 case 'a':
c277df42 10001 case 's':
81714fb9 10002 case 'S':
55eed653 10003 case 'u':
ad64d0ec 10004 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 10005 break;
653099ff 10006 case 'f':
f8fc2ecf 10007 Safefree(ri->data->data[n]);
653099ff 10008 break;
dfad63ad 10009 case 'p':
502c6561 10010 new_comppad = MUTABLE_AV(ri->data->data[n]);
dfad63ad 10011 break;
c277df42 10012 case 'o':
dfad63ad 10013 if (new_comppad == NULL)
cea2e8a9 10014 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
10015 PAD_SAVE_LOCAL(old_comppad,
10016 /* Watch out for global destruction's random ordering. */
c445ea15 10017 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 10018 );
b34c0dd4 10019 OP_REFCNT_LOCK;
f8fc2ecf 10020 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
10021 OP_REFCNT_UNLOCK;
10022 if (!refcnt)
f8fc2ecf 10023 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 10024
f3548bdc 10025 PAD_RESTORE_LOCAL(old_comppad);
ad64d0ec 10026 SvREFCNT_dec(MUTABLE_SV(new_comppad));
dfad63ad 10027 new_comppad = NULL;
c277df42
IZ
10028 break;
10029 case 'n':
9e55ce06 10030 break;
07be1b83 10031 case 'T':
be8e71aa
YO
10032 { /* Aho Corasick add-on structure for a trie node.
10033 Used in stclass optimization only */
07be1b83 10034 U32 refcount;
f8fc2ecf 10035 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
10036 OP_REFCNT_LOCK;
10037 refcount = --aho->refcount;
10038 OP_REFCNT_UNLOCK;
10039 if ( !refcount ) {
446bd890
NC
10040 PerlMemShared_free(aho->states);
10041 PerlMemShared_free(aho->fail);
446bd890
NC
10042 /* do this last!!!! */
10043 PerlMemShared_free(ri->data->data[n]);
10044 PerlMemShared_free(ri->regstclass);
07be1b83
YO
10045 }
10046 }
10047 break;
a3621e74 10048 case 't':
07be1b83 10049 {
be8e71aa 10050 /* trie structure. */
07be1b83 10051 U32 refcount;
f8fc2ecf 10052 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
10053 OP_REFCNT_LOCK;
10054 refcount = --trie->refcount;
10055 OP_REFCNT_UNLOCK;
10056 if ( !refcount ) {
446bd890 10057 PerlMemShared_free(trie->charmap);
446bd890
NC
10058 PerlMemShared_free(trie->states);
10059 PerlMemShared_free(trie->trans);
07be1b83 10060 if (trie->bitmap)
446bd890 10061 PerlMemShared_free(trie->bitmap);
786e8c11 10062 if (trie->jump)
446bd890 10063 PerlMemShared_free(trie->jump);
2e64971a 10064 PerlMemShared_free(trie->wordinfo);
446bd890
NC
10065 /* do this last!!!! */
10066 PerlMemShared_free(ri->data->data[n]);
a3621e74 10067 }
07be1b83
YO
10068 }
10069 break;
c277df42 10070 default:
f8fc2ecf 10071 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
10072 }
10073 }
f8fc2ecf
YO
10074 Safefree(ri->data->what);
10075 Safefree(ri->data);
a0d0e21e 10076 }
28d8d7f4 10077
f8fc2ecf 10078 Safefree(ri);
a687059c 10079}
c277df42 10080
a09252eb
NC
10081#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
10082#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
84da74a7
YO
10083#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
10084
10085/*
32cd70f6 10086 re_dup - duplicate a regexp.
84da74a7 10087
8233f606
DM
10088 This routine is expected to clone a given regexp structure. It is only
10089 compiled under USE_ITHREADS.
32cd70f6 10090
f8149455
YO
10091 After all of the core data stored in struct regexp is duplicated
10092 the regexp_engine.dupe method is used to copy any private data
10093 stored in the *pprivate pointer. This allows extensions to handle
10094 any duplication it needs to do.
10095
10096 See pregfree() and regfree_internal() if you change anything here.
84da74a7 10097*/
a3c0e9ca 10098#if defined(USE_ITHREADS)
f8149455 10099#ifndef PERL_IN_XSUB_RE
288b8c02
NC
10100void
10101Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 10102{
84da74a7 10103 dVAR;
a86a1ca7 10104 I32 npar;
288b8c02
NC
10105 const struct regexp *r = (const struct regexp *)SvANY(sstr);
10106 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 10107
7918f24d
NC
10108 PERL_ARGS_ASSERT_RE_DUP_GUTS;
10109
84da74a7 10110 npar = r->nparens+1;
f0ab9afb
NC
10111 Newx(ret->offs, npar, regexp_paren_pair);
10112 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 10113 if(ret->swap) {
28d8d7f4 10114 /* no need to copy these */
f0ab9afb 10115 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 10116 }
84da74a7 10117
6057429f 10118 if (ret->substrs) {
32cd70f6
NC
10119 /* Do it this way to avoid reading from *r after the StructCopy().
10120 That way, if any of the sv_dup_inc()s dislodge *r from the L1
10121 cache, it doesn't matter. */
66b1de87
NC
10122 const bool anchored = r->check_substr
10123 ? r->check_substr == r->anchored_substr
10124 : r->check_utf8 == r->anchored_utf8;
785a26d5 10125 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
10126 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10127
32cd70f6
NC
10128 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
10129 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
10130 ret->float_substr = sv_dup_inc(ret->float_substr, param);
10131 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 10132
32cd70f6
NC
10133 /* check_substr and check_utf8, if non-NULL, point to either their
10134 anchored or float namesakes, and don't hold a second reference. */
10135
10136 if (ret->check_substr) {
10137 if (anchored) {
10138 assert(r->check_utf8 == r->anchored_utf8);
10139 ret->check_substr = ret->anchored_substr;
10140 ret->check_utf8 = ret->anchored_utf8;
10141 } else {
10142 assert(r->check_substr == r->float_substr);
10143 assert(r->check_utf8 == r->float_utf8);
10144 ret->check_substr = ret->float_substr;
10145 ret->check_utf8 = ret->float_utf8;
10146 }
66b1de87
NC
10147 } else if (ret->check_utf8) {
10148 if (anchored) {
10149 ret->check_utf8 = ret->anchored_utf8;
10150 } else {
10151 ret->check_utf8 = ret->float_utf8;
10152 }
32cd70f6 10153 }
6057429f 10154 }
f8149455 10155
5daac39c 10156 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 10157
6057429f 10158 if (ret->pprivate)
288b8c02 10159 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 10160
288b8c02 10161 if (RX_MATCH_COPIED(dstr))
6057429f 10162 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
10163 else
10164 ret->subbeg = NULL;
10165#ifdef PERL_OLD_COPY_ON_WRITE
10166 ret->saved_copy = NULL;
10167#endif
6057429f 10168
c2123ae3
NC
10169 if (ret->mother_re) {
10170 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
10171 /* Our storage points directly to our mother regexp, but that's
10172 1: a buffer in a different thread
10173 2: something we no longer hold a reference on
10174 so we need to copy it locally. */
10175 /* Note we need to sue SvCUR() on our mother_re, because it, in
10176 turn, may well be pointing to its own mother_re. */
10177 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
10178 SvCUR(ret->mother_re)+1));
10179 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
10180 }
10181 ret->mother_re = NULL;
10182 }
6057429f 10183 ret->gofs = 0;
f8149455
YO
10184}
10185#endif /* PERL_IN_XSUB_RE */
10186
10187/*
10188 regdupe_internal()
10189
10190 This is the internal complement to regdupe() which is used to copy
10191 the structure pointed to by the *pprivate pointer in the regexp.
10192 This is the core version of the extension overridable cloning hook.
10193 The regexp structure being duplicated will be copied by perl prior
10194 to this and will be provided as the regexp *r argument, however
10195 with the /old/ structures pprivate pointer value. Thus this routine
10196 may override any copying normally done by perl.
10197
10198 It returns a pointer to the new regexp_internal structure.
10199*/
10200
10201void *
288b8c02 10202Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
10203{
10204 dVAR;
288b8c02 10205 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
10206 regexp_internal *reti;
10207 int len, npar;
10208 RXi_GET_DECL(r,ri);
7918f24d
NC
10209
10210 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455
YO
10211
10212 npar = r->nparens+1;
7122b237 10213 len = ProgLen(ri);
f8149455 10214
45cf4570 10215 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455
YO
10216 Copy(ri->program, reti->program, len+1, regnode);
10217
f8149455 10218
f8fc2ecf 10219 reti->regstclass = NULL;
bcdf7404 10220
f8fc2ecf 10221 if (ri->data) {
84da74a7 10222 struct reg_data *d;
f8fc2ecf 10223 const int count = ri->data->count;
84da74a7
YO
10224 int i;
10225
10226 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10227 char, struct reg_data);
10228 Newx(d->what, count, U8);
10229
10230 d->count = count;
10231 for (i = 0; i < count; i++) {
f8fc2ecf 10232 d->what[i] = ri->data->what[i];
84da74a7 10233 switch (d->what[i]) {
af534a04 10234 /* legal options are one of: sSfpontTua
84da74a7 10235 see also regcomp.h and pregfree() */
af534a04 10236 case 'a': /* actually an AV, but the dup function is identical. */
84da74a7 10237 case 's':
81714fb9 10238 case 'S':
0536c0a7 10239 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 10240 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 10241 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 10242 break;
84da74a7
YO
10243 case 'f':
10244 /* This is cheating. */
10245 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 10246 StructCopy(ri->data->data[i], d->data[i],
84da74a7 10247 struct regnode_charclass_class);
f8fc2ecf 10248 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
10249 break;
10250 case 'o':
bbe252da
YO
10251 /* Compiled op trees are readonly and in shared memory,
10252 and can thus be shared without duplication. */
84da74a7 10253 OP_REFCNT_LOCK;
f8fc2ecf 10254 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
10255 OP_REFCNT_UNLOCK;
10256 break;
23eab42c
NC
10257 case 'T':
10258 /* Trie stclasses are readonly and can thus be shared
10259 * without duplication. We free the stclass in pregfree
10260 * when the corresponding reg_ac_data struct is freed.
10261 */
10262 reti->regstclass= ri->regstclass;
10263 /* Fall through */
84da74a7 10264 case 't':
84da74a7 10265 OP_REFCNT_LOCK;
0536c0a7 10266 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 10267 OP_REFCNT_UNLOCK;
0536c0a7
NC
10268 /* Fall through */
10269 case 'n':
10270 d->data[i] = ri->data->data[i];
84da74a7 10271 break;
84da74a7 10272 default:
f8fc2ecf 10273 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
10274 }
10275 }
10276
f8fc2ecf 10277 reti->data = d;
84da74a7
YO
10278 }
10279 else
f8fc2ecf 10280 reti->data = NULL;
84da74a7 10281
cde0cee5
YO
10282 reti->name_list_idx = ri->name_list_idx;
10283
7122b237
YO
10284#ifdef RE_TRACK_PATTERN_OFFSETS
10285 if (ri->u.offsets) {
10286 Newx(reti->u.offsets, 2*len+1, U32);
10287 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10288 }
10289#else
10290 SetProgLen(reti,len);
10291#endif
10292
f8149455 10293 return (void*)reti;
84da74a7 10294}
f8149455
YO
10295
10296#endif /* USE_ITHREADS */
84da74a7 10297
f8149455 10298#ifndef PERL_IN_XSUB_RE
bcdf7404 10299
c277df42
IZ
10300/*
10301 - regnext - dig the "next" pointer out of a node
c277df42
IZ
10302 */
10303regnode *
864dbfa3 10304Perl_regnext(pTHX_ register regnode *p)
c277df42 10305{
97aff369 10306 dVAR;
c277df42
IZ
10307 register I32 offset;
10308
f8fc2ecf 10309 if (!p)
c277df42
IZ
10310 return(NULL);
10311
35db910f
KW
10312 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
10313 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10314 }
10315
c277df42
IZ
10316 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10317 if (offset == 0)
10318 return(NULL);
10319
c277df42 10320 return(p+offset);
c277df42 10321}
76234dfb 10322#endif
c277df42 10323
01f988be 10324STATIC void
cea2e8a9 10325S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
10326{
10327 va_list args;
10328 STRLEN l1 = strlen(pat1);
10329 STRLEN l2 = strlen(pat2);
10330 char buf[512];
06bf62c7 10331 SV *msv;
73d840c0 10332 const char *message;
c277df42 10333
7918f24d
NC
10334 PERL_ARGS_ASSERT_RE_CROAK2;
10335
c277df42
IZ
10336 if (l1 > 510)
10337 l1 = 510;
10338 if (l1 + l2 > 510)
10339 l2 = 510 - l1;
10340 Copy(pat1, buf, l1 , char);
10341 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
10342 buf[l1 + l2] = '\n';
10343 buf[l1 + l2 + 1] = '\0';
8736538c
AS
10344#ifdef I_STDARG
10345 /* ANSI variant takes additional second argument */
c277df42 10346 va_start(args, pat2);
8736538c
AS
10347#else
10348 va_start(args);
10349#endif
5a844595 10350 msv = vmess(buf, &args);
c277df42 10351 va_end(args);
cfd0369c 10352 message = SvPV_const(msv,l1);
c277df42
IZ
10353 if (l1 > 512)
10354 l1 = 512;
10355 Copy(message, buf, l1 , char);
197cf9b9 10356 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 10357 Perl_croak(aTHX_ "%s", buf);
c277df42 10358}
a0ed51b3
LW
10359
10360/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
10361
76234dfb 10362#ifndef PERL_IN_XSUB_RE
a0ed51b3 10363void
864dbfa3 10364Perl_save_re_context(pTHX)
b81d288d 10365{
97aff369 10366 dVAR;
1ade1aa1
NC
10367
10368 struct re_save_state *state;
10369
10370 SAVEVPTR(PL_curcop);
10371 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10372
10373 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10374 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
c6bf6a65 10375 SSPUSHUV(SAVEt_RE_STATE);
1ade1aa1 10376
46ab3289 10377 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 10378
a0ed51b3 10379 PL_reg_start_tmp = 0;
a0ed51b3 10380 PL_reg_start_tmpl = 0;
c445ea15 10381 PL_reg_oldsaved = NULL;
a5db57d6 10382 PL_reg_oldsavedlen = 0;
a5db57d6 10383 PL_reg_maxiter = 0;
a5db57d6 10384 PL_reg_leftiter = 0;
c445ea15 10385 PL_reg_poscache = NULL;
a5db57d6 10386 PL_reg_poscache_size = 0;
1ade1aa1
NC
10387#ifdef PERL_OLD_COPY_ON_WRITE
10388 PL_nrs = NULL;
10389#endif
ada6e8a9 10390
c445ea15
AL
10391 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10392 if (PL_curpm) {
10393 const REGEXP * const rx = PM_GETRE(PL_curpm);
10394 if (rx) {
1df70142 10395 U32 i;
07bc277f 10396 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 10397 char digits[TYPE_CHARS(long)];
d9fad198 10398 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
10399 GV *const *const gvp
10400 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10401
b37c2d43
AL
10402 if (gvp) {
10403 GV * const gv = *gvp;
10404 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10405 save_scalar(gv);
49f27e4b 10406 }
ada6e8a9
AMS
10407 }
10408 }
10409 }
a0ed51b3 10410}
76234dfb 10411#endif
51371543 10412
51371543 10413static void
acfe0abc 10414clear_re(pTHX_ void *r)
51371543 10415{
97aff369 10416 dVAR;
84679df5 10417 ReREFCNT_dec((REGEXP *)r);
51371543 10418}
ffbc6a93 10419
a28509cc
AL
10420#ifdef DEBUGGING
10421
10422STATIC void
10423S_put_byte(pTHX_ SV *sv, int c)
10424{
7918f24d
NC
10425 PERL_ARGS_ASSERT_PUT_BYTE;
10426
7fddd944
NC
10427 /* Our definition of isPRINT() ignores locales, so only bytes that are
10428 not part of UTF-8 are considered printable. I assume that the same
10429 holds for UTF-EBCDIC.
10430 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10431 which Wikipedia says:
10432
10433 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10434 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10435 identical, to the ASCII delete (DEL) or rubout control character.
10436 ) So the old condition can be simplified to !isPRINT(c) */
9ce2357e
KW
10437 if (!isPRINT(c)) {
10438 if (c < 256) {
10439 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
10440 }
10441 else {
10442 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
10443 }
10444 }
5e7aa789 10445 else {
88c9ea1e 10446 const char string = c;
5e7aa789
NC
10447 if (c == '-' || c == ']' || c == '\\' || c == '^')
10448 sv_catpvs(sv, "\\");
10449 sv_catpvn(sv, &string, 1);
10450 }
a28509cc
AL
10451}
10452
786e8c11 10453
3dab1dad
YO
10454#define CLEAR_OPTSTART \
10455 if (optstart) STMT_START { \
70685ca0 10456 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
10457 optstart=NULL; \
10458 } STMT_END
10459
786e8c11 10460#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 10461
b5a2f8d8
NC
10462STATIC const regnode *
10463S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
10464 const regnode *last, const regnode *plast,
10465 SV* sv, I32 indent, U32 depth)
a28509cc 10466{
97aff369 10467 dVAR;
786e8c11 10468 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 10469 register const regnode *next;
3dab1dad 10470 const regnode *optstart= NULL;
1f1031fe 10471
f8fc2ecf 10472 RXi_GET_DECL(r,ri);
3dab1dad 10473 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10474
10475 PERL_ARGS_ASSERT_DUMPUNTIL;
10476
786e8c11
YO
10477#ifdef DEBUG_DUMPUNTIL
10478 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10479 last ? last-start : 0,plast ? plast-start : 0);
10480#endif
10481
10482 if (plast && plast < last)
10483 last= plast;
10484
10485 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 10486 /* While that wasn't END last time... */
a28509cc
AL
10487 NODE_ALIGN(node);
10488 op = OP(node);
de734bd5 10489 if (op == CLOSE || op == WHILEM)
786e8c11 10490 indent--;
b5a2f8d8 10491 next = regnext((regnode *)node);
1f1031fe 10492
a28509cc 10493 /* Where, what. */
8e11feef 10494 if (OP(node) == OPTIMIZED) {
e68ec53f 10495 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 10496 optstart = node;
3dab1dad 10497 else
8e11feef 10498 goto after_print;
3dab1dad
YO
10499 } else
10500 CLEAR_OPTSTART;
1f1031fe 10501
32fc9b6a 10502 regprop(r, sv, node);
a28509cc 10503 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 10504 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
10505
10506 if (OP(node) != OPTIMIZED) {
10507 if (next == NULL) /* Next ptr. */
10508 PerlIO_printf(Perl_debug_log, " (0)");
10509 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10510 PerlIO_printf(Perl_debug_log, " (FAIL)");
10511 else
10512 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10513 (void)PerlIO_putc(Perl_debug_log, '\n');
10514 }
10515
a28509cc
AL
10516 after_print:
10517 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
10518 assert(next);
10519 {
10520 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
10521 ? regnext((regnode *)next)
10522 : next);
be8e71aa
YO
10523 if (last && nnode > last)
10524 nnode = last;
786e8c11 10525 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 10526 }
a28509cc
AL
10527 }
10528 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 10529 assert(next);
786e8c11 10530 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
10531 }
10532 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 10533 const regnode *this_trie = node;
1de06328 10534 const char op = OP(node);
647f639f 10535 const U32 n = ARG(node);
1de06328 10536 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 10537 (reg_ac_data *)ri->data->data[n] :
1de06328 10538 NULL;
3251b653
NC
10539 const reg_trie_data * const trie =
10540 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 10541#ifdef DEBUGGING
502c6561 10542 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 10543#endif
786e8c11 10544 const regnode *nextbranch= NULL;
a28509cc 10545 I32 word_idx;
76f68e9b 10546 sv_setpvs(sv, "");
786e8c11 10547 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 10548 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
10549
10550 PerlIO_printf(Perl_debug_log, "%*s%s ",
10551 (int)(2*(indent+3)), "",
10552 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
10553 PL_colors[0], PL_colors[1],
10554 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 10555 PERL_PV_PRETTY_ELLIPSES |
7f69552c 10556 PERL_PV_PRETTY_LTGT
786e8c11
YO
10557 )
10558 : "???"
10559 );
10560 if (trie->jump) {
40d049e4 10561 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
10562 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10563 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
10564 if (dist) {
10565 if (!nextbranch)
24b23f37 10566 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
10567 DUMPUNTIL(this_trie + dist, nextbranch);
10568 }
786e8c11
YO
10569 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10570 nextbranch= regnext((regnode *)nextbranch);
10571 } else {
10572 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 10573 }
786e8c11
YO
10574 }
10575 if (last && next > last)
10576 node= last;
10577 else
10578 node= next;
a28509cc 10579 }
786e8c11
YO
10580 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10581 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10582 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
10583 }
10584 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 10585 assert(next);
786e8c11 10586 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
10587 }
10588 else if ( op == PLUS || op == STAR) {
786e8c11 10589 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
10590 }
10591 else if (op == ANYOF) {
10592 /* arglen 1 + class block */
4a3ee7a8 10593 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
a28509cc
AL
10594 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10595 node = NEXTOPER(node);
10596 }
10597 else if (PL_regkind[(U8)op] == EXACT) {
10598 /* Literal string, where present. */
10599 node += NODE_SZ_STR(node) - 1;
10600 node = NEXTOPER(node);
10601 }
10602 else {
10603 node = NEXTOPER(node);
10604 node += regarglen[(U8)op];
10605 }
10606 if (op == CURLYX || op == OPEN)
786e8c11 10607 indent++;
a28509cc 10608 }
3dab1dad 10609 CLEAR_OPTSTART;
786e8c11 10610#ifdef DEBUG_DUMPUNTIL
70685ca0 10611 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 10612#endif
1de06328 10613 return node;
a28509cc
AL
10614}
10615
10616#endif /* DEBUGGING */
10617
241d1a3b
NC
10618/*
10619 * Local variables:
10620 * c-indentation-style: bsd
10621 * c-basic-offset: 4
10622 * indent-tabs-mode: t
10623 * End:
10624 *
37442d52
RGS
10625 * ex: set ts=8 sts=4 sw=4 noet:
10626 */