This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Make cpan/CGI/t/http.t (hopefully) compatible with VMS
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73
AD
29/* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
81714fb9 137 HV *paren_names; /* Paren names */
1f1031fe 138
40d049e4
YO
139 regnode **recurse; /* Recurse regops */
140 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
141#if ADD_TO_REGEXEC
142 char *starttry; /* -Dr: where regtry was called. */
143#define RExC_starttry (pRExC_state->starttry)
144#endif
3dab1dad 145#ifdef DEBUGGING
be8e71aa 146 const char *lastparse;
3dab1dad 147 I32 lastnum;
1f1031fe 148 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
149#define RExC_lastparse (pRExC_state->lastparse)
150#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 151#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 152#endif
830247a4
IZ
153} RExC_state_t;
154
e2509266 155#define RExC_flags (pRExC_state->flags)
830247a4 156#define RExC_precomp (pRExC_state->precomp)
288b8c02 157#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 158#define RExC_rx (pRExC_state->rx)
f8fc2ecf 159#define RExC_rxi (pRExC_state->rxi)
fac92740 160#define RExC_start (pRExC_state->start)
830247a4
IZ
161#define RExC_end (pRExC_state->end)
162#define RExC_parse (pRExC_state->parse)
163#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
164#ifdef RE_TRACK_PATTERN_OFFSETS
165#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
166#endif
830247a4 167#define RExC_emit (pRExC_state->emit)
fac92740 168#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 169#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
170#define RExC_naughty (pRExC_state->naughty)
171#define RExC_sawback (pRExC_state->sawback)
172#define RExC_seen (pRExC_state->seen)
173#define RExC_size (pRExC_state->size)
174#define RExC_npar (pRExC_state->npar)
e2e6a0f1 175#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
176#define RExC_extralen (pRExC_state->extralen)
177#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 179#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 180#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
181#define RExC_open_parens (pRExC_state->open_parens)
182#define RExC_close_parens (pRExC_state->close_parens)
183#define RExC_opend (pRExC_state->opend)
81714fb9 184#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
185#define RExC_recurse (pRExC_state->recurse)
186#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 187
cde0cee5 188
a687059c
LW
189#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
a687059c 192
35c8bce7
LW
193#ifdef SPSTART
194#undef SPSTART /* dratted cpp namespace... */
195#endif
a687059c
LW
196/*
197 * Flags to be passed up and down.
198 */
a687059c 199#define WORST 0 /* Worst case. */
a3b492c3 200#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
201
202/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
203 * character, and if utf8, must be invariant. */
204#define SIMPLE 0x02
a3b492c3
YO
205#define SPSTART 0x04 /* Starts with * or +. */
206#define TRYAGAIN 0x08 /* Weeded out a declaration. */
207#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 208
3dab1dad
YO
209#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
210
07be1b83
YO
211/* whether trie related optimizations are enabled */
212#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
213#define TRIE_STUDY_OPT
786e8c11 214#define FULL_TRIE_STUDY
07be1b83
YO
215#define TRIE_STCLASS
216#endif
1de06328
YO
217
218
40d049e4
YO
219
220#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
221#define PBITVAL(paren) (1 << ((paren) & 7))
222#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
223#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
224#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
225
bbd61b5f
KW
226/* If not already in utf8, do a longjmp back to the beginning */
227#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
228#define REQUIRE_UTF8 STMT_START { \
229 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
230 } STMT_END
40d049e4 231
1de06328
YO
232/* About scan_data_t.
233
234 During optimisation we recurse through the regexp program performing
235 various inplace (keyhole style) optimisations. In addition study_chunk
236 and scan_commit populate this data structure with information about
237 what strings MUST appear in the pattern. We look for the longest
238 string that must appear for at a fixed location, and we look for the
239 longest string that may appear at a floating location. So for instance
240 in the pattern:
241
242 /FOO[xX]A.*B[xX]BAR/
243
244 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
245 strings (because they follow a .* construct). study_chunk will identify
246 both FOO and BAR as being the longest fixed and floating strings respectively.
247
248 The strings can be composites, for instance
249
250 /(f)(o)(o)/
251
252 will result in a composite fixed substring 'foo'.
253
254 For each string some basic information is maintained:
255
256 - offset or min_offset
257 This is the position the string must appear at, or not before.
258 It also implicitly (when combined with minlenp) tells us how many
259 character must match before the string we are searching.
260 Likewise when combined with minlenp and the length of the string
261 tells us how many characters must appear after the string we have
262 found.
263
264 - max_offset
265 Only used for floating strings. This is the rightmost point that
266 the string can appear at. Ifset to I32 max it indicates that the
267 string can occur infinitely far to the right.
268
269 - minlenp
270 A pointer to the minimum length of the pattern that the string
271 was found inside. This is important as in the case of positive
272 lookahead or positive lookbehind we can have multiple patterns
273 involved. Consider
274
275 /(?=FOO).*F/
276
277 The minimum length of the pattern overall is 3, the minimum length
278 of the lookahead part is 3, but the minimum length of the part that
279 will actually match is 1. So 'FOO's minimum length is 3, but the
280 minimum length for the F is 1. This is important as the minimum length
281 is used to determine offsets in front of and behind the string being
282 looked for. Since strings can be composites this is the length of the
283 pattern at the time it was commited with a scan_commit. Note that
284 the length is calculated by study_chunk, so that the minimum lengths
285 are not known until the full pattern has been compiled, thus the
286 pointer to the value.
287
288 - lookbehind
289
290 In the case of lookbehind the string being searched for can be
291 offset past the start point of the final matching string.
292 If this value was just blithely removed from the min_offset it would
293 invalidate some of the calculations for how many chars must match
294 before or after (as they are derived from min_offset and minlen and
295 the length of the string being searched for).
296 When the final pattern is compiled and the data is moved from the
297 scan_data_t structure into the regexp structure the information
298 about lookbehind is factored in, with the information that would
299 have been lost precalculated in the end_shift field for the
300 associated string.
301
302 The fields pos_min and pos_delta are used to store the minimum offset
303 and the delta to the maximum offset at the current point in the pattern.
304
305*/
2c2d71f5
JH
306
307typedef struct scan_data_t {
1de06328
YO
308 /*I32 len_min; unused */
309 /*I32 len_delta; unused */
2c2d71f5
JH
310 I32 pos_min;
311 I32 pos_delta;
312 SV *last_found;
1de06328 313 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
314 I32 last_start_min;
315 I32 last_start_max;
1de06328
YO
316 SV **longest; /* Either &l_fixed, or &l_float. */
317 SV *longest_fixed; /* longest fixed string found in pattern */
318 I32 offset_fixed; /* offset where it starts */
319 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
320 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
321 SV *longest_float; /* longest floating string found in pattern */
322 I32 offset_float_min; /* earliest point in string it can appear */
323 I32 offset_float_max; /* latest point in string it can appear */
324 I32 *minlen_float; /* pointer to the minlen relevent to the string */
325 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
326 I32 flags;
327 I32 whilem_c;
cb434fcc 328 I32 *last_closep;
653099ff 329 struct regnode_charclass_class *start_class;
2c2d71f5
JH
330} scan_data_t;
331
a687059c 332/*
e50aee73 333 * Forward declarations for pregcomp()'s friends.
a687059c 334 */
a0d0e21e 335
27da23d5 336static const scan_data_t zero_scan_data =
1de06328 337 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
338
339#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
340#define SF_BEFORE_SEOL 0x0001
341#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
342#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
343#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
344
09b7f37c
CB
345#ifdef NO_UNARY_PLUS
346# define SF_FIX_SHIFT_EOL (0+2)
347# define SF_FL_SHIFT_EOL (0+4)
348#else
349# define SF_FIX_SHIFT_EOL (+2)
350# define SF_FL_SHIFT_EOL (+4)
351#endif
c277df42
IZ
352
353#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
354#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
355
356#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
357#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
358#define SF_IS_INF 0x0040
359#define SF_HAS_PAR 0x0080
360#define SF_IN_PAR 0x0100
361#define SF_HAS_EVAL 0x0200
362#define SCF_DO_SUBSTR 0x0400
653099ff
GS
363#define SCF_DO_STCLASS_AND 0x0800
364#define SCF_DO_STCLASS_OR 0x1000
365#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 366#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 367
786e8c11 368#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 369#define SCF_SEEN_ACCEPT 0x8000
07be1b83 370
43fead97
KW
371#define UTF cBOOL(RExC_utf8)
372#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
373#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
374#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 375
ffc61ed2 376#define OOB_UNICODE 12345678
93733859 377#define OOB_NAMEDCLASS -1
b8c5462f 378
a0ed51b3
LW
379#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
380#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
381
8615cb43 382
b45f050a
JF
383/* length of regex to show in messages that don't mark a position within */
384#define RegexLengthToShowInErrorMessages 127
385
386/*
387 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
388 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
389 * op/pragma/warn/regcomp.
390 */
7253e4e3
RK
391#define MARKER1 "<-- HERE" /* marker as it appears in the description */
392#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 393
7253e4e3 394#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
395
396/*
397 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
398 * arg. Show regex, up to a maximum length. If it's too long, chop and add
399 * "...".
400 */
58e23c8d 401#define _FAIL(code) STMT_START { \
bfed75c6 402 const char *ellipses = ""; \
ccb2c380
MP
403 IV len = RExC_end - RExC_precomp; \
404 \
405 if (!SIZE_ONLY) \
288b8c02 406 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
407 if (len > RegexLengthToShowInErrorMessages) { \
408 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
409 len = RegexLengthToShowInErrorMessages - 10; \
410 ellipses = "..."; \
411 } \
58e23c8d 412 code; \
ccb2c380 413} STMT_END
8615cb43 414
58e23c8d
YO
415#define FAIL(msg) _FAIL( \
416 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
417 msg, (int)len, RExC_precomp, ellipses))
418
419#define FAIL2(msg,arg) _FAIL( \
420 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
421 arg, (int)len, RExC_precomp, ellipses))
422
b45f050a 423/*
b45f050a
JF
424 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
425 */
ccb2c380 426#define Simple_vFAIL(m) STMT_START { \
a28509cc 427 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
428 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
b45f050a
JF
431
432/*
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
434 */
ccb2c380
MP
435#define vFAIL(m) STMT_START { \
436 if (!SIZE_ONLY) \
288b8c02 437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
438 Simple_vFAIL(m); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Like Simple_vFAIL(), but accepts two arguments.
443 */
ccb2c380 444#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 445 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
448} STMT_END
b45f050a
JF
449
450/*
451 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
452 */
ccb2c380
MP
453#define vFAIL2(m,a1) STMT_START { \
454 if (!SIZE_ONLY) \
288b8c02 455 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
456 Simple_vFAIL2(m, a1); \
457} STMT_END
b45f050a
JF
458
459
460/*
461 * Like Simple_vFAIL(), but accepts three arguments.
462 */
ccb2c380 463#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 464 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
465 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
466 (int)offset, RExC_precomp, RExC_precomp + offset); \
467} STMT_END
b45f050a
JF
468
469/*
470 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
471 */
ccb2c380
MP
472#define vFAIL3(m,a1,a2) STMT_START { \
473 if (!SIZE_ONLY) \
288b8c02 474 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
475 Simple_vFAIL3(m, a1, a2); \
476} STMT_END
b45f050a
JF
477
478/*
479 * Like Simple_vFAIL(), but accepts four arguments.
480 */
ccb2c380 481#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 482 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
483 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
484 (int)offset, RExC_precomp, RExC_precomp + offset); \
485} STMT_END
b45f050a 486
668c081a 487#define ckWARNreg(loc,m) STMT_START { \
a28509cc 488 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
489 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
491} STMT_END
492
668c081a 493#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 494 const IV offset = loc - RExC_precomp; \
d1d15184 495 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
496 m REPORT_LOCATION, \
497 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
498} STMT_END
499
668c081a 500#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 501 const IV offset = loc - RExC_precomp; \
668c081a 502 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
503 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
504} STMT_END
505
506#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 507 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
508 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
509 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
510} STMT_END
511
668c081a
NC
512#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
513 const IV offset = loc - RExC_precomp; \
514 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
515 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
516} STMT_END
517
ccb2c380 518#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 519 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
520 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
521 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
522} STMT_END
523
668c081a
NC
524#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
527 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
528} STMT_END
529
ccb2c380 530#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 531 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
532 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
533 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
534} STMT_END
9d1d55b5 535
8615cb43 536
cd439c50 537/* Allow for side effects in s */
ccb2c380
MP
538#define REGC(c,s) STMT_START { \
539 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
540} STMT_END
cd439c50 541
fac92740
MJD
542/* Macros for recording node offsets. 20001227 mjd@plover.com
543 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
544 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
545 * Element 0 holds the number n.
07be1b83 546 * Position is 1 indexed.
fac92740 547 */
7122b237
YO
548#ifndef RE_TRACK_PATTERN_OFFSETS
549#define Set_Node_Offset_To_R(node,byte)
550#define Set_Node_Offset(node,byte)
551#define Set_Cur_Node_Offset
552#define Set_Node_Length_To_R(node,len)
553#define Set_Node_Length(node,len)
554#define Set_Node_Cur_Length(node)
555#define Node_Offset(n)
556#define Node_Length(n)
557#define Set_Node_Offset_Length(node,offset,len)
558#define ProgLen(ri) ri->u.proglen
559#define SetProgLen(ri,x) ri->u.proglen = x
560#else
561#define ProgLen(ri) ri->u.offsets[0]
562#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
563#define Set_Node_Offset_To_R(node,byte) STMT_START { \
564 if (! SIZE_ONLY) { \
565 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 566 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 567 if((node) < 0) { \
551405c4 568 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
569 } else { \
570 RExC_offsets[2*(node)-1] = (byte); \
571 } \
572 } \
573} STMT_END
574
575#define Set_Node_Offset(node,byte) \
576 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
577#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
578
579#define Set_Node_Length_To_R(node,len) STMT_START { \
580 if (! SIZE_ONLY) { \
581 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 582 __LINE__, (int)(node), (int)(len))); \
ccb2c380 583 if((node) < 0) { \
551405c4 584 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
585 } else { \
586 RExC_offsets[2*(node)] = (len); \
587 } \
588 } \
589} STMT_END
590
591#define Set_Node_Length(node,len) \
592 Set_Node_Length_To_R((node)-RExC_emit_start, len)
593#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
594#define Set_Node_Cur_Length(node) \
595 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
596
597/* Get offsets and lengths */
598#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
599#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
600
07be1b83
YO
601#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
602 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
603 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
604} STMT_END
7122b237 605#endif
07be1b83
YO
606
607#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
608#define EXPERIMENTAL_INPLACESCAN
7122b237 609#endif /*RE_TRACK_PATTERN_OFFSETS*/
07be1b83 610
304ee84b
YO
611#define DEBUG_STUDYDATA(str,data,depth) \
612DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 613 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
614 "%*s" str "Pos:%"IVdf"/%"IVdf \
615 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
616 (int)(depth)*2, "", \
617 (IV)((data)->pos_min), \
618 (IV)((data)->pos_delta), \
304ee84b 619 (UV)((data)->flags), \
1de06328 620 (IV)((data)->whilem_c), \
304ee84b
YO
621 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
622 is_inf ? "INF " : "" \
1de06328
YO
623 ); \
624 if ((data)->last_found) \
625 PerlIO_printf(Perl_debug_log, \
626 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
627 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
628 SvPVX_const((data)->last_found), \
629 (IV)((data)->last_end), \
630 (IV)((data)->last_start_min), \
631 (IV)((data)->last_start_max), \
632 ((data)->longest && \
633 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
634 SvPVX_const((data)->longest_fixed), \
635 (IV)((data)->offset_fixed), \
636 ((data)->longest && \
637 (data)->longest==&((data)->longest_float)) ? "*" : "", \
638 SvPVX_const((data)->longest_float), \
639 (IV)((data)->offset_float_min), \
640 (IV)((data)->offset_float_max) \
641 ); \
642 PerlIO_printf(Perl_debug_log,"\n"); \
643});
644
acfe0abc 645static void clear_re(pTHX_ void *r);
4327152a 646
653099ff 647/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 648 Update the longest found anchored substring and the longest found
653099ff
GS
649 floating substrings if needed. */
650
4327152a 651STATIC void
304ee84b 652S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 653{
e1ec3a88
AL
654 const STRLEN l = CHR_SVLEN(data->last_found);
655 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 656 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 657
7918f24d
NC
658 PERL_ARGS_ASSERT_SCAN_COMMIT;
659
c277df42 660 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 661 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
662 if (*data->longest == data->longest_fixed) {
663 data->offset_fixed = l ? data->last_start_min : data->pos_min;
664 if (data->flags & SF_BEFORE_EOL)
b81d288d 665 data->flags
c277df42
IZ
666 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
667 else
668 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
669 data->minlen_fixed=minlenp;
670 data->lookbehind_fixed=0;
a0ed51b3 671 }
304ee84b 672 else { /* *data->longest == data->longest_float */
c277df42 673 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
674 data->offset_float_max = (l
675 ? data->last_start_max
c277df42 676 : data->pos_min + data->pos_delta);
304ee84b 677 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 678 data->offset_float_max = I32_MAX;
c277df42 679 if (data->flags & SF_BEFORE_EOL)
b81d288d 680 data->flags
c277df42
IZ
681 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
682 else
683 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
684 data->minlen_float=minlenp;
685 data->lookbehind_float=0;
c277df42
IZ
686 }
687 }
688 SvCUR_set(data->last_found, 0);
0eda9292 689 {
a28509cc 690 SV * const sv = data->last_found;
097eb12c
AL
691 if (SvUTF8(sv) && SvMAGICAL(sv)) {
692 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
693 if (mg)
694 mg->mg_len = 0;
695 }
0eda9292 696 }
c277df42
IZ
697 data->last_end = -1;
698 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 699 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
700}
701
653099ff
GS
702/* Can match anything (initialization) */
703STATIC void
097eb12c 704S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 705{
7918f24d
NC
706 PERL_ARGS_ASSERT_CL_ANYTHING;
707
653099ff 708 ANYOF_CLASS_ZERO(cl);
f8bef550 709 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 710 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
711 if (LOC)
712 cl->flags |= ANYOF_LOCALE;
713}
714
715/* Can match anything (initialization) */
716STATIC int
5f66b61c 717S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
718{
719 int value;
720
7918f24d
NC
721 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
722
aaa51d5e 723 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
724 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
725 return 1;
1aa99e6b
IH
726 if (!(cl->flags & ANYOF_UNICODE_ALL))
727 return 0;
10edeb5d 728 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 729 return 0;
653099ff
GS
730 return 1;
731}
732
733/* Can match anything (initialization) */
734STATIC void
097eb12c 735S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 736{
7918f24d
NC
737 PERL_ARGS_ASSERT_CL_INIT;
738
8ecf7187 739 Zero(cl, 1, struct regnode_charclass_class);
653099ff 740 cl->type = ANYOF;
830247a4 741 cl_anything(pRExC_state, cl);
653099ff
GS
742}
743
744STATIC void
097eb12c 745S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 746{
7918f24d
NC
747 PERL_ARGS_ASSERT_CL_INIT_ZERO;
748
8ecf7187 749 Zero(cl, 1, struct regnode_charclass_class);
653099ff 750 cl->type = ANYOF;
830247a4 751 cl_anything(pRExC_state, cl);
653099ff
GS
752 if (LOC)
753 cl->flags |= ANYOF_LOCALE;
754}
755
756/* 'And' a given class with another one. Can create false positives */
757/* We assume that cl is not inverted */
758STATIC void
5f66b61c 759S_cl_and(struct regnode_charclass_class *cl,
a28509cc 760 const struct regnode_charclass_class *and_with)
653099ff 761{
7918f24d 762 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
763
764 assert(and_with->type == ANYOF);
653099ff
GS
765 if (!(and_with->flags & ANYOF_CLASS)
766 && !(cl->flags & ANYOF_CLASS)
767 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
768 && !(and_with->flags & ANYOF_FOLD)
769 && !(cl->flags & ANYOF_FOLD)) {
770 int i;
771
772 if (and_with->flags & ANYOF_INVERT)
773 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
774 cl->bitmap[i] &= ~and_with->bitmap[i];
775 else
776 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
777 cl->bitmap[i] &= and_with->bitmap[i];
778 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
779 if (!(and_with->flags & ANYOF_EOS))
780 cl->flags &= ~ANYOF_EOS;
1aa99e6b 781
14ebb1a2
JH
782 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
783 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
784 cl->flags &= ~ANYOF_UNICODE_ALL;
785 cl->flags |= ANYOF_UNICODE;
786 ARG_SET(cl, ARG(and_with));
787 }
14ebb1a2
JH
788 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
789 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 790 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
791 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
792 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 793 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
794}
795
796/* 'OR' a given class with another one. Can create false positives */
797/* We assume that cl is not inverted */
798STATIC void
097eb12c 799S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 800{
7918f24d
NC
801 PERL_ARGS_ASSERT_CL_OR;
802
653099ff
GS
803 if (or_with->flags & ANYOF_INVERT) {
804 /* We do not use
805 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
806 * <= (B1 | !B2) | (CL1 | !CL2)
807 * which is wasteful if CL2 is small, but we ignore CL2:
808 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
809 * XXXX Can we handle case-fold? Unclear:
810 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
811 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
812 */
813 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
814 && !(or_with->flags & ANYOF_FOLD)
815 && !(cl->flags & ANYOF_FOLD) ) {
816 int i;
817
818 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
819 cl->bitmap[i] |= ~or_with->bitmap[i];
820 } /* XXXX: logic is complicated otherwise */
821 else {
830247a4 822 cl_anything(pRExC_state, cl);
653099ff
GS
823 }
824 } else {
825 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
826 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 827 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
828 || (cl->flags & ANYOF_FOLD)) ) {
829 int i;
830
831 /* OR char bitmap and class bitmap separately */
832 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
833 cl->bitmap[i] |= or_with->bitmap[i];
834 if (or_with->flags & ANYOF_CLASS) {
835 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
836 cl->classflags[i] |= or_with->classflags[i];
837 cl->flags |= ANYOF_CLASS;
838 }
839 }
840 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 841 cl_anything(pRExC_state, cl);
653099ff
GS
842 }
843 }
844 if (or_with->flags & ANYOF_EOS)
845 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
846
847 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
848 ARG(cl) != ARG(or_with)) {
849 cl->flags |= ANYOF_UNICODE_ALL;
850 cl->flags &= ~ANYOF_UNICODE;
851 }
852 if (or_with->flags & ANYOF_UNICODE_ALL) {
853 cl->flags |= ANYOF_UNICODE_ALL;
854 cl->flags &= ~ANYOF_UNICODE;
855 }
653099ff
GS
856}
857
a3621e74
YO
858#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
859#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
860#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
861#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
862
3dab1dad
YO
863
864#ifdef DEBUGGING
07be1b83 865/*
2b8b4781
NC
866 dump_trie(trie,widecharmap,revcharmap)
867 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
868 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
869
870 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
871 The _interim_ variants are used for debugging the interim
872 tables that are used to generate the final compressed
873 representation which is what dump_trie expects.
874
3dab1dad
YO
875 Part of the reason for their existance is to provide a form
876 of documentation as to how the different representations function.
07be1b83
YO
877
878*/
3dab1dad
YO
879
880/*
3dab1dad
YO
881 Dumps the final compressed table form of the trie to Perl_debug_log.
882 Used for debugging make_trie().
883*/
884
885STATIC void
2b8b4781
NC
886S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
887 AV *revcharmap, U32 depth)
3dab1dad
YO
888{
889 U32 state;
ab3bbdeb 890 SV *sv=sv_newmortal();
55eed653 891 int colwidth= widecharmap ? 6 : 4;
2e64971a 892 U16 word;
3dab1dad
YO
893 GET_RE_DEBUG_FLAGS_DECL;
894
7918f24d 895 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 896
3dab1dad
YO
897 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
898 (int)depth * 2 + 2,"",
899 "Match","Base","Ofs" );
900
901 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 902 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 903 if ( tmp ) {
ab3bbdeb
YO
904 PerlIO_printf( Perl_debug_log, "%*s",
905 colwidth,
ddc5bc0f 906 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
907 PL_colors[0], PL_colors[1],
908 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
909 PERL_PV_ESCAPE_FIRSTCHAR
910 )
911 );
3dab1dad
YO
912 }
913 }
914 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
915 (int)depth * 2 + 2,"");
916
917 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 918 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
919 PerlIO_printf( Perl_debug_log, "\n");
920
1e2e3d02 921 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 922 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
923
924 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
925
926 if ( trie->states[ state ].wordnum ) {
927 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
928 } else {
929 PerlIO_printf( Perl_debug_log, "%6s", "" );
930 }
931
932 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
933
934 if ( base ) {
935 U32 ofs = 0;
936
937 while( ( base + ofs < trie->uniquecharcount ) ||
938 ( base + ofs - trie->uniquecharcount < trie->lasttrans
939 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
940 ofs++;
941
942 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
943
944 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
945 if ( ( base + ofs >= trie->uniquecharcount ) &&
946 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
947 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
948 {
ab3bbdeb
YO
949 PerlIO_printf( Perl_debug_log, "%*"UVXf,
950 colwidth,
3dab1dad
YO
951 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
952 } else {
ab3bbdeb 953 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
954 }
955 }
956
957 PerlIO_printf( Perl_debug_log, "]");
958
959 }
960 PerlIO_printf( Perl_debug_log, "\n" );
961 }
2e64971a
DM
962 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
963 for (word=1; word <= trie->wordcount; word++) {
964 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
965 (int)word, (int)(trie->wordinfo[word].prev),
966 (int)(trie->wordinfo[word].len));
967 }
968 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
969}
970/*
3dab1dad
YO
971 Dumps a fully constructed but uncompressed trie in list form.
972 List tries normally only are used for construction when the number of
973 possible chars (trie->uniquecharcount) is very high.
974 Used for debugging make_trie().
975*/
976STATIC void
55eed653 977S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
978 HV *widecharmap, AV *revcharmap, U32 next_alloc,
979 U32 depth)
3dab1dad
YO
980{
981 U32 state;
ab3bbdeb 982 SV *sv=sv_newmortal();
55eed653 983 int colwidth= widecharmap ? 6 : 4;
3dab1dad 984 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
985
986 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
987
3dab1dad 988 /* print out the table precompression. */
ab3bbdeb
YO
989 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
990 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
991 "------:-----+-----------------\n" );
3dab1dad
YO
992
993 for( state=1 ; state < next_alloc ; state ++ ) {
994 U16 charid;
995
ab3bbdeb 996 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
997 (int)depth * 2 + 2,"", (UV)state );
998 if ( ! trie->states[ state ].wordnum ) {
999 PerlIO_printf( Perl_debug_log, "%5s| ","");
1000 } else {
1001 PerlIO_printf( Perl_debug_log, "W%4x| ",
1002 trie->states[ state ].wordnum
1003 );
1004 }
1005 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1006 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1007 if ( tmp ) {
1008 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1009 colwidth,
ddc5bc0f 1010 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1011 PL_colors[0], PL_colors[1],
1012 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1013 PERL_PV_ESCAPE_FIRSTCHAR
1014 ) ,
1e2e3d02
YO
1015 TRIE_LIST_ITEM(state,charid).forid,
1016 (UV)TRIE_LIST_ITEM(state,charid).newstate
1017 );
1018 if (!(charid % 10))
664e119d
RGS
1019 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1020 (int)((depth * 2) + 14), "");
1e2e3d02 1021 }
ab3bbdeb
YO
1022 }
1023 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1024 }
1025}
1026
1027/*
3dab1dad
YO
1028 Dumps a fully constructed but uncompressed trie in table form.
1029 This is the normal DFA style state transition table, with a few
1030 twists to facilitate compression later.
1031 Used for debugging make_trie().
1032*/
1033STATIC void
55eed653 1034S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1035 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1036 U32 depth)
3dab1dad
YO
1037{
1038 U32 state;
1039 U16 charid;
ab3bbdeb 1040 SV *sv=sv_newmortal();
55eed653 1041 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1042 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1043
1044 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1045
1046 /*
1047 print out the table precompression so that we can do a visual check
1048 that they are identical.
1049 */
1050
1051 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1052
1053 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1054 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1055 if ( tmp ) {
ab3bbdeb
YO
1056 PerlIO_printf( Perl_debug_log, "%*s",
1057 colwidth,
ddc5bc0f 1058 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1059 PL_colors[0], PL_colors[1],
1060 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1061 PERL_PV_ESCAPE_FIRSTCHAR
1062 )
1063 );
3dab1dad
YO
1064 }
1065 }
1066
1067 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1068
1069 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1070 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1071 }
1072
1073 PerlIO_printf( Perl_debug_log, "\n" );
1074
1075 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1076
1077 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1078 (int)depth * 2 + 2,"",
1079 (UV)TRIE_NODENUM( state ) );
1080
1081 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1082 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1083 if (v)
1084 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1085 else
1086 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1087 }
1088 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1089 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1090 } else {
1091 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1092 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1093 }
1094 }
07be1b83 1095}
3dab1dad
YO
1096
1097#endif
1098
2e64971a 1099
786e8c11
YO
1100/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1101 startbranch: the first branch in the whole branch sequence
1102 first : start branch of sequence of branch-exact nodes.
1103 May be the same as startbranch
1104 last : Thing following the last branch.
1105 May be the same as tail.
1106 tail : item following the branch sequence
1107 count : words in the sequence
1108 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1109 depth : indent depth
3dab1dad 1110
786e8c11 1111Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1112
786e8c11
YO
1113A trie is an N'ary tree where the branches are determined by digital
1114decomposition of the key. IE, at the root node you look up the 1st character and
1115follow that branch repeat until you find the end of the branches. Nodes can be
1116marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1117
786e8c11 1118 /he|she|his|hers/
72f13be8 1119
786e8c11
YO
1120would convert into the following structure. Numbers represent states, letters
1121following numbers represent valid transitions on the letter from that state, if
1122the number is in square brackets it represents an accepting state, otherwise it
1123will be in parenthesis.
07be1b83 1124
786e8c11
YO
1125 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1126 | |
1127 | (2)
1128 | |
1129 (1) +-i->(6)-+-s->[7]
1130 |
1131 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1132
786e8c11
YO
1133 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1134
1135This shows that when matching against the string 'hers' we will begin at state 1
1136read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1137then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1138is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1139single traverse. We store a mapping from accepting to state to which word was
1140matched, and then when we have multiple possibilities we try to complete the
1141rest of the regex in the order in which they occured in the alternation.
1142
1143The only prior NFA like behaviour that would be changed by the TRIE support is
1144the silent ignoring of duplicate alternations which are of the form:
1145
1146 / (DUPE|DUPE) X? (?{ ... }) Y /x
1147
1148Thus EVAL blocks follwing a trie may be called a different number of times with
1149and without the optimisation. With the optimisations dupes will be silently
1150ignored. This inconsistant behaviour of EVAL type nodes is well established as
1151the following demonstrates:
1152
1153 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1154
1155which prints out 'word' three times, but
1156
1157 'words'=~/(word|word|word)(?{ print $1 })S/
1158
1159which doesnt print it out at all. This is due to other optimisations kicking in.
1160
1161Example of what happens on a structural level:
1162
1163The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1164
1165 1: CURLYM[1] {1,32767}(18)
1166 5: BRANCH(8)
1167 6: EXACT <ac>(16)
1168 8: BRANCH(11)
1169 9: EXACT <ad>(16)
1170 11: BRANCH(14)
1171 12: EXACT <ab>(16)
1172 16: SUCCEED(0)
1173 17: NOTHING(18)
1174 18: END(0)
1175
1176This would be optimizable with startbranch=5, first=5, last=16, tail=16
1177and should turn into:
1178
1179 1: CURLYM[1] {1,32767}(18)
1180 5: TRIE(16)
1181 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1182 <ac>
1183 <ad>
1184 <ab>
1185 16: SUCCEED(0)
1186 17: NOTHING(18)
1187 18: END(0)
1188
1189Cases where tail != last would be like /(?foo|bar)baz/:
1190
1191 1: BRANCH(4)
1192 2: EXACT <foo>(8)
1193 4: BRANCH(7)
1194 5: EXACT <bar>(8)
1195 7: TAIL(8)
1196 8: EXACT <baz>(10)
1197 10: END(0)
1198
1199which would be optimizable with startbranch=1, first=1, last=7, tail=8
1200and would end up looking like:
1201
1202 1: TRIE(8)
1203 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1204 <foo>
1205 <bar>
1206 7: TAIL(8)
1207 8: EXACT <baz>(10)
1208 10: END(0)
1209
1210 d = uvuni_to_utf8_flags(d, uv, 0);
1211
1212is the recommended Unicode-aware way of saying
1213
1214 *(d++) = uv;
1215*/
1216
1e2e3d02 1217#define TRIE_STORE_REVCHAR \
786e8c11 1218 STMT_START { \
73031816
NC
1219 if (UTF) { \
1220 SV *zlopp = newSV(2); \
88c9ea1e
CB
1221 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1222 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1223 SvCUR_set(zlopp, kapow - flrbbbbb); \
1224 SvPOK_on(zlopp); \
1225 SvUTF8_on(zlopp); \
1226 av_push(revcharmap, zlopp); \
1227 } else { \
6bdeddd2 1228 char ooooff = (char)uvc; \
73031816
NC
1229 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1230 } \
1231 } STMT_END
786e8c11
YO
1232
1233#define TRIE_READ_CHAR STMT_START { \
1234 wordlen++; \
1235 if ( UTF ) { \
1236 if ( folder ) { \
1237 if ( foldlen > 0 ) { \
1238 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1239 foldlen -= len; \
1240 scan += len; \
1241 len = 0; \
1242 } else { \
1243 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1244 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1245 foldlen -= UNISKIP( uvc ); \
1246 scan = foldbuf + UNISKIP( uvc ); \
1247 } \
1248 } else { \
1249 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1250 } \
1251 } else { \
1252 uvc = (U32)*uc; \
1253 len = 1; \
1254 } \
1255} STMT_END
1256
1257
1258
1259#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1260 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1261 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1262 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1263 } \
1264 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1265 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1266 TRIE_LIST_CUR( state )++; \
1267} STMT_END
07be1b83 1268
786e8c11
YO
1269#define TRIE_LIST_NEW(state) STMT_START { \
1270 Newxz( trie->states[ state ].trans.list, \
1271 4, reg_trie_trans_le ); \
1272 TRIE_LIST_CUR( state ) = 1; \
1273 TRIE_LIST_LEN( state ) = 4; \
1274} STMT_END
07be1b83 1275
786e8c11
YO
1276#define TRIE_HANDLE_WORD(state) STMT_START { \
1277 U16 dupe= trie->states[ state ].wordnum; \
1278 regnode * const noper_next = regnext( noper ); \
1279 \
786e8c11
YO
1280 DEBUG_r({ \
1281 /* store the word for dumping */ \
1282 SV* tmp; \
1283 if (OP(noper) != NOTHING) \
740cce10 1284 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1285 else \
740cce10 1286 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1287 av_push( trie_words, tmp ); \
786e8c11
YO
1288 }); \
1289 \
1290 curword++; \
2e64971a
DM
1291 trie->wordinfo[curword].prev = 0; \
1292 trie->wordinfo[curword].len = wordlen; \
1293 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1294 \
1295 if ( noper_next < tail ) { \
1296 if (!trie->jump) \
c944940b 1297 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1298 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1299 if (!jumper) \
1300 jumper = noper_next; \
1301 if (!nextbranch) \
1302 nextbranch= regnext(cur); \
1303 } \
1304 \
1305 if ( dupe ) { \
2e64971a
DM
1306 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1307 /* chain, so that when the bits of chain are later */\
1308 /* linked together, the dups appear in the chain */\
1309 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1310 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1311 } else { \
1312 /* we haven't inserted this word yet. */ \
1313 trie->states[ state ].wordnum = curword; \
1314 } \
1315} STMT_END
07be1b83 1316
3dab1dad 1317
786e8c11
YO
1318#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1319 ( ( base + charid >= ucharcount \
1320 && base + charid < ubound \
1321 && state == trie->trans[ base - ucharcount + charid ].check \
1322 && trie->trans[ base - ucharcount + charid ].next ) \
1323 ? trie->trans[ base - ucharcount + charid ].next \
1324 : ( state==1 ? special : 0 ) \
1325 )
3dab1dad 1326
786e8c11
YO
1327#define MADE_TRIE 1
1328#define MADE_JUMP_TRIE 2
1329#define MADE_EXACT_TRIE 4
3dab1dad 1330
a3621e74 1331STATIC I32
786e8c11 1332S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1333{
27da23d5 1334 dVAR;
a3621e74
YO
1335 /* first pass, loop through and scan words */
1336 reg_trie_data *trie;
55eed653 1337 HV *widecharmap = NULL;
2b8b4781 1338 AV *revcharmap = newAV();
a3621e74 1339 regnode *cur;
9f7f3913 1340 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1341 STRLEN len = 0;
1342 UV uvc = 0;
1343 U16 curword = 0;
1344 U32 next_alloc = 0;
786e8c11
YO
1345 regnode *jumper = NULL;
1346 regnode *nextbranch = NULL;
7f69552c 1347 regnode *convert = NULL;
2e64971a 1348 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1349 /* we just use folder as a flag in utf8 */
e1ec3a88 1350 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1351 ? PL_fold
1352 : ( flags == EXACTFL
1353 ? PL_fold_locale
1354 : NULL
1355 )
1356 );
1357
2b8b4781
NC
1358#ifdef DEBUGGING
1359 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1360 AV *trie_words = NULL;
1361 /* along with revcharmap, this only used during construction but both are
1362 * useful during debugging so we store them in the struct when debugging.
8e11feef 1363 */
2b8b4781
NC
1364#else
1365 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1366 STRLEN trie_charcount=0;
3dab1dad 1367#endif
2b8b4781 1368 SV *re_trie_maxbuff;
a3621e74 1369 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1370
1371 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1372#ifndef DEBUGGING
1373 PERL_UNUSED_ARG(depth);
1374#endif
a3621e74 1375
c944940b 1376 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1377 trie->refcount = 1;
3dab1dad 1378 trie->startstate = 1;
786e8c11 1379 trie->wordcount = word_count;
f8fc2ecf 1380 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1381 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1382 if (!(UTF && folder))
c944940b 1383 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1384 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1385 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1386
a3621e74 1387 DEBUG_r({
2b8b4781 1388 trie_words = newAV();
a3621e74 1389 });
a3621e74 1390
0111c4fd 1391 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1392 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1393 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1394 }
3dab1dad
YO
1395 DEBUG_OPTIMISE_r({
1396 PerlIO_printf( Perl_debug_log,
786e8c11 1397 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1398 (int)depth * 2 + 2, "",
1399 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1400 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1401 (int)depth);
3dab1dad 1402 });
7f69552c
YO
1403
1404 /* Find the node we are going to overwrite */
1405 if ( first == startbranch && OP( last ) != BRANCH ) {
1406 /* whole branch chain */
1407 convert = first;
1408 } else {
1409 /* branch sub-chain */
1410 convert = NEXTOPER( first );
1411 }
1412
a3621e74
YO
1413 /* -- First loop and Setup --
1414
1415 We first traverse the branches and scan each word to determine if it
1416 contains widechars, and how many unique chars there are, this is
1417 important as we have to build a table with at least as many columns as we
1418 have unique chars.
1419
1420 We use an array of integers to represent the character codes 0..255
38a44b82 1421 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1422 native representation of the character value as the key and IV's for the
1423 coded index.
1424
1425 *TODO* If we keep track of how many times each character is used we can
1426 remap the columns so that the table compression later on is more
1427 efficient in terms of memory by ensuring most common value is in the
1428 middle and the least common are on the outside. IMO this would be better
1429 than a most to least common mapping as theres a decent chance the most
1430 common letter will share a node with the least common, meaning the node
1431 will not be compressable. With a middle is most common approach the worst
1432 case is when we have the least common nodes twice.
1433
1434 */
1435
a3621e74 1436 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1437 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1438 const U8 *uc = (U8*)STRING( noper );
a28509cc 1439 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1440 STRLEN foldlen = 0;
1441 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1442 const U8 *scan = (U8*)NULL;
07be1b83 1443 U32 wordlen = 0; /* required init */
02daf0ab
YO
1444 STRLEN chars = 0;
1445 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1446
3dab1dad
YO
1447 if (OP(noper) == NOTHING) {
1448 trie->minlen= 0;
1449 continue;
1450 }
02daf0ab
YO
1451 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1452 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1453 regardless of encoding */
1454
a3621e74 1455 for ( ; uc < e ; uc += len ) {
3dab1dad 1456 TRIE_CHARCOUNT(trie)++;
a3621e74 1457 TRIE_READ_CHAR;
3dab1dad 1458 chars++;
a3621e74
YO
1459 if ( uvc < 256 ) {
1460 if ( !trie->charmap[ uvc ] ) {
1461 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1462 if ( folder )
1463 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1464 TRIE_STORE_REVCHAR;
a3621e74 1465 }
02daf0ab
YO
1466 if ( set_bit ) {
1467 /* store the codepoint in the bitmap, and if its ascii
1468 also store its folded equivelent. */
1469 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1470
1471 /* store the folded codepoint */
1472 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1473
1474 if ( !UTF ) {
1475 /* store first byte of utf8 representation of
1476 codepoints in the 127 < uvc < 256 range */
1477 if (127 < uvc && uvc < 192) {
1478 TRIE_BITMAP_SET(trie,194);
1479 } else if (191 < uvc ) {
1480 TRIE_BITMAP_SET(trie,195);
1481 /* && uvc < 256 -- we know uvc is < 256 already */
1482 }
1483 }
02daf0ab
YO
1484 set_bit = 0; /* We've done our bit :-) */
1485 }
a3621e74
YO
1486 } else {
1487 SV** svpp;
55eed653
NC
1488 if ( !widecharmap )
1489 widecharmap = newHV();
a3621e74 1490
55eed653 1491 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1492
1493 if ( !svpp )
e4584336 1494 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1495
1496 if ( !SvTRUE( *svpp ) ) {
1497 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1498 TRIE_STORE_REVCHAR;
a3621e74
YO
1499 }
1500 }
1501 }
3dab1dad
YO
1502 if( cur == first ) {
1503 trie->minlen=chars;
1504 trie->maxlen=chars;
1505 } else if (chars < trie->minlen) {
1506 trie->minlen=chars;
1507 } else if (chars > trie->maxlen) {
1508 trie->maxlen=chars;
1509 }
1510
a3621e74
YO
1511 } /* end first pass */
1512 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1513 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1514 (int)depth * 2 + 2,"",
55eed653 1515 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1516 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1517 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1518 );
a3621e74
YO
1519
1520 /*
1521 We now know what we are dealing with in terms of unique chars and
1522 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1523 representation using a flat table will take. If it's over a reasonable
1524 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1525 conservative but potentially much slower representation using an array
1526 of lists.
1527
1528 At the end we convert both representations into the same compressed
1529 form that will be used in regexec.c for matching with. The latter
1530 is a form that cannot be used to construct with but has memory
1531 properties similar to the list form and access properties similar
1532 to the table form making it both suitable for fast searches and
1533 small enough that its feasable to store for the duration of a program.
1534
1535 See the comment in the code where the compressed table is produced
1536 inplace from the flat tabe representation for an explanation of how
1537 the compression works.
1538
1539 */
1540
1541
2e64971a
DM
1542 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1543 prev_states[1] = 0;
1544
3dab1dad 1545 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1546 /*
1547 Second Pass -- Array Of Lists Representation
1548
1549 Each state will be represented by a list of charid:state records
1550 (reg_trie_trans_le) the first such element holds the CUR and LEN
1551 points of the allocated array. (See defines above).
1552
1553 We build the initial structure using the lists, and then convert
1554 it into the compressed table form which allows faster lookups
1555 (but cant be modified once converted).
a3621e74
YO
1556 */
1557
a3621e74
YO
1558 STRLEN transcount = 1;
1559
1e2e3d02
YO
1560 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1561 "%*sCompiling trie using list compiler\n",
1562 (int)depth * 2 + 2, ""));
446bd890 1563
c944940b
JH
1564 trie->states = (reg_trie_state *)
1565 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1566 sizeof(reg_trie_state) );
a3621e74
YO
1567 TRIE_LIST_NEW(1);
1568 next_alloc = 2;
1569
1570 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1571
c445ea15
AL
1572 regnode * const noper = NEXTOPER( cur );
1573 U8 *uc = (U8*)STRING( noper );
1574 const U8 * const e = uc + STR_LEN( noper );
1575 U32 state = 1; /* required init */
1576 U16 charid = 0; /* sanity init */
1577 U8 *scan = (U8*)NULL; /* sanity init */
1578 STRLEN foldlen = 0; /* required init */
07be1b83 1579 U32 wordlen = 0; /* required init */
c445ea15
AL
1580 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1581
3dab1dad 1582 if (OP(noper) != NOTHING) {
786e8c11 1583 for ( ; uc < e ; uc += len ) {
c445ea15 1584
786e8c11 1585 TRIE_READ_CHAR;
c445ea15 1586
786e8c11
YO
1587 if ( uvc < 256 ) {
1588 charid = trie->charmap[ uvc ];
c445ea15 1589 } else {
55eed653 1590 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1591 if ( !svpp ) {
1592 charid = 0;
1593 } else {
1594 charid=(U16)SvIV( *svpp );
1595 }
c445ea15 1596 }
786e8c11
YO
1597 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1598 if ( charid ) {
a3621e74 1599
786e8c11
YO
1600 U16 check;
1601 U32 newstate = 0;
a3621e74 1602
786e8c11
YO
1603 charid--;
1604 if ( !trie->states[ state ].trans.list ) {
1605 TRIE_LIST_NEW( state );
c445ea15 1606 }
786e8c11
YO
1607 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1608 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1609 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1610 break;
1611 }
1612 }
1613 if ( ! newstate ) {
1614 newstate = next_alloc++;
2e64971a 1615 prev_states[newstate] = state;
786e8c11
YO
1616 TRIE_LIST_PUSH( state, charid, newstate );
1617 transcount++;
1618 }
1619 state = newstate;
1620 } else {
1621 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1622 }
a28509cc 1623 }
c445ea15 1624 }
3dab1dad 1625 TRIE_HANDLE_WORD(state);
a3621e74
YO
1626
1627 } /* end second pass */
1628
1e2e3d02
YO
1629 /* next alloc is the NEXT state to be allocated */
1630 trie->statecount = next_alloc;
c944940b
JH
1631 trie->states = (reg_trie_state *)
1632 PerlMemShared_realloc( trie->states,
1633 next_alloc
1634 * sizeof(reg_trie_state) );
a3621e74 1635
3dab1dad 1636 /* and now dump it out before we compress it */
2b8b4781
NC
1637 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1638 revcharmap, next_alloc,
1639 depth+1)
1e2e3d02 1640 );
a3621e74 1641
c944940b
JH
1642 trie->trans = (reg_trie_trans *)
1643 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1644 {
1645 U32 state;
a3621e74
YO
1646 U32 tp = 0;
1647 U32 zp = 0;
1648
1649
1650 for( state=1 ; state < next_alloc ; state ++ ) {
1651 U32 base=0;
1652
1653 /*
1654 DEBUG_TRIE_COMPILE_MORE_r(
1655 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1656 );
1657 */
1658
1659 if (trie->states[state].trans.list) {
1660 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1661 U16 maxid=minid;
a28509cc 1662 U16 idx;
a3621e74
YO
1663
1664 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1665 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1666 if ( forid < minid ) {
1667 minid=forid;
1668 } else if ( forid > maxid ) {
1669 maxid=forid;
1670 }
a3621e74
YO
1671 }
1672 if ( transcount < tp + maxid - minid + 1) {
1673 transcount *= 2;
c944940b
JH
1674 trie->trans = (reg_trie_trans *)
1675 PerlMemShared_realloc( trie->trans,
446bd890
NC
1676 transcount
1677 * sizeof(reg_trie_trans) );
a3621e74
YO
1678 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1679 }
1680 base = trie->uniquecharcount + tp - minid;
1681 if ( maxid == minid ) {
1682 U32 set = 0;
1683 for ( ; zp < tp ; zp++ ) {
1684 if ( ! trie->trans[ zp ].next ) {
1685 base = trie->uniquecharcount + zp - minid;
1686 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1687 trie->trans[ zp ].check = state;
1688 set = 1;
1689 break;
1690 }
1691 }
1692 if ( !set ) {
1693 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1694 trie->trans[ tp ].check = state;
1695 tp++;
1696 zp = tp;
1697 }
1698 } else {
1699 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1700 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1701 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1702 trie->trans[ tid ].check = state;
1703 }
1704 tp += ( maxid - minid + 1 );
1705 }
1706 Safefree(trie->states[ state ].trans.list);
1707 }
1708 /*
1709 DEBUG_TRIE_COMPILE_MORE_r(
1710 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1711 );
1712 */
1713 trie->states[ state ].trans.base=base;
1714 }
cc601c31 1715 trie->lasttrans = tp + 1;
a3621e74
YO
1716 }
1717 } else {
1718 /*
1719 Second Pass -- Flat Table Representation.
1720
1721 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1722 We know that we will need Charcount+1 trans at most to store the data
1723 (one row per char at worst case) So we preallocate both structures
1724 assuming worst case.
1725
1726 We then construct the trie using only the .next slots of the entry
1727 structs.
1728
1729 We use the .check field of the first entry of the node temporarily to
1730 make compression both faster and easier by keeping track of how many non
1731 zero fields are in the node.
1732
1733 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1734 transition.
1735
1736 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1737 number representing the first entry of the node, and state as a
1738 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1739 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1740 are 2 entrys per node. eg:
1741
1742 A B A B
1743 1. 2 4 1. 3 7
1744 2. 0 3 3. 0 5
1745 3. 0 0 5. 0 0
1746 4. 0 0 7. 0 0
1747
1748 The table is internally in the right hand, idx form. However as we also
1749 have to deal with the states array which is indexed by nodenum we have to
1750 use TRIE_NODENUM() to convert.
1751
1752 */
1e2e3d02
YO
1753 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1754 "%*sCompiling trie using table compiler\n",
1755 (int)depth * 2 + 2, ""));
3dab1dad 1756
c944940b
JH
1757 trie->trans = (reg_trie_trans *)
1758 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1759 * trie->uniquecharcount + 1,
1760 sizeof(reg_trie_trans) );
1761 trie->states = (reg_trie_state *)
1762 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1763 sizeof(reg_trie_state) );
a3621e74
YO
1764 next_alloc = trie->uniquecharcount + 1;
1765
3dab1dad 1766
a3621e74
YO
1767 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1768
c445ea15 1769 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1770 const U8 *uc = (U8*)STRING( noper );
1771 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1772
1773 U32 state = 1; /* required init */
1774
1775 U16 charid = 0; /* sanity init */
1776 U32 accept_state = 0; /* sanity init */
1777 U8 *scan = (U8*)NULL; /* sanity init */
1778
1779 STRLEN foldlen = 0; /* required init */
07be1b83 1780 U32 wordlen = 0; /* required init */
a3621e74
YO
1781 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1782
3dab1dad 1783 if ( OP(noper) != NOTHING ) {
786e8c11 1784 for ( ; uc < e ; uc += len ) {
a3621e74 1785
786e8c11 1786 TRIE_READ_CHAR;
a3621e74 1787
786e8c11
YO
1788 if ( uvc < 256 ) {
1789 charid = trie->charmap[ uvc ];
1790 } else {
55eed653 1791 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1792 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1793 }
786e8c11
YO
1794 if ( charid ) {
1795 charid--;
1796 if ( !trie->trans[ state + charid ].next ) {
1797 trie->trans[ state + charid ].next = next_alloc;
1798 trie->trans[ state ].check++;
2e64971a
DM
1799 prev_states[TRIE_NODENUM(next_alloc)]
1800 = TRIE_NODENUM(state);
786e8c11
YO
1801 next_alloc += trie->uniquecharcount;
1802 }
1803 state = trie->trans[ state + charid ].next;
1804 } else {
1805 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1806 }
1807 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1808 }
a3621e74 1809 }
3dab1dad
YO
1810 accept_state = TRIE_NODENUM( state );
1811 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1812
1813 } /* end second pass */
1814
3dab1dad 1815 /* and now dump it out before we compress it */
2b8b4781
NC
1816 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1817 revcharmap,
1818 next_alloc, depth+1));
a3621e74 1819
a3621e74
YO
1820 {
1821 /*
1822 * Inplace compress the table.*
1823
1824 For sparse data sets the table constructed by the trie algorithm will
1825 be mostly 0/FAIL transitions or to put it another way mostly empty.
1826 (Note that leaf nodes will not contain any transitions.)
1827
1828 This algorithm compresses the tables by eliminating most such
1829 transitions, at the cost of a modest bit of extra work during lookup:
1830
1831 - Each states[] entry contains a .base field which indicates the
1832 index in the state[] array wheres its transition data is stored.
1833
1834 - If .base is 0 there are no valid transitions from that node.
1835
1836 - If .base is nonzero then charid is added to it to find an entry in
1837 the trans array.
1838
1839 -If trans[states[state].base+charid].check!=state then the
1840 transition is taken to be a 0/Fail transition. Thus if there are fail
1841 transitions at the front of the node then the .base offset will point
1842 somewhere inside the previous nodes data (or maybe even into a node
1843 even earlier), but the .check field determines if the transition is
1844 valid.
1845
786e8c11 1846 XXX - wrong maybe?
a3621e74
YO
1847 The following process inplace converts the table to the compressed
1848 table: We first do not compress the root node 1,and mark its all its
1849 .check pointers as 1 and set its .base pointer as 1 as well. This
1850 allows to do a DFA construction from the compressed table later, and
1851 ensures that any .base pointers we calculate later are greater than
1852 0.
1853
1854 - We set 'pos' to indicate the first entry of the second node.
1855
1856 - We then iterate over the columns of the node, finding the first and
1857 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1858 and set the .check pointers accordingly, and advance pos
1859 appropriately and repreat for the next node. Note that when we copy
1860 the next pointers we have to convert them from the original
1861 NODEIDX form to NODENUM form as the former is not valid post
1862 compression.
1863
1864 - If a node has no transitions used we mark its base as 0 and do not
1865 advance the pos pointer.
1866
1867 - If a node only has one transition we use a second pointer into the
1868 structure to fill in allocated fail transitions from other states.
1869 This pointer is independent of the main pointer and scans forward
1870 looking for null transitions that are allocated to a state. When it
1871 finds one it writes the single transition into the "hole". If the
786e8c11 1872 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1873
1874 - Once compressed we can Renew/realloc the structures to release the
1875 excess space.
1876
1877 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1878 specifically Fig 3.47 and the associated pseudocode.
1879
1880 demq
1881 */
a3b680e6 1882 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1883 U32 state, charid;
a3621e74 1884 U32 pos = 0, zp=0;
1e2e3d02 1885 trie->statecount = laststate;
a3621e74
YO
1886
1887 for ( state = 1 ; state < laststate ; state++ ) {
1888 U8 flag = 0;
a28509cc
AL
1889 const U32 stateidx = TRIE_NODEIDX( state );
1890 const U32 o_used = trie->trans[ stateidx ].check;
1891 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1892 trie->trans[ stateidx ].check = 0;
1893
1894 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1895 if ( flag || trie->trans[ stateidx + charid ].next ) {
1896 if ( trie->trans[ stateidx + charid ].next ) {
1897 if (o_used == 1) {
1898 for ( ; zp < pos ; zp++ ) {
1899 if ( ! trie->trans[ zp ].next ) {
1900 break;
1901 }
1902 }
1903 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1904 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1905 trie->trans[ zp ].check = state;
1906 if ( ++zp > pos ) pos = zp;
1907 break;
1908 }
1909 used--;
1910 }
1911 if ( !flag ) {
1912 flag = 1;
1913 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1914 }
1915 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1916 trie->trans[ pos ].check = state;
1917 pos++;
1918 }
1919 }
1920 }
cc601c31 1921 trie->lasttrans = pos + 1;
c944940b
JH
1922 trie->states = (reg_trie_state *)
1923 PerlMemShared_realloc( trie->states, laststate
1924 * sizeof(reg_trie_state) );
a3621e74 1925 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1926 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1927 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1928 (int)depth * 2 + 2,"",
1929 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1930 (IV)next_alloc,
1931 (IV)pos,
a3621e74
YO
1932 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1933 );
1934
1935 } /* end table compress */
1936 }
1e2e3d02
YO
1937 DEBUG_TRIE_COMPILE_MORE_r(
1938 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1939 (int)depth * 2 + 2, "",
1940 (UV)trie->statecount,
1941 (UV)trie->lasttrans)
1942 );
cc601c31 1943 /* resize the trans array to remove unused space */
c944940b
JH
1944 trie->trans = (reg_trie_trans *)
1945 PerlMemShared_realloc( trie->trans, trie->lasttrans
1946 * sizeof(reg_trie_trans) );
a3621e74 1947
3dab1dad 1948 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1949 U8 nodetype =(U8)(flags & 0xFF);
1950 char *str=NULL;
786e8c11 1951
07be1b83 1952#ifdef DEBUGGING
e62cc96a 1953 regnode *optimize = NULL;
7122b237
YO
1954#ifdef RE_TRACK_PATTERN_OFFSETS
1955
b57a0404
JH
1956 U32 mjd_offset = 0;
1957 U32 mjd_nodelen = 0;
7122b237
YO
1958#endif /* RE_TRACK_PATTERN_OFFSETS */
1959#endif /* DEBUGGING */
a3621e74 1960 /*
3dab1dad
YO
1961 This means we convert either the first branch or the first Exact,
1962 depending on whether the thing following (in 'last') is a branch
1963 or not and whther first is the startbranch (ie is it a sub part of
1964 the alternation or is it the whole thing.)
1965 Assuming its a sub part we conver the EXACT otherwise we convert
1966 the whole branch sequence, including the first.
a3621e74 1967 */
3dab1dad 1968 /* Find the node we are going to overwrite */
7f69552c 1969 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1970 /* branch sub-chain */
3dab1dad 1971 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1972#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1973 DEBUG_r({
1974 mjd_offset= Node_Offset((convert));
1975 mjd_nodelen= Node_Length((convert));
1976 });
7122b237 1977#endif
7f69552c 1978 /* whole branch chain */
7122b237
YO
1979 }
1980#ifdef RE_TRACK_PATTERN_OFFSETS
1981 else {
7f69552c
YO
1982 DEBUG_r({
1983 const regnode *nop = NEXTOPER( convert );
1984 mjd_offset= Node_Offset((nop));
1985 mjd_nodelen= Node_Length((nop));
1986 });
07be1b83
YO
1987 }
1988 DEBUG_OPTIMISE_r(
1989 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1990 (int)depth * 2 + 2, "",
786e8c11 1991 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1992 );
7122b237 1993#endif
3dab1dad
YO
1994 /* But first we check to see if there is a common prefix we can
1995 split out as an EXACT and put in front of the TRIE node. */
1996 trie->startstate= 1;
55eed653 1997 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1998 U32 state;
1e2e3d02 1999 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2000 U32 ofs = 0;
8e11feef
RGS
2001 I32 idx = -1;
2002 U32 count = 0;
2003 const U32 base = trie->states[ state ].trans.base;
a3621e74 2004
3dab1dad 2005 if ( trie->states[state].wordnum )
8e11feef 2006 count = 1;
a3621e74 2007
8e11feef 2008 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2009 if ( ( base + ofs >= trie->uniquecharcount ) &&
2010 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2011 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2012 {
3dab1dad 2013 if ( ++count > 1 ) {
2b8b4781 2014 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2015 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2016 if ( state == 1 ) break;
3dab1dad
YO
2017 if ( count == 2 ) {
2018 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2019 DEBUG_OPTIMISE_r(
8e11feef
RGS
2020 PerlIO_printf(Perl_debug_log,
2021 "%*sNew Start State=%"UVuf" Class: [",
2022 (int)depth * 2 + 2, "",
786e8c11 2023 (UV)state));
be8e71aa 2024 if (idx >= 0) {
2b8b4781 2025 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2026 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2027
3dab1dad 2028 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2029 if ( folder )
2030 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2031 DEBUG_OPTIMISE_r(
f1f66076 2032 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2033 );
8e11feef
RGS
2034 }
2035 }
2036 TRIE_BITMAP_SET(trie,*ch);
2037 if ( folder )
2038 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2039 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2040 }
2041 idx = ofs;
2042 }
3dab1dad
YO
2043 }
2044 if ( count == 1 ) {
2b8b4781 2045 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2046 STRLEN len;
2047 char *ch = SvPV( *tmp, len );
de734bd5
A
2048 DEBUG_OPTIMISE_r({
2049 SV *sv=sv_newmortal();
8e11feef
RGS
2050 PerlIO_printf( Perl_debug_log,
2051 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2052 (int)depth * 2 + 2, "",
de734bd5
A
2053 (UV)state, (UV)idx,
2054 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2055 PL_colors[0], PL_colors[1],
2056 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2057 PERL_PV_ESCAPE_FIRSTCHAR
2058 )
2059 );
2060 });
3dab1dad
YO
2061 if ( state==1 ) {
2062 OP( convert ) = nodetype;
2063 str=STRING(convert);
2064 STR_LEN(convert)=0;
2065 }
c490c714
YO
2066 STR_LEN(convert) += len;
2067 while (len--)
de734bd5 2068 *str++ = *ch++;
8e11feef 2069 } else {
f9049ba1 2070#ifdef DEBUGGING
8e11feef
RGS
2071 if (state>1)
2072 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2073#endif
8e11feef
RGS
2074 break;
2075 }
2076 }
2e64971a 2077 trie->prefixlen = (state-1);
3dab1dad 2078 if (str) {
8e11feef 2079 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2080 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2081 trie->startstate = state;
07be1b83
YO
2082 trie->minlen -= (state - 1);
2083 trie->maxlen -= (state - 1);
33809eae
JH
2084#ifdef DEBUGGING
2085 /* At least the UNICOS C compiler choked on this
2086 * being argument to DEBUG_r(), so let's just have
2087 * it right here. */
2088 if (
2089#ifdef PERL_EXT_RE_BUILD
2090 1
2091#else
2092 DEBUG_r_TEST
2093#endif
2094 ) {
2095 regnode *fix = convert;
2096 U32 word = trie->wordcount;
2097 mjd_nodelen++;
2098 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2099 while( ++fix < n ) {
2100 Set_Node_Offset_Length(fix, 0, 0);
2101 }
2102 while (word--) {
2103 SV ** const tmp = av_fetch( trie_words, word, 0 );
2104 if (tmp) {
2105 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2106 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2107 else
2108 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2109 }
2110 }
2111 }
2112#endif
8e11feef
RGS
2113 if (trie->maxlen) {
2114 convert = n;
2115 } else {
3dab1dad 2116 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2117 DEBUG_r(optimize= n);
3dab1dad
YO
2118 }
2119 }
2120 }
a5ca303d
YO
2121 if (!jumper)
2122 jumper = last;
3dab1dad 2123 if ( trie->maxlen ) {
8e11feef
RGS
2124 NEXT_OFF( convert ) = (U16)(tail - convert);
2125 ARG_SET( convert, data_slot );
786e8c11
YO
2126 /* Store the offset to the first unabsorbed branch in
2127 jump[0], which is otherwise unused by the jump logic.
2128 We use this when dumping a trie and during optimisation. */
2129 if (trie->jump)
7f69552c 2130 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2131
786e8c11
YO
2132 /* XXXX */
2133 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2134 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2135 {
2136 OP( convert ) = TRIEC;
2137 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2138 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2139 trie->bitmap= NULL;
2140 } else
2141 OP( convert ) = TRIE;
a3621e74 2142
3dab1dad
YO
2143 /* store the type in the flags */
2144 convert->flags = nodetype;
a5ca303d
YO
2145 DEBUG_r({
2146 optimize = convert
2147 + NODE_STEP_REGNODE
2148 + regarglen[ OP( convert ) ];
2149 });
2150 /* XXX We really should free up the resource in trie now,
2151 as we won't use them - (which resources?) dmq */
3dab1dad 2152 }
a3621e74 2153 /* needed for dumping*/
e62cc96a 2154 DEBUG_r(if (optimize) {
07be1b83 2155 regnode *opt = convert;
bcdf7404 2156
e62cc96a 2157 while ( ++opt < optimize) {
07be1b83
YO
2158 Set_Node_Offset_Length(opt,0,0);
2159 }
786e8c11
YO
2160 /*
2161 Try to clean up some of the debris left after the
2162 optimisation.
a3621e74 2163 */
786e8c11 2164 while( optimize < jumper ) {
07be1b83 2165 mjd_nodelen += Node_Length((optimize));
a3621e74 2166 OP( optimize ) = OPTIMIZED;
07be1b83 2167 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2168 optimize++;
2169 }
07be1b83 2170 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2171 });
2172 } /* end node insert */
2e64971a
DM
2173
2174 /* Finish populating the prev field of the wordinfo array. Walk back
2175 * from each accept state until we find another accept state, and if
2176 * so, point the first word's .prev field at the second word. If the
2177 * second already has a .prev field set, stop now. This will be the
2178 * case either if we've already processed that word's accept state,
2179 * or that that state had multiple words, and the overspill words
2180 * were already linked up earlier.
2181 */
2182 {
2183 U16 word;
2184 U32 state;
2185 U16 prev;
2186
2187 for (word=1; word <= trie->wordcount; word++) {
2188 prev = 0;
2189 if (trie->wordinfo[word].prev)
2190 continue;
2191 state = trie->wordinfo[word].accept;
2192 while (state) {
2193 state = prev_states[state];
2194 if (!state)
2195 break;
2196 prev = trie->states[state].wordnum;
2197 if (prev)
2198 break;
2199 }
2200 trie->wordinfo[word].prev = prev;
2201 }
2202 Safefree(prev_states);
2203 }
2204
2205
2206 /* and now dump out the compressed format */
2207 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2208
55eed653 2209 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2210#ifdef DEBUGGING
2211 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2212 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2213#else
2214 SvREFCNT_dec(revcharmap);
07be1b83 2215#endif
786e8c11
YO
2216 return trie->jump
2217 ? MADE_JUMP_TRIE
2218 : trie->startstate>1
2219 ? MADE_EXACT_TRIE
2220 : MADE_TRIE;
2221}
2222
2223STATIC void
2224S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2225{
2226/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2227
2228 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2229 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2230 ISBN 0-201-10088-6
2231
2232 We find the fail state for each state in the trie, this state is the longest proper
2233 suffix of the current states 'word' that is also a proper prefix of another word in our
2234 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2235 the DFA not to have to restart after its tried and failed a word at a given point, it
2236 simply continues as though it had been matching the other word in the first place.
2237 Consider
2238 'abcdgu'=~/abcdefg|cdgu/
2239 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2240 fail, which would bring use to the state representing 'd' in the second word where we would
2241 try 'g' and succeed, prodceding to match 'cdgu'.
2242 */
2243 /* add a fail transition */
3251b653
NC
2244 const U32 trie_offset = ARG(source);
2245 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2246 U32 *q;
2247 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2248 const U32 numstates = trie->statecount;
786e8c11
YO
2249 const U32 ubound = trie->lasttrans + ucharcount;
2250 U32 q_read = 0;
2251 U32 q_write = 0;
2252 U32 charid;
2253 U32 base = trie->states[ 1 ].trans.base;
2254 U32 *fail;
2255 reg_ac_data *aho;
2256 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2257 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2258
2259 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2260#ifndef DEBUGGING
2261 PERL_UNUSED_ARG(depth);
2262#endif
2263
2264
2265 ARG_SET( stclass, data_slot );
c944940b 2266 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2267 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2268 aho->trie=trie_offset;
446bd890
NC
2269 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2270 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2271 Newxz( q, numstates, U32);
c944940b 2272 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2273 aho->refcount = 1;
2274 fail = aho->fail;
2275 /* initialize fail[0..1] to be 1 so that we always have
2276 a valid final fail state */
2277 fail[ 0 ] = fail[ 1 ] = 1;
2278
2279 for ( charid = 0; charid < ucharcount ; charid++ ) {
2280 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2281 if ( newstate ) {
2282 q[ q_write ] = newstate;
2283 /* set to point at the root */
2284 fail[ q[ q_write++ ] ]=1;
2285 }
2286 }
2287 while ( q_read < q_write) {
2288 const U32 cur = q[ q_read++ % numstates ];
2289 base = trie->states[ cur ].trans.base;
2290
2291 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2292 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2293 if (ch_state) {
2294 U32 fail_state = cur;
2295 U32 fail_base;
2296 do {
2297 fail_state = fail[ fail_state ];
2298 fail_base = aho->states[ fail_state ].trans.base;
2299 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2300
2301 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2302 fail[ ch_state ] = fail_state;
2303 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2304 {
2305 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2306 }
2307 q[ q_write++ % numstates] = ch_state;
2308 }
2309 }
2310 }
2311 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2312 when we fail in state 1, this allows us to use the
2313 charclass scan to find a valid start char. This is based on the principle
2314 that theres a good chance the string being searched contains lots of stuff
2315 that cant be a start char.
2316 */
2317 fail[ 0 ] = fail[ 1 ] = 0;
2318 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2319 PerlIO_printf(Perl_debug_log,
2320 "%*sStclass Failtable (%"UVuf" states): 0",
2321 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2322 );
786e8c11
YO
2323 for( q_read=1; q_read<numstates; q_read++ ) {
2324 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2325 }
2326 PerlIO_printf(Perl_debug_log, "\n");
2327 });
2328 Safefree(q);
2329 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2330}
2331
786e8c11 2332
a3621e74 2333/*
5d1c421c
JH
2334 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2335 * These need to be revisited when a newer toolchain becomes available.
2336 */
2337#if defined(__sparc64__) && defined(__GNUC__)
2338# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2339# undef SPARC64_GCC_WORKAROUND
2340# define SPARC64_GCC_WORKAROUND 1
2341# endif
2342#endif
2343
07be1b83 2344#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2345 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2346 SV * const mysv=sv_newmortal(); \
2347 regnode *Next = regnext(scan); \
2348 regprop(RExC_rx, mysv, scan); \
7f69552c 2349 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2350 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2351 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2352 }});
07be1b83 2353
1de06328
YO
2354
2355
2356
2357
07be1b83
YO
2358#define JOIN_EXACT(scan,min,flags) \
2359 if (PL_regkind[OP(scan)] == EXACT) \
2360 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2361
be8e71aa 2362STATIC U32
07be1b83
YO
2363S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2364 /* Merge several consecutive EXACTish nodes into one. */
2365 regnode *n = regnext(scan);
2366 U32 stringok = 1;
2367 regnode *next = scan + NODE_SZ_STR(scan);
2368 U32 merged = 0;
2369 U32 stopnow = 0;
2370#ifdef DEBUGGING
2371 regnode *stop = scan;
72f13be8 2372 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2373#else
d47053eb
RGS
2374 PERL_UNUSED_ARG(depth);
2375#endif
7918f24d
NC
2376
2377 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2378#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2379 PERL_UNUSED_ARG(flags);
2380 PERL_UNUSED_ARG(val);
07be1b83 2381#endif
07be1b83
YO
2382 DEBUG_PEEP("join",scan,depth);
2383
2384 /* Skip NOTHING, merge EXACT*. */
2385 while (n &&
2386 ( PL_regkind[OP(n)] == NOTHING ||
2387 (stringok && (OP(n) == OP(scan))))
2388 && NEXT_OFF(n)
2389 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2390
2391 if (OP(n) == TAIL || n > next)
2392 stringok = 0;
2393 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2394 DEBUG_PEEP("skip:",n,depth);
2395 NEXT_OFF(scan) += NEXT_OFF(n);
2396 next = n + NODE_STEP_REGNODE;
2397#ifdef DEBUGGING
2398 if (stringok)
2399 stop = n;
2400#endif
2401 n = regnext(n);
2402 }
2403 else if (stringok) {
786e8c11 2404 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2405 regnode * const nnext = regnext(n);
2406
2407 DEBUG_PEEP("merg",n,depth);
2408
2409 merged++;
2410 if (oldl + STR_LEN(n) > U8_MAX)
2411 break;
2412 NEXT_OFF(scan) += NEXT_OFF(n);
2413 STR_LEN(scan) += STR_LEN(n);
2414 next = n + NODE_SZ_STR(n);
2415 /* Now we can overwrite *n : */
2416 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2417#ifdef DEBUGGING
2418 stop = next - 1;
2419#endif
2420 n = nnext;
2421 if (stopnow) break;
2422 }
2423
d47053eb
RGS
2424#ifdef EXPERIMENTAL_INPLACESCAN
2425 if (flags && !NEXT_OFF(n)) {
2426 DEBUG_PEEP("atch", val, depth);
2427 if (reg_off_by_arg[OP(n)]) {
2428 ARG_SET(n, val - n);
2429 }
2430 else {
2431 NEXT_OFF(n) = val - n;
2432 }
2433 stopnow = 1;
2434 }
07be1b83
YO
2435#endif
2436 }
2437
2438 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2439 /*
2440 Two problematic code points in Unicode casefolding of EXACT nodes:
2441
2442 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2443 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2444
2445 which casefold to
2446
2447 Unicode UTF-8
2448
2449 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2450 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2451
2452 This means that in case-insensitive matching (or "loose matching",
2453 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2454 length of the above casefolded versions) can match a target string
2455 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2456 This would rather mess up the minimum length computation.
2457
2458 What we'll do is to look for the tail four bytes, and then peek
2459 at the preceding two bytes to see whether we need to decrease
2460 the minimum length by four (six minus two).
2461
2462 Thanks to the design of UTF-8, there cannot be false matches:
2463 A sequence of valid UTF-8 bytes cannot be a subsequence of
2464 another valid sequence of UTF-8 bytes.
2465
2466 */
2467 char * const s0 = STRING(scan), *s, *t;
2468 char * const s1 = s0 + STR_LEN(scan) - 1;
2469 char * const s2 = s1 - 4;
e294cc5d
JH
2470#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2471 const char t0[] = "\xaf\x49\xaf\x42";
2472#else
07be1b83 2473 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2474#endif
07be1b83
YO
2475 const char * const t1 = t0 + 3;
2476
2477 for (s = s0 + 2;
2478 s < s2 && (t = ninstr(s, s1, t0, t1));
2479 s = t + 4) {
e294cc5d
JH
2480#ifdef EBCDIC
2481 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2482 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2483#else
07be1b83
YO
2484 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2485 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2486#endif
07be1b83
YO
2487 *min -= 4;
2488 }
2489 }
2490
2491#ifdef DEBUGGING
2492 /* Allow dumping */
2493 n = scan + NODE_SZ_STR(scan);
2494 while (n <= stop) {
2495 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2496 OP(n) = OPTIMIZED;
2497 NEXT_OFF(n) = 0;
2498 }
2499 n++;
2500 }
2501#endif
2502 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2503 return stopnow;
2504}
2505
653099ff
GS
2506/* REx optimizer. Converts nodes into quickier variants "in place".
2507 Finds fixed substrings. */
2508
a0288114 2509/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2510 to the position after last scanned or to NULL. */
2511
40d049e4
YO
2512#define INIT_AND_WITHP \
2513 assert(!and_withp); \
2514 Newx(and_withp,1,struct regnode_charclass_class); \
2515 SAVEFREEPV(and_withp)
07be1b83 2516
b515a41d
YO
2517/* this is a chain of data about sub patterns we are processing that
2518 need to be handled seperately/specially in study_chunk. Its so
2519 we can simulate recursion without losing state. */
2520struct scan_frame;
2521typedef struct scan_frame {
2522 regnode *last; /* last node to process in this frame */
2523 regnode *next; /* next node to process when last is reached */
2524 struct scan_frame *prev; /*previous frame*/
2525 I32 stop; /* what stopparen do we use */
2526} scan_frame;
2527
304ee84b
YO
2528
2529#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2530
e1d1eefb
YO
2531#define CASE_SYNST_FNC(nAmE) \
2532case nAmE: \
2533 if (flags & SCF_DO_STCLASS_AND) { \
2534 for (value = 0; value < 256; value++) \
2535 if (!is_ ## nAmE ## _cp(value)) \
2536 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2537 } \
2538 else { \
2539 for (value = 0; value < 256; value++) \
2540 if (is_ ## nAmE ## _cp(value)) \
2541 ANYOF_BITMAP_SET(data->start_class, value); \
2542 } \
2543 break; \
2544case N ## nAmE: \
2545 if (flags & SCF_DO_STCLASS_AND) { \
2546 for (value = 0; value < 256; value++) \
2547 if (is_ ## nAmE ## _cp(value)) \
2548 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2549 } \
2550 else { \
2551 for (value = 0; value < 256; value++) \
2552 if (!is_ ## nAmE ## _cp(value)) \
2553 ANYOF_BITMAP_SET(data->start_class, value); \
2554 } \
2555 break
2556
2557
2558
76e3520e 2559STATIC I32
40d049e4 2560S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2561 I32 *minlenp, I32 *deltap,
40d049e4
YO
2562 regnode *last,
2563 scan_data_t *data,
2564 I32 stopparen,
2565 U8* recursed,
2566 struct regnode_charclass_class *and_withp,
2567 U32 flags, U32 depth)
c277df42
IZ
2568 /* scanp: Start here (read-write). */
2569 /* deltap: Write maxlen-minlen here. */
2570 /* last: Stop before this one. */
40d049e4
YO
2571 /* data: string data about the pattern */
2572 /* stopparen: treat close N as END */
2573 /* recursed: which subroutines have we recursed into */
2574 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2575{
97aff369 2576 dVAR;
c277df42
IZ
2577 I32 min = 0, pars = 0, code;
2578 regnode *scan = *scanp, *next;
2579 I32 delta = 0;
2580 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2581 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2582 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2583 scan_data_t data_fake;
a3621e74 2584 SV *re_trie_maxbuff = NULL;
786e8c11 2585 regnode *first_non_open = scan;
e2e6a0f1 2586 I32 stopmin = I32_MAX;
8aa23a47 2587 scan_frame *frame = NULL;
a3621e74 2588 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2589
7918f24d
NC
2590 PERL_ARGS_ASSERT_STUDY_CHUNK;
2591
13a24bad 2592#ifdef DEBUGGING
40d049e4 2593 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2594#endif
40d049e4 2595
786e8c11 2596 if ( depth == 0 ) {
40d049e4 2597 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2598 first_non_open=regnext(first_non_open);
2599 }
2600
b81d288d 2601
8aa23a47
YO
2602 fake_study_recurse:
2603 while ( scan && OP(scan) != END && scan < last ){
2604 /* Peephole optimizer: */
304ee84b 2605 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2606 DEBUG_PEEP("Peep",scan,depth);
2607 JOIN_EXACT(scan,&min,0);
2608
2609 /* Follow the next-chain of the current node and optimize
2610 away all the NOTHINGs from it. */
2611 if (OP(scan) != CURLYX) {
2612 const int max = (reg_off_by_arg[OP(scan)]
2613 ? I32_MAX
2614 /* I32 may be smaller than U16 on CRAYs! */
2615 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2616 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2617 int noff;
2618 regnode *n = scan;
2619
2620 /* Skip NOTHING and LONGJMP. */
2621 while ((n = regnext(n))
2622 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2623 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2624 && off + noff < max)
2625 off += noff;
2626 if (reg_off_by_arg[OP(scan)])
2627 ARG(scan) = off;
2628 else
2629 NEXT_OFF(scan) = off;
2630 }
a3621e74 2631
c277df42 2632
8aa23a47
YO
2633
2634 /* The principal pseudo-switch. Cannot be a switch, since we
2635 look into several different things. */
2636 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2637 || OP(scan) == IFTHEN) {
2638 next = regnext(scan);
2639 code = OP(scan);
2640 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2641
2642 if (OP(next) == code || code == IFTHEN) {
2643 /* NOTE - There is similar code to this block below for handling
2644 TRIE nodes on a re-study. If you change stuff here check there
2645 too. */
2646 I32 max1 = 0, min1 = I32_MAX, num = 0;
2647 struct regnode_charclass_class accum;
2648 regnode * const startbranch=scan;
2649
2650 if (flags & SCF_DO_SUBSTR)
304ee84b 2651 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2652 if (flags & SCF_DO_STCLASS)
2653 cl_init_zero(pRExC_state, &accum);
2654
2655 while (OP(scan) == code) {
2656 I32 deltanext, minnext, f = 0, fake;
2657 struct regnode_charclass_class this_class;
2658
2659 num++;
2660 data_fake.flags = 0;
2661 if (data) {
2662 data_fake.whilem_c = data->whilem_c;
2663 data_fake.last_closep = data->last_closep;
2664 }
2665 else
2666 data_fake.last_closep = &fake;
58e23c8d
YO
2667
2668 data_fake.pos_delta = delta;
8aa23a47
YO
2669 next = regnext(scan);
2670 scan = NEXTOPER(scan);
2671 if (code != BRANCH)
c277df42 2672 scan = NEXTOPER(scan);
8aa23a47
YO
2673 if (flags & SCF_DO_STCLASS) {
2674 cl_init(pRExC_state, &this_class);
2675 data_fake.start_class = &this_class;
2676 f = SCF_DO_STCLASS_AND;
58e23c8d 2677 }
8aa23a47
YO
2678 if (flags & SCF_WHILEM_VISITED_POS)
2679 f |= SCF_WHILEM_VISITED_POS;
2680
2681 /* we suppose the run is continuous, last=next...*/
2682 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2683 next, &data_fake,
2684 stopparen, recursed, NULL, f,depth+1);
2685 if (min1 > minnext)
2686 min1 = minnext;
2687 if (max1 < minnext + deltanext)
2688 max1 = minnext + deltanext;
2689 if (deltanext == I32_MAX)
2690 is_inf = is_inf_internal = 1;
2691 scan = next;
2692 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2693 pars++;
2694 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2695 if ( stopmin > minnext)
2696 stopmin = min + min1;
2697 flags &= ~SCF_DO_SUBSTR;
2698 if (data)
2699 data->flags |= SCF_SEEN_ACCEPT;
2700 }
2701 if (data) {
2702 if (data_fake.flags & SF_HAS_EVAL)
2703 data->flags |= SF_HAS_EVAL;
2704 data->whilem_c = data_fake.whilem_c;
3dab1dad 2705 }
8aa23a47
YO
2706 if (flags & SCF_DO_STCLASS)
2707 cl_or(pRExC_state, &accum, &this_class);
2708 }
2709 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2710 min1 = 0;
2711 if (flags & SCF_DO_SUBSTR) {
2712 data->pos_min += min1;
2713 data->pos_delta += max1 - min1;
2714 if (max1 != min1 || is_inf)
2715 data->longest = &(data->longest_float);
2716 }
2717 min += min1;
2718 delta += max1 - min1;
2719 if (flags & SCF_DO_STCLASS_OR) {
2720 cl_or(pRExC_state, data->start_class, &accum);
2721 if (min1) {
2722 cl_and(data->start_class, and_withp);
2723 flags &= ~SCF_DO_STCLASS;
653099ff 2724 }
8aa23a47
YO
2725 }
2726 else if (flags & SCF_DO_STCLASS_AND) {
2727 if (min1) {
2728 cl_and(data->start_class, &accum);
2729 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2730 }
8aa23a47
YO
2731 else {
2732 /* Switch to OR mode: cache the old value of
2733 * data->start_class */
2734 INIT_AND_WITHP;
2735 StructCopy(data->start_class, and_withp,
2736 struct regnode_charclass_class);
2737 flags &= ~SCF_DO_STCLASS_AND;
2738 StructCopy(&accum, data->start_class,
2739 struct regnode_charclass_class);
2740 flags |= SCF_DO_STCLASS_OR;
2741 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2742 }
8aa23a47 2743 }
a3621e74 2744
8aa23a47
YO
2745 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2746 /* demq.
a3621e74 2747
8aa23a47
YO
2748 Assuming this was/is a branch we are dealing with: 'scan' now
2749 points at the item that follows the branch sequence, whatever
2750 it is. We now start at the beginning of the sequence and look
2751 for subsequences of
a3621e74 2752
8aa23a47
YO
2753 BRANCH->EXACT=>x1
2754 BRANCH->EXACT=>x2
2755 tail
a3621e74 2756
8aa23a47 2757 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2758
8aa23a47
YO
2759 If we can find such a subseqence we need to turn the first
2760 element into a trie and then add the subsequent branch exact
2761 strings to the trie.
a3621e74 2762
8aa23a47 2763 We have two cases
a3621e74 2764
8aa23a47 2765 1. patterns where the whole set of branch can be converted.
a3621e74 2766
8aa23a47 2767 2. patterns where only a subset can be converted.
a3621e74 2768
8aa23a47
YO
2769 In case 1 we can replace the whole set with a single regop
2770 for the trie. In case 2 we need to keep the start and end
2771 branchs so
a3621e74 2772
8aa23a47
YO
2773 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2774 becomes BRANCH TRIE; BRANCH X;
786e8c11 2775
8aa23a47
YO
2776 There is an additional case, that being where there is a
2777 common prefix, which gets split out into an EXACT like node
2778 preceding the TRIE node.
a3621e74 2779
8aa23a47
YO
2780 If x(1..n)==tail then we can do a simple trie, if not we make
2781 a "jump" trie, such that when we match the appropriate word
2782 we "jump" to the appopriate tail node. Essentailly we turn
2783 a nested if into a case structure of sorts.
b515a41d 2784
8aa23a47
YO
2785 */
2786
2787 int made=0;
2788 if (!re_trie_maxbuff) {
2789 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2790 if (!SvIOK(re_trie_maxbuff))
2791 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2792 }
2793 if ( SvIV(re_trie_maxbuff)>=0 ) {
2794 regnode *cur;
2795 regnode *first = (regnode *)NULL;
2796 regnode *last = (regnode *)NULL;
2797 regnode *tail = scan;
2798 U8 optype = 0;
2799 U32 count=0;
a3621e74
YO
2800
2801#ifdef DEBUGGING
8aa23a47 2802 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2803#endif
8aa23a47
YO
2804 /* var tail is used because there may be a TAIL
2805 regop in the way. Ie, the exacts will point to the
2806 thing following the TAIL, but the last branch will
2807 point at the TAIL. So we advance tail. If we
2808 have nested (?:) we may have to move through several
2809 tails.
2810 */
2811
2812 while ( OP( tail ) == TAIL ) {
2813 /* this is the TAIL generated by (?:) */
2814 tail = regnext( tail );
2815 }
a3621e74 2816
8aa23a47
YO
2817
2818 DEBUG_OPTIMISE_r({
2819 regprop(RExC_rx, mysv, tail );
2820 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2821 (int)depth * 2 + 2, "",
2822 "Looking for TRIE'able sequences. Tail node is: ",
2823 SvPV_nolen_const( mysv )
2824 );
2825 });
2826
2827 /*
2828
2829 step through the branches, cur represents each
2830 branch, noper is the first thing to be matched
2831 as part of that branch and noper_next is the
2832 regnext() of that node. if noper is an EXACT
2833 and noper_next is the same as scan (our current
2834 position in the regex) then the EXACT branch is
2835 a possible optimization target. Once we have
2836 two or more consequetive such branches we can
2837 create a trie of the EXACT's contents and stich
2838 it in place. If the sequence represents all of
2839 the branches we eliminate the whole thing and
2840 replace it with a single TRIE. If it is a
2841 subsequence then we need to stitch it in. This
2842 means the first branch has to remain, and needs
2843 to be repointed at the item on the branch chain
2844 following the last branch optimized. This could
2845 be either a BRANCH, in which case the
2846 subsequence is internal, or it could be the
2847 item following the branch sequence in which
2848 case the subsequence is at the end.
2849
2850 */
2851
2852 /* dont use tail as the end marker for this traverse */
2853 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2854 regnode * const noper = NEXTOPER( cur );
b515a41d 2855#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2856 regnode * const noper_next = regnext( noper );
b515a41d
YO
2857#endif
2858
8aa23a47
YO
2859 DEBUG_OPTIMISE_r({
2860 regprop(RExC_rx, mysv, cur);
2861 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2862 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2863
2864 regprop(RExC_rx, mysv, noper);
2865 PerlIO_printf( Perl_debug_log, " -> %s",
2866 SvPV_nolen_const(mysv));
2867
2868 if ( noper_next ) {
2869 regprop(RExC_rx, mysv, noper_next );
2870 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2871 SvPV_nolen_const(mysv));
2872 }
2873 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2874 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2875 });
2876 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2877 : PL_regkind[ OP( noper ) ] == EXACT )
2878 || OP(noper) == NOTHING )
786e8c11 2879#ifdef NOJUMPTRIE
8aa23a47 2880 && noper_next == tail
786e8c11 2881#endif
8aa23a47
YO
2882 && count < U16_MAX)
2883 {
2884 count++;
2885 if ( !first || optype == NOTHING ) {
2886 if (!first) first = cur;
2887 optype = OP( noper );
2888 } else {
2889 last = cur;
2890 }
2891 } else {
a0a388a1 2892/*
0abd0d78
YO
2893 Currently we do not believe that the trie logic can
2894 handle case insensitive matching properly when the
2895 pattern is not unicode (thus forcing unicode semantics).
2896
2897 If/when this is fixed the following define can be swapped
2898 in below to fully enable trie logic.
2899
a0a388a1 2900#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2901
2902*/
2903#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2904
a0a388a1 2905 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2906 make_trie( pRExC_state,
2907 startbranch, first, cur, tail, count,
2908 optype, depth+1 );
2909 }
2910 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2911#ifdef NOJUMPTRIE
8aa23a47 2912 && noper_next == tail
786e8c11 2913#endif
8aa23a47
YO
2914 ){
2915 count = 1;
2916 first = cur;
2917 optype = OP( noper );
2918 } else {
2919 count = 0;
2920 first = NULL;
2921 optype = 0;
2922 }
2923 last = NULL;
2924 }
2925 }
2926 DEBUG_OPTIMISE_r({
2927 regprop(RExC_rx, mysv, cur);
2928 PerlIO_printf( Perl_debug_log,
2929 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2930 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2931
2932 });
a0a388a1
YO
2933
2934 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2935 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2936#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2937 if ( ((made == MADE_EXACT_TRIE &&
2938 startbranch == first)
2939 || ( first_non_open == first )) &&
2940 depth==0 ) {
2941 flags |= SCF_TRIE_RESTUDY;
2942 if ( startbranch == first
2943 && scan == tail )
2944 {
2945 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2946 }
2947 }
3dab1dad 2948#endif
8aa23a47
YO
2949 }
2950 }
2951
2952 } /* do trie */
2953
653099ff 2954 }
8aa23a47
YO
2955 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2956 scan = NEXTOPER(NEXTOPER(scan));
2957 } else /* single branch is optimized. */
2958 scan = NEXTOPER(scan);
2959 continue;
2960 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2961 scan_frame *newframe = NULL;
2962 I32 paren;
2963 regnode *start;
2964 regnode *end;
2965
2966 if (OP(scan) != SUSPEND) {
2967 /* set the pointer */
2968 if (OP(scan) == GOSUB) {
2969 paren = ARG(scan);
2970 RExC_recurse[ARG2L(scan)] = scan;
2971 start = RExC_open_parens[paren-1];
2972 end = RExC_close_parens[paren-1];
2973 } else {
2974 paren = 0;
f8fc2ecf 2975 start = RExC_rxi->program + 1;
8aa23a47
YO
2976 end = RExC_opend;
2977 }
2978 if (!recursed) {
2979 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2980 SAVEFREEPV(recursed);
2981 }
2982 if (!PAREN_TEST(recursed,paren+1)) {
2983 PAREN_SET(recursed,paren+1);
2984 Newx(newframe,1,scan_frame);
2985 } else {
2986 if (flags & SCF_DO_SUBSTR) {
304ee84b 2987 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2988 data->longest = &(data->longest_float);
2989 }
2990 is_inf = is_inf_internal = 1;
2991 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2992 cl_anything(pRExC_state, data->start_class);
2993 flags &= ~SCF_DO_STCLASS;
2994 }
2995 } else {
2996 Newx(newframe,1,scan_frame);
2997 paren = stopparen;
2998 start = scan+2;
2999 end = regnext(scan);
3000 }
3001 if (newframe) {
3002 assert(start);
3003 assert(end);
3004 SAVEFREEPV(newframe);
3005 newframe->next = regnext(scan);
3006 newframe->last = last;
3007 newframe->stop = stopparen;
3008 newframe->prev = frame;
3009
3010 frame = newframe;
3011 scan = start;
3012 stopparen = paren;
3013 last = end;
3014
3015 continue;
3016 }
3017 }
3018 else if (OP(scan) == EXACT) {
3019 I32 l = STR_LEN(scan);
3020 UV uc;
3021 if (UTF) {
3022 const U8 * const s = (U8*)STRING(scan);
3023 l = utf8_length(s, s + l);
3024 uc = utf8_to_uvchr(s, NULL);
3025 } else {
3026 uc = *((U8*)STRING(scan));
3027 }
3028 min += l;
3029 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3030 /* The code below prefers earlier match for fixed
3031 offset, later match for variable offset. */
3032 if (data->last_end == -1) { /* Update the start info. */
3033 data->last_start_min = data->pos_min;
3034 data->last_start_max = is_inf
3035 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3036 }
8aa23a47
YO
3037 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3038 if (UTF)
3039 SvUTF8_on(data->last_found);
3040 {
3041 SV * const sv = data->last_found;
3042 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3043 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3044 if (mg && mg->mg_len >= 0)
3045 mg->mg_len += utf8_length((U8*)STRING(scan),
3046 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3047 }
8aa23a47
YO
3048 data->last_end = data->pos_min + l;
3049 data->pos_min += l; /* As in the first entry. */
3050 data->flags &= ~SF_BEFORE_EOL;
3051 }
3052 if (flags & SCF_DO_STCLASS_AND) {
3053 /* Check whether it is compatible with what we know already! */
3054 int compat = 1;
3055
3056 if (uc >= 0x100 ||
3057 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3058 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3059 && (!(data->start_class->flags & ANYOF_FOLD)
3060 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3061 )
3062 compat = 0;
3063 ANYOF_CLASS_ZERO(data->start_class);
3064 ANYOF_BITMAP_ZERO(data->start_class);
3065 if (compat)
3066 ANYOF_BITMAP_SET(data->start_class, uc);
3067 data->start_class->flags &= ~ANYOF_EOS;
3068 if (uc < 0x100)
3069 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3070 }
3071 else if (flags & SCF_DO_STCLASS_OR) {
3072 /* false positive possible if the class is case-folded */
3073 if (uc < 0x100)
3074 ANYOF_BITMAP_SET(data->start_class, uc);
3075 else
3076 data->start_class->flags |= ANYOF_UNICODE_ALL;
3077 data->start_class->flags &= ~ANYOF_EOS;
3078 cl_and(data->start_class, and_withp);
3079 }
3080 flags &= ~SCF_DO_STCLASS;
3081 }
3082 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3083 I32 l = STR_LEN(scan);
3084 UV uc = *((U8*)STRING(scan));
3085
3086 /* Search for fixed substrings supports EXACT only. */
3087 if (flags & SCF_DO_SUBSTR) {
3088 assert(data);
304ee84b 3089 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3090 }
3091 if (UTF) {
3092 const U8 * const s = (U8 *)STRING(scan);
3093 l = utf8_length(s, s + l);
3094 uc = utf8_to_uvchr(s, NULL);
3095 }
3096 min += l;
3097 if (flags & SCF_DO_SUBSTR)
3098 data->pos_min += l;
3099 if (flags & SCF_DO_STCLASS_AND) {
3100 /* Check whether it is compatible with what we know already! */
3101 int compat = 1;
3102
3103 if (uc >= 0x100 ||
3104 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3105 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3106 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3107 compat = 0;
3108 ANYOF_CLASS_ZERO(data->start_class);
3109 ANYOF_BITMAP_ZERO(data->start_class);
3110 if (compat) {
3111 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3112 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
3113 data->start_class->flags |= ANYOF_FOLD;
3114 if (OP(scan) == EXACTFL)
3115 data->start_class->flags |= ANYOF_LOCALE;
653099ff 3116 }
8aa23a47
YO
3117 }
3118 else if (flags & SCF_DO_STCLASS_OR) {
3119 if (data->start_class->flags & ANYOF_FOLD) {
3120 /* false positive possible if the class is case-folded.
3121 Assume that the locale settings are the same... */
1aa99e6b
IH
3122 if (uc < 0x100)
3123 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
3124 data->start_class->flags &= ~ANYOF_EOS;
3125 }
8aa23a47 3126 cl_and(data->start_class, and_withp);
653099ff 3127 }
8aa23a47
YO
3128 flags &= ~SCF_DO_STCLASS;
3129 }
e52fc539 3130 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3131 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3132 I32 f = flags, pos_before = 0;
3133 regnode * const oscan = scan;
3134 struct regnode_charclass_class this_class;
3135 struct regnode_charclass_class *oclass = NULL;
3136 I32 next_is_eval = 0;
3137
3138 switch (PL_regkind[OP(scan)]) {
3139 case WHILEM: /* End of (?:...)* . */
3140 scan = NEXTOPER(scan);
3141 goto finish;
3142 case PLUS:
3143 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3144 next = NEXTOPER(scan);
3145 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3146 mincount = 1;
3147 maxcount = REG_INFTY;
3148 next = regnext(scan);
3149 scan = NEXTOPER(scan);
3150 goto do_curly;
3151 }
3152 }
3153 if (flags & SCF_DO_SUBSTR)
3154 data->pos_min++;
3155 min++;
3156 /* Fall through. */
3157 case STAR:
3158 if (flags & SCF_DO_STCLASS) {
3159 mincount = 0;
3160 maxcount = REG_INFTY;
3161 next = regnext(scan);
3162 scan = NEXTOPER(scan);
3163 goto do_curly;
3164 }
3165 is_inf = is_inf_internal = 1;
3166 scan = regnext(scan);
c277df42 3167 if (flags & SCF_DO_SUBSTR) {
304ee84b 3168 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3169 data->longest = &(data->longest_float);
c277df42 3170 }
8aa23a47
YO
3171 goto optimize_curly_tail;
3172 case CURLY:
3173 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3174 && (scan->flags == stopparen))
3175 {
3176 mincount = 1;
3177 maxcount = 1;
3178 } else {
3179 mincount = ARG1(scan);
3180 maxcount = ARG2(scan);
653099ff 3181 }
8aa23a47
YO
3182 next = regnext(scan);
3183 if (OP(scan) == CURLYX) {
3184 I32 lp = (data ? *(data->last_closep) : 0);
3185 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3186 }
8aa23a47
YO
3187 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3188 next_is_eval = (OP(scan) == EVAL);
3189 do_curly:
3190 if (flags & SCF_DO_SUBSTR) {
304ee84b 3191 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3192 pos_before = data->pos_min;
b45f050a 3193 }
8aa23a47
YO
3194 if (data) {
3195 fl = data->flags;
3196 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3197 if (is_inf)
3198 data->flags |= SF_IS_INF;
3199 }
3200 if (flags & SCF_DO_STCLASS) {
3201 cl_init(pRExC_state, &this_class);
3202 oclass = data->start_class;
3203 data->start_class = &this_class;
3204 f |= SCF_DO_STCLASS_AND;
3205 f &= ~SCF_DO_STCLASS_OR;
3206 }
3207 /* These are the cases when once a subexpression
3208 fails at a particular position, it cannot succeed
3209 even after backtracking at the enclosing scope.
3210
3211 XXXX what if minimal match and we are at the
3212 initial run of {n,m}? */
3213 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3214 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3215
8aa23a47
YO
3216 /* This will finish on WHILEM, setting scan, or on NULL: */
3217 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3218 last, data, stopparen, recursed, NULL,
3219 (mincount == 0
3220 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3221
8aa23a47
YO
3222 if (flags & SCF_DO_STCLASS)
3223 data->start_class = oclass;
3224 if (mincount == 0 || minnext == 0) {
3225 if (flags & SCF_DO_STCLASS_OR) {
3226 cl_or(pRExC_state, data->start_class, &this_class);
3227 }
3228 else if (flags & SCF_DO_STCLASS_AND) {
3229 /* Switch to OR mode: cache the old value of
3230 * data->start_class */
3231 INIT_AND_WITHP;
3232 StructCopy(data->start_class, and_withp,
3233 struct regnode_charclass_class);
3234 flags &= ~SCF_DO_STCLASS_AND;
3235 StructCopy(&this_class, data->start_class,
3236 struct regnode_charclass_class);
3237 flags |= SCF_DO_STCLASS_OR;
3238 data->start_class->flags |= ANYOF_EOS;
3239 }
3240 } else { /* Non-zero len */
3241 if (flags & SCF_DO_STCLASS_OR) {
3242 cl_or(pRExC_state, data->start_class, &this_class);
3243 cl_and(data->start_class, and_withp);
3244 }
3245 else if (flags & SCF_DO_STCLASS_AND)
3246 cl_and(data->start_class, &this_class);
3247 flags &= ~SCF_DO_STCLASS;
3248 }
3249 if (!scan) /* It was not CURLYX, but CURLY. */
3250 scan = next;
3251 if ( /* ? quantifier ok, except for (?{ ... }) */
3252 (next_is_eval || !(mincount == 0 && maxcount == 1))
3253 && (minnext == 0) && (deltanext == 0)
3254 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3255 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3256 {
668c081a
NC
3257 ckWARNreg(RExC_parse,
3258 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3259 }
3260
3261 min += minnext * mincount;
3262 is_inf_internal |= ((maxcount == REG_INFTY
3263 && (minnext + deltanext) > 0)
3264 || deltanext == I32_MAX);
3265 is_inf |= is_inf_internal;
3266 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3267
3268 /* Try powerful optimization CURLYX => CURLYN. */
3269 if ( OP(oscan) == CURLYX && data
3270 && data->flags & SF_IN_PAR
3271 && !(data->flags & SF_HAS_EVAL)
3272 && !deltanext && minnext == 1 ) {
3273 /* Try to optimize to CURLYN. */
3274 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3275 regnode * const nxt1 = nxt;
497b47a8 3276#ifdef DEBUGGING
8aa23a47 3277 regnode *nxt2;
497b47a8 3278#endif
c277df42 3279
8aa23a47
YO
3280 /* Skip open. */
3281 nxt = regnext(nxt);
e52fc539 3282 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3283 && !(PL_regkind[OP(nxt)] == EXACT
3284 && STR_LEN(nxt) == 1))
3285 goto nogo;
497b47a8 3286#ifdef DEBUGGING
8aa23a47 3287 nxt2 = nxt;
497b47a8 3288#endif
8aa23a47
YO
3289 nxt = regnext(nxt);
3290 if (OP(nxt) != CLOSE)
3291 goto nogo;
3292 if (RExC_open_parens) {
3293 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3294 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3295 }
3296 /* Now we know that nxt2 is the only contents: */
3297 oscan->flags = (U8)ARG(nxt);
3298 OP(oscan) = CURLYN;
3299 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3300
c277df42 3301#ifdef DEBUGGING
8aa23a47 3302 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3303 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3304 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3305 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3306 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3307 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3308#endif
8aa23a47
YO
3309 }
3310 nogo:
3311
3312 /* Try optimization CURLYX => CURLYM. */
3313 if ( OP(oscan) == CURLYX && data
3314 && !(data->flags & SF_HAS_PAR)
3315 && !(data->flags & SF_HAS_EVAL)
3316 && !deltanext /* atom is fixed width */
3317 && minnext != 0 /* CURLYM can't handle zero width */
3318 ) {
3319 /* XXXX How to optimize if data == 0? */
3320 /* Optimize to a simpler form. */
3321 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3322 regnode *nxt2;
3323
3324 OP(oscan) = CURLYM;
3325 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3326 && (OP(nxt2) != WHILEM))
3327 nxt = nxt2;
3328 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3329 /* Need to optimize away parenths. */
b3c0965f 3330 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3331 /* Set the parenth number. */
3332 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3333
8aa23a47
YO
3334 oscan->flags = (U8)ARG(nxt);
3335 if (RExC_open_parens) {
3336 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3337 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3338 }
8aa23a47
YO
3339 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3340 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3341
c277df42 3342#ifdef DEBUGGING
8aa23a47
YO
3343 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3344 OP(nxt + 1) = OPTIMIZED; /* was count. */
3345 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3346 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3347#endif
c277df42 3348#if 0
8aa23a47
YO
3349 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3350 regnode *nnxt = regnext(nxt1);
3351
3352 if (nnxt == nxt) {
3353 if (reg_off_by_arg[OP(nxt1)])
3354 ARG_SET(nxt1, nxt2 - nxt1);
3355 else if (nxt2 - nxt1 < U16_MAX)
3356 NEXT_OFF(nxt1) = nxt2 - nxt1;
3357 else
3358 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3359 }
8aa23a47 3360 nxt1 = nnxt;
c277df42 3361 }
5d1c421c 3362#endif
8aa23a47
YO
3363 /* Optimize again: */
3364 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3365 NULL, stopparen, recursed, NULL, 0,depth+1);
3366 }
3367 else
3368 oscan->flags = 0;
3369 }
3370 else if ((OP(oscan) == CURLYX)
3371 && (flags & SCF_WHILEM_VISITED_POS)
3372 /* See the comment on a similar expression above.
3373 However, this time it not a subexpression
3374 we care about, but the expression itself. */
3375 && (maxcount == REG_INFTY)
3376 && data && ++data->whilem_c < 16) {
3377 /* This stays as CURLYX, we can put the count/of pair. */
3378 /* Find WHILEM (as in regexec.c) */
3379 regnode *nxt = oscan + NEXT_OFF(oscan);
3380
3381 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3382 nxt += ARG(nxt);
3383 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3384 | (RExC_whilem_seen << 4)); /* On WHILEM */
3385 }
3386 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3387 pars++;
3388 if (flags & SCF_DO_SUBSTR) {
3389 SV *last_str = NULL;
3390 int counted = mincount != 0;
a0ed51b3 3391
8aa23a47
YO
3392 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3393#if defined(SPARC64_GCC_WORKAROUND)
3394 I32 b = 0;
3395 STRLEN l = 0;
3396 const char *s = NULL;
3397 I32 old = 0;
b515a41d 3398
8aa23a47
YO
3399 if (pos_before >= data->last_start_min)
3400 b = pos_before;
3401 else
3402 b = data->last_start_min;
b515a41d 3403
8aa23a47
YO
3404 l = 0;
3405 s = SvPV_const(data->last_found, l);
3406 old = b - data->last_start_min;
3407
3408#else
3409 I32 b = pos_before >= data->last_start_min
3410 ? pos_before : data->last_start_min;
3411 STRLEN l;
3412 const char * const s = SvPV_const(data->last_found, l);
3413 I32 old = b - data->last_start_min;
3414#endif
3415
3416 if (UTF)
3417 old = utf8_hop((U8*)s, old) - (U8*)s;
3418
3419 l -= old;
3420 /* Get the added string: */
740cce10 3421 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3422 if (deltanext == 0 && pos_before == b) {
3423 /* What was added is a constant string */
3424 if (mincount > 1) {
3425 SvGROW(last_str, (mincount * l) + 1);
3426 repeatcpy(SvPVX(last_str) + l,
3427 SvPVX_const(last_str), l, mincount - 1);
3428 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3429 /* Add additional parts. */
3430 SvCUR_set(data->last_found,
3431 SvCUR(data->last_found) - l);
3432 sv_catsv(data->last_found, last_str);
3433 {
3434 SV * sv = data->last_found;
3435 MAGIC *mg =
3436 SvUTF8(sv) && SvMAGICAL(sv) ?
3437 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3438 if (mg && mg->mg_len >= 0)
bd94e887 3439 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3440 }
8aa23a47 3441 data->last_end += l * (mincount - 1);
b515a41d 3442 }
8aa23a47
YO
3443 } else {
3444 /* start offset must point into the last copy */
3445 data->last_start_min += minnext * (mincount - 1);
3446 data->last_start_max += is_inf ? I32_MAX
3447 : (maxcount - 1) * (minnext + data->pos_delta);
3448 }
c277df42 3449 }
8aa23a47
YO
3450 /* It is counted once already... */
3451 data->pos_min += minnext * (mincount - counted);
3452 data->pos_delta += - counted * deltanext +
3453 (minnext + deltanext) * maxcount - minnext * mincount;
3454 if (mincount != maxcount) {
3455 /* Cannot extend fixed substrings found inside
3456 the group. */
304ee84b 3457 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3458 if (mincount && last_str) {
3459 SV * const sv = data->last_found;
3460 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3461 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3462
3463 if (mg)
3464 mg->mg_len = -1;
3465 sv_setsv(sv, last_str);
3466 data->last_end = data->pos_min;
3467 data->last_start_min =
3468 data->pos_min - CHR_SVLEN(last_str);
3469 data->last_start_max = is_inf
3470 ? I32_MAX
3471 : data->pos_min + data->pos_delta
3472 - CHR_SVLEN(last_str);
3473 }
3474 data->longest = &(data->longest_float);
3475 }
3476 SvREFCNT_dec(last_str);
c277df42 3477 }
8aa23a47
YO
3478 if (data && (fl & SF_HAS_EVAL))
3479 data->flags |= SF_HAS_EVAL;
3480 optimize_curly_tail:
3481 if (OP(oscan) != CURLYX) {
3482 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3483 && NEXT_OFF(next))
3484 NEXT_OFF(oscan) += NEXT_OFF(next);
3485 }
3486 continue;
3487 default: /* REF and CLUMP only? */
3488 if (flags & SCF_DO_SUBSTR) {
304ee84b 3489 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3490 data->longest = &(data->longest_float);
3491 }
3492 is_inf = is_inf_internal = 1;
3493 if (flags & SCF_DO_STCLASS_OR)
3494 cl_anything(pRExC_state, data->start_class);
3495 flags &= ~SCF_DO_STCLASS;
3496 break;
c277df42 3497 }
8aa23a47 3498 }
e1d1eefb
YO
3499 else if (OP(scan) == LNBREAK) {
3500 if (flags & SCF_DO_STCLASS) {
3501 int value = 0;
3502 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3503 if (flags & SCF_DO_STCLASS_AND) {
3504 for (value = 0; value < 256; value++)
e64b1bd1 3505 if (!is_VERTWS_cp(value))
e1d1eefb
YO
3506 ANYOF_BITMAP_CLEAR(data->start_class, value);
3507 }
3508 else {
3509 for (value = 0; value < 256; value++)
e64b1bd1 3510 if (is_VERTWS_cp(value))
e1d1eefb
YO
3511 ANYOF_BITMAP_SET(data->start_class, value);
3512 }
3513 if (flags & SCF_DO_STCLASS_OR)
3514 cl_and(data->start_class, and_withp);
3515 flags &= ~SCF_DO_STCLASS;
3516 }
3517 min += 1;
f9a79580 3518 delta += 1;
e1d1eefb
YO
3519 if (flags & SCF_DO_SUBSTR) {
3520 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3521 data->pos_min += 1;
f9a79580 3522 data->pos_delta += 1;
e1d1eefb
YO
3523 data->longest = &(data->longest_float);
3524 }
3525
3526 }
f9a79580
RGS
3527 else if (OP(scan) == FOLDCHAR) {
3528 int d = ARG(scan)==0xDF ? 1 : 2;
3529 flags &= ~SCF_DO_STCLASS;
3530 min += 1;
3531 delta += d;
3532 if (flags & SCF_DO_SUBSTR) {
3533 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3534 data->pos_min += 1;
3535 data->pos_delta += d;
3536 data->longest = &(data->longest_float);
3537 }
3538 }
e52fc539 3539 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 3540 int value = 0;
653099ff 3541
8aa23a47 3542 if (flags & SCF_DO_SUBSTR) {
304ee84b 3543 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3544 data->pos_min++;
3545 }
3546 min++;
3547 if (flags & SCF_DO_STCLASS) {
3548 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3549
8aa23a47
YO
3550 /* Some of the logic below assumes that switching
3551 locale on will only add false positives. */
3552 switch (PL_regkind[OP(scan)]) {
3553 case SANY:
3554 default:
3555 do_default:
3556 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3557 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3558 cl_anything(pRExC_state, data->start_class);
3559 break;
3560 case REG_ANY:
3561 if (OP(scan) == SANY)
3562 goto do_default;
3563 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3564 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3565 || (data->start_class->flags & ANYOF_CLASS));
3566 cl_anything(pRExC_state, data->start_class);
653099ff 3567 }
8aa23a47
YO
3568 if (flags & SCF_DO_STCLASS_AND || !value)
3569 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3570 break;
3571 case ANYOF:
3572 if (flags & SCF_DO_STCLASS_AND)
3573 cl_and(data->start_class,
3574 (struct regnode_charclass_class*)scan);
653099ff 3575 else
8aa23a47
YO
3576 cl_or(pRExC_state, data->start_class,
3577 (struct regnode_charclass_class*)scan);
3578 break;
3579 case ALNUM:
3580 if (flags & SCF_DO_STCLASS_AND) {
3581 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3582 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3583 for (value = 0; value < 256; value++)
3584 if (!isALNUM(value))
3585 ANYOF_BITMAP_CLEAR(data->start_class, value);
3586 }
653099ff 3587 }
8aa23a47
YO
3588 else {
3589 if (data->start_class->flags & ANYOF_LOCALE)
3590 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3591 else {
3592 for (value = 0; value < 256; value++)
3593 if (isALNUM(value))
3594 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3595 }
8aa23a47
YO
3596 }
3597 break;
3598 case ALNUML:
3599 if (flags & SCF_DO_STCLASS_AND) {
3600 if (data->start_class->flags & ANYOF_LOCALE)
3601 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3602 }
3603 else {
3604 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3605 data->start_class->flags |= ANYOF_LOCALE;
3606 }
3607 break;
3608 case NALNUM:
3609 if (flags & SCF_DO_STCLASS_AND) {
3610 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3611 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3612 for (value = 0; value < 256; value++)
3613 if (isALNUM(value))
3614 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3615 }
3616 }
8aa23a47
YO
3617 else {
3618 if (data->start_class->flags & ANYOF_LOCALE)
3619 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3620 else {
3621 for (value = 0; value < 256; value++)
3622 if (!isALNUM(value))
3623 ANYOF_BITMAP_SET(data->start_class, value);
3624 }
653099ff 3625 }
8aa23a47
YO
3626 break;
3627 case NALNUML:
3628 if (flags & SCF_DO_STCLASS_AND) {
3629 if (data->start_class->flags & ANYOF_LOCALE)
3630 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3631 }
8aa23a47
YO
3632 else {
3633 data->start_class->flags |= ANYOF_LOCALE;
3634 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3635 }
3636 break;
3637 case SPACE:
3638 if (flags & SCF_DO_STCLASS_AND) {
3639 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3640 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3641 for (value = 0; value < 256; value++)
3642 if (!isSPACE(value))
3643 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3644 }
3645 }
8aa23a47
YO
3646 else {
3647 if (data->start_class->flags & ANYOF_LOCALE)
3648 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3649 else {
3650 for (value = 0; value < 256; value++)
3651 if (isSPACE(value))
3652 ANYOF_BITMAP_SET(data->start_class, value);
3653 }
653099ff 3654 }
8aa23a47
YO
3655 break;
3656 case SPACEL:
3657 if (flags & SCF_DO_STCLASS_AND) {
3658 if (data->start_class->flags & ANYOF_LOCALE)
3659 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3660 }
3661 else {
3662 data->start_class->flags |= ANYOF_LOCALE;
3663 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3664 }
3665 break;
3666 case NSPACE:
3667 if (flags & SCF_DO_STCLASS_AND) {
3668 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3669 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3670 for (value = 0; value < 256; value++)
3671 if (isSPACE(value))
3672 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3673 }
8aa23a47
YO
3674 }
3675 else {
3676 if (data->start_class->flags & ANYOF_LOCALE)
3677 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3678 else {
3679 for (value = 0; value < 256; value++)
3680 if (!isSPACE(value))
3681 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3682 }
3683 }
8aa23a47
YO
3684 break;
3685 case NSPACEL:
3686 if (flags & SCF_DO_STCLASS_AND) {
3687 if (data->start_class->flags & ANYOF_LOCALE) {
3688 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3689 for (value = 0; value < 256; value++)
3690 if (!isSPACE(value))
3691 ANYOF_BITMAP_CLEAR(data->start_class, value);
3692 }
653099ff 3693 }
8aa23a47
YO
3694 else {
3695 data->start_class->flags |= ANYOF_LOCALE;
3696 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3697 }
3698 break;
3699 case DIGIT:
3700 if (flags & SCF_DO_STCLASS_AND) {
3701 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3702 for (value = 0; value < 256; value++)
3703 if (!isDIGIT(value))
3704 ANYOF_BITMAP_CLEAR(data->start_class, value);
3705 }
3706 else {
3707 if (data->start_class->flags & ANYOF_LOCALE)
3708 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3709 else {
3710 for (value = 0; value < 256; value++)
3711 if (isDIGIT(value))
3712 ANYOF_BITMAP_SET(data->start_class, value);
3713 }
3714 }
3715 break;
3716 case NDIGIT:
3717 if (flags & SCF_DO_STCLASS_AND) {
3718 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3719 for (value = 0; value < 256; value++)
3720 if (isDIGIT(value))
3721 ANYOF_BITMAP_CLEAR(data->start_class, value);
3722 }
3723 else {
3724 if (data->start_class->flags & ANYOF_LOCALE)
3725 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3726 else {
3727 for (value = 0; value < 256; value++)
3728 if (!isDIGIT(value))
3729 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3730 }
3731 }
8aa23a47 3732 break;
e1d1eefb
YO
3733 CASE_SYNST_FNC(VERTWS);
3734 CASE_SYNST_FNC(HORIZWS);
3735
8aa23a47
YO
3736 }
3737 if (flags & SCF_DO_STCLASS_OR)
3738 cl_and(data->start_class, and_withp);
3739 flags &= ~SCF_DO_STCLASS;
3740 }
3741 }
3742 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3743 data->flags |= (OP(scan) == MEOL
3744 ? SF_BEFORE_MEOL
3745 : SF_BEFORE_SEOL);
3746 }
3747 else if ( PL_regkind[OP(scan)] == BRANCHJ
3748 /* Lookbehind, or need to calculate parens/evals/stclass: */
3749 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3750 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3751 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3752 || OP(scan) == UNLESSM )
3753 {
3754 /* Negative Lookahead/lookbehind
3755 In this case we can't do fixed string optimisation.
3756 */
1de06328 3757
8aa23a47
YO
3758 I32 deltanext, minnext, fake = 0;
3759 regnode *nscan;
3760 struct regnode_charclass_class intrnl;
3761 int f = 0;
1de06328 3762
8aa23a47
YO
3763 data_fake.flags = 0;
3764 if (data) {
3765 data_fake.whilem_c = data->whilem_c;
3766 data_fake.last_closep = data->last_closep;
c277df42 3767 }
8aa23a47
YO
3768 else
3769 data_fake.last_closep = &fake;
58e23c8d 3770 data_fake.pos_delta = delta;
8aa23a47
YO
3771 if ( flags & SCF_DO_STCLASS && !scan->flags
3772 && OP(scan) == IFMATCH ) { /* Lookahead */
3773 cl_init(pRExC_state, &intrnl);
3774 data_fake.start_class = &intrnl;
3775 f |= SCF_DO_STCLASS_AND;
3776 }
3777 if (flags & SCF_WHILEM_VISITED_POS)
3778 f |= SCF_WHILEM_VISITED_POS;
3779 next = regnext(scan);
3780 nscan = NEXTOPER(NEXTOPER(scan));
3781 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3782 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3783 if (scan->flags) {
3784 if (deltanext) {
58e23c8d 3785 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3786 }
3787 else if (minnext > (I32)U8_MAX) {
58e23c8d 3788 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3789 }
3790 scan->flags = (U8)minnext;
3791 }
3792 if (data) {
3793 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3794 pars++;
3795 if (data_fake.flags & SF_HAS_EVAL)
3796 data->flags |= SF_HAS_EVAL;
3797 data->whilem_c = data_fake.whilem_c;
3798 }
3799 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
3800 if (flags & SCF_DO_STCLASS_OR) {
3801 /* OR before, AND after: ideally we would recurse with
3802 * data_fake to get the AND applied by study of the
3803 * remainder of the pattern, and then derecurse;
3804 * *** HACK *** for now just treat as "no information".
3805 * See [perl #56690].
3806 */
3807 cl_init(pRExC_state, data->start_class);
3808 } else {
3809 /* AND before and after: combine and continue */
3810 const int was = (data->start_class->flags & ANYOF_EOS);
3811
3812 cl_and(data->start_class, &intrnl);
3813 if (was)
3814 data->start_class->flags |= ANYOF_EOS;
3815 }
8aa23a47 3816 }
cb434fcc 3817 }
8aa23a47
YO
3818#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3819 else {
3820 /* Positive Lookahead/lookbehind
3821 In this case we can do fixed string optimisation,
3822 but we must be careful about it. Note in the case of
3823 lookbehind the positions will be offset by the minimum
3824 length of the pattern, something we won't know about
3825 until after the recurse.
3826 */
3827 I32 deltanext, fake = 0;
3828 regnode *nscan;
3829 struct regnode_charclass_class intrnl;
3830 int f = 0;
3831 /* We use SAVEFREEPV so that when the full compile
3832 is finished perl will clean up the allocated
3833 minlens when its all done. This was we don't
3834 have to worry about freeing them when we know
3835 they wont be used, which would be a pain.
3836 */
3837 I32 *minnextp;
3838 Newx( minnextp, 1, I32 );
3839 SAVEFREEPV(minnextp);
3840
3841 if (data) {
3842 StructCopy(data, &data_fake, scan_data_t);
3843 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3844 f |= SCF_DO_SUBSTR;
3845 if (scan->flags)
304ee84b 3846 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3847 data_fake.last_found=newSVsv(data->last_found);
3848 }
3849 }
3850 else
3851 data_fake.last_closep = &fake;
3852 data_fake.flags = 0;
58e23c8d 3853 data_fake.pos_delta = delta;
8aa23a47
YO
3854 if (is_inf)
3855 data_fake.flags |= SF_IS_INF;
3856 if ( flags & SCF_DO_STCLASS && !scan->flags
3857 && OP(scan) == IFMATCH ) { /* Lookahead */
3858 cl_init(pRExC_state, &intrnl);
3859 data_fake.start_class = &intrnl;
3860 f |= SCF_DO_STCLASS_AND;
3861 }
3862 if (flags & SCF_WHILEM_VISITED_POS)
3863 f |= SCF_WHILEM_VISITED_POS;
3864 next = regnext(scan);
3865 nscan = NEXTOPER(NEXTOPER(scan));
3866
3867 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3868 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3869 if (scan->flags) {
3870 if (deltanext) {
58e23c8d 3871 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3872 }
3873 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3874 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3875 }
3876 scan->flags = (U8)*minnextp;
3877 }
3878
3879 *minnextp += min;
3880
3881 if (f & SCF_DO_STCLASS_AND) {
3882 const int was = (data->start_class->flags & ANYOF_EOS);
3883
3884 cl_and(data->start_class, &intrnl);
3885 if (was)
3886 data->start_class->flags |= ANYOF_EOS;
3887 }
3888 if (data) {
3889 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3890 pars++;
3891 if (data_fake.flags & SF_HAS_EVAL)
3892 data->flags |= SF_HAS_EVAL;
3893 data->whilem_c = data_fake.whilem_c;
3894 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3895 if (RExC_rx->minlen<*minnextp)
3896 RExC_rx->minlen=*minnextp;
304ee84b 3897 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3898 SvREFCNT_dec(data_fake.last_found);
3899
3900 if ( data_fake.minlen_fixed != minlenp )
3901 {
3902 data->offset_fixed= data_fake.offset_fixed;
3903 data->minlen_fixed= data_fake.minlen_fixed;
3904 data->lookbehind_fixed+= scan->flags;
3905 }
3906 if ( data_fake.minlen_float != minlenp )
3907 {
3908 data->minlen_float= data_fake.minlen_float;
3909 data->offset_float_min=data_fake.offset_float_min;
3910 data->offset_float_max=data_fake.offset_float_max;
3911 data->lookbehind_float+= scan->flags;
3912 }
3913 }
3914 }
3915
3916
40d049e4 3917 }
8aa23a47
YO
3918#endif
3919 }
3920 else if (OP(scan) == OPEN) {
3921 if (stopparen != (I32)ARG(scan))
3922 pars++;
3923 }
3924 else if (OP(scan) == CLOSE) {
3925 if (stopparen == (I32)ARG(scan)) {
3926 break;
3927 }
3928 if ((I32)ARG(scan) == is_par) {
3929 next = regnext(scan);
b515a41d 3930
8aa23a47
YO
3931 if ( next && (OP(next) != WHILEM) && next < last)
3932 is_par = 0; /* Disable optimization */
40d049e4 3933 }
8aa23a47
YO
3934 if (data)
3935 *(data->last_closep) = ARG(scan);
3936 }
3937 else if (OP(scan) == EVAL) {
c277df42
IZ
3938 if (data)
3939 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3940 }
3941 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3942 if (flags & SCF_DO_SUBSTR) {
304ee84b 3943 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3944 flags &= ~SCF_DO_SUBSTR;
40d049e4 3945 }
8aa23a47
YO
3946 if (data && OP(scan)==ACCEPT) {
3947 data->flags |= SCF_SEEN_ACCEPT;
3948 if (stopmin > min)
3949 stopmin = min;
e2e6a0f1 3950 }
8aa23a47
YO
3951 }
3952 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3953 {
0f5d15d6 3954 if (flags & SCF_DO_SUBSTR) {
304ee84b 3955 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3956 data->longest = &(data->longest_float);
3957 }
3958 is_inf = is_inf_internal = 1;
653099ff 3959 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3960 cl_anything(pRExC_state, data->start_class);
96776eda 3961 flags &= ~SCF_DO_STCLASS;
8aa23a47 3962 }
58e23c8d 3963 else if (OP(scan) == GPOS) {
bbe252da 3964 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3965 !(delta || is_inf || (data && data->pos_delta)))
3966 {
bbe252da
YO
3967 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3968 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3969 if (RExC_rx->gofs < (U32)min)
3970 RExC_rx->gofs = min;
3971 } else {
bbe252da 3972 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3973 RExC_rx->gofs = 0;
3974 }
3975 }
786e8c11 3976#ifdef TRIE_STUDY_OPT
40d049e4 3977#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3978 else if (PL_regkind[OP(scan)] == TRIE) {
3979 /* NOTE - There is similar code to this block above for handling
3980 BRANCH nodes on the initial study. If you change stuff here
3981 check there too. */
3982 regnode *trie_node= scan;
3983 regnode *tail= regnext(scan);
f8fc2ecf 3984 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3985 I32 max1 = 0, min1 = I32_MAX;
3986 struct regnode_charclass_class accum;
3987
3988 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3989 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3990 if (flags & SCF_DO_STCLASS)
3991 cl_init_zero(pRExC_state, &accum);
3992
3993 if (!trie->jump) {
3994 min1= trie->minlen;
3995 max1= trie->maxlen;
3996 } else {
3997 const regnode *nextbranch= NULL;
3998 U32 word;
3999
4000 for ( word=1 ; word <= trie->wordcount ; word++)
4001 {
4002 I32 deltanext=0, minnext=0, f = 0, fake;
4003 struct regnode_charclass_class this_class;
4004
4005 data_fake.flags = 0;
4006 if (data) {
4007 data_fake.whilem_c = data->whilem_c;
4008 data_fake.last_closep = data->last_closep;
4009 }
4010 else
4011 data_fake.last_closep = &fake;
58e23c8d 4012 data_fake.pos_delta = delta;
8aa23a47
YO
4013 if (flags & SCF_DO_STCLASS) {
4014 cl_init(pRExC_state, &this_class);
4015 data_fake.start_class = &this_class;
4016 f = SCF_DO_STCLASS_AND;
4017 }
4018 if (flags & SCF_WHILEM_VISITED_POS)
4019 f |= SCF_WHILEM_VISITED_POS;
4020
4021 if (trie->jump[word]) {
4022 if (!nextbranch)
4023 nextbranch = trie_node + trie->jump[0];
4024 scan= trie_node + trie->jump[word];
4025 /* We go from the jump point to the branch that follows
4026 it. Note this means we need the vestigal unused branches
4027 even though they arent otherwise used.
4028 */
4029 minnext = study_chunk(pRExC_state, &scan, minlenp,
4030 &deltanext, (regnode *)nextbranch, &data_fake,
4031 stopparen, recursed, NULL, f,depth+1);
4032 }
4033 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4034 nextbranch= regnext((regnode*)nextbranch);
4035
4036 if (min1 > (I32)(minnext + trie->minlen))
4037 min1 = minnext + trie->minlen;
4038 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4039 max1 = minnext + deltanext + trie->maxlen;
4040 if (deltanext == I32_MAX)
4041 is_inf = is_inf_internal = 1;
4042
4043 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4044 pars++;
4045 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4046 if ( stopmin > min + min1)
4047 stopmin = min + min1;
4048 flags &= ~SCF_DO_SUBSTR;
4049 if (data)
4050 data->flags |= SCF_SEEN_ACCEPT;
4051 }
4052 if (data) {
4053 if (data_fake.flags & SF_HAS_EVAL)
4054 data->flags |= SF_HAS_EVAL;
4055 data->whilem_c = data_fake.whilem_c;
4056 }
4057 if (flags & SCF_DO_STCLASS)
4058 cl_or(pRExC_state, &accum, &this_class);
4059 }
4060 }
4061 if (flags & SCF_DO_SUBSTR) {
4062 data->pos_min += min1;
4063 data->pos_delta += max1 - min1;
4064 if (max1 != min1 || is_inf)
4065 data->longest = &(data->longest_float);
4066 }
4067 min += min1;
4068 delta += max1 - min1;
4069 if (flags & SCF_DO_STCLASS_OR) {
4070 cl_or(pRExC_state, data->start_class, &accum);
4071 if (min1) {
4072 cl_and(data->start_class, and_withp);
4073 flags &= ~SCF_DO_STCLASS;
4074 }
4075 }
4076 else if (flags & SCF_DO_STCLASS_AND) {
4077 if (min1) {
4078 cl_and(data->start_class, &accum);
4079 flags &= ~SCF_DO_STCLASS;
4080 }
4081 else {
4082 /* Switch to OR mode: cache the old value of
4083 * data->start_class */
4084 INIT_AND_WITHP;
4085 StructCopy(data->start_class, and_withp,
4086 struct regnode_charclass_class);
4087 flags &= ~SCF_DO_STCLASS_AND;
4088 StructCopy(&accum, data->start_class,
4089 struct regnode_charclass_class);
4090 flags |= SCF_DO_STCLASS_OR;
4091 data->start_class->flags |= ANYOF_EOS;
4092 }
4093 }
4094 scan= tail;
4095 continue;
4096 }
786e8c11 4097#else
8aa23a47 4098 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4099 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4100 U8*bang=NULL;
4101
4102 min += trie->minlen;
4103 delta += (trie->maxlen - trie->minlen);
4104 flags &= ~SCF_DO_STCLASS; /* xxx */
4105 if (flags & SCF_DO_SUBSTR) {
304ee84b 4106 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4107 data->pos_min += trie->minlen;
4108 data->pos_delta += (trie->maxlen - trie->minlen);
4109 if (trie->maxlen != trie->minlen)
4110 data->longest = &(data->longest_float);
4111 }
4112 if (trie->jump) /* no more substrings -- for now /grr*/
4113 flags &= ~SCF_DO_SUBSTR;
b515a41d 4114 }
8aa23a47
YO
4115#endif /* old or new */
4116#endif /* TRIE_STUDY_OPT */
e1d1eefb 4117
8aa23a47
YO
4118 /* Else: zero-length, ignore. */
4119 scan = regnext(scan);
4120 }
4121 if (frame) {
4122 last = frame->last;
4123 scan = frame->next;
4124 stopparen = frame->stop;
4125 frame = frame->prev;
4126 goto fake_study_recurse;
c277df42
IZ
4127 }
4128
4129 finish:
8aa23a47 4130 assert(!frame);
304ee84b 4131 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4132
c277df42 4133 *scanp = scan;
aca2d497 4134 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4135 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4136 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4137 if (is_par > (I32)U8_MAX)
c277df42
IZ
4138 is_par = 0;
4139 if (is_par && pars==1 && data) {
4140 data->flags |= SF_IN_PAR;
4141 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4142 }
4143 else if (pars && data) {
c277df42
IZ
4144 data->flags |= SF_HAS_PAR;
4145 data->flags &= ~SF_IN_PAR;
4146 }
653099ff 4147 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4148 cl_and(data->start_class, and_withp);
786e8c11
YO
4149 if (flags & SCF_TRIE_RESTUDY)
4150 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4151
304ee84b 4152 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4153
e2e6a0f1 4154 return min < stopmin ? min : stopmin;
c277df42
IZ
4155}
4156
2eccd3b2
NC
4157STATIC U32
4158S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4159{
4a4e7719
NC
4160 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4161
7918f24d
NC
4162 PERL_ARGS_ASSERT_ADD_DATA;
4163
4a4e7719
NC
4164 Renewc(RExC_rxi->data,
4165 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4166 char, struct reg_data);
4167 if(count)
f8fc2ecf 4168 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4169 else
f8fc2ecf 4170 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4171 RExC_rxi->data->count = count + n;
4172 Copy(s, RExC_rxi->data->what + count, n, U8);
4173 return count;
c277df42
IZ
4174}
4175
f8149455 4176/*XXX: todo make this not included in a non debugging perl */
76234dfb 4177#ifndef PERL_IN_XSUB_RE
d88dccdf 4178void
864dbfa3 4179Perl_reginitcolors(pTHX)
d88dccdf 4180{
97aff369 4181 dVAR;
1df70142 4182 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4183 if (s) {
1df70142
AL
4184 char *t = savepv(s);
4185 int i = 0;
4186 PL_colors[0] = t;
d88dccdf 4187 while (++i < 6) {
1df70142
AL
4188 t = strchr(t, '\t');
4189 if (t) {
4190 *t = '\0';
4191 PL_colors[i] = ++t;
d88dccdf
IZ
4192 }
4193 else
1df70142 4194 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4195 }
4196 } else {
1df70142 4197 int i = 0;
b81d288d 4198 while (i < 6)
06b5626a 4199 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4200 }
4201 PL_colorset = 1;
4202}
76234dfb 4203#endif
8615cb43 4204
07be1b83 4205
786e8c11
YO
4206#ifdef TRIE_STUDY_OPT
4207#define CHECK_RESTUDY_GOTO \
4208 if ( \
4209 (data.flags & SCF_TRIE_RESTUDY) \
4210 && ! restudied++ \
4211 ) goto reStudy
4212#else
4213#define CHECK_RESTUDY_GOTO
4214#endif
f9f4320a 4215
a687059c 4216/*
e50aee73 4217 - pregcomp - compile a regular expression into internal code
a687059c
LW
4218 *
4219 * We can't allocate space until we know how big the compiled form will be,
4220 * but we can't compile it (and thus know how big it is) until we've got a
4221 * place to put the code. So we cheat: we compile it twice, once with code
4222 * generation turned off and size counting turned on, and once "for real".
4223 * This also means that we don't allocate space until we are sure that the
4224 * thing really will compile successfully, and we never have to move the
4225 * code and thus invalidate pointers into it. (Note that it has to be in
4226 * one piece because free() must be able to free it all.) [NB: not true in perl]
4227 *
4228 * Beware that the optimization-preparation code in here knows about some
4229 * of the structure of the compiled regexp. [I'll say.]
4230 */
b9b4dddf
YO
4231
4232
4233
f9f4320a 4234#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4235#define RE_ENGINE_PTR &PL_core_reg_engine
4236#else
f9f4320a
YO
4237extern const struct regexp_engine my_reg_engine;
4238#define RE_ENGINE_PTR &my_reg_engine
4239#endif
6d5c990f
RGS
4240
4241#ifndef PERL_IN_XSUB_RE
3ab4a224 4242REGEXP *
1593ad57 4243Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4244{
97aff369 4245 dVAR;
6d5c990f 4246 HV * const table = GvHV(PL_hintgv);
7918f24d
NC
4247
4248 PERL_ARGS_ASSERT_PREGCOMP;
4249
f9f4320a
YO
4250 /* Dispatch a request to compile a regexp to correct
4251 regexp engine. */
f9f4320a
YO
4252 if (table) {
4253 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4254 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4255 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4256 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4257 DEBUG_COMPILE_r({
8d8756e7 4258 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4259 SvIV(*ptr));
4260 });
3ab4a224 4261 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4262 }
b9b4dddf 4263 }
3ab4a224 4264 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4265}
6d5c990f 4266#endif
2a5d9b1d 4267
3ab4a224 4268REGEXP *
1593ad57 4269Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
2a5d9b1d
RGS
4270{
4271 dVAR;
288b8c02
NC
4272 REGEXP *rx;
4273 struct regexp *r;
f8fc2ecf 4274 register regexp_internal *ri;
3ab4a224 4275 STRLEN plen;
5d51ce98
KW
4276 char *exp;
4277 char* xend;
c277df42 4278 regnode *scan;
a0d0e21e 4279 I32 flags;
a0d0e21e
LW
4280 I32 minlen = 0;
4281 I32 sawplus = 0;
4282 I32 sawopen = 0;
bbd61b5f
KW
4283 U8 jump_ret = 0;
4284 dJMPENV;
2c2d71f5 4285 scan_data_t data;
830247a4 4286 RExC_state_t RExC_state;
be8e71aa 4287 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83 4288#ifdef TRIE_STUDY_OPT
5d51ce98 4289 int restudied;
07be1b83
YO
4290 RExC_state_t copyRExC_state;
4291#endif
2a5d9b1d 4292 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4293
4294 PERL_ARGS_ASSERT_RE_COMPILE;
4295
6d5c990f 4296 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4297
b9ad30b4 4298 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
a0ed51b3 4299
02daf0ab 4300
bbd61b5f
KW
4301 /* Longjmp back to here if have to switch in midstream to utf8 */
4302 if (! RExC_orig_utf8) {
4303 JMPENV_PUSH(jump_ret);
4304 }
4305
5d51ce98
KW
4306 if (jump_ret == 0) { /* First time through */
4307 exp = SvPV(pattern, plen);
4308 xend = exp + plen;
4309
4310 DEBUG_COMPILE_r({
4311 SV *dsv= sv_newmortal();
4312 RE_PV_QUOTED_DECL(s, RExC_utf8,
4313 dsv, exp, plen, 60);
4314 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4315 PL_colors[4],PL_colors[5],s);
4316 });
4317 }
4318 else { /* longjumped back */
bbd61b5f
KW
4319 STRLEN len = plen;
4320
5d51ce98
KW
4321 /* If the cause for the longjmp was other than changing to utf8, pop
4322 * our own setjmp, and longjmp to the correct handler */
bbd61b5f
KW
4323 if (jump_ret != UTF8_LONGJMP) {
4324 JMPENV_POP;
4325 JMPENV_JUMP(jump_ret);
4326 }
4327
595598ee
KW
4328 GET_RE_DEBUG_FLAGS;
4329
bbd61b5f
KW
4330 /* It's possible to write a regexp in ascii that represents Unicode
4331 codepoints outside of the byte range, such as via \x{100}. If we
4332 detect such a sequence we have to convert the entire pattern to utf8
4333 and then recompile, as our sizing calculation will have been based
4334 on 1 byte == 1 character, but we will need to use utf8 to encode
4335 at least some part of the pattern, and therefore must convert the whole
4336 thing.
4337 -- dmq */
4338 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4339 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
595598ee 4340 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
bbd61b5f
KW
4341 xend = exp + len;
4342 RExC_orig_utf8 = RExC_utf8 = 1;
4343 SAVEFREEPV(exp);
4344 }
4345
5d51ce98
KW
4346#ifdef TRIE_STUDY_OPT
4347 restudied = 0;
4348#endif
4349
02daf0ab 4350 RExC_precomp = exp;
c737faaf 4351 RExC_flags = pm_flags;
830247a4 4352 RExC_sawback = 0;
bbce6d69 4353
830247a4
IZ
4354 RExC_seen = 0;
4355 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4356 RExC_seen_evals = 0;
4357 RExC_extralen = 0;
c277df42 4358
bbce6d69 4359 /* First pass: determine size, legality. */
830247a4 4360 RExC_parse = exp;
fac92740 4361 RExC_start = exp;
830247a4
IZ
4362 RExC_end = xend;
4363 RExC_naughty = 0;
4364 RExC_npar = 1;
e2e6a0f1 4365 RExC_nestroot = 0;
830247a4
IZ
4366 RExC_size = 0L;
4367 RExC_emit = &PL_regdummy;
4368 RExC_whilem_seen = 0;
40d049e4
YO
4369 RExC_open_parens = NULL;
4370 RExC_close_parens = NULL;
4371 RExC_opend = NULL;
81714fb9 4372 RExC_paren_names = NULL;
1f1031fe
YO
4373#ifdef DEBUGGING
4374 RExC_paren_name_list = NULL;
4375#endif
40d049e4
YO
4376 RExC_recurse = NULL;
4377 RExC_recurse_count = 0;
81714fb9 4378
85ddcde9
JH
4379#if 0 /* REGC() is (currently) a NOP at the first pass.
4380 * Clever compilers notice this and complain. --jhi */
830247a4 4381 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4382#endif
3dab1dad
YO
4383 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4384 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4385 RExC_precomp = NULL;
a0d0e21e
LW
4386 return(NULL);
4387 }
bbd61b5f
KW
4388
4389 /* Here, finished first pass. Get rid of our setjmp, which we added for
4390 * efficiency only if the passed-in string wasn't in utf8, as shown by
4391 * RExC_orig_utf8. But if the first pass was redone, that variable will be
4392 * 1 here even though the original string wasn't utf8, but in this case
4393 * there will have been a long jump */
4394 if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
4395 JMPENV_POP;
02daf0ab 4396 }
07be1b83 4397 DEBUG_PARSE_r({
81714fb9
YO
4398 PerlIO_printf(Perl_debug_log,
4399 "Required size %"IVdf" nodes\n"
4400 "Starting second pass (creation)\n",
4401 (IV)RExC_size);
07be1b83
YO
4402 RExC_lastnum=0;
4403 RExC_lastparse=NULL;
4404 });
c277df42
IZ
4405 /* Small enough for pointer-storage convention?
4406 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4407 if (RExC_size >= 0x10000L && RExC_extralen)
4408 RExC_size += RExC_extralen;
c277df42 4409 else
830247a4
IZ
4410 RExC_extralen = 0;
4411 if (RExC_whilem_seen > 15)
4412 RExC_whilem_seen = 15;
a0d0e21e 4413
f9f4320a
YO
4414 /* Allocate space and zero-initialize. Note, the two step process
4415 of zeroing when in debug mode, thus anything assigned has to
4416 happen after that */
d2f13c59 4417 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 4418 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
4419 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4420 char, regexp_internal);
4421 if ( r == NULL || ri == NULL )
b45f050a 4422 FAIL("Regexp out of space");
0f79a09d
GS
4423#ifdef DEBUGGING
4424 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4425 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4426#else
f8fc2ecf
YO
4427 /* bulk initialize base fields with 0. */
4428 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4429#endif
58e23c8d
YO
4430
4431 /* non-zero initialization begins here */
f8fc2ecf 4432 RXi_SET( r, ri );
f9f4320a 4433 r->engine= RE_ENGINE_PTR;
c737faaf 4434 r->extflags = pm_flags;
bcdf7404 4435 {
f7819f85 4436 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
77ba44fb 4437 bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
c5ea2ffa
KW
4438
4439 /* The caret is output if there are any defaults: if not all the STD
4440 * flags are set, or if no character set specifier is needed */
4441 bool has_default =
4442 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4443 || ! has_charset);
bcdf7404 4444 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
4445 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4446 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
4447 const char *fptr = STD_PAT_MODS; /*"msix"*/
4448 char *p;
fb85c044 4449 /* Allocate for the worst case, which is all the std flags are turned
c5ea2ffa
KW
4450 * on. If more precision is desired, we could do a population count of
4451 * the flags set. This could be done with a small lookup table, or by
4452 * shifting, masking and adding, or even, when available, assembly
4453 * language for a machine-language population count.
4454 * We never output a minus, as all those are defaults, so are
4455 * covered by the caret */
fb85c044 4456 const STRLEN wraplen = plen + has_p + has_runon
c5ea2ffa
KW
4457 + has_default /* If needs a caret */
4458 + has_charset /* If needs a character set specifier */
bcdf7404
YO
4459 + (sizeof(STD_PAT_MODS) - 1)
4460 + (sizeof("(?:)") - 1);
4461
c5ea2ffa 4462 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
f7c278bf 4463 SvPOK_on(rx);
8f6ae13c 4464 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 4465 *p++='('; *p++='?';
9de15fec
KW
4466
4467 /* If a default, cover it using the caret */
c5ea2ffa 4468 if (has_default) {
85508812 4469 *p++= DEFAULT_PAT_MOD;
fb85c044 4470 }
c5ea2ffa
KW
4471 if (has_charset) {
4472 if (r->extflags & RXf_PMf_LOCALE) {
4473 *p++ = LOCALE_PAT_MOD;
4474 } else {
4475 *p++ = UNICODE_PAT_MOD;
4476 }
9de15fec 4477 }
f7819f85
A
4478 if (has_p)
4479 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404 4480 {
bcdf7404 4481 char ch;
bcdf7404
YO
4482 while((ch = *fptr++)) {
4483 if(reganch & 1)
4484 *p++ = ch;
bcdf7404
YO
4485 reganch >>= 1;
4486 }
bcdf7404
YO
4487 }
4488
28d8d7f4 4489 *p++ = ':';
bb661a58 4490 Copy(RExC_precomp, p, plen, char);
efd26800
NC
4491 assert ((RX_WRAPPED(rx) - p) < 16);
4492 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 4493 p += plen;
bcdf7404 4494 if (has_runon)
28d8d7f4
YO
4495 *p++ = '\n';
4496 *p++ = ')';
4497 *p = 0;
fb85c044 4498 SvCUR_set(rx, p - SvPVX_const(rx));
bcdf7404
YO
4499 }
4500
bbe252da 4501 r->intflags = 0;
830247a4 4502 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4503
6bda09f9 4504 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4505 Newxz(RExC_open_parens, RExC_npar,regnode *);
4506 SAVEFREEPV(RExC_open_parens);
4507 Newxz(RExC_close_parens,RExC_npar,regnode *);
4508 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4509 }
4510
4511 /* Useful during FAIL. */
7122b237
YO
4512#ifdef RE_TRACK_PATTERN_OFFSETS
4513 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4514 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4515 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4516 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4517 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4518#endif
4519 SetProgLen(ri,RExC_size);
288b8c02 4520 RExC_rx_sv = rx;
830247a4 4521 RExC_rx = r;
f8fc2ecf 4522 RExC_rxi = ri;
bbce6d69 4523
4524 /* Second pass: emit code. */
c737faaf 4525 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
4526 RExC_parse = exp;
4527 RExC_end = xend;
4528 RExC_naughty = 0;
4529 RExC_npar = 1;
f8fc2ecf
YO
4530 RExC_emit_start = ri->program;
4531 RExC_emit = ri->program;
3b57cd43
YO
4532 RExC_emit_bound = ri->program + RExC_size + 1;
4533
2cd61cdb 4534 /* Store the count of eval-groups for security checks: */
f8149455 4535 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4536 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 4537 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 4538 ReREFCNT_dec(rx);
a0d0e21e 4539 return(NULL);
80757612 4540 }
07be1b83
YO
4541 /* XXXX To minimize changes to RE engine we always allocate
4542 3-units-long substrs field. */
4543 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4544 if (RExC_recurse_count) {
4545 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4546 SAVEFREEPV(RExC_recurse);
4547 }
a0d0e21e 4548
07be1b83 4549reStudy:
1de06328 4550 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4551 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4552
07be1b83 4553#ifdef TRIE_STUDY_OPT
0934c9d9
SH
4554 if (!restudied) {
4555 StructCopy(&zero_scan_data, &data, scan_data_t);
4556 copyRExC_state = RExC_state;
4557 } else {
5d458dd8 4558 U32 seen=RExC_seen;
07be1b83 4559 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4560
4561 RExC_state = copyRExC_state;
4562 if (seen & REG_TOP_LEVEL_BRANCHES)
4563 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4564 else
4565 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4566 if (data.last_found) {
07be1b83 4567 SvREFCNT_dec(data.longest_fixed);
07be1b83 4568 SvREFCNT_dec(data.longest_float);
07be1b83 4569 SvREFCNT_dec(data.last_found);
1de06328 4570 }
40d049e4 4571 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4572 }
40d049e4
YO
4573#else
4574 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4575#endif
fc8cd66c 4576
a0d0e21e 4577 /* Dig out information for optimizations. */
f7819f85 4578 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
4579 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4580
a0ed51b3 4581 if (UTF)
8f6ae13c 4582 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 4583 ri->regstclass = NULL;
830247a4 4584 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4585 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4586 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4587
1de06328
YO
4588 /* testing for BRANCH here tells us whether there is "must appear"
4589 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4590 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4591 I32 fake;
c5254dd6 4592 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4593 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4594 int stclass_flag;
07be1b83 4595 I32 last_close = 0; /* pointed to by data */
5339e136
YO
4596 regnode *first= scan;
4597 regnode *first_next= regnext(first);
4598
639081d6
YO
4599 /*
4600 * Skip introductions and multiplicators >= 1
4601 * so that we can extract the 'meat' of the pattern that must
4602 * match in the large if() sequence following.
4603 * NOTE that EXACT is NOT covered here, as it is normally
4604 * picked up by the optimiser separately.
4605 *
4606 * This is unfortunate as the optimiser isnt handling lookahead
4607 * properly currently.
4608 *
4609 */
a0d0e21e 4610 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4611 /* An OR of *one* alternative - should not happen now. */
5339e136 4612 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83
YO
4613 /* for now we can't handle lookbehind IFMATCH*/
4614 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4615 (OP(first) == PLUS) ||
4616 (OP(first) == MINMOD) ||
653099ff 4617 /* An {n,m} with n>0 */
5339e136
YO
4618 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4619 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 4620 {
639081d6
YO
4621 /*
4622 * the only op that could be a regnode is PLUS, all the rest
4623 * will be regnode_1 or regnode_2.
4624 *
4625 */
a0d0e21e
LW
4626 if (OP(first) == PLUS)
4627 sawplus = 1;
4628 else
3dab1dad 4629 first += regarglen[OP(first)];
639081d6
YO
4630
4631 first = NEXTOPER(first);
5339e136 4632 first_next= regnext(first);
a687059c
LW
4633 }
4634
a0d0e21e
LW
4635 /* Starting-point info. */
4636 again:
786e8c11 4637 DEBUG_PEEP("first:",first,0);
07be1b83 4638 /* Ignore EXACT as we deal with it later. */
3dab1dad 4639 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4640 if (OP(first) == EXACT)
6f207bd3 4641 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4642 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4643 ri->regstclass = first;
b3c9acc1 4644 }
07be1b83 4645#ifdef TRIE_STCLASS
786e8c11 4646 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4647 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4648 {
786e8c11 4649 regnode *trie_op;
07be1b83 4650 /* this can happen only on restudy */
786e8c11 4651 if ( OP(first) == TRIE ) {
c944940b 4652 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4653 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4654 StructCopy(first,trieop,struct regnode_1);
4655 trie_op=(regnode *)trieop;
4656 } else {
c944940b 4657 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4658 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4659 StructCopy(first,trieop,struct regnode_charclass);
4660 trie_op=(regnode *)trieop;
4661 }
1de06328 4662 OP(trie_op)+=2;
786e8c11 4663 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4664 ri->regstclass = trie_op;
07be1b83
YO
4665 }
4666#endif
e52fc539 4667 else if (REGNODE_SIMPLE(OP(first)))
f8fc2ecf 4668 ri->regstclass = first;
3dab1dad
YO
4669 else if (PL_regkind[OP(first)] == BOUND ||
4670 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4671 ri->regstclass = first;
3dab1dad 4672 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4673 r->extflags |= (OP(first) == MBOL
4674 ? RXf_ANCH_MBOL
cad2e5aa 4675 : (OP(first) == SBOL
bbe252da
YO
4676 ? RXf_ANCH_SBOL
4677 : RXf_ANCH_BOL));
a0d0e21e 4678 first = NEXTOPER(first);
774d564b 4679 goto again;
4680 }
4681 else if (OP(first) == GPOS) {
bbe252da 4682 r->extflags |= RXf_ANCH_GPOS;
774d564b 4683 first = NEXTOPER(first);
4684 goto again;
a0d0e21e 4685 }
cf2a2b69
YO
4686 else if ((!sawopen || !RExC_sawback) &&
4687 (OP(first) == STAR &&
3dab1dad 4688 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4689 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4690 {
4691 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4692 const int type =
4693 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4694 ? RXf_ANCH_MBOL
4695 : RXf_ANCH_SBOL;
4696 r->extflags |= type;
4697 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4698 first = NEXTOPER(first);
774d564b 4699 goto again;
a0d0e21e 4700 }
b81d288d 4701 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4702 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4703 /* x+ must match at the 1st pos of run of x's */
bbe252da 4704 r->intflags |= PREGf_SKIP;
a0d0e21e 4705
c277df42 4706 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4707#ifdef TRIE_STUDY_OPT
81714fb9 4708 DEBUG_PARSE_r(
be8e71aa
YO
4709 if (!restudied)
4710 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4711 (IV)(first - scan + 1))
4712 );
4713#else
81714fb9 4714 DEBUG_PARSE_r(
be8e71aa
YO
4715 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4716 (IV)(first - scan + 1))
4717 );
4718#endif
4719
4720
a0d0e21e
LW
4721 /*
4722 * If there's something expensive in the r.e., find the
4723 * longest literal string that must appear and make it the
4724 * regmust. Resolve ties in favor of later strings, since
4725 * the regstart check works with the beginning of the r.e.
4726 * and avoiding duplication strengthens checking. Not a
4727 * strong reason, but sufficient in the absence of others.
4728 * [Now we resolve ties in favor of the earlier string if
c277df42 4729 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4730 * earlier string may buy us something the later one won't.]
4731 */
de8c5301 4732
396482e1
GA
4733 data.longest_fixed = newSVpvs("");
4734 data.longest_float = newSVpvs("");
4735 data.last_found = newSVpvs("");
c277df42
IZ
4736 data.longest = &(data.longest_fixed);
4737 first = scan;
f8fc2ecf 4738 if (!ri->regstclass) {
830247a4 4739 cl_init(pRExC_state, &ch_class);
653099ff
GS
4740 data.start_class = &ch_class;
4741 stclass_flag = SCF_DO_STCLASS_AND;
4742 } else /* XXXX Check for BOUND? */
4743 stclass_flag = 0;
cb434fcc 4744 data.last_closep = &last_close;
de8c5301 4745
1de06328 4746 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4747 &data, -1, NULL, NULL,
4748 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4749
07be1b83 4750
786e8c11
YO
4751 CHECK_RESTUDY_GOTO;
4752
4753
830247a4 4754 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4755 && data.last_start_min == 0 && data.last_end > 0
830247a4 4756 && !RExC_seen_zerolen
2bf803e2 4757 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4758 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4759 r->extflags |= RXf_CHECK_ALL;
304ee84b 4760 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4761 SvREFCNT_dec(data.last_found);
4762
1de06328
YO
4763 /* Note that code very similar to this but for anchored string
4764 follows immediately below, changes may need to be made to both.
4765 Be careful.
4766 */
a0ed51b3 4767 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4768 if (longest_float_length
c277df42
IZ
4769 || (data.flags & SF_FL_BEFORE_EOL
4770 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4771 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4772 {
1182767e 4773 I32 t,ml;
cf93c79d 4774
1de06328 4775 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4776 && data.offset_fixed == data.offset_float_min
4777 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4778 goto remove_float; /* As in (a)+. */
4779
1de06328
YO
4780 /* copy the information about the longest float from the reg_scan_data
4781 over to the program. */
33b8afdf
JH
4782 if (SvUTF8(data.longest_float)) {
4783 r->float_utf8 = data.longest_float;
c445ea15 4784 r->float_substr = NULL;
33b8afdf
JH
4785 } else {
4786 r->float_substr = data.longest_float;
c445ea15 4787 r->float_utf8 = NULL;
33b8afdf 4788 }
1de06328
YO
4789 /* float_end_shift is how many chars that must be matched that
4790 follow this item. We calculate it ahead of time as once the
4791 lookbehind offset is added in we lose the ability to correctly
4792 calculate it.*/
4793 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4794 : (I32)longest_float_length;
1de06328
YO
4795 r->float_end_shift = ml - data.offset_float_min
4796 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4797 + data.lookbehind_float;
4798 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4799 r->float_max_offset = data.offset_float_max;
1182767e 4800 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4801 r->float_max_offset -= data.lookbehind_float;
4802
cf93c79d
IZ
4803 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4804 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4805 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4806 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4807 }
4808 else {
aca2d497 4809 remove_float:
c445ea15 4810 r->float_substr = r->float_utf8 = NULL;
c277df42 4811 SvREFCNT_dec(data.longest_float);
c5254dd6 4812 longest_float_length = 0;
a0d0e21e 4813 }
c277df42 4814
1de06328
YO
4815 /* Note that code very similar to this but for floating string
4816 is immediately above, changes may need to be made to both.
4817 Be careful.
4818 */
a0ed51b3 4819 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4820 if (longest_fixed_length
c277df42
IZ
4821 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4822 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4823 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4824 {
1182767e 4825 I32 t,ml;
cf93c79d 4826
1de06328
YO
4827 /* copy the information about the longest fixed
4828 from the reg_scan_data over to the program. */
33b8afdf
JH
4829 if (SvUTF8(data.longest_fixed)) {
4830 r->anchored_utf8 = data.longest_fixed;
c445ea15 4831 r->anchored_substr = NULL;
33b8afdf
JH
4832 } else {
4833 r->anchored_substr = data.longest_fixed;
c445ea15 4834 r->anchored_utf8 = NULL;
33b8afdf 4835 }
1de06328
YO
4836 /* fixed_end_shift is how many chars that must be matched that
4837 follow this item. We calculate it ahead of time as once the
4838 lookbehind offset is added in we lose the ability to correctly
4839 calculate it.*/
4840 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4841 : (I32)longest_fixed_length;
1de06328
YO
4842 r->anchored_end_shift = ml - data.offset_fixed
4843 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4844 + data.lookbehind_fixed;
4845 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4846
cf93c79d
IZ
4847 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4848 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4849 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4850 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4851 }
4852 else {
c445ea15 4853 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4854 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4855 longest_fixed_length = 0;
a0d0e21e 4856 }
f8fc2ecf
YO
4857 if (ri->regstclass
4858 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4859 ri->regstclass = NULL;
33b8afdf
JH
4860 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4861 && stclass_flag
653099ff 4862 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4863 && !cl_is_anything(data.start_class))
4864 {
2eccd3b2 4865 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4866
f8fc2ecf 4867 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4868 struct regnode_charclass_class);
4869 StructCopy(data.start_class,
f8fc2ecf 4870 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4871 struct regnode_charclass_class);
f8fc2ecf 4872 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4873 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4874 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4875 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4876 PerlIO_printf(Perl_debug_log,
a0288114 4877 "synthetic stclass \"%s\".\n",
3f7c398e 4878 SvPVX_const(sv));});
653099ff 4879 }
c277df42
IZ
4880
4881 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4882 if (longest_fixed_length > longest_float_length) {
1de06328 4883 r->check_end_shift = r->anchored_end_shift;
c277df42 4884 r->check_substr = r->anchored_substr;
33b8afdf 4885 r->check_utf8 = r->anchored_utf8;
c277df42 4886 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4887 if (r->extflags & RXf_ANCH_SINGLE)
4888 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4889 }
4890 else {
1de06328 4891 r->check_end_shift = r->float_end_shift;
c277df42 4892 r->check_substr = r->float_substr;
33b8afdf 4893 r->check_utf8 = r->float_utf8;
1de06328
YO
4894 r->check_offset_min = r->float_min_offset;
4895 r->check_offset_max = r->float_max_offset;
a0d0e21e 4896 }
30382c73
IZ
4897 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4898 This should be changed ASAP! */
bbe252da
YO
4899 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4900 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4901 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4902 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4903 }
1de06328
YO
4904 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4905 if ( (STRLEN)minlen < longest_float_length )
4906 minlen= longest_float_length;
4907 if ( (STRLEN)minlen < longest_fixed_length )
4908 minlen= longest_fixed_length;
4909 */
a0ed51b3
LW
4910 }
4911 else {
c277df42
IZ
4912 /* Several toplevels. Best we can is to set minlen. */
4913 I32 fake;
653099ff 4914 struct regnode_charclass_class ch_class;
cb434fcc 4915 I32 last_close = 0;
c277df42 4916
5d458dd8 4917 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4918
f8fc2ecf 4919 scan = ri->program + 1;
830247a4 4920 cl_init(pRExC_state, &ch_class);
653099ff 4921 data.start_class = &ch_class;
cb434fcc 4922 data.last_closep = &last_close;
07be1b83 4923
de8c5301 4924
1de06328 4925 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4926 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4927
786e8c11 4928 CHECK_RESTUDY_GOTO;
07be1b83 4929
33b8afdf 4930 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4931 = r->float_substr = r->float_utf8 = NULL;
653099ff 4932 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4933 && !cl_is_anything(data.start_class))
4934 {
2eccd3b2 4935 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4936
f8fc2ecf 4937 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4938 struct regnode_charclass_class);
4939 StructCopy(data.start_class,
f8fc2ecf 4940 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4941 struct regnode_charclass_class);
f8fc2ecf 4942 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4943 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4944 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4945 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4946 PerlIO_printf(Perl_debug_log,
a0288114 4947 "synthetic stclass \"%s\".\n",
3f7c398e 4948 SvPVX_const(sv));});
653099ff 4949 }
a0d0e21e
LW
4950 }
4951
1de06328
YO
4952 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4953 the "real" pattern. */
cf9788e3
RGS
4954 DEBUG_OPTIMISE_r({
4955 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4956 (IV)minlen, (IV)r->minlen);
cf9788e3 4957 });
de8c5301 4958 r->minlenret = minlen;
1de06328
YO
4959 if (r->minlen < minlen)
4960 r->minlen = minlen;
4961
b81d288d 4962 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4963 r->extflags |= RXf_GPOS_SEEN;
830247a4 4964 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4965 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4966 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4967 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4968 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4969 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4970 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4971 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4972 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4973 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 4974 if (RExC_paren_names)
85fbaab2 4975 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 4976 else
5daac39c 4977 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 4978
7bd1e614 4979#ifdef STUPID_PATTERN_CHECKS
5509d87a 4980 if (RX_PRELEN(rx) == 0)
640f820d 4981 r->extflags |= RXf_NULL;
5509d87a 4982 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
4983 /* XXX: this should happen BEFORE we compile */
4984 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 4985 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 4986 r->extflags |= RXf_WHITE;
5509d87a 4987 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 4988 r->extflags |= RXf_START_ONLY;
f1b875a0 4989#else
5509d87a 4990 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
4991 /* XXX: this should happen BEFORE we compile */
4992 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4993 else {
4994 regnode *first = ri->program + 1;
39aa8307
JH
4995 U8 fop = OP(first);
4996 U8 nop = OP(NEXTOPER(first));
7bd1e614 4997
640f820d
AB
4998 if (PL_regkind[fop] == NOTHING && nop == END)
4999 r->extflags |= RXf_NULL;
5000 else if (PL_regkind[fop] == BOL && nop == END)
7bd1e614
YO
5001 r->extflags |= RXf_START_ONLY;
5002 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5003 r->extflags |= RXf_WHITE;
5004 }
f1b875a0 5005#endif
1f1031fe
YO
5006#ifdef DEBUGGING
5007 if (RExC_paren_names) {
af534a04 5008 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
1f1031fe
YO
5009 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5010 } else
1f1031fe 5011#endif
cde0cee5 5012 ri->name_list_idx = 0;
1f1031fe 5013
40d049e4
YO
5014 if (RExC_recurse_count) {
5015 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5016 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5017 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5018 }
5019 }
f0ab9afb 5020 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
5021 /* assume we don't need to swap parens around before we match */
5022
be8e71aa
YO
5023 DEBUG_DUMP_r({
5024 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
5025 regdump(r);
5026 });
7122b237
YO
5027#ifdef RE_TRACK_PATTERN_OFFSETS
5028 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5029 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
5030 U32 i;
5031 GET_RE_DEBUG_FLAGS_DECL;
7122b237 5032 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 5033 for (i = 1; i <= len; i++) {
7122b237 5034 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 5035 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 5036 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
5037 }
5038 PerlIO_printf(Perl_debug_log, "\n");
5039 });
7122b237 5040#endif
288b8c02 5041 return rx;
a687059c
LW
5042}
5043
f9f4320a 5044#undef RE_ENGINE_PTR
3dab1dad 5045
93b32b6d 5046
81714fb9 5047SV*
192b9cd1
AB
5048Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5049 const U32 flags)
5050{
7918f24d
NC
5051 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5052
192b9cd1
AB
5053 PERL_UNUSED_ARG(value);
5054
f1b875a0 5055 if (flags & RXapif_FETCH) {
192b9cd1 5056 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 5057 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6ad8f254 5058 Perl_croak_no_modify(aTHX);
192b9cd1 5059 return NULL;
f1b875a0 5060 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
5061 return reg_named_buff_exists(rx, key, flags)
5062 ? &PL_sv_yes
5063 : &PL_sv_no;
f1b875a0 5064 } else if (flags & RXapif_REGNAMES) {
192b9cd1 5065 return reg_named_buff_all(rx, flags);
f1b875a0 5066 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
5067 return reg_named_buff_scalar(rx, flags);
5068 } else {
5069 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5070 return NULL;
5071 }
5072}
5073
5074SV*
5075Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5076 const U32 flags)
5077{
7918f24d 5078 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
5079 PERL_UNUSED_ARG(lastkey);
5080
f1b875a0 5081 if (flags & RXapif_FIRSTKEY)
192b9cd1 5082 return reg_named_buff_firstkey(rx, flags);
f1b875a0 5083 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
5084 return reg_named_buff_nextkey(rx, flags);
5085 else {
5086 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5087 return NULL;
5088 }
5089}
5090
5091SV*
288b8c02
NC
5092Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5093 const U32 flags)
81714fb9 5094{
44a2ac75
YO
5095 AV *retarray = NULL;
5096 SV *ret;
288b8c02 5097 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5098
5099 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5100
f1b875a0 5101 if (flags & RXapif_ALL)
44a2ac75 5102 retarray=newAV();
93b32b6d 5103
5daac39c
NC
5104 if (rx && RXp_PAREN_NAMES(rx)) {
5105 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
5106 if (he_str) {
5107 IV i;
5108 SV* sv_dat=HeVAL(he_str);
5109 I32 *nums=(I32*)SvPVX(sv_dat);
5110 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
5111 if ((I32)(rx->nparens) >= nums[i]
5112 && rx->offs[nums[i]].start != -1
5113 && rx->offs[nums[i]].end != -1)
93b32b6d 5114 {
49d7dfbc 5115 ret = newSVpvs("");
288b8c02 5116 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
5117 if (!retarray)
5118 return ret;
5119 } else {
5120 ret = newSVsv(&PL_sv_undef);
5121 }
ec83ea38 5122 if (retarray)
93b32b6d 5123 av_push(retarray, ret);
81714fb9 5124 }
93b32b6d 5125 if (retarray)
ad64d0ec 5126 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
5127 }
5128 }
5129 return NULL;
5130}
5131
5132bool
288b8c02 5133Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
5134 const U32 flags)
5135{
288b8c02 5136 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5137
5138 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5139
5daac39c 5140 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5141 if (flags & RXapif_ALL) {
5daac39c 5142 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 5143 } else {
288b8c02 5144 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
5145 if (sv) {
5146 SvREFCNT_dec(sv);
192b9cd1
AB
5147 return TRUE;
5148 } else {
5149 return FALSE;
5150 }
5151 }
5152 } else {
5153 return FALSE;
5154 }
5155}
5156
5157SV*
288b8c02 5158Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5159{
288b8c02 5160 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5161
5162 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5163
5daac39c
NC
5164 if ( rx && RXp_PAREN_NAMES(rx) ) {
5165 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 5166
288b8c02 5167 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
5168 } else {
5169 return FALSE;
5170 }
192b9cd1
AB
5171}
5172
5173SV*
288b8c02 5174Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5175{
288b8c02 5176 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 5177 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5178
5179 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5180
5daac39c
NC
5181 if (rx && RXp_PAREN_NAMES(rx)) {
5182 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
5183 HE *temphe;
5184 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5185 IV i;
5186 IV parno = 0;
5187 SV* sv_dat = HeVAL(temphe);
5188 I32 *nums = (I32*)SvPVX(sv_dat);
5189 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5190 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5191 rx->offs[nums[i]].start != -1 &&
5192 rx->offs[nums[i]].end != -1)
5193 {
5194 parno = nums[i];
5195 break;
5196 }
5197 }
f1b875a0 5198 if (parno || flags & RXapif_ALL) {
a663657d 5199 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5200 }
81714fb9
YO
5201 }
5202 }
44a2ac75
YO
5203 return NULL;
5204}
5205
192b9cd1 5206SV*
288b8c02 5207Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5208{
5209 SV *ret;
5210 AV *av;
5211 I32 length;
288b8c02 5212 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5213
7918f24d
NC
5214 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5215
5daac39c 5216 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5217 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5218 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5219 } else if (flags & RXapif_ONE) {
288b8c02 5220 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 5221 av = MUTABLE_AV(SvRV(ret));
192b9cd1 5222 length = av_len(av);
ec83ea38 5223 SvREFCNT_dec(ret);
192b9cd1
AB
5224 return newSViv(length + 1);
5225 } else {
5226 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5227 return NULL;
5228 }
5229 }
5230 return &PL_sv_undef;
5231}
5232
5233SV*
288b8c02 5234Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5235{
288b8c02 5236 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5237 AV *av = newAV();
5238
7918f24d
NC
5239 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5240
5daac39c
NC
5241 if (rx && RXp_PAREN_NAMES(rx)) {
5242 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5243 HE *temphe;
5244 (void)hv_iterinit(hv);
5245 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5246 IV i;
5247 IV parno = 0;
5248 SV* sv_dat = HeVAL(temphe);
5249 I32 *nums = (I32*)SvPVX(sv_dat);
5250 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5251 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5252 rx->offs[nums[i]].start != -1 &&
5253 rx->offs[nums[i]].end != -1)
5254 {
5255 parno = nums[i];
5256 break;
5257 }
5258 }
f1b875a0 5259 if (parno || flags & RXapif_ALL) {
a663657d 5260 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
5261 }
5262 }
5263 }
5264
ad64d0ec 5265 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
5266}
5267
49d7dfbc 5268void
288b8c02
NC
5269Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5270 SV * const sv)
44a2ac75 5271{
288b8c02 5272 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 5273 char *s = NULL;
a9d504c3 5274 I32 i = 0;
44a2ac75 5275 I32 s1, t1;
7918f24d
NC
5276
5277 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 5278
cde0cee5
YO
5279 if (!rx->subbeg) {
5280 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5281 return;
cde0cee5
YO
5282 }
5283 else
f1b875a0 5284 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 5285 /* $` */
f0ab9afb 5286 i = rx->offs[0].start;
cde0cee5 5287 s = rx->subbeg;
44a2ac75
YO
5288 }
5289 else
f1b875a0 5290 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 5291 /* $' */
f0ab9afb
NC
5292 s = rx->subbeg + rx->offs[0].end;
5293 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
5294 }
5295 else
5296 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
5297 (s1 = rx->offs[paren].start) != -1 &&
5298 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
5299 {
5300 /* $& $1 ... */
5301 i = t1 - s1;
5302 s = rx->subbeg + s1;
cde0cee5
YO
5303 } else {
5304 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5305 return;
cde0cee5
YO
5306 }
5307 assert(rx->sublen >= (s - rx->subbeg) + i );
5308 if (i >= 0) {
5309 const int oldtainted = PL_tainted;
5310 TAINT_NOT;
5311 sv_setpvn(sv, s, i);
5312 PL_tainted = oldtainted;
5313 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 5314 ? (RXp_MATCH_UTF8(rx)
cde0cee5 5315 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 5316 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
5317 {
5318 SvUTF8_on(sv);
5319 }
5320 else
5321 SvUTF8_off(sv);
5322 if (PL_tainting) {
07bc277f 5323 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
5324 if (SvTYPE(sv) >= SVt_PVMG) {
5325 MAGIC* const mg = SvMAGIC(sv);
5326 MAGIC* mgt;
5327 PL_tainted = 1;
5328 SvMAGIC_set(sv, mg->mg_moremagic);
5329 SvTAINT(sv);
5330 if ((mgt = SvMAGIC(sv))) {
5331 mg->mg_moremagic = mgt;
5332 SvMAGIC_set(sv, mg);
44a2ac75 5333 }
cde0cee5
YO
5334 } else {
5335 PL_tainted = 1;
5336 SvTAINT(sv);
5337 }
5338 } else
5339 SvTAINTED_off(sv);
44a2ac75 5340 }
81714fb9 5341 } else {
44a2ac75 5342 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5343 return;
81714fb9
YO
5344 }
5345}
93b32b6d 5346
2fdbfb4d
AB
5347void
5348Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5349 SV const * const value)
5350{
7918f24d
NC
5351 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5352
2fdbfb4d
AB
5353 PERL_UNUSED_ARG(rx);
5354 PERL_UNUSED_ARG(paren);
5355 PERL_UNUSED_ARG(value);
5356
5357 if (!PL_localizing)
6ad8f254 5358 Perl_croak_no_modify(aTHX);
2fdbfb4d
AB
5359}
5360
5361I32
288b8c02 5362Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
5363 const I32 paren)
5364{
288b8c02 5365 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
5366 I32 i;
5367 I32 s1, t1;
5368
7918f24d
NC
5369 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5370
2fdbfb4d
AB
5371 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5372 switch (paren) {
192b9cd1 5373 /* $` / ${^PREMATCH} */
f1b875a0 5374 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
5375 if (rx->offs[0].start != -1) {
5376 i = rx->offs[0].start;
5377 if (i > 0) {
5378 s1 = 0;
5379 t1 = i;
5380 goto getlen;
5381 }
5382 }
5383 return 0;
192b9cd1 5384 /* $' / ${^POSTMATCH} */
f1b875a0 5385 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
5386 if (rx->offs[0].end != -1) {
5387 i = rx->sublen - rx->offs[0].end;
5388 if (i > 0) {
5389 s1 = rx->offs[0].end;
5390 t1 = rx->sublen;
5391 goto getlen;
5392 }
5393 }
5394 return 0;
192b9cd1
AB
5395 /* $& / ${^MATCH}, $1, $2, ... */
5396 default:
2fdbfb4d
AB
5397 if (paren <= (I32)rx->nparens &&
5398 (s1 = rx->offs[paren].start) != -1 &&
5399 (t1 = rx->offs[paren].end) != -1)
5400 {
5401 i = t1 - s1;
5402 goto getlen;
5403 } else {
5404 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 5405 report_uninit((const SV *)sv);
2fdbfb4d
AB
5406 return 0;
5407 }
5408 }
5409 getlen:
07bc277f 5410 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
5411 const char * const s = rx->subbeg + s1;
5412 const U8 *ep;
5413 STRLEN el;
5414
5415 i = t1 - s1;
5416 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5417 i = el;
5418 }
5419 return i;
5420}
5421
fe578d7f 5422SV*
49d7dfbc 5423Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 5424{
7918f24d 5425 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 5426 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
5427 if (0)
5428 return NULL;
5429 else
5430 return newSVpvs("Regexp");
fe578d7f 5431}
0a4db386 5432
894be9b7 5433/* Scans the name of a named buffer from the pattern.
0a4db386
YO
5434 * If flags is REG_RSN_RETURN_NULL returns null.
5435 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5436 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5437 * to the parsed name as looked up in the RExC_paren_names hash.
5438 * If there is an error throws a vFAIL().. type exception.
894be9b7 5439 */
0a4db386
YO
5440
5441#define REG_RSN_RETURN_NULL 0
5442#define REG_RSN_RETURN_NAME 1
5443#define REG_RSN_RETURN_DATA 2
5444
894be9b7 5445STATIC SV*
7918f24d
NC
5446S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5447{
894be9b7 5448 char *name_start = RExC_parse;
1f1031fe 5449
7918f24d
NC
5450 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5451
1f1031fe
YO
5452 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5453 /* skip IDFIRST by using do...while */
5454 if (UTF)
5455 do {
5456 RExC_parse += UTF8SKIP(RExC_parse);
5457 } while (isALNUM_utf8((U8*)RExC_parse));
5458 else
5459 do {
5460 RExC_parse++;
5461 } while (isALNUM(*RExC_parse));
894be9b7 5462 }
1f1031fe 5463
0a4db386 5464 if ( flags ) {
59cd0e26
NC
5465 SV* sv_name
5466 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5467 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
5468 if ( flags == REG_RSN_RETURN_NAME)
5469 return sv_name;
5470 else if (flags==REG_RSN_RETURN_DATA) {
5471 HE *he_str = NULL;
5472 SV *sv_dat = NULL;
5473 if ( ! sv_name ) /* should not happen*/
5474 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5475 if (RExC_paren_names)
5476 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5477 if ( he_str )
5478 sv_dat = HeVAL(he_str);
5479 if ( ! sv_dat )
5480 vFAIL("Reference to nonexistent named group");
5481 return sv_dat;
5482 }
5483 else {
5484 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5485 }
5486 /* NOT REACHED */
894be9b7 5487 }
0a4db386 5488 return NULL;
894be9b7
YO
5489}
5490
3dab1dad
YO
5491#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5492 int rem=(int)(RExC_end - RExC_parse); \
5493 int cut; \
5494 int num; \
5495 int iscut=0; \
5496 if (rem>10) { \
5497 rem=10; \
5498 iscut=1; \
5499 } \
5500 cut=10-rem; \
5501 if (RExC_lastparse!=RExC_parse) \
5502 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5503 rem, RExC_parse, \
5504 cut + 4, \
5505 iscut ? "..." : "<" \
5506 ); \
5507 else \
5508 PerlIO_printf(Perl_debug_log,"%16s",""); \
5509 \
5510 if (SIZE_ONLY) \
3b57cd43 5511 num = RExC_size + 1; \
3dab1dad
YO
5512 else \
5513 num=REG_NODE_NUM(RExC_emit); \
5514 if (RExC_lastnum!=num) \
0a4db386 5515 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 5516 else \
0a4db386 5517 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
5518 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5519 (int)((depth*2)), "", \
3dab1dad
YO
5520 (funcname) \
5521 ); \
5522 RExC_lastnum=num; \
5523 RExC_lastparse=RExC_parse; \
5524})
5525
07be1b83
YO
5526
5527
3dab1dad
YO
5528#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5529 DEBUG_PARSE_MSG((funcname)); \
5530 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5531})
6bda09f9
YO
5532#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5533 DEBUG_PARSE_MSG((funcname)); \
5534 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5535})
a687059c
LW
5536/*
5537 - reg - regular expression, i.e. main body or parenthesized thing
5538 *
5539 * Caller must absorb opening parenthesis.
5540 *
5541 * Combining parenthesis handling with the base level of regular expression
5542 * is a trifle forced, but the need to tie the tails of the branches to what
5543 * follows makes it hard to avoid.
5544 */
07be1b83
YO
5545#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5546#ifdef DEBUGGING
5547#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5548#else
5549#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5550#endif
3dab1dad 5551
76e3520e 5552STATIC regnode *
3dab1dad 5553S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 5554 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 5555{
27da23d5 5556 dVAR;
c277df42
IZ
5557 register regnode *ret; /* Will be the head of the group. */
5558 register regnode *br;
5559 register regnode *lastbr;
cbbf8932 5560 register regnode *ender = NULL;
a0d0e21e 5561 register I32 parno = 0;
cbbf8932 5562 I32 flags;
f7819f85 5563 U32 oregflags = RExC_flags;
6136c704
AL
5564 bool have_branch = 0;
5565 bool is_open = 0;
594d7033
YO
5566 I32 freeze_paren = 0;
5567 I32 after_freeze = 0;
9d1d55b5
JP
5568
5569 /* for (?g), (?gc), and (?o) warnings; warning
5570 about (?c) will warn about (?g) -- japhy */
5571
6136c704
AL
5572#define WASTED_O 0x01
5573#define WASTED_G 0x02
5574#define WASTED_C 0x04
5575#define WASTED_GC (0x02|0x04)
cbbf8932 5576 I32 wastedflags = 0x00;
9d1d55b5 5577
fac92740 5578 char * parse_start = RExC_parse; /* MJD */
a28509cc 5579 char * const oregcomp_parse = RExC_parse;
a0d0e21e 5580
3dab1dad 5581 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5582
5583 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
5584 DEBUG_PARSE("reg ");
5585
821b33a5 5586 *flagp = 0; /* Tentatively. */
a0d0e21e 5587
9d1d55b5 5588
a0d0e21e
LW
5589 /* Make an OPEN node, if parenthesized. */
5590 if (paren) {
e2e6a0f1
YO
5591 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5592 char *start_verb = RExC_parse;
5593 STRLEN verb_len = 0;
5594 char *start_arg = NULL;
5595 unsigned char op = 0;
5596 int argok = 1;
5597 int internal_argval = 0; /* internal_argval is only useful if !argok */
5598 while ( *RExC_parse && *RExC_parse != ')' ) {
5599 if ( *RExC_parse == ':' ) {
5600 start_arg = RExC_parse + 1;
5601 break;
5602 }
5603 RExC_parse++;
5604 }
5605 ++start_verb;
5606 verb_len = RExC_parse - start_verb;
5607 if ( start_arg ) {
5608 RExC_parse++;
5609 while ( *RExC_parse && *RExC_parse != ')' )
5610 RExC_parse++;
5611 if ( *RExC_parse != ')' )
5612 vFAIL("Unterminated verb pattern argument");
5613 if ( RExC_parse == start_arg )
5614 start_arg = NULL;
5615 } else {
5616 if ( *RExC_parse != ')' )
5617 vFAIL("Unterminated verb pattern");
5618 }
5d458dd8 5619
e2e6a0f1
YO
5620 switch ( *start_verb ) {
5621 case 'A': /* (*ACCEPT) */
568a785a 5622 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
5623 op = ACCEPT;
5624 internal_argval = RExC_nestroot;
5625 }
5626 break;
5627 case 'C': /* (*COMMIT) */
568a785a 5628 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 5629 op = COMMIT;
e2e6a0f1
YO
5630 break;
5631 case 'F': /* (*FAIL) */
568a785a 5632 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
5633 op = OPFAIL;
5634 argok = 0;
5635 }
5636 break;
5d458dd8
YO
5637 case ':': /* (*:NAME) */
5638 case 'M': /* (*MARK:NAME) */
568a785a 5639 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 5640 op = MARKPOINT;
5d458dd8
YO
5641 argok = -1;
5642 }
5643 break;
5644 case 'P': /* (*PRUNE) */
568a785a 5645 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 5646 op = PRUNE;
e2e6a0f1 5647 break;
5d458dd8 5648 case 'S': /* (*SKIP) */
568a785a 5649 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
5650 op = SKIP;
5651 break;
5652 case 'T': /* (*THEN) */
5653 /* [19:06] <TimToady> :: is then */
568a785a 5654 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
5655 op = CUTGROUP;
5656 RExC_seen |= REG_SEEN_CUTGROUP;
5657 }
e2e6a0f1
YO
5658 break;
5659 }
5660 if ( ! op ) {
5661 RExC_parse++;
5662 vFAIL3("Unknown verb pattern '%.*s'",
5663 verb_len, start_verb);
5664 }
5665 if ( argok ) {
5666 if ( start_arg && internal_argval ) {
5667 vFAIL3("Verb pattern '%.*s' may not have an argument",
5668 verb_len, start_verb);
5669 } else if ( argok < 0 && !start_arg ) {
5670 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5671 verb_len, start_verb);
5672 } else {
5673 ret = reganode(pRExC_state, op, internal_argval);
5674 if ( ! internal_argval && ! SIZE_ONLY ) {
5675 if (start_arg) {
5676 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5677 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5678 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
5679 ret->flags = 0;
5680 } else {
5681 ret->flags = 1;
5682 }
5683 }
5684 }
5685 if (!internal_argval)
5686 RExC_seen |= REG_SEEN_VERBARG;
5687 } else if ( start_arg ) {
5688 vFAIL3("Verb pattern '%.*s' may not have an argument",
5689 verb_len, start_verb);
5690 } else {
5691 ret = reg_node(pRExC_state, op);
5692 }
5693 nextchar(pRExC_state);
5694 return ret;
5695 } else
fac92740 5696 if (*RExC_parse == '?') { /* (?...) */
6136c704 5697 bool is_logical = 0;
a28509cc 5698 const char * const seqstart = RExC_parse;
fb85c044 5699 bool has_use_defaults = FALSE;
ca9dfc88 5700
830247a4
IZ
5701 RExC_parse++;
5702 paren = *RExC_parse++;
c277df42 5703 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5704 switch (paren) {
894be9b7 5705
1f1031fe
YO
5706 case 'P': /* (?P...) variants for those used to PCRE/Python */
5707 paren = *RExC_parse++;
5708 if ( paren == '<') /* (?P<...>) named capture */
5709 goto named_capture;
5710 else if (paren == '>') { /* (?P>name) named recursion */
5711 goto named_recursion;
5712 }
5713 else if (paren == '=') { /* (?P=...) named backref */
5714 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5715 you change this make sure you change that */
5716 char* name_start = RExC_parse;
5717 U32 num = 0;
5718 SV *sv_dat = reg_scan_name(pRExC_state,
5719 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5720 if (RExC_parse == name_start || *RExC_parse != ')')
5721 vFAIL2("Sequence %.3s... not terminated",parse_start);
5722
5723 if (!SIZE_ONLY) {
5724 num = add_data( pRExC_state, 1, "S" );
5725 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5726 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
5727 }
5728 RExC_sawback = 1;
5729 ret = reganode(pRExC_state,
5730 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5731 num);
5732 *flagp |= HASWIDTH;
5733
5734 Set_Node_Offset(ret, parse_start+1);
5735 Set_Node_Cur_Length(ret); /* MJD */
5736
5737 nextchar(pRExC_state);
5738 return ret;
5739 }
57b84237
YO
5740 RExC_parse++;
5741 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5742 /*NOTREACHED*/
5743 case '<': /* (?<...) */
b81d288d 5744 if (*RExC_parse == '!')
c277df42 5745 paren = ',';
0a4db386 5746 else if (*RExC_parse != '=')
1f1031fe 5747 named_capture:
0a4db386 5748 { /* (?<...>) */
81714fb9 5749 char *name_start;
894be9b7 5750 SV *svname;
81714fb9
YO
5751 paren= '>';
5752 case '\'': /* (?'...') */
5753 name_start= RExC_parse;
0a4db386
YO
5754 svname = reg_scan_name(pRExC_state,
5755 SIZE_ONLY ? /* reverse test from the others */
5756 REG_RSN_RETURN_NAME :
5757 REG_RSN_RETURN_NULL);
57b84237
YO
5758 if (RExC_parse == name_start) {
5759 RExC_parse++;
5760 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5761 /*NOTREACHED*/
5762 }
81714fb9
YO
5763 if (*RExC_parse != paren)
5764 vFAIL2("Sequence (?%c... not terminated",
5765 paren=='>' ? '<' : paren);
5766 if (SIZE_ONLY) {
e62cc96a
YO
5767 HE *he_str;
5768 SV *sv_dat = NULL;
894be9b7
YO
5769 if (!svname) /* shouldnt happen */
5770 Perl_croak(aTHX_
5771 "panic: reg_scan_name returned NULL");
81714fb9
YO
5772 if (!RExC_paren_names) {
5773 RExC_paren_names= newHV();
ad64d0ec 5774 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
5775#ifdef DEBUGGING
5776 RExC_paren_name_list= newAV();
ad64d0ec 5777 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 5778#endif
81714fb9
YO
5779 }
5780 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5781 if ( he_str )
81714fb9 5782 sv_dat = HeVAL(he_str);
e62cc96a 5783 if ( ! sv_dat ) {
81714fb9 5784 /* croak baby croak */
e62cc96a
YO
5785 Perl_croak(aTHX_
5786 "panic: paren_name hash element allocation failed");
5787 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
5788 /* (?|...) can mean we have dupes so scan to check
5789 its already been stored. Maybe a flag indicating
5790 we are inside such a construct would be useful,
5791 but the arrays are likely to be quite small, so
5792 for now we punt -- dmq */
5793 IV count = SvIV(sv_dat);
5794 I32 *pv = (I32*)SvPVX(sv_dat);
5795 IV i;
5796 for ( i = 0 ; i < count ; i++ ) {
5797 if ( pv[i] == RExC_npar ) {
5798 count = 0;
5799 break;
5800 }
5801 }
5802 if ( count ) {
5803 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5804 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5805 pv[count] = RExC_npar;
3a92e6ae 5806 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 5807 }
81714fb9
YO
5808 } else {
5809 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5810 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5811 SvIOK_on(sv_dat);
3ec35e0f 5812 SvIV_set(sv_dat, 1);
e62cc96a 5813 }
1f1031fe
YO
5814#ifdef DEBUGGING
5815 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5816 SvREFCNT_dec(svname);
5817#endif
e62cc96a 5818
81714fb9
YO
5819 /*sv_dump(sv_dat);*/
5820 }
5821 nextchar(pRExC_state);
5822 paren = 1;
5823 goto capturing_parens;
5824 }
5825 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5826 RExC_parse++;
fac92740 5827 case '=': /* (?=...) */
89c6a13e 5828 RExC_seen_zerolen++;
5c3fa2e7 5829 break;
fac92740 5830 case '!': /* (?!...) */
830247a4 5831 RExC_seen_zerolen++;
e2e6a0f1
YO
5832 if (*RExC_parse == ')') {
5833 ret=reg_node(pRExC_state, OPFAIL);
5834 nextchar(pRExC_state);
5835 return ret;
5836 }
594d7033
YO
5837 break;
5838 case '|': /* (?|...) */
5839 /* branch reset, behave like a (?:...) except that
5840 buffers in alternations share the same numbers */
5841 paren = ':';
5842 after_freeze = freeze_paren = RExC_npar;
5843 break;
fac92740
MJD
5844 case ':': /* (?:...) */
5845 case '>': /* (?>...) */
a0d0e21e 5846 break;
fac92740
MJD
5847 case '$': /* (?$...) */
5848 case '@': /* (?@...) */
8615cb43 5849 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5850 break;
fac92740 5851 case '#': /* (?#...) */
830247a4
IZ
5852 while (*RExC_parse && *RExC_parse != ')')
5853 RExC_parse++;
5854 if (*RExC_parse != ')')
c277df42 5855 FAIL("Sequence (?#... not terminated");
830247a4 5856 nextchar(pRExC_state);
a0d0e21e
LW
5857 *flagp = TRYAGAIN;
5858 return NULL;
894be9b7
YO
5859 case '0' : /* (?0) */
5860 case 'R' : /* (?R) */
5861 if (*RExC_parse != ')')
6bda09f9 5862 FAIL("Sequence (?R) not terminated");
1a147d38 5863 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 5864 *flagp |= POSTPONED;
7f69552c
YO
5865 nextchar(pRExC_state);
5866 return ret;
5867 /*notreached*/
894be9b7
YO
5868 { /* named and numeric backreferences */
5869 I32 num;
894be9b7
YO
5870 case '&': /* (?&NAME) */
5871 parse_start = RExC_parse - 1;
1f1031fe 5872 named_recursion:
894be9b7 5873 {
0a4db386
YO
5874 SV *sv_dat = reg_scan_name(pRExC_state,
5875 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5876 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5877 }
5878 goto gen_recurse_regop;
5879 /* NOT REACHED */
542fa716
YO
5880 case '+':
5881 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5882 RExC_parse++;
5883 vFAIL("Illegal pattern");
5884 }
5885 goto parse_recursion;
5886 /* NOT REACHED*/
5887 case '-': /* (?-1) */
5888 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5889 RExC_parse--; /* rewind to let it be handled later */
5890 goto parse_flags;
5891 }
5892 /*FALLTHROUGH */
6bda09f9
YO
5893 case '1': case '2': case '3': case '4': /* (?1) */
5894 case '5': case '6': case '7': case '8': case '9':
5895 RExC_parse--;
542fa716 5896 parse_recursion:
894be9b7
YO
5897 num = atoi(RExC_parse);
5898 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5899 if (*RExC_parse == '-')
5900 RExC_parse++;
6bda09f9
YO
5901 while (isDIGIT(*RExC_parse))
5902 RExC_parse++;
5903 if (*RExC_parse!=')')
5904 vFAIL("Expecting close bracket");
894be9b7
YO
5905
5906 gen_recurse_regop:
542fa716
YO
5907 if ( paren == '-' ) {
5908 /*
5909 Diagram of capture buffer numbering.
5910 Top line is the normal capture buffer numbers
5911 Botton line is the negative indexing as from
5912 the X (the (?-2))
5913
5914 + 1 2 3 4 5 X 6 7
5915 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5916 - 5 4 3 2 1 X x x
5917
5918 */
5919 num = RExC_npar + num;
5920 if (num < 1) {
5921 RExC_parse++;
5922 vFAIL("Reference to nonexistent group");
5923 }
5924 } else if ( paren == '+' ) {
5925 num = RExC_npar + num - 1;
5926 }
5927
1a147d38 5928 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5929 if (!SIZE_ONLY) {
5930 if (num > (I32)RExC_rx->nparens) {
5931 RExC_parse++;
5932 vFAIL("Reference to nonexistent group");
5933 }
40d049e4 5934 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5935 RExC_emit++;
226de585 5936 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5937 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5938 } else {
6bda09f9 5939 RExC_size++;
6bda09f9 5940 }
0a4db386 5941 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5942 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5943 Set_Node_Offset(ret, parse_start); /* MJD */
5944
a3b492c3 5945 *flagp |= POSTPONED;
6bda09f9
YO
5946 nextchar(pRExC_state);
5947 return ret;
894be9b7
YO
5948 } /* named and numeric backreferences */
5949 /* NOT REACHED */
5950
fac92740 5951 case '?': /* (??...) */
6136c704 5952 is_logical = 1;
57b84237
YO
5953 if (*RExC_parse != '{') {
5954 RExC_parse++;
5955 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5956 /*NOTREACHED*/
5957 }
a3b492c3 5958 *flagp |= POSTPONED;
830247a4 5959 paren = *RExC_parse++;
0f5d15d6 5960 /* FALL THROUGH */
fac92740 5961 case '{': /* (?{...}) */
c277df42 5962 {
2eccd3b2
NC
5963 I32 count = 1;
5964 U32 n = 0;
c277df42 5965 char c;
830247a4 5966 char *s = RExC_parse;
c277df42 5967
830247a4
IZ
5968 RExC_seen_zerolen++;
5969 RExC_seen |= REG_SEEN_EVAL;
5970 while (count && (c = *RExC_parse)) {
6136c704
AL
5971 if (c == '\\') {
5972 if (RExC_parse[1])
5973 RExC_parse++;
5974 }
b81d288d 5975 else if (c == '{')
c277df42 5976 count++;
b81d288d 5977 else if (c == '}')
c277df42 5978 count--;
830247a4 5979 RExC_parse++;
c277df42 5980 }
6136c704 5981 if (*RExC_parse != ')') {
b81d288d 5982 RExC_parse = s;
b45f050a
JF
5983 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5984 }
c277df42 5985 if (!SIZE_ONLY) {
f3548bdc 5986 PAD *pad;
6136c704
AL
5987 OP_4tree *sop, *rop;
5988 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5989
569233ed
SB
5990 ENTER;
5991 Perl_save_re_context(aTHX);
f3548bdc 5992 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5993 sop->op_private |= OPpREFCOUNTED;
5994 /* re_dup will OpREFCNT_inc */
5995 OpREFCNT_set(sop, 1);
569233ed 5996 LEAVE;
c277df42 5997
830247a4 5998 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5999 RExC_rxi->data->data[n] = (void*)rop;
6000 RExC_rxi->data->data[n+1] = (void*)sop;
6001 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 6002 SvREFCNT_dec(sv);
a0ed51b3 6003 }
e24b16f9 6004 else { /* First pass */
830247a4 6005 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 6006 && IN_PERL_RUNTIME)
2cd61cdb
IZ
6007 /* No compiled RE interpolated, has runtime
6008 components ===> unsafe. */
6009 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 6010 if (PL_tainting && PL_tainted)
cc6b7395 6011 FAIL("Eval-group in insecure regular expression");
54df2634 6012#if PERL_VERSION > 8
923e4eb5 6013 if (IN_PERL_COMPILETIME)
b5c19bd7 6014 PL_cv_has_eval = 1;
54df2634 6015#endif
c277df42 6016 }
b5c19bd7 6017
830247a4 6018 nextchar(pRExC_state);
6136c704 6019 if (is_logical) {
830247a4 6020 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6021 if (!SIZE_ONLY)
6022 ret->flags = 2;
3dab1dad 6023 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 6024 /* deal with the length of this later - MJD */
0f5d15d6
IZ
6025 return ret;
6026 }
ccb2c380
MP
6027 ret = reganode(pRExC_state, EVAL, n);
6028 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6029 Set_Node_Offset(ret, parse_start);
6030 return ret;
c277df42 6031 }
fac92740 6032 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 6033 {
0a4db386 6034 int is_define= 0;
fac92740 6035 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
6036 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6037 || RExC_parse[1] == '<'
830247a4 6038 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
6039 I32 flag;
6040
830247a4 6041 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6042 if (!SIZE_ONLY)
6043 ret->flags = 1;
3dab1dad 6044 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 6045 goto insert_if;
b81d288d 6046 }
a0ed51b3 6047 }
0a4db386
YO
6048 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6049 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6050 {
6051 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6052 char *name_start= RExC_parse++;
2eccd3b2 6053 U32 num = 0;
0a4db386
YO
6054 SV *sv_dat=reg_scan_name(pRExC_state,
6055 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6056 if (RExC_parse == name_start || *RExC_parse != ch)
6057 vFAIL2("Sequence (?(%c... not terminated",
6058 (ch == '>' ? '<' : ch));
6059 RExC_parse++;
6060 if (!SIZE_ONLY) {
6061 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 6062 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 6063 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
6064 }
6065 ret = reganode(pRExC_state,NGROUPP,num);
6066 goto insert_if_check_paren;
6067 }
6068 else if (RExC_parse[0] == 'D' &&
6069 RExC_parse[1] == 'E' &&
6070 RExC_parse[2] == 'F' &&
6071 RExC_parse[3] == 'I' &&
6072 RExC_parse[4] == 'N' &&
6073 RExC_parse[5] == 'E')
6074 {
6075 ret = reganode(pRExC_state,DEFINEP,0);
6076 RExC_parse +=6 ;
6077 is_define = 1;
6078 goto insert_if_check_paren;
6079 }
6080 else if (RExC_parse[0] == 'R') {
6081 RExC_parse++;
6082 parno = 0;
6083 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6084 parno = atoi(RExC_parse++);
6085 while (isDIGIT(*RExC_parse))
6086 RExC_parse++;
6087 } else if (RExC_parse[0] == '&') {
6088 SV *sv_dat;
6089 RExC_parse++;
6090 sv_dat = reg_scan_name(pRExC_state,
6091 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6092 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6093 }
1a147d38 6094 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
6095 goto insert_if_check_paren;
6096 }
830247a4 6097 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 6098 /* (?(1)...) */
6136c704 6099 char c;
830247a4 6100 parno = atoi(RExC_parse++);
c277df42 6101
830247a4
IZ
6102 while (isDIGIT(*RExC_parse))
6103 RExC_parse++;
fac92740 6104 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 6105
0a4db386 6106 insert_if_check_paren:
830247a4 6107 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 6108 vFAIL("Switch condition not recognized");
c277df42 6109 insert_if:
3dab1dad
YO
6110 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6111 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 6112 if (br == NULL)
830247a4 6113 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 6114 else
3dab1dad 6115 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 6116 c = *nextchar(pRExC_state);
d1b80229
IZ
6117 if (flags&HASWIDTH)
6118 *flagp |= HASWIDTH;
c277df42 6119 if (c == '|') {
0a4db386
YO
6120 if (is_define)
6121 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 6122 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
6123 regbranch(pRExC_state, &flags, 1,depth+1);
6124 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
6125 if (flags&HASWIDTH)
6126 *flagp |= HASWIDTH;
830247a4 6127 c = *nextchar(pRExC_state);
a0ed51b3
LW
6128 }
6129 else
c277df42
IZ
6130 lastbr = NULL;
6131 if (c != ')')
8615cb43 6132 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 6133 ender = reg_node(pRExC_state, TAIL);
3dab1dad 6134 REGTAIL(pRExC_state, br, ender);
c277df42 6135 if (lastbr) {
3dab1dad
YO
6136 REGTAIL(pRExC_state, lastbr, ender);
6137 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
6138 }
6139 else
3dab1dad 6140 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
6141 RExC_size++; /* XXX WHY do we need this?!!
6142 For large programs it seems to be required
6143 but I can't figure out why. -- dmq*/
c277df42 6144 return ret;
a0ed51b3
LW
6145 }
6146 else {
830247a4 6147 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
6148 }
6149 }
1b1626e4 6150 case 0:
830247a4 6151 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 6152 vFAIL("Sequence (? incomplete");
1b1626e4 6153 break;
85508812
KW
6154 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6155 that follow */
fb85c044
KW
6156 has_use_defaults = TRUE;
6157 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9de15fec 6158 RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
fb85c044 6159 goto parse_flags;
a0d0e21e 6160 default:
cde0cee5
YO
6161 --RExC_parse;
6162 parse_flags: /* (?i) */
6163 {
6164 U32 posflags = 0, negflags = 0;
6165 U32 *flagsp = &posflags;
9de15fec 6166 bool has_charset_modifier = 0;
cde0cee5
YO
6167
6168 while (*RExC_parse) {
6169 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
6170 /* (?g), (?gc) and (?o) are useless here
6171 and must be globally applied -- japhy */
cde0cee5
YO
6172 switch (*RExC_parse) {
6173 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9de15fec
KW
6174 case LOCALE_PAT_MOD:
6175 if (has_charset_modifier || flagsp == &negflags) {
6176 goto fail_modifiers;
6177 }
6178 *flagsp &= ~RXf_PMf_UNICODE;
6179 *flagsp |= RXf_PMf_LOCALE;
6180 has_charset_modifier = 1;
6181 break;
6182 case UNICODE_PAT_MOD:
6183 if (has_charset_modifier || flagsp == &negflags) {
6184 goto fail_modifiers;
6185 }
6186 *flagsp &= ~RXf_PMf_LOCALE;
6187 *flagsp |= RXf_PMf_UNICODE;
6188 has_charset_modifier = 1;
6189 break;
6190 case DUAL_PAT_MOD:
6191 if (has_use_defaults
6192 || has_charset_modifier
6193 || flagsp == &negflags)
6194 {
6195 goto fail_modifiers;
6196 }
6197 *flagsp &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6198 has_charset_modifier = 1;
6199 break;
f7819f85
A
6200 case ONCE_PAT_MOD: /* 'o' */
6201 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 6202 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 6203 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
6204 if (! (wastedflags & wflagbit) ) {
6205 wastedflags |= wflagbit;
6206 vWARN5(
6207 RExC_parse + 1,
6208 "Useless (%s%c) - %suse /%c modifier",
6209 flagsp == &negflags ? "?-" : "?",
6210 *RExC_parse,
6211 flagsp == &negflags ? "don't " : "",
6212 *RExC_parse
6213 );
6214 }
6215 }
cde0cee5
YO
6216 break;
6217
f7819f85 6218 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 6219 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
6220 if (! (wastedflags & WASTED_C) ) {
6221 wastedflags |= WASTED_GC;
9d1d55b5
JP
6222 vWARN3(
6223 RExC_parse + 1,
6224 "Useless (%sc) - %suse /gc modifier",
6225 flagsp == &negflags ? "?-" : "?",
6226 flagsp == &negflags ? "don't " : ""
6227 );
6228 }
6229 }
cde0cee5 6230 break;
f7819f85 6231 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5 6232 if (flagsp == &negflags) {
668c081a
NC
6233 if (SIZE_ONLY)
6234 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
6235 } else {
6236 *flagsp |= RXf_PMf_KEEPCOPY;
6237 }
6238 break;
6239 case '-':
fb85c044
KW
6240 /* A flag is a default iff it is following a minus, so
6241 * if there is a minus, it means will be trying to
6242 * re-specify a default which is an error */
6243 if (has_use_defaults || flagsp == &negflags) {
9de15fec 6244 fail_modifiers:
57b84237
YO
6245 RExC_parse++;
6246 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6247 /*NOTREACHED*/
6248 }
cde0cee5
YO
6249 flagsp = &negflags;
6250 wastedflags = 0; /* reset so (?g-c) warns twice */
6251 break;
6252 case ':':
6253 paren = ':';
6254 /*FALLTHROUGH*/
6255 case ')':
6256 RExC_flags |= posflags;
6257 RExC_flags &= ~negflags;
f7819f85
A
6258 if (paren != ':') {
6259 oregflags |= posflags;
6260 oregflags &= ~negflags;
6261 }
cde0cee5
YO
6262 nextchar(pRExC_state);
6263 if (paren != ':') {
6264 *flagp = TRYAGAIN;
6265 return NULL;
6266 } else {
6267 ret = NULL;
6268 goto parse_rest;
6269 }
6270 /*NOTREACHED*/
6271 default:
cde0cee5
YO
6272 RExC_parse++;
6273 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6274 /*NOTREACHED*/
6275 }
830247a4 6276 ++RExC_parse;
48c036b1 6277 }
cde0cee5 6278 }} /* one for the default block, one for the switch */
a0d0e21e 6279 }
fac92740 6280 else { /* (...) */
81714fb9 6281 capturing_parens:
830247a4
IZ
6282 parno = RExC_npar;
6283 RExC_npar++;
e2e6a0f1 6284
830247a4 6285 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
6286 if (!SIZE_ONLY ){
6287 if (!RExC_nestroot)
6288 RExC_nestroot = parno;
c009da3d
YO
6289 if (RExC_seen & REG_SEEN_RECURSE
6290 && !RExC_open_parens[parno-1])
6291 {
e2e6a0f1 6292 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
6293 "Setting open paren #%"IVdf" to %d\n",
6294 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
6295 RExC_open_parens[parno-1]= ret;
6296 }
6bda09f9 6297 }
fac92740
MJD
6298 Set_Node_Length(ret, 1); /* MJD */
6299 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 6300 is_open = 1;
a0d0e21e 6301 }
a0ed51b3 6302 }
fac92740 6303 else /* ! paren */
a0d0e21e 6304 ret = NULL;
cde0cee5
YO
6305
6306 parse_rest:
a0d0e21e 6307 /* Pick up the branches, linking them together. */
fac92740 6308 parse_start = RExC_parse; /* MJD */
3dab1dad 6309 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e
VP
6310
6311 if (freeze_paren) {
6312 if (RExC_npar > after_freeze)
6313 after_freeze = RExC_npar;
6314 RExC_npar = freeze_paren;
6315 }
6316
fac92740 6317 /* branch_len = (paren != 0); */
2af232bd 6318
a0d0e21e
LW
6319 if (br == NULL)
6320 return(NULL);
830247a4
IZ
6321 if (*RExC_parse == '|') {
6322 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 6323 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 6324 }
fac92740 6325 else { /* MJD */
6bda09f9 6326 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
6327 Set_Node_Length(br, paren != 0);
6328 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6329 }
c277df42
IZ
6330 have_branch = 1;
6331 if (SIZE_ONLY)
830247a4 6332 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
6333 }
6334 else if (paren == ':') {
c277df42
IZ
6335 *flagp |= flags&SIMPLE;
6336 }
6136c704 6337 if (is_open) { /* Starts with OPEN. */
3dab1dad 6338 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
6339 }
6340 else if (paren != '?') /* Not Conditional */
a0d0e21e 6341 ret = br;
8ae10a67 6342 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 6343 lastbr = br;
830247a4
IZ
6344 while (*RExC_parse == '|') {
6345 if (!SIZE_ONLY && RExC_extralen) {
6346 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 6347 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
6348 }
6349 if (SIZE_ONLY)
830247a4
IZ
6350 RExC_extralen += 2; /* Account for LONGJMP. */
6351 nextchar(pRExC_state);
594d7033
YO
6352 if (freeze_paren) {
6353 if (RExC_npar > after_freeze)
6354 after_freeze = RExC_npar;
6355 RExC_npar = freeze_paren;
6356 }
3dab1dad 6357 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 6358
a687059c 6359 if (br == NULL)
a0d0e21e 6360 return(NULL);
3dab1dad 6361 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 6362 lastbr = br;
8ae10a67 6363 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
6364 }
6365
c277df42
IZ
6366 if (have_branch || paren != ':') {
6367 /* Make a closing node, and hook it on the end. */
6368 switch (paren) {
6369 case ':':
830247a4 6370 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
6371 break;
6372 case 1:
830247a4 6373 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
6374 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6375 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6376 "Setting close paren #%"IVdf" to %d\n",
6377 (IV)parno, REG_NODE_NUM(ender)));
6378 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
6379 if (RExC_nestroot == parno)
6380 RExC_nestroot = 0;
40d049e4 6381 }
fac92740
MJD
6382 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6383 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
6384 break;
6385 case '<':
c277df42
IZ
6386 case ',':
6387 case '=':
6388 case '!':
c277df42 6389 *flagp &= ~HASWIDTH;
821b33a5
IZ
6390 /* FALL THROUGH */
6391 case '>':
830247a4 6392 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
6393 break;
6394 case 0:
830247a4 6395 ender = reg_node(pRExC_state, END);
40d049e4
YO
6396 if (!SIZE_ONLY) {
6397 assert(!RExC_opend); /* there can only be one! */
6398 RExC_opend = ender;
6399 }
c277df42
IZ
6400 break;
6401 }
eaf3ca90 6402 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 6403
9674d46a 6404 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
6405 if (depth==1)
6406 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6407
c277df42 6408 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
6409 for (br = ret; br; br = regnext(br)) {
6410 const U8 op = PL_regkind[OP(br)];
6411 if (op == BRANCH) {
07be1b83 6412 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
6413 }
6414 else if (op == BRANCHJ) {
07be1b83 6415 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 6416 }
c277df42
IZ
6417 }
6418 }
a0d0e21e 6419 }
c277df42
IZ
6420
6421 {
e1ec3a88
AL
6422 const char *p;
6423 static const char parens[] = "=!<,>";
c277df42
IZ
6424
6425 if (paren && (p = strchr(parens, paren))) {
eb160463 6426 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
6427 int flag = (p - parens) > 1;
6428
6429 if (paren == '>')
6430 node = SUSPEND, flag = 0;
6bda09f9 6431 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
6432 Set_Node_Cur_Length(ret);
6433 Set_Node_Offset(ret, parse_start + 1);
c277df42 6434 ret->flags = flag;
07be1b83 6435 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 6436 }
a0d0e21e
LW
6437 }
6438
6439 /* Check for proper termination. */
ce3e6498 6440 if (paren) {
e2509266 6441 RExC_flags = oregflags;
830247a4
IZ
6442 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6443 RExC_parse = oregcomp_parse;
380a0633 6444 vFAIL("Unmatched (");
ce3e6498 6445 }
a0ed51b3 6446 }
830247a4
IZ
6447 else if (!paren && RExC_parse < RExC_end) {
6448 if (*RExC_parse == ')') {
6449 RExC_parse++;
380a0633 6450 vFAIL("Unmatched )");
a0ed51b3
LW
6451 }
6452 else
b45f050a 6453 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
6454 /* NOTREACHED */
6455 }
594d7033
YO
6456 if (after_freeze)
6457 RExC_npar = after_freeze;
a0d0e21e 6458 return(ret);
a687059c
LW
6459}
6460
6461/*
6462 - regbranch - one alternative of an | operator
6463 *
6464 * Implements the concatenation operator.
6465 */
76e3520e 6466STATIC regnode *
3dab1dad 6467S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 6468{
97aff369 6469 dVAR;
c277df42
IZ
6470 register regnode *ret;
6471 register regnode *chain = NULL;
6472 register regnode *latest;
6473 I32 flags = 0, c = 0;
3dab1dad 6474 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6475
6476 PERL_ARGS_ASSERT_REGBRANCH;
6477
3dab1dad 6478 DEBUG_PARSE("brnc");
02daf0ab 6479
b81d288d 6480 if (first)
c277df42
IZ
6481 ret = NULL;
6482 else {
b81d288d 6483 if (!SIZE_ONLY && RExC_extralen)
830247a4 6484 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 6485 else {
830247a4 6486 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
6487 Set_Node_Length(ret, 1);
6488 }
c277df42
IZ
6489 }
6490
b81d288d 6491 if (!first && SIZE_ONLY)
830247a4 6492 RExC_extralen += 1; /* BRANCHJ */
b81d288d 6493
c277df42 6494 *flagp = WORST; /* Tentatively. */
a0d0e21e 6495
830247a4
IZ
6496 RExC_parse--;
6497 nextchar(pRExC_state);
6498 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 6499 flags &= ~TRYAGAIN;
3dab1dad 6500 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6501 if (latest == NULL) {
6502 if (flags & TRYAGAIN)
6503 continue;
6504 return(NULL);
a0ed51b3
LW
6505 }
6506 else if (ret == NULL)
c277df42 6507 ret = latest;
8ae10a67 6508 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 6509 if (chain == NULL) /* First piece. */
a0d0e21e
LW
6510 *flagp |= flags&SPSTART;
6511 else {
830247a4 6512 RExC_naughty++;
3dab1dad 6513 REGTAIL(pRExC_state, chain, latest);
a687059c 6514 }
a0d0e21e 6515 chain = latest;
c277df42
IZ
6516 c++;
6517 }
6518 if (chain == NULL) { /* Loop ran zero times. */
830247a4 6519 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
6520 if (ret == NULL)
6521 ret = chain;
6522 }
6523 if (c == 1) {
6524 *flagp |= flags&SIMPLE;
a0d0e21e 6525 }
a687059c 6526
d4c19fe8 6527 return ret;
a687059c
LW
6528}
6529
6530/*
6531 - regpiece - something followed by possible [*+?]
6532 *
6533 * Note that the branching code sequences used for ? and the general cases
6534 * of * and + are somewhat optimized: they use the same NOTHING node as
6535 * both the endmarker for their branch list and the body of the last branch.
6536 * It might seem that this node could be dispensed with entirely, but the
6537 * endmarker role is not redundant.
6538 */
76e3520e 6539STATIC regnode *
3dab1dad 6540S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6541{
97aff369 6542 dVAR;
c277df42 6543 register regnode *ret;
a0d0e21e
LW
6544 register char op;
6545 register char *next;
6546 I32 flags;
1df70142 6547 const char * const origparse = RExC_parse;
a0d0e21e 6548 I32 min;
c277df42 6549 I32 max = REG_INFTY;
fac92740 6550 char *parse_start;
10edeb5d 6551 const char *maxpos = NULL;
3dab1dad 6552 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6553
6554 PERL_ARGS_ASSERT_REGPIECE;
6555
3dab1dad 6556 DEBUG_PARSE("piec");
a0d0e21e 6557
3dab1dad 6558 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6559 if (ret == NULL) {
6560 if (flags & TRYAGAIN)
6561 *flagp |= TRYAGAIN;
6562 return(NULL);
6563 }
6564
830247a4 6565 op = *RExC_parse;
a0d0e21e 6566
830247a4 6567 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 6568 maxpos = NULL;
fac92740 6569 parse_start = RExC_parse; /* MJD */
830247a4 6570 next = RExC_parse + 1;
a0d0e21e
LW
6571 while (isDIGIT(*next) || *next == ',') {
6572 if (*next == ',') {
6573 if (maxpos)
6574 break;
6575 else
6576 maxpos = next;
a687059c 6577 }
a0d0e21e
LW
6578 next++;
6579 }
6580 if (*next == '}') { /* got one */
6581 if (!maxpos)
6582 maxpos = next;
830247a4
IZ
6583 RExC_parse++;
6584 min = atoi(RExC_parse);
a0d0e21e
LW
6585 if (*maxpos == ',')
6586 maxpos++;
6587 else
830247a4 6588 maxpos = RExC_parse;
a0d0e21e
LW
6589 max = atoi(maxpos);
6590 if (!max && *maxpos != '0')
c277df42
IZ
6591 max = REG_INFTY; /* meaning "infinity" */
6592 else if (max >= REG_INFTY)
8615cb43 6593 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
6594 RExC_parse = next;
6595 nextchar(pRExC_state);
a0d0e21e
LW
6596
6597 do_curly:
6598 if ((flags&SIMPLE)) {
830247a4 6599 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 6600 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
6601 Set_Node_Offset(ret, parse_start+1); /* MJD */
6602 Set_Node_Cur_Length(ret);
a0d0e21e
LW
6603 }
6604 else {
3dab1dad 6605 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
6606
6607 w->flags = 0;
3dab1dad 6608 REGTAIL(pRExC_state, ret, w);
830247a4 6609 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
6610 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6611 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
6612 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6613 }
6bda09f9 6614 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
6615 /* MJD hk */
6616 Set_Node_Offset(ret, parse_start+1);
2af232bd 6617 Set_Node_Length(ret,
fac92740 6618 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 6619
830247a4 6620 if (!SIZE_ONLY && RExC_extralen)
c277df42 6621 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 6622 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 6623 if (SIZE_ONLY)
830247a4
IZ
6624 RExC_whilem_seen++, RExC_extralen += 3;
6625 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 6626 }
c277df42 6627 ret->flags = 0;
a0d0e21e
LW
6628
6629 if (min > 0)
821b33a5
IZ
6630 *flagp = WORST;
6631 if (max > 0)
6632 *flagp |= HASWIDTH;
8fa23287 6633 if (max < min)
8615cb43 6634 vFAIL("Can't do {n,m} with n > m");
c277df42 6635 if (!SIZE_ONLY) {
eb160463
GS
6636 ARG1_SET(ret, (U16)min);
6637 ARG2_SET(ret, (U16)max);
a687059c 6638 }
a687059c 6639
a0d0e21e 6640 goto nest_check;
a687059c 6641 }
a0d0e21e 6642 }
a687059c 6643
a0d0e21e
LW
6644 if (!ISMULT1(op)) {
6645 *flagp = flags;
a687059c 6646 return(ret);
a0d0e21e 6647 }
bb20fd44 6648
c277df42 6649#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
6650
6651 /* if this is reinstated, don't forget to put this back into perldiag:
6652
6653 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6654
6655 (F) The part of the regexp subject to either the * or + quantifier
6656 could match an empty string. The {#} shows in the regular
6657 expression about where the problem was discovered.
6658
6659 */
6660
bb20fd44 6661 if (!(flags&HASWIDTH) && op != '?')
b45f050a 6662 vFAIL("Regexp *+ operand could be empty");
b81d288d 6663#endif
bb20fd44 6664
fac92740 6665 parse_start = RExC_parse;
830247a4 6666 nextchar(pRExC_state);
a0d0e21e 6667
821b33a5 6668 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
6669
6670 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 6671 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 6672 ret->flags = 0;
830247a4 6673 RExC_naughty += 4;
a0d0e21e
LW
6674 }
6675 else if (op == '*') {
6676 min = 0;
6677 goto do_curly;
a0ed51b3
LW
6678 }
6679 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 6680 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 6681 ret->flags = 0;
830247a4 6682 RExC_naughty += 3;
a0d0e21e
LW
6683 }
6684 else if (op == '+') {
6685 min = 1;
6686 goto do_curly;
a0ed51b3
LW
6687 }
6688 else if (op == '?') {
a0d0e21e
LW
6689 min = 0; max = 1;
6690 goto do_curly;
6691 }
6692 nest_check:
668c081a
NC
6693 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6694 ckWARN3reg(RExC_parse,
6695 "%.*s matches null string many times",
6696 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6697 origparse);
a0d0e21e
LW
6698 }
6699
b9b4dddf 6700 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 6701 nextchar(pRExC_state);
6bda09f9 6702 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 6703 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 6704 }
b9b4dddf
YO
6705#ifndef REG_ALLOW_MINMOD_SUSPEND
6706 else
6707#endif
6708 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6709 regnode *ender;
6710 nextchar(pRExC_state);
6711 ender = reg_node(pRExC_state, SUCCEED);
6712 REGTAIL(pRExC_state, ret, ender);
6713 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6714 ret->flags = 0;
6715 ender = reg_node(pRExC_state, TAIL);
6716 REGTAIL(pRExC_state, ret, ender);
6717 /*ret= ender;*/
6718 }
6719
6720 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 6721 RExC_parse++;
b45f050a
JF
6722 vFAIL("Nested quantifiers");
6723 }
a0d0e21e
LW
6724
6725 return(ret);
a687059c
LW
6726}
6727
fc8cd66c
YO
6728
6729/* reg_namedseq(pRExC_state,UVp)
6730
6731 This is expected to be called by a parser routine that has
afefe6bf 6732 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
6733 expected to point at the first char following the N at the time
6734 of the call.
ff3f963a
KW
6735
6736 The \N may be inside (indicated by valuep not being NULL) or outside a
6737 character class.
6738
6739 \N may begin either a named sequence, or if outside a character class, mean
6740 to match a non-newline. For non single-quoted regexes, the tokenizer has
6741 attempted to decide which, and in the case of a named sequence converted it
6742 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6743 where c1... are the characters in the sequence. For single-quoted regexes,
6744 the tokenizer passes the \N sequence through unchanged; this code will not
6745 attempt to determine this nor expand those. The net effect is that if the
6746 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6747 signals that this \N occurrence means to match a non-newline.
6748
6749 Only the \N{U+...} form should occur in a character class, for the same
6750 reason that '.' inside a character class means to just match a period: it
6751 just doesn't make sense.
fc8cd66c
YO
6752
6753 If valuep is non-null then it is assumed that we are parsing inside
6754 of a charclass definition and the first codepoint in the resolved
6755 string is returned via *valuep and the routine will return NULL.
6756 In this mode if a multichar string is returned from the charnames
ff3f963a 6757 handler, a warning will be issued, and only the first char in the
fc8cd66c
YO
6758 sequence will be examined. If the string returned is zero length
6759 then the value of *valuep is undefined and NON-NULL will
6760 be returned to indicate failure. (This will NOT be a valid pointer
6761 to a regnode.)
6762
ff3f963a
KW
6763 If valuep is null then it is assumed that we are parsing normal text and a
6764 new EXACT node is inserted into the program containing the resolved string,
6765 and a pointer to the new node is returned. But if the string is zero length
6766 a NOTHING node is emitted instead.
afefe6bf 6767
fc8cd66c 6768 On success RExC_parse is set to the char following the endbrace.
ff3f963a 6769 Parsing failures will generate a fatal error via vFAIL(...)
fc8cd66c
YO
6770 */
6771STATIC regnode *
afefe6bf 6772S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
fc8cd66c 6773{
c3c41406 6774 char * endbrace; /* '}' following the name */
fc8cd66c 6775 regnode *ret = NULL;
ff3f963a
KW
6776#ifdef DEBUGGING
6777 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6778#endif
c3c41406 6779 char* p;
ff3f963a
KW
6780
6781 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6782
6783 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
ff3f963a
KW
6784
6785 GET_RE_DEBUG_FLAGS;
c3c41406
KW
6786
6787 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6788 * modifier. The other meaning does not */
6789 p = (RExC_flags & RXf_PMf_EXTENDED)
6790 ? regwhite( pRExC_state, RExC_parse )
6791 : RExC_parse;
7918f24d 6792
ff3f963a 6793 /* Disambiguate between \N meaning a named character versus \N meaning
c3c41406
KW
6794 * [^\n]. The former is assumed when it can't be the latter. */
6795 if (*p != '{' || regcurly(p)) {
6796 RExC_parse = p;
ff3f963a 6797 if (valuep) {
afefe6bf 6798 /* no bare \N in a charclass */
ff3f963a
KW
6799 vFAIL("\\N in a character class must be a named character: \\N{...}");
6800 }
afefe6bf
RGS
6801 nextchar(pRExC_state);
6802 ret = reg_node(pRExC_state, REG_ANY);
6803 *flagp |= HASWIDTH|SIMPLE;
6804 RExC_naughty++;
6805 RExC_parse--;
6806 Set_Node_Length(ret, 1); /* MJD */
6807 return ret;
fc8cd66c 6808 }
a4893424 6809
c3c41406
KW
6810 /* Here, we have decided it should be a named sequence */
6811
6812 /* The test above made sure that the next real character is a '{', but
6813 * under the /x modifier, it could be separated by space (or a comment and
6814 * \n) and this is not allowed (for consistency with \x{...} and the
6815 * tokenizer handling of \N{NAME}). */
6816 if (*RExC_parse != '{') {
6817 vFAIL("Missing braces on \\N{}");
6818 }
6819
ff3f963a 6820 RExC_parse++; /* Skip past the '{' */
c3c41406
KW
6821
6822 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6823 || ! (endbrace == RExC_parse /* nothing between the {} */
6824 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6825 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6826 {
6827 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6828 vFAIL("\\N{NAME} must be resolved by the lexer");
6829 }
6830
ff3f963a
KW
6831 if (endbrace == RExC_parse) { /* empty: \N{} */
6832 if (! valuep) {
6833 RExC_parse = endbrace + 1;
6834 return reg_node(pRExC_state,NOTHING);
a4893424 6835 }
fc8cd66c 6836
ff3f963a
KW
6837 if (SIZE_ONLY) {
6838 ckWARNreg(RExC_parse,
6839 "Ignoring zero length \\N{} in character class"
6840 );
6841 RExC_parse = endbrace + 1;
6842 }
6843 *valuep = 0;
6844 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 6845 }
ff3f963a 6846
62fed28b 6847 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
ff3f963a
KW
6848 RExC_parse += 2; /* Skip past the 'U+' */
6849
6850 if (valuep) { /* In a bracketed char class */
6851 /* We only pay attention to the first char of
6852 multichar strings being returned. I kinda wonder
6853 if this makes sense as it does change the behaviour
6854 from earlier versions, OTOH that behaviour was broken
6855 as well. XXX Solution is to recharacterize as
6856 [rest-of-class]|multi1|multi2... */
6857
6858 STRLEN length_of_hex;
6859 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6860 | PERL_SCAN_DISALLOW_PREFIX
6861 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6862
37820adc
KW
6863 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6864 if (endchar < endbrace) {
ff3f963a
KW
6865 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6866 }
ff3f963a
KW
6867
6868 length_of_hex = (STRLEN)(endchar - RExC_parse);
6869 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6870
6871 /* The tokenizer should have guaranteed validity, but it's possible to
6872 * bypass it by using single quoting, so check */
c3c41406
KW
6873 if (length_of_hex == 0
6874 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6875 {
6876 RExC_parse += length_of_hex; /* Includes all the valid */
6877 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6878 ? UTF8SKIP(RExC_parse)
6879 : 1;
6880 /* Guard against malformed utf8 */
6881 if (RExC_parse >= endchar) RExC_parse = endchar;
6882 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
6883 }
6884
6885 RExC_parse = endbrace + 1;
6886 if (endchar == endbrace) return NULL;
6887
6888 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 6889 }
ff3f963a
KW
6890 else { /* Not a char class */
6891 char *s; /* String to put in generated EXACT node */
fda99bee 6892 STRLEN len = 0; /* Its current byte length */
ff3f963a
KW
6893 char *endchar; /* Points to '.' or '}' ending cur char in the input
6894 stream */
6895
6896 ret = reg_node(pRExC_state,
6897 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6898 s= STRING(ret);
6899
6900 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
6901 * the input which is of the form now 'c1.c2.c3...}' until find the
fda99bee 6902 * ending brace or exceed length 255. The characters that exceed this
ff3f963a
KW
6903 * limit are dropped. The limit could be relaxed should it become
6904 * desirable by reparsing this as (?:\N{NAME}), so could generate
6905 * multiple EXACT nodes, as is done for just regular input. But this
6906 * is primarily a named character, and not intended to be a huge long
6907 * string, so 255 bytes should be good enough */
6908 while (1) {
c3c41406 6909 STRLEN length_of_hex;
ff3f963a
KW
6910 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6911 | PERL_SCAN_DISALLOW_PREFIX
6912 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6913 UV cp; /* Ord of current character */
6914
6915 /* Code points are separated by dots. If none, there is only one
6916 * code point, and is terminated by the brace */
37820adc 6917 endchar = RExC_parse + strcspn(RExC_parse, ".}");
ff3f963a
KW
6918
6919 /* The values are Unicode even on EBCDIC machines */
c3c41406
KW
6920 length_of_hex = (STRLEN)(endchar - RExC_parse);
6921 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
6922 if ( length_of_hex == 0
6923 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
ff3f963a 6924 {
c3c41406
KW
6925 RExC_parse += length_of_hex; /* Includes all the valid */
6926 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6927 ? UTF8SKIP(RExC_parse)
6928 : 1;
6929 /* Guard against malformed utf8 */
6930 if (RExC_parse >= endchar) RExC_parse = endchar;
6931 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
6932 }
6933
6934 if (! FOLD) { /* Not folding, just append to the string */
6935 STRLEN unilen;
6936
6937 /* Quit before adding this character if would exceed limit */
6938 if (len + UNISKIP(cp) > U8_MAX) break;
fc8cd66c 6939
ff3f963a
KW
6940 unilen = reguni(pRExC_state, cp, s);
6941 if (unilen > 0) {
6942 s += unilen;
6943 len += unilen;
6944 }
6945 } else { /* Folding, output the folded equivalent */
6946 STRLEN foldlen,numlen;
6947 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6948 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
6949
6950 /* Quit before exceeding size limit */
6951 if (len + foldlen > U8_MAX) break;
6952
6953 for (foldbuf = tmpbuf;
6954 foldlen;
6955 foldlen -= numlen)
6956 {
6957 cp = utf8_to_uvchr(foldbuf, &numlen);
6958 if (numlen > 0) {
6959 const STRLEN unilen = reguni(pRExC_state, cp, s);
6960 s += unilen;
6961 len += unilen;
6962 /* In EBCDIC the numlen and unilen can differ. */
6963 foldbuf += numlen;
6964 if (numlen >= foldlen)
6965 break;
6966 }
6967 else
6968 break; /* "Can't happen." */
6969 }
6970 }
6971
6972 /* Point to the beginning of the next character in the sequence. */
6973 RExC_parse = endchar + 1;
6974
6975 /* Quit if no more characters */
6976 if (RExC_parse >= endbrace) break;
6977 }
6978
6979
6980 if (SIZE_ONLY) {
6981 if (RExC_parse < endbrace) {
6982 ckWARNreg(RExC_parse - 1,
6983 "Using just the first characters returned by \\N{}");
6984 }
6985
6986 RExC_size += STR_SZ(len);
6987 } else {
6988 STR_LEN(ret) = len;
6989 RExC_emit += STR_SZ(len);
6990 }
6991
6992 RExC_parse = endbrace + 1;
6993
6994 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
6995 with malformed in t/re/pat_advanced.t */
6996 RExC_parse --;
6997 Set_Node_Cur_Length(ret); /* MJD */
6998 nextchar(pRExC_state);
6999 }
7000
7001 return ret;
fc8cd66c
YO
7002}
7003
7004
9e08bc66
TS
7005/*
7006 * reg_recode
7007 *
7008 * It returns the code point in utf8 for the value in *encp.
7009 * value: a code value in the source encoding
7010 * encp: a pointer to an Encode object
7011 *
7012 * If the result from Encode is not a single character,
7013 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7014 */
7015STATIC UV
7016S_reg_recode(pTHX_ const char value, SV **encp)
7017{
7018 STRLEN numlen = 1;
59cd0e26 7019 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 7020 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
7021 const STRLEN newlen = SvCUR(sv);
7022 UV uv = UNICODE_REPLACEMENT;
7023
7918f24d
NC
7024 PERL_ARGS_ASSERT_REG_RECODE;
7025
9e08bc66
TS
7026 if (newlen)
7027 uv = SvUTF8(sv)
7028 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7029 : *(U8*)s;
7030
7031 if (!newlen || numlen != newlen) {
7032 uv = UNICODE_REPLACEMENT;
c86f7df5 7033 *encp = NULL;
9e08bc66
TS
7034 }
7035 return uv;
7036}
7037
fc8cd66c 7038
a687059c
LW
7039/*
7040 - regatom - the lowest level
ee9b8eae
YO
7041
7042 Try to identify anything special at the start of the pattern. If there
7043 is, then handle it as required. This may involve generating a single regop,
7044 such as for an assertion; or it may involve recursing, such as to
7045 handle a () structure.
7046
7047 If the string doesn't start with something special then we gobble up
7048 as much literal text as we can.
7049
7050 Once we have been able to handle whatever type of thing started the
7051 sequence, we return.
7052
7053 Note: we have to be careful with escapes, as they can be both literal
7054 and special, and in the case of \10 and friends can either, depending
7055 on context. Specifically there are two seperate switches for handling
7056 escape sequences, with the one for handling literal escapes requiring
7057 a dummy entry for all of the special escapes that are actually handled
7058 by the other.
7059*/
7060
76e3520e 7061STATIC regnode *
3dab1dad 7062S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 7063{
97aff369 7064 dVAR;
cbbf8932 7065 register regnode *ret = NULL;
a0d0e21e 7066 I32 flags;
45948336 7067 char *parse_start = RExC_parse;
3dab1dad
YO
7068 GET_RE_DEBUG_FLAGS_DECL;
7069 DEBUG_PARSE("atom");
a0d0e21e
LW
7070 *flagp = WORST; /* Tentatively. */
7071
7918f24d 7072 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 7073
a0d0e21e 7074tryagain:
f9a79580 7075 switch ((U8)*RExC_parse) {
a0d0e21e 7076 case '^':
830247a4
IZ
7077 RExC_seen_zerolen++;
7078 nextchar(pRExC_state);
bbe252da 7079 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7080 ret = reg_node(pRExC_state, MBOL);
bbe252da 7081 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7082 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7083 else
830247a4 7084 ret = reg_node(pRExC_state, BOL);
fac92740 7085 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7086 break;
7087 case '$':
830247a4 7088 nextchar(pRExC_state);
b81d288d 7089 if (*RExC_parse)
830247a4 7090 RExC_seen_zerolen++;
bbe252da 7091 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7092 ret = reg_node(pRExC_state, MEOL);
bbe252da 7093 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7094 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7095 else
830247a4 7096 ret = reg_node(pRExC_state, EOL);
fac92740 7097 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7098 break;
7099 case '.':
830247a4 7100 nextchar(pRExC_state);
bbe252da 7101 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
7102 ret = reg_node(pRExC_state, SANY);
7103 else
7104 ret = reg_node(pRExC_state, REG_ANY);
7105 *flagp |= HASWIDTH|SIMPLE;
830247a4 7106 RExC_naughty++;
fac92740 7107 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7108 break;
7109 case '[':
b45f050a 7110 {
3dab1dad
YO
7111 char * const oregcomp_parse = ++RExC_parse;
7112 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
7113 if (*RExC_parse != ']') {
7114 RExC_parse = oregcomp_parse;
b45f050a
JF
7115 vFAIL("Unmatched [");
7116 }
830247a4 7117 nextchar(pRExC_state);
a0d0e21e 7118 *flagp |= HASWIDTH|SIMPLE;
fac92740 7119 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 7120 break;
b45f050a 7121 }
a0d0e21e 7122 case '(':
830247a4 7123 nextchar(pRExC_state);
3dab1dad 7124 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 7125 if (ret == NULL) {
bf93d4cc 7126 if (flags & TRYAGAIN) {
830247a4 7127 if (RExC_parse == RExC_end) {
bf93d4cc
GS
7128 /* Make parent create an empty node if needed. */
7129 *flagp |= TRYAGAIN;
7130 return(NULL);
7131 }
a0d0e21e 7132 goto tryagain;
bf93d4cc 7133 }
a0d0e21e
LW
7134 return(NULL);
7135 }
a3b492c3 7136 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
7137 break;
7138 case '|':
7139 case ')':
7140 if (flags & TRYAGAIN) {
7141 *flagp |= TRYAGAIN;
7142 return NULL;
7143 }
b45f050a 7144 vFAIL("Internal urp");
a0d0e21e
LW
7145 /* Supposed to be caught earlier. */
7146 break;
85afd4ae 7147 case '{':
830247a4
IZ
7148 if (!regcurly(RExC_parse)) {
7149 RExC_parse++;
85afd4ae
CS
7150 goto defchar;
7151 }
7152 /* FALL THROUGH */
a0d0e21e
LW
7153 case '?':
7154 case '+':
7155 case '*':
830247a4 7156 RExC_parse++;
b45f050a 7157 vFAIL("Quantifier follows nothing");
a0d0e21e 7158 break;
f9a79580
RGS
7159 case 0xDF:
7160 case 0xC3:
7161 case 0xCE:
a0a388a1 7162 do_foldchar:
56d400ed 7163 if (!LOC && FOLD) {
e64b1bd1 7164 U32 len,cp;
7cf3a6a3 7165 len=0; /* silence a spurious compiler warning */
56d400ed 7166 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
e64b1bd1
YO
7167 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7168 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7169 ret = reganode(pRExC_state, FOLDCHAR, cp);
7170 Set_Node_Length(ret, 1); /* MJD */
7171 nextchar(pRExC_state); /* kill whitespace under /x */
7172 return ret;
7173 }
7174 }
7175 goto outer_default;
a0d0e21e 7176 case '\\':
ee9b8eae
YO
7177 /* Special Escapes
7178
7179 This switch handles escape sequences that resolve to some kind
7180 of special regop and not to literal text. Escape sequnces that
7181 resolve to literal text are handled below in the switch marked
7182 "Literal Escapes".
7183
7184 Every entry in this switch *must* have a corresponding entry
7185 in the literal escape switch. However, the opposite is not
7186 required, as the default for this switch is to jump to the
7187 literal text handling code.
7188 */
a0a388a1
YO
7189 switch ((U8)*++RExC_parse) {
7190 case 0xDF:
7191 case 0xC3:
7192 case 0xCE:
7193 goto do_foldchar;
ee9b8eae 7194 /* Special Escapes */
a0d0e21e 7195 case 'A':
830247a4
IZ
7196 RExC_seen_zerolen++;
7197 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7198 *flagp |= SIMPLE;
ee9b8eae 7199 goto finish_meta_pat;
a0d0e21e 7200 case 'G':
830247a4
IZ
7201 ret = reg_node(pRExC_state, GPOS);
7202 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 7203 *flagp |= SIMPLE;
ee9b8eae
YO
7204 goto finish_meta_pat;
7205 case 'K':
7206 RExC_seen_zerolen++;
7207 ret = reg_node(pRExC_state, KEEPS);
7208 *flagp |= SIMPLE;
37923168
RGS
7209 /* XXX:dmq : disabling in-place substitution seems to
7210 * be necessary here to avoid cases of memory corruption, as
7211 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7212 */
7213 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 7214 goto finish_meta_pat;
a0d0e21e 7215 case 'Z':
830247a4 7216 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7217 *flagp |= SIMPLE;
a1917ab9 7218 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7219 goto finish_meta_pat;
b85d18e9 7220 case 'z':
830247a4 7221 ret = reg_node(pRExC_state, EOS);
b85d18e9 7222 *flagp |= SIMPLE;
830247a4 7223 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7224 goto finish_meta_pat;
4a2d328f 7225 case 'C':
f33976b4
DB
7226 ret = reg_node(pRExC_state, CANY);
7227 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 7228 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7229 goto finish_meta_pat;
a0ed51b3 7230 case 'X':
830247a4 7231 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 7232 *flagp |= HASWIDTH;
ee9b8eae 7233 goto finish_meta_pat;
a0d0e21e 7234 case 'w':
eb160463 7235 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 7236 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7237 goto finish_meta_pat;
a0d0e21e 7238 case 'W':
eb160463 7239 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 7240 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7241 goto finish_meta_pat;
a0d0e21e 7242 case 'b':
830247a4
IZ
7243 RExC_seen_zerolen++;
7244 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 7245 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 7246 *flagp |= SIMPLE;
ee9b8eae 7247 goto finish_meta_pat;
a0d0e21e 7248 case 'B':
830247a4
IZ
7249 RExC_seen_zerolen++;
7250 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 7251 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 7252 *flagp |= SIMPLE;
ee9b8eae 7253 goto finish_meta_pat;
a0d0e21e 7254 case 's':
eb160463 7255 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 7256 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7257 goto finish_meta_pat;
a0d0e21e 7258 case 'S':
eb160463 7259 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 7260 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7261 goto finish_meta_pat;
a0d0e21e 7262 case 'd':
ffc61ed2 7263 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 7264 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7265 goto finish_meta_pat;
a0d0e21e 7266 case 'D':
ffc61ed2 7267 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 7268 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7269 goto finish_meta_pat;
e1d1eefb
YO
7270 case 'R':
7271 ret = reg_node(pRExC_state, LNBREAK);
7272 *flagp |= HASWIDTH|SIMPLE;
7273 goto finish_meta_pat;
7274 case 'h':
7275 ret = reg_node(pRExC_state, HORIZWS);
7276 *flagp |= HASWIDTH|SIMPLE;
7277 goto finish_meta_pat;
7278 case 'H':
7279 ret = reg_node(pRExC_state, NHORIZWS);
7280 *flagp |= HASWIDTH|SIMPLE;
7281 goto finish_meta_pat;
ee9b8eae 7282 case 'v':
e1d1eefb
YO
7283 ret = reg_node(pRExC_state, VERTWS);
7284 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
7285 goto finish_meta_pat;
7286 case 'V':
e1d1eefb
YO
7287 ret = reg_node(pRExC_state, NVERTWS);
7288 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7289 finish_meta_pat:
830247a4 7290 nextchar(pRExC_state);
fac92740 7291 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 7292 break;
a14b48bc
LW
7293 case 'p':
7294 case 'P':
3568d838 7295 {
3dab1dad 7296 char* const oldregxend = RExC_end;
d008bc60 7297#ifdef DEBUGGING
ccb2c380 7298 char* parse_start = RExC_parse - 2;
d008bc60 7299#endif
a14b48bc 7300
830247a4 7301 if (RExC_parse[1] == '{') {
3568d838 7302 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
7303 RExC_end = strchr(RExC_parse, '}');
7304 if (!RExC_end) {
3dab1dad 7305 const U8 c = (U8)*RExC_parse;
830247a4
IZ
7306 RExC_parse += 2;
7307 RExC_end = oldregxend;
0da60cf5 7308 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 7309 }
830247a4 7310 RExC_end++;
a14b48bc 7311 }
af6f566e 7312 else {
830247a4 7313 RExC_end = RExC_parse + 2;
af6f566e
HS
7314 if (RExC_end > oldregxend)
7315 RExC_end = oldregxend;
7316 }
830247a4 7317 RExC_parse--;
a14b48bc 7318
3dab1dad 7319 ret = regclass(pRExC_state,depth+1);
a14b48bc 7320
830247a4
IZ
7321 RExC_end = oldregxend;
7322 RExC_parse--;
ccb2c380
MP
7323
7324 Set_Node_Offset(ret, parse_start + 2);
7325 Set_Node_Cur_Length(ret);
830247a4 7326 nextchar(pRExC_state);
a14b48bc
LW
7327 *flagp |= HASWIDTH|SIMPLE;
7328 }
7329 break;
fc8cd66c 7330 case 'N':
afefe6bf 7331 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
7332 multicharacter. join_exact() will join them up later on.
7333 Also this makes sure that things like /\N{BLAH}+/ and
7334 \N{BLAH} being multi char Just Happen. dmq*/
7335 ++RExC_parse;
afefe6bf 7336 ret= reg_namedseq(pRExC_state, NULL, flagp);
fc8cd66c 7337 break;
0a4db386 7338 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 7339 parse_named_seq:
81714fb9
YO
7340 {
7341 char ch= RExC_parse[1];
1f1031fe
YO
7342 if (ch != '<' && ch != '\'' && ch != '{') {
7343 RExC_parse++;
7344 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 7345 } else {
1f1031fe
YO
7346 /* this pretty much dupes the code for (?P=...) in reg(), if
7347 you change this make sure you change that */
81714fb9 7348 char* name_start = (RExC_parse += 2);
2eccd3b2 7349 U32 num = 0;
0a4db386
YO
7350 SV *sv_dat = reg_scan_name(pRExC_state,
7351 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 7352 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 7353 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
7354 vFAIL2("Sequence %.3s... not terminated",parse_start);
7355
7356 if (!SIZE_ONLY) {
7357 num = add_data( pRExC_state, 1, "S" );
7358 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 7359 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
7360 }
7361
81714fb9
YO
7362 RExC_sawback = 1;
7363 ret = reganode(pRExC_state,
7364 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7365 num);
7366 *flagp |= HASWIDTH;
1f1031fe 7367
81714fb9
YO
7368 /* override incorrect value set in reganode MJD */
7369 Set_Node_Offset(ret, parse_start+1);
7370 Set_Node_Cur_Length(ret); /* MJD */
7371 nextchar(pRExC_state);
1f1031fe 7372
81714fb9
YO
7373 }
7374 break;
1f1031fe 7375 }
2bf803e2 7376 case 'g':
a0d0e21e
LW
7377 case '1': case '2': case '3': case '4':
7378 case '5': case '6': case '7': case '8': case '9':
7379 {
c74340f9 7380 I32 num;
2bf803e2
YO
7381 bool isg = *RExC_parse == 'g';
7382 bool isrel = 0;
7383 bool hasbrace = 0;
7384 if (isg) {
c74340f9 7385 RExC_parse++;
2bf803e2
YO
7386 if (*RExC_parse == '{') {
7387 RExC_parse++;
7388 hasbrace = 1;
7389 }
7390 if (*RExC_parse == '-') {
7391 RExC_parse++;
7392 isrel = 1;
7393 }
1f1031fe
YO
7394 if (hasbrace && !isDIGIT(*RExC_parse)) {
7395 if (isrel) RExC_parse--;
7396 RExC_parse -= 2;
7397 goto parse_named_seq;
7398 } }
c74340f9 7399 num = atoi(RExC_parse);
b72d83b2
RGS
7400 if (isg && num == 0)
7401 vFAIL("Reference to invalid group 0");
c74340f9 7402 if (isrel) {
5624f11d 7403 num = RExC_npar - num;
c74340f9
YO
7404 if (num < 1)
7405 vFAIL("Reference to nonexistent or unclosed group");
7406 }
2bf803e2 7407 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
7408 goto defchar;
7409 else {
3dab1dad 7410 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
7411 while (isDIGIT(*RExC_parse))
7412 RExC_parse++;
1f1031fe
YO
7413 if (parse_start == RExC_parse - 1)
7414 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
7415 if (hasbrace) {
7416 if (*RExC_parse != '}')
7417 vFAIL("Unterminated \\g{...} pattern");
7418 RExC_parse++;
7419 }
c74340f9
YO
7420 if (!SIZE_ONLY) {
7421 if (num > (I32)RExC_rx->nparens)
7422 vFAIL("Reference to nonexistent group");
c74340f9 7423 }
830247a4 7424 RExC_sawback = 1;
eb160463
GS
7425 ret = reganode(pRExC_state,
7426 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7427 num);
a0d0e21e 7428 *flagp |= HASWIDTH;
2af232bd 7429
fac92740 7430 /* override incorrect value set in reganode MJD */
2af232bd 7431 Set_Node_Offset(ret, parse_start+1);
fac92740 7432 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
7433 RExC_parse--;
7434 nextchar(pRExC_state);
a0d0e21e
LW
7435 }
7436 }
7437 break;
7438 case '\0':
830247a4 7439 if (RExC_parse >= RExC_end)
b45f050a 7440 FAIL("Trailing \\");
a0d0e21e
LW
7441 /* FALL THROUGH */
7442 default:
a0288114 7443 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 7444 back into the quick-grab loop below */
45948336 7445 parse_start--;
a0d0e21e
LW
7446 goto defchar;
7447 }
7448 break;
4633a7c4
LW
7449
7450 case '#':
bbe252da 7451 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 7452 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
7453 goto tryagain;
7454 }
7455 /* FALL THROUGH */
7456
f9a79580
RGS
7457 default:
7458 outer_default:{
ba210ebe 7459 register STRLEN len;
58ae7d3f 7460 register UV ender;
a0d0e21e 7461 register char *p;
3dab1dad 7462 char *s;
80aecb99 7463 STRLEN foldlen;
89ebb4a3 7464 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
7465
7466 parse_start = RExC_parse - 1;
a0d0e21e 7467
830247a4 7468 RExC_parse++;
a0d0e21e
LW
7469
7470 defchar:
58ae7d3f 7471 ender = 0;
eb160463
GS
7472 ret = reg_node(pRExC_state,
7473 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 7474 s = STRING(ret);
830247a4
IZ
7475 for (len = 0, p = RExC_parse - 1;
7476 len < 127 && p < RExC_end;
a0d0e21e
LW
7477 len++)
7478 {
3dab1dad 7479 char * const oldp = p;
5b5a24f7 7480
bbe252da 7481 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 7482 p = regwhite( pRExC_state, p );
f9a79580
RGS
7483 switch ((U8)*p) {
7484 case 0xDF:
7485 case 0xC3:
7486 case 0xCE:
56d400ed 7487 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
f9a79580 7488 goto normal_default;
a0d0e21e
LW
7489 case '^':
7490 case '$':
7491 case '.':
7492 case '[':
7493 case '(':
7494 case ')':
7495 case '|':
7496 goto loopdone;
7497 case '\\':
ee9b8eae
YO
7498 /* Literal Escapes Switch
7499
7500 This switch is meant to handle escape sequences that
7501 resolve to a literal character.
7502
7503 Every escape sequence that represents something
7504 else, like an assertion or a char class, is handled
7505 in the switch marked 'Special Escapes' above in this
7506 routine, but also has an entry here as anything that
7507 isn't explicitly mentioned here will be treated as
7508 an unescaped equivalent literal.
7509 */
7510
a0a388a1 7511 switch ((U8)*++p) {
ee9b8eae 7512 /* These are all the special escapes. */
a0a388a1
YO
7513 case 0xDF:
7514 case 0xC3:
7515 case 0xCE:
7516 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7517 goto normal_default;
ee9b8eae
YO
7518 case 'A': /* Start assertion */
7519 case 'b': case 'B': /* Word-boundary assertion*/
7520 case 'C': /* Single char !DANGEROUS! */
7521 case 'd': case 'D': /* digit class */
7522 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 7523 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
7524 case 'k': case 'K': /* named backref, keep marker */
7525 case 'N': /* named char sequence */
38a44b82 7526 case 'p': case 'P': /* Unicode property */
e1d1eefb 7527 case 'R': /* LNBREAK */
ee9b8eae 7528 case 's': case 'S': /* space class */
e1d1eefb 7529 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
7530 case 'w': case 'W': /* word class */
7531 case 'X': /* eXtended Unicode "combining character sequence" */
7532 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
7533 --p;
7534 goto loopdone;
ee9b8eae
YO
7535
7536 /* Anything after here is an escape that resolves to a
7537 literal. (Except digits, which may or may not)
7538 */
a0d0e21e
LW
7539 case 'n':
7540 ender = '\n';
7541 p++;
a687059c 7542 break;
a0d0e21e
LW
7543 case 'r':
7544 ender = '\r';
7545 p++;
a687059c 7546 break;
a0d0e21e
LW
7547 case 't':
7548 ender = '\t';
7549 p++;
a687059c 7550 break;
a0d0e21e
LW
7551 case 'f':
7552 ender = '\f';
7553 p++;
a687059c 7554 break;
a0d0e21e 7555 case 'e':
c7f1f016 7556 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 7557 p++;
a687059c 7558 break;
a0d0e21e 7559 case 'a':
c7f1f016 7560 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 7561 p++;
a687059c 7562 break;
f0a2b745
KW
7563 case 'o':
7564 {
7565 STRLEN brace_len = len;
00c0cb6d 7566 UV result;
454155d9
KW
7567 const char* error_msg;
7568
7569 bool valid = grok_bslash_o(p,
7570 &result,
7571 &brace_len,
7572 &error_msg,
7573 1);
7574 p += brace_len;
7575 if (! valid) {
7576 RExC_parse = p; /* going to die anyway; point
7577 to exact spot of failure */
f0a2b745
KW
7578 vFAIL(error_msg);
7579 }
00c0cb6d
DG
7580 else
7581 {
7582 ender = result;
7583 }
f0a2b745
KW
7584 if (PL_encoding && ender < 0x100) {
7585 goto recode_encoding;
7586 }
7587 if (ender > 0xff) {
62fed28b 7588 REQUIRE_UTF8;
f0a2b745
KW
7589 }
7590 break;
7591 }
a0d0e21e 7592 case 'x':
a0ed51b3 7593 if (*++p == '{') {
1df70142 7594 char* const e = strchr(p, '}');
b81d288d 7595
b45f050a 7596 if (!e) {
830247a4 7597 RExC_parse = p + 1;
b45f050a
JF
7598 vFAIL("Missing right brace on \\x{}");
7599 }
de5f0749 7600 else {
a4c04bdc
NC
7601 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7602 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 7603 STRLEN numlen = e - p - 1;
53305cf1 7604 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028 7605 if (ender > 0xff)
62fed28b 7606 REQUIRE_UTF8;
a0ed51b3
LW
7607 p = e + 1;
7608 }
a0ed51b3
LW
7609 }
7610 else {
a4c04bdc 7611 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 7612 STRLEN numlen = 2;
53305cf1 7613 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
7614 p += numlen;
7615 }
9e08bc66
TS
7616 if (PL_encoding && ender < 0x100)
7617 goto recode_encoding;
a687059c 7618 break;
a0d0e21e
LW
7619 case 'c':
7620 p++;
f9d13529 7621 ender = grok_bslash_c(*p++, SIZE_ONLY);
a687059c 7622 break;
a0d0e21e
LW
7623 case '0': case '1': case '2': case '3':case '4':
7624 case '5': case '6': case '7': case '8':case '9':
7625 if (*p == '0' ||
ca67da41 7626 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
c99e91e9
KW
7627 {
7628 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
1df70142 7629 STRLEN numlen = 3;
53305cf1 7630 ender = grok_oct(p, &numlen, &flags, NULL);
fa1639c5 7631 if (ender > 0xff) {
62fed28b 7632 REQUIRE_UTF8;
609122bd 7633 }
a0d0e21e
LW
7634 p += numlen;
7635 }
7636 else {
7637 --p;
7638 goto loopdone;
a687059c 7639 }
9e08bc66
TS
7640 if (PL_encoding && ender < 0x100)
7641 goto recode_encoding;
7642 break;
7643 recode_encoding:
7644 {
7645 SV* enc = PL_encoding;
7646 ender = reg_recode((const char)(U8)ender, &enc);
668c081a
NC
7647 if (!enc && SIZE_ONLY)
7648 ckWARNreg(p, "Invalid escape in the specified encoding");
62fed28b 7649 REQUIRE_UTF8;
9e08bc66 7650 }
a687059c 7651 break;
a0d0e21e 7652 case '\0':
830247a4 7653 if (p >= RExC_end)
b45f050a 7654 FAIL("Trailing \\");
a687059c 7655 /* FALL THROUGH */
a0d0e21e 7656 default:
668c081a
NC
7657 if (!SIZE_ONLY&& isALPHA(*p))
7658 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 7659 goto normal_default;
a0d0e21e
LW
7660 }
7661 break;
a687059c 7662 default:
a0ed51b3 7663 normal_default:
fd400ab9 7664 if (UTF8_IS_START(*p) && UTF) {
1df70142 7665 STRLEN numlen;
5e12f4fb 7666 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 7667 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
7668 p += numlen;
7669 }
7670 else
7671 ender = *p++;
a0d0e21e 7672 break;
a687059c 7673 }
bcdf7404
YO
7674 if ( RExC_flags & RXf_PMf_EXTENDED)
7675 p = regwhite( pRExC_state, p );
60a8b682
JH
7676 if (UTF && FOLD) {
7677 /* Prime the casefolded buffer. */
ac7e0132 7678 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 7679 }
bcdf7404 7680 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
7681 if (len)
7682 p = oldp;
16ea2a2e 7683 else if (UTF) {
80aecb99 7684 if (FOLD) {
60a8b682 7685 /* Emit all the Unicode characters. */
1df70142 7686 STRLEN numlen;
80aecb99
JH
7687 for (foldbuf = tmpbuf;
7688 foldlen;
7689 foldlen -= numlen) {
7690 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7691 if (numlen > 0) {
71207a34 7692 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7693 s += unilen;
7694 len += unilen;
7695 /* In EBCDIC the numlen
7696 * and unilen can differ. */
9dc45d57 7697 foldbuf += numlen;
47654450
JH
7698 if (numlen >= foldlen)
7699 break;
9dc45d57
JH
7700 }
7701 else
7702 break; /* "Can't happen." */
80aecb99
JH
7703 }
7704 }
7705 else {
71207a34 7706 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7707 if (unilen > 0) {
0ebc6274
JH
7708 s += unilen;
7709 len += unilen;
9dc45d57 7710 }
80aecb99 7711 }
a0ed51b3 7712 }
a0d0e21e
LW
7713 else {
7714 len++;
eb160463 7715 REGC((char)ender, s++);
a0d0e21e
LW
7716 }
7717 break;
a687059c 7718 }
16ea2a2e 7719 if (UTF) {
80aecb99 7720 if (FOLD) {
60a8b682 7721 /* Emit all the Unicode characters. */
1df70142 7722 STRLEN numlen;
80aecb99
JH
7723 for (foldbuf = tmpbuf;
7724 foldlen;
7725 foldlen -= numlen) {
7726 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7727 if (numlen > 0) {
71207a34 7728 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7729 len += unilen;
7730 s += unilen;
7731 /* In EBCDIC the numlen
7732 * and unilen can differ. */
9dc45d57 7733 foldbuf += numlen;
47654450
JH
7734 if (numlen >= foldlen)
7735 break;
9dc45d57
JH
7736 }
7737 else
7738 break;
80aecb99
JH
7739 }
7740 }
7741 else {
71207a34 7742 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7743 if (unilen > 0) {
0ebc6274
JH
7744 s += unilen;
7745 len += unilen;
9dc45d57 7746 }
80aecb99
JH
7747 }
7748 len--;
a0ed51b3
LW
7749 }
7750 else
eb160463 7751 REGC((char)ender, s++);
a0d0e21e
LW
7752 }
7753 loopdone:
830247a4 7754 RExC_parse = p - 1;
fac92740 7755 Set_Node_Cur_Length(ret); /* MJD */
830247a4 7756 nextchar(pRExC_state);
793db0cb
JH
7757 {
7758 /* len is STRLEN which is unsigned, need to copy to signed */
7759 IV iv = len;
7760 if (iv < 0)
7761 vFAIL("Internal disaster");
7762 }
a0d0e21e
LW
7763 if (len > 0)
7764 *flagp |= HASWIDTH;
090f7165 7765 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 7766 *flagp |= SIMPLE;
3dab1dad 7767
cd439c50 7768 if (SIZE_ONLY)
830247a4 7769 RExC_size += STR_SZ(len);
3dab1dad
YO
7770 else {
7771 STR_LEN(ret) = len;
830247a4 7772 RExC_emit += STR_SZ(len);
07be1b83 7773 }
3dab1dad 7774 }
a0d0e21e
LW
7775 break;
7776 }
a687059c 7777
a0d0e21e 7778 return(ret);
a687059c
LW
7779}
7780
873ef191 7781STATIC char *
bcdf7404 7782S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 7783{
bcdf7404 7784 const char *e = RExC_end;
7918f24d
NC
7785
7786 PERL_ARGS_ASSERT_REGWHITE;
7787
5b5a24f7
CS
7788 while (p < e) {
7789 if (isSPACE(*p))
7790 ++p;
7791 else if (*p == '#') {
bcdf7404 7792 bool ended = 0;
5b5a24f7 7793 do {
bcdf7404
YO
7794 if (*p++ == '\n') {
7795 ended = 1;
7796 break;
7797 }
7798 } while (p < e);
7799 if (!ended)
7800 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
7801 }
7802 else
7803 break;
7804 }
7805 return p;
7806}
7807
b8c5462f
JH
7808/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7809 Character classes ([:foo:]) can also be negated ([:^foo:]).
7810 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7811 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 7812 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
7813
7814#define POSIXCC_DONE(c) ((c) == ':')
7815#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7816#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7817
b8c5462f 7818STATIC I32
830247a4 7819S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 7820{
97aff369 7821 dVAR;
936ed897 7822 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 7823
7918f24d
NC
7824 PERL_ARGS_ASSERT_REGPPOSIXCC;
7825
830247a4 7826 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 7827 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 7828 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 7829 const char c = UCHARAT(RExC_parse);
097eb12c 7830 char* const s = RExC_parse++;
b81d288d 7831
9a86a77b 7832 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
7833 RExC_parse++;
7834 if (RExC_parse == RExC_end)
620e46c5 7835 /* Grandfather lone [:, [=, [. */
830247a4 7836 RExC_parse = s;
620e46c5 7837 else {
3dab1dad 7838 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
7839 assert(*t == c);
7840
9a86a77b 7841 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 7842 const char *posixcc = s + 1;
830247a4 7843 RExC_parse++; /* skip over the ending ] */
3dab1dad 7844
b8c5462f 7845 if (*s == ':') {
1df70142
AL
7846 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7847 const I32 skip = t - posixcc;
80916619
NC
7848
7849 /* Initially switch on the length of the name. */
7850 switch (skip) {
7851 case 4:
3dab1dad
YO
7852 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7853 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 7854 break;
80916619
NC
7855 case 5:
7856 /* Names all of length 5. */
7857 /* alnum alpha ascii blank cntrl digit graph lower
7858 print punct space upper */
7859 /* Offset 4 gives the best switch position. */
7860 switch (posixcc[4]) {
7861 case 'a':
3dab1dad
YO
7862 if (memEQ(posixcc, "alph", 4)) /* alpha */
7863 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
7864 break;
7865 case 'e':
3dab1dad
YO
7866 if (memEQ(posixcc, "spac", 4)) /* space */
7867 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
7868 break;
7869 case 'h':
3dab1dad
YO
7870 if (memEQ(posixcc, "grap", 4)) /* graph */
7871 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
7872 break;
7873 case 'i':
3dab1dad
YO
7874 if (memEQ(posixcc, "asci", 4)) /* ascii */
7875 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
7876 break;
7877 case 'k':
3dab1dad
YO
7878 if (memEQ(posixcc, "blan", 4)) /* blank */
7879 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
7880 break;
7881 case 'l':
3dab1dad
YO
7882 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7883 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
7884 break;
7885 case 'm':
3dab1dad
YO
7886 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7887 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
7888 break;
7889 case 'r':
3dab1dad
YO
7890 if (memEQ(posixcc, "lowe", 4)) /* lower */
7891 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7892 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7893 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
7894 break;
7895 case 't':
3dab1dad
YO
7896 if (memEQ(posixcc, "digi", 4)) /* digit */
7897 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7898 else if (memEQ(posixcc, "prin", 4)) /* print */
7899 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7900 else if (memEQ(posixcc, "punc", 4)) /* punct */
7901 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 7902 break;
b8c5462f
JH
7903 }
7904 break;
80916619 7905 case 6:
3dab1dad
YO
7906 if (memEQ(posixcc, "xdigit", 6))
7907 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
7908 break;
7909 }
80916619
NC
7910
7911 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
7912 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7913 t - s - 1, s + 1);
80916619
NC
7914 assert (posixcc[skip] == ':');
7915 assert (posixcc[skip+1] == ']');
b45f050a 7916 } else if (!SIZE_ONLY) {
b8c5462f 7917 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 7918
830247a4 7919 /* adjust RExC_parse so the warning shows after
b45f050a 7920 the class closes */
9a86a77b 7921 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 7922 RExC_parse++;
b45f050a
JF
7923 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7924 }
b8c5462f
JH
7925 } else {
7926 /* Maternal grandfather:
7927 * "[:" ending in ":" but not in ":]" */
830247a4 7928 RExC_parse = s;
767d463e 7929 }
620e46c5
JH
7930 }
7931 }
7932
b8c5462f
JH
7933 return namedclass;
7934}
7935
7936STATIC void
830247a4 7937S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 7938{
97aff369 7939 dVAR;
7918f24d
NC
7940
7941 PERL_ARGS_ASSERT_CHECKPOSIXCC;
7942
3dab1dad 7943 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
7944 const char *s = RExC_parse;
7945 const char c = *s++;
b8c5462f 7946
3dab1dad 7947 while (isALNUM(*s))
b8c5462f
JH
7948 s++;
7949 if (*s && c == *s && s[1] == ']') {
668c081a
NC
7950 ckWARN3reg(s+2,
7951 "POSIX syntax [%c %c] belongs inside character classes",
7952 c, c);
b45f050a
JF
7953
7954 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 7955 if (POSIXCC_NOTYET(c)) {
830247a4 7956 /* adjust RExC_parse so the error shows after
b45f050a 7957 the class closes */
9a86a77b 7958 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 7959 NOOP;
b45f050a
JF
7960 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7961 }
b8c5462f
JH
7962 }
7963 }
620e46c5
JH
7964}
7965
7f6f358c 7966
89836f1f
YO
7967#define _C_C_T_(NAME,TEST,WORD) \
7968ANYOF_##NAME: \
7969 if (LOC) \
7970 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7971 else { \
7972 for (value = 0; value < 256; value++) \
7973 if (TEST) \
7974 ANYOF_BITMAP_SET(ret, value); \
7975 } \
7976 yesno = '+'; \
7977 what = WORD; \
7978 break; \
7979case ANYOF_N##NAME: \
7980 if (LOC) \
7981 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7982 else { \
7983 for (value = 0; value < 256; value++) \
7984 if (!TEST) \
7985 ANYOF_BITMAP_SET(ret, value); \
7986 } \
7987 yesno = '!'; \
7988 what = WORD; \
7989 break
7990
e1d1eefb
YO
7991#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7992ANYOF_##NAME: \
7993 for (value = 0; value < 256; value++) \
7994 if (TEST) \
7995 ANYOF_BITMAP_SET(ret, value); \
7996 yesno = '+'; \
7997 what = WORD; \
7998 break; \
7999case ANYOF_N##NAME: \
8000 for (value = 0; value < 256; value++) \
8001 if (!TEST) \
8002 ANYOF_BITMAP_SET(ret, value); \
8003 yesno = '!'; \
8004 what = WORD; \
8005 break
89836f1f 8006
da7fcca4
YO
8007/*
8008 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8009 so that it is possible to override the option here without having to
8010 rebuild the entire core. as we are required to do if we change regcomp.h
8011 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8012*/
8013#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8014#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8015#endif
8016
8017#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8018#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8019#else
8020#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8021#endif
8022
7f6f358c
YO
8023/*
8024 parse a class specification and produce either an ANYOF node that
89836f1f
YO
8025 matches the pattern or if the pattern matches a single char only and
8026 that char is < 256 and we are case insensitive then we produce an
8027 EXACT node instead.
7f6f358c 8028*/
89836f1f 8029
76e3520e 8030STATIC regnode *
3dab1dad 8031S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 8032{
97aff369 8033 dVAR;
9a86a77b 8034 register UV nextvalue;
3568d838 8035 register IV prevvalue = OOB_UNICODE;
ffc61ed2 8036 register IV range = 0;
e1d1eefb 8037 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 8038 register regnode *ret;
ba210ebe 8039 STRLEN numlen;
ffc61ed2 8040 IV namedclass;
cbbf8932 8041 char *rangebegin = NULL;
936ed897 8042 bool need_class = 0;
c445ea15 8043 SV *listsv = NULL;
ffc61ed2 8044 UV n;
9e55ce06 8045 bool optimize_invert = TRUE;
cbbf8932 8046 AV* unicode_alternate = NULL;
1b2d223b
JH
8047#ifdef EBCDIC
8048 UV literal_endpoint = 0;
8049#endif
7f6f358c 8050 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 8051
3dab1dad 8052 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 8053 case we need to change the emitted regop to an EXACT. */
07be1b83 8054 const char * orig_parse = RExC_parse;
72f13be8 8055 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8056
8057 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
8058#ifndef DEBUGGING
8059 PERL_UNUSED_ARG(depth);
8060#endif
72f13be8 8061
3dab1dad 8062 DEBUG_PARSE("clas");
7f6f358c
YO
8063
8064 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
8065 ret = reganode(pRExC_state, ANYOF, 0);
8066
8067 if (!SIZE_ONLY)
8068 ANYOF_FLAGS(ret) = 0;
8069
9a86a77b 8070 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
8071 RExC_naughty++;
8072 RExC_parse++;
8073 if (!SIZE_ONLY)
8074 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8075 }
a0d0e21e 8076
73060fc4 8077 if (SIZE_ONLY) {
830247a4 8078 RExC_size += ANYOF_SKIP;
73060fc4
JH
8079 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8080 }
936ed897 8081 else {
830247a4 8082 RExC_emit += ANYOF_SKIP;
936ed897
IZ
8083 if (FOLD)
8084 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8085 if (LOC)
8086 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 8087 ANYOF_BITMAP_ZERO(ret);
396482e1 8088 listsv = newSVpvs("# comment\n");
a0d0e21e 8089 }
b8c5462f 8090
9a86a77b
JH
8091 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8092
b938889d 8093 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 8094 checkposixcc(pRExC_state);
b8c5462f 8095
f064b6ad
HS
8096 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8097 if (UCHARAT(RExC_parse) == ']')
8098 goto charclassloop;
ffc61ed2 8099
fc8cd66c 8100parseit:
9a86a77b 8101 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
8102
8103 charclassloop:
8104
8105 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8106
73b437c8 8107 if (!range)
830247a4 8108 rangebegin = RExC_parse;
ffc61ed2 8109 if (UTF) {
5e12f4fb 8110 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 8111 RExC_end - RExC_parse,
9f7f3913 8112 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
8113 RExC_parse += numlen;
8114 }
8115 else
8116 value = UCHARAT(RExC_parse++);
7f6f358c 8117
9a86a77b
JH
8118 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8119 if (value == '[' && POSIXCC(nextvalue))
830247a4 8120 namedclass = regpposixcc(pRExC_state, value);
620e46c5 8121 else if (value == '\\') {
ffc61ed2 8122 if (UTF) {
5e12f4fb 8123 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 8124 RExC_end - RExC_parse,
9f7f3913 8125 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
8126 RExC_parse += numlen;
8127 }
8128 else
8129 value = UCHARAT(RExC_parse++);
470c3474 8130 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 8131 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
8132 * be a problem later if we want switch on Unicode.
8133 * A similar issue a little bit later when switching on
8134 * namedclass. --jhi */
ffc61ed2 8135 switch ((I32)value) {
b8c5462f
JH
8136 case 'w': namedclass = ANYOF_ALNUM; break;
8137 case 'W': namedclass = ANYOF_NALNUM; break;
8138 case 's': namedclass = ANYOF_SPACE; break;
8139 case 'S': namedclass = ANYOF_NSPACE; break;
8140 case 'd': namedclass = ANYOF_DIGIT; break;
8141 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
8142 case 'v': namedclass = ANYOF_VERTWS; break;
8143 case 'V': namedclass = ANYOF_NVERTWS; break;
8144 case 'h': namedclass = ANYOF_HORIZWS; break;
8145 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
8146 case 'N': /* Handle \N{NAME} in class */
8147 {
8148 /* We only pay attention to the first char of
8149 multichar strings being returned. I kinda wonder
8150 if this makes sense as it does change the behaviour
8151 from earlier versions, OTOH that behaviour was broken
8152 as well. */
8153 UV v; /* value is register so we cant & it /grrr */
afefe6bf 8154 if (reg_namedseq(pRExC_state, &v, NULL)) {
fc8cd66c
YO
8155 goto parseit;
8156 }
8157 value= v;
8158 }
8159 break;
ffc61ed2
JH
8160 case 'p':
8161 case 'P':
3dab1dad
YO
8162 {
8163 char *e;
af6f566e 8164 if (RExC_parse >= RExC_end)
2a4859cd 8165 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 8166 if (*RExC_parse == '{') {
1df70142 8167 const U8 c = (U8)value;
ffc61ed2
JH
8168 e = strchr(RExC_parse++, '}');
8169 if (!e)
0da60cf5 8170 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
8171 while (isSPACE(UCHARAT(RExC_parse)))
8172 RExC_parse++;
8173 if (e == RExC_parse)
0da60cf5 8174 vFAIL2("Empty \\%c{}", c);
ffc61ed2 8175 n = e - RExC_parse;
ab13f0c7
JH
8176 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8177 n--;
ffc61ed2
JH
8178 }
8179 else {
8180 e = RExC_parse;
8181 n = 1;
8182 }
8183 if (!SIZE_ONLY) {
ab13f0c7
JH
8184 if (UCHARAT(RExC_parse) == '^') {
8185 RExC_parse++;
8186 n--;
8187 value = value == 'p' ? 'P' : 'p'; /* toggle */
8188 while (isSPACE(UCHARAT(RExC_parse))) {
8189 RExC_parse++;
8190 n--;
8191 }
8192 }
097eb12c
AL
8193 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8194 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
8195 }
8196 RExC_parse = e + 1;
8197 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 8198 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 8199 }
f81125e2 8200 break;
b8c5462f
JH
8201 case 'n': value = '\n'; break;
8202 case 'r': value = '\r'; break;
8203 case 't': value = '\t'; break;
8204 case 'f': value = '\f'; break;
8205 case 'b': value = '\b'; break;
c7f1f016
NIS
8206 case 'e': value = ASCII_TO_NATIVE('\033');break;
8207 case 'a': value = ASCII_TO_NATIVE('\007');break;
f0a2b745
KW
8208 case 'o':
8209 RExC_parse--; /* function expects to be pointed at the 'o' */
454155d9
KW
8210 {
8211 const char* error_msg;
8212 bool valid = grok_bslash_o(RExC_parse,
f0a2b745
KW
8213 &value,
8214 &numlen,
454155d9
KW
8215 &error_msg,
8216 SIZE_ONLY);
8217 RExC_parse += numlen;
8218 if (! valid) {
8219 vFAIL(error_msg);
8220 }
f0a2b745 8221 }
f0a2b745
KW
8222 if (PL_encoding && value < 0x100) {
8223 goto recode_encoding;
8224 }
8225 break;
b8c5462f 8226 case 'x':
ffc61ed2 8227 if (*RExC_parse == '{') {
a4c04bdc
NC
8228 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8229 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 8230 char * const e = strchr(RExC_parse++, '}');
b81d288d 8231 if (!e)
ffc61ed2 8232 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
8233
8234 numlen = e - RExC_parse;
8235 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8236 RExC_parse = e + 1;
8237 }
8238 else {
a4c04bdc 8239 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
8240 numlen = 2;
8241 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8242 RExC_parse += numlen;
8243 }
9e08bc66
TS
8244 if (PL_encoding && value < 0x100)
8245 goto recode_encoding;
b8c5462f
JH
8246 break;
8247 case 'c':
f9d13529 8248 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
b8c5462f
JH
8249 break;
8250 case '0': case '1': case '2': case '3': case '4':
c99e91e9 8251 case '5': case '6': case '7':
9e08bc66 8252 {
c99e91e9
KW
8253 /* Take 1-3 octal digits */
8254 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9e08bc66
TS
8255 numlen = 3;
8256 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8257 RExC_parse += numlen;
8258 if (PL_encoding && value < 0x100)
8259 goto recode_encoding;
8260 break;
8261 }
8262 recode_encoding:
8263 {
8264 SV* enc = PL_encoding;
8265 value = reg_recode((const char)(U8)value, &enc);
668c081a
NC
8266 if (!enc && SIZE_ONLY)
8267 ckWARNreg(RExC_parse,
8268 "Invalid escape in the specified encoding");
9e08bc66
TS
8269 break;
8270 }
1028017a 8271 default:
c99e91e9
KW
8272 /* Allow \_ to not give an error */
8273 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
668c081a
NC
8274 ckWARN2reg(RExC_parse,
8275 "Unrecognized escape \\%c in character class passed through",
8276 (int)value);
c99e91e9 8277 }
1028017a 8278 break;
b8c5462f 8279 }
ffc61ed2 8280 } /* end of \blah */
1b2d223b
JH
8281#ifdef EBCDIC
8282 else
8283 literal_endpoint++;
8284#endif
ffc61ed2
JH
8285
8286 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8287
8288 if (!SIZE_ONLY && !need_class)
936ed897 8289 ANYOF_CLASS_ZERO(ret);
ffc61ed2 8290
936ed897 8291 need_class = 1;
ffc61ed2
JH
8292
8293 /* a bad range like a-\d, a-[:digit:] ? */
8294 if (range) {
73b437c8 8295 if (!SIZE_ONLY) {
668c081a
NC
8296 const int w =
8297 RExC_parse >= rangebegin ?
8298 RExC_parse - rangebegin : 0;
8299 ckWARN4reg(RExC_parse,
b45f050a 8300 "False [] range \"%*.*s\"",
097eb12c 8301 w, w, rangebegin);
668c081a 8302
3568d838
JH
8303 if (prevvalue < 256) {
8304 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
8305 ANYOF_BITMAP_SET(ret, '-');
8306 }
8307 else {
8308 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8309 Perl_sv_catpvf(aTHX_ listsv,
3568d838 8310 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 8311 }
b8c5462f 8312 }
ffc61ed2
JH
8313
8314 range = 0; /* this was not a true range */
73b437c8 8315 }
ffc61ed2 8316
89836f1f
YO
8317
8318
73b437c8 8319 if (!SIZE_ONLY) {
c49a72a9
NC
8320 const char *what = NULL;
8321 char yesno = 0;
8322
3568d838
JH
8323 if (namedclass > OOB_NAMEDCLASS)
8324 optimize_invert = FALSE;
e2962f66
JH
8325 /* Possible truncation here but in some 64-bit environments
8326 * the compiler gets heartburn about switch on 64-bit values.
8327 * A similar issue a little earlier when switching on value.
98f323fa 8328 * --jhi */
e2962f66 8329 switch ((I32)namedclass) {
da7fcca4
YO
8330
8331 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8332 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8333 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8334 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8335 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8336 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8337 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8338 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8339 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8340 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8341#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
89836f1f 8342 case _C_C_T_(ALNUM, isALNUM(value), "Word");
89836f1f 8343 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
da7fcca4
YO
8344#else
8345 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8346 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8347#endif
89836f1f 8348 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
e1d1eefb
YO
8349 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8350 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
73b437c8
JH
8351 case ANYOF_ASCII:
8352 if (LOC)
936ed897 8353 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 8354 else {
c7f1f016 8355#ifndef EBCDIC
1ba5c669
JH
8356 for (value = 0; value < 128; value++)
8357 ANYOF_BITMAP_SET(ret, value);
8358#else /* EBCDIC */
ffbc6a93 8359 for (value = 0; value < 256; value++) {
3a3c4447
JH
8360 if (isASCII(value))
8361 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 8362 }
1ba5c669 8363#endif /* EBCDIC */
73b437c8 8364 }
c49a72a9
NC
8365 yesno = '+';
8366 what = "ASCII";
73b437c8
JH
8367 break;
8368 case ANYOF_NASCII:
8369 if (LOC)
936ed897 8370 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 8371 else {
c7f1f016 8372#ifndef EBCDIC
1ba5c669
JH
8373 for (value = 128; value < 256; value++)
8374 ANYOF_BITMAP_SET(ret, value);
8375#else /* EBCDIC */
ffbc6a93 8376 for (value = 0; value < 256; value++) {
3a3c4447
JH
8377 if (!isASCII(value))
8378 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 8379 }
1ba5c669 8380#endif /* EBCDIC */
73b437c8 8381 }
c49a72a9
NC
8382 yesno = '!';
8383 what = "ASCII";
89836f1f 8384 break;
ffc61ed2
JH
8385 case ANYOF_DIGIT:
8386 if (LOC)
8387 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8388 else {
8389 /* consecutive digits assumed */
8390 for (value = '0'; value <= '9'; value++)
8391 ANYOF_BITMAP_SET(ret, value);
8392 }
c49a72a9 8393 yesno = '+';
da7fcca4 8394 what = POSIX_CC_UNI_NAME("Digit");
ffc61ed2
JH
8395 break;
8396 case ANYOF_NDIGIT:
8397 if (LOC)
8398 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8399 else {
8400 /* consecutive digits assumed */
8401 for (value = 0; value < '0'; value++)
8402 ANYOF_BITMAP_SET(ret, value);
8403 for (value = '9' + 1; value < 256; value++)
8404 ANYOF_BITMAP_SET(ret, value);
8405 }
c49a72a9 8406 yesno = '!';
da7fcca4 8407 what = POSIX_CC_UNI_NAME("Digit");
89836f1f 8408 break;
f81125e2
JP
8409 case ANYOF_MAX:
8410 /* this is to handle \p and \P */
8411 break;
73b437c8 8412 default:
b45f050a 8413 vFAIL("Invalid [::] class");
73b437c8 8414 break;
b8c5462f 8415 }
c49a72a9
NC
8416 if (what) {
8417 /* Strings such as "+utf8::isWord\n" */
8418 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8419 }
b8c5462f 8420 if (LOC)
936ed897 8421 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 8422 continue;
a0d0e21e 8423 }
ffc61ed2
JH
8424 } /* end of namedclass \blah */
8425
a0d0e21e 8426 if (range) {
eb160463 8427 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
8428 const int w = RExC_parse - rangebegin;
8429 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 8430 range = 0; /* not a valid range */
73b437c8 8431 }
a0d0e21e
LW
8432 }
8433 else {
3568d838 8434 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
8435 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8436 RExC_parse[1] != ']') {
8437 RExC_parse++;
ffc61ed2
JH
8438
8439 /* a bad range like \w-, [:word:]- ? */
8440 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 8441 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 8442 const int w =
afd78fd5
JH
8443 RExC_parse >= rangebegin ?
8444 RExC_parse - rangebegin : 0;
830247a4 8445 vWARN4(RExC_parse,
b45f050a 8446 "False [] range \"%*.*s\"",
097eb12c 8447 w, w, rangebegin);
afd78fd5 8448 }
73b437c8 8449 if (!SIZE_ONLY)
936ed897 8450 ANYOF_BITMAP_SET(ret, '-');
73b437c8 8451 } else
ffc61ed2
JH
8452 range = 1; /* yeah, it's a range! */
8453 continue; /* but do it the next time */
a0d0e21e 8454 }
a687059c 8455 }
ffc61ed2 8456
93733859 8457 /* now is the next time */
07be1b83 8458 /*stored += (value - prevvalue + 1);*/
ae5c130c 8459 if (!SIZE_ONLY) {
3568d838 8460 if (prevvalue < 256) {
1df70142 8461 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 8462 IV i;
3568d838 8463#ifdef EBCDIC
1b2d223b
JH
8464 /* In EBCDIC [\x89-\x91] should include
8465 * the \x8e but [i-j] should not. */
8466 if (literal_endpoint == 2 &&
8467 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8468 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 8469 {
3568d838
JH
8470 if (isLOWER(prevvalue)) {
8471 for (i = prevvalue; i <= ceilvalue; i++)
2670d666
BC
8472 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8473 stored++;
ffc61ed2 8474 ANYOF_BITMAP_SET(ret, i);
2670d666 8475 }
ffc61ed2 8476 } else {
3568d838 8477 for (i = prevvalue; i <= ceilvalue; i++)
2670d666
BC
8478 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8479 stored++;
ffc61ed2 8480 ANYOF_BITMAP_SET(ret, i);
2670d666 8481 }
ffc61ed2 8482 }
8ada0baa 8483 }
ffc61ed2 8484 else
8ada0baa 8485#endif
07be1b83
YO
8486 for (i = prevvalue; i <= ceilvalue; i++) {
8487 if (!ANYOF_BITMAP_TEST(ret,i)) {
8488 stored++;
8489 ANYOF_BITMAP_SET(ret, i);
8490 }
8491 }
3568d838 8492 }
a5961de5 8493 if (value > 255 || UTF) {
1df70142
AL
8494 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8495 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 8496 stored+=2; /* can't optimize this class */
ffc61ed2 8497 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 8498 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 8499 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
8500 prevnatvalue, natvalue);
8501 }
8502 else if (prevnatvalue == natvalue) {
8503 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 8504 if (FOLD) {
89ebb4a3 8505 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 8506 STRLEN foldlen;
1df70142 8507 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 8508
e294cc5d
JH
8509#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8510 if (RExC_precomp[0] == ':' &&
8511 RExC_precomp[1] == '[' &&
8512 (f == 0xDF || f == 0x92)) {
8513 f = NATIVE_TO_UNI(f);
8514 }
8515#endif
c840d2a2
JH
8516 /* If folding and foldable and a single
8517 * character, insert also the folded version
8518 * to the charclass. */
9e55ce06 8519 if (f != value) {
e294cc5d
JH
8520#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8521 if ((RExC_precomp[0] == ':' &&
8522 RExC_precomp[1] == '[' &&
8523 (f == 0xA2 &&
8524 (value == 0xFB05 || value == 0xFB06))) ?
8525 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8526 foldlen == (STRLEN)UNISKIP(f) )
8527#else
eb160463 8528 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 8529#endif
9e55ce06
JH
8530 Perl_sv_catpvf(aTHX_ listsv,
8531 "%04"UVxf"\n", f);
8532 else {
8533 /* Any multicharacter foldings
8534 * require the following transform:
8535 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8536 * where E folds into "pq" and F folds
8537 * into "rst", all other characters
8538 * fold to single characters. We save
8539 * away these multicharacter foldings,
8540 * to be later saved as part of the
8541 * additional "s" data. */
8542 SV *sv;
8543
8544 if (!unicode_alternate)
8545 unicode_alternate = newAV();
740cce10
NC
8546 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8547 TRUE);
9e55ce06
JH
8548 av_push(unicode_alternate, sv);
8549 }
8550 }
254ba52a 8551
60a8b682
JH
8552 /* If folding and the value is one of the Greek
8553 * sigmas insert a few more sigmas to make the
8554 * folding rules of the sigmas to work right.
8555 * Note that not all the possible combinations
8556 * are handled here: some of them are handled
9e55ce06
JH
8557 * by the standard folding rules, and some of
8558 * them (literal or EXACTF cases) are handled
8559 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
8560 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8561 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8562 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 8563 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8564 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8565 }
8566 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8567 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8568 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8569 }
8570 }
ffc61ed2 8571 }
1b2d223b
JH
8572#ifdef EBCDIC
8573 literal_endpoint = 0;
8574#endif
8ada0baa 8575 }
ffc61ed2
JH
8576
8577 range = 0; /* this range (if it was one) is done now */
a0d0e21e 8578 }
ffc61ed2 8579
936ed897 8580 if (need_class) {
4f66b38d 8581 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 8582 if (SIZE_ONLY)
830247a4 8583 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 8584 else
830247a4 8585 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 8586 }
ffc61ed2 8587
7f6f358c
YO
8588
8589 if (SIZE_ONLY)
8590 return ret;
8591 /****** !SIZE_ONLY AFTER HERE *********/
8592
3b57cd43 8593 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
7f6f358c
YO
8594 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8595 ) {
8596 /* optimize single char class to an EXACT node
8597 but *only* when its not a UTF/high char */
07be1b83
YO
8598 const char * cur_parse= RExC_parse;
8599 RExC_emit = (regnode *)orig_emit;
8600 RExC_parse = (char *)orig_parse;
7f6f358c
YO
8601 ret = reg_node(pRExC_state,
8602 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 8603 RExC_parse = (char *)cur_parse;
7f6f358c
YO
8604 *STRING(ret)= (char)value;
8605 STR_LEN(ret)= 1;
8606 RExC_emit += STR_SZ(1);
ef8d46e8 8607 SvREFCNT_dec(listsv);
7f6f358c
YO
8608 return ret;
8609 }
ae5c130c 8610 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 8611 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
8612 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8613 ) {
a0ed51b3 8614 for (value = 0; value < 256; ++value) {
936ed897 8615 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 8616 UV fold = PL_fold[value];
ffc61ed2
JH
8617
8618 if (fold != value)
8619 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
8620 }
8621 }
936ed897 8622 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 8623 }
ffc61ed2 8624
ae5c130c 8625 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 8626 if (optimize_invert &&
ffc61ed2
JH
8627 /* If the only flag is inversion. */
8628 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 8629 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 8630 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 8631 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 8632 }
7f6f358c 8633 {
097eb12c 8634 AV * const av = newAV();
ffc61ed2 8635 SV *rv;
9e55ce06 8636 /* The 0th element stores the character class description
6a0407ee 8637 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
8638 * to initialize the appropriate swash (which gets stored in
8639 * the 1st element), and also useful for dumping the regnode.
8640 * The 2nd element stores the multicharacter foldings,
6a0407ee 8641 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
8642 av_store(av, 0, listsv);
8643 av_store(av, 1, NULL);
ad64d0ec
NC
8644 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8645 rv = newRV_noinc(MUTABLE_SV(av));
19860706 8646 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 8647 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 8648 ARG_SET(ret, n);
a0ed51b3 8649 }
a0ed51b3
LW
8650 return ret;
8651}
89836f1f
YO
8652#undef _C_C_T_
8653
a0ed51b3 8654
bcdf7404
YO
8655/* reg_skipcomment()
8656
8657 Absorbs an /x style # comments from the input stream.
8658 Returns true if there is more text remaining in the stream.
8659 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8660 terminates the pattern without including a newline.
8661
8662 Note its the callers responsibility to ensure that we are
8663 actually in /x mode
8664
8665*/
8666
8667STATIC bool
8668S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8669{
8670 bool ended = 0;
7918f24d
NC
8671
8672 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8673
bcdf7404
YO
8674 while (RExC_parse < RExC_end)
8675 if (*RExC_parse++ == '\n') {
8676 ended = 1;
8677 break;
8678 }
8679 if (!ended) {
8680 /* we ran off the end of the pattern without ending
8681 the comment, so we have to add an \n when wrapping */
8682 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8683 return 0;
8684 } else
8685 return 1;
8686}
8687
8688/* nextchar()
8689
8690 Advance that parse position, and optionall absorbs
8691 "whitespace" from the inputstream.
8692
8693 Without /x "whitespace" means (?#...) style comments only,
8694 with /x this means (?#...) and # comments and whitespace proper.
8695
8696 Returns the RExC_parse point from BEFORE the scan occurs.
8697
8698 This is the /x friendly way of saying RExC_parse++.
8699*/
8700
76e3520e 8701STATIC char*
830247a4 8702S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 8703{
097eb12c 8704 char* const retval = RExC_parse++;
a0d0e21e 8705
7918f24d
NC
8706 PERL_ARGS_ASSERT_NEXTCHAR;
8707
4633a7c4 8708 for (;;) {
830247a4
IZ
8709 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8710 RExC_parse[2] == '#') {
e994fd66
AE
8711 while (*RExC_parse != ')') {
8712 if (RExC_parse == RExC_end)
8713 FAIL("Sequence (?#... not terminated");
830247a4 8714 RExC_parse++;
e994fd66 8715 }
830247a4 8716 RExC_parse++;
4633a7c4
LW
8717 continue;
8718 }
bbe252da 8719 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
8720 if (isSPACE(*RExC_parse)) {
8721 RExC_parse++;
748a9306
LW
8722 continue;
8723 }
830247a4 8724 else if (*RExC_parse == '#') {
bcdf7404
YO
8725 if ( reg_skipcomment( pRExC_state ) )
8726 continue;
748a9306 8727 }
748a9306 8728 }
4633a7c4 8729 return retval;
a0d0e21e 8730 }
a687059c
LW
8731}
8732
8733/*
c277df42 8734- reg_node - emit a node
a0d0e21e 8735*/
76e3520e 8736STATIC regnode * /* Location. */
830247a4 8737S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 8738{
97aff369 8739 dVAR;
c277df42 8740 register regnode *ptr;
504618e9 8741 regnode * const ret = RExC_emit;
07be1b83 8742 GET_RE_DEBUG_FLAGS_DECL;
a687059c 8743
7918f24d
NC
8744 PERL_ARGS_ASSERT_REG_NODE;
8745
c277df42 8746 if (SIZE_ONLY) {
830247a4
IZ
8747 SIZE_ALIGN(RExC_size);
8748 RExC_size += 1;
a0d0e21e
LW
8749 return(ret);
8750 }
3b57cd43
YO
8751 if (RExC_emit >= RExC_emit_bound)
8752 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8753
c277df42 8754 NODE_ALIGN_FILL(ret);
a0d0e21e 8755 ptr = ret;
c277df42 8756 FILL_ADVANCE_NODE(ptr, op);
7122b237 8757#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8758 if (RExC_offsets) { /* MJD */
07be1b83 8759 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 8760 "reg_node", __LINE__,
13d6edb4 8761 PL_reg_name[op],
07be1b83
YO
8762 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8763 ? "Overwriting end of array!\n" : "OK",
8764 (UV)(RExC_emit - RExC_emit_start),
8765 (UV)(RExC_parse - RExC_start),
8766 (UV)RExC_offsets[0]));
ccb2c380 8767 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 8768 }
7122b237 8769#endif
830247a4 8770 RExC_emit = ptr;
a0d0e21e 8771 return(ret);
a687059c
LW
8772}
8773
8774/*
a0d0e21e
LW
8775- reganode - emit a node with an argument
8776*/
76e3520e 8777STATIC regnode * /* Location. */
830247a4 8778S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 8779{
97aff369 8780 dVAR;
c277df42 8781 register regnode *ptr;
504618e9 8782 regnode * const ret = RExC_emit;
07be1b83 8783 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 8784
7918f24d
NC
8785 PERL_ARGS_ASSERT_REGANODE;
8786
c277df42 8787 if (SIZE_ONLY) {
830247a4
IZ
8788 SIZE_ALIGN(RExC_size);
8789 RExC_size += 2;
6bda09f9
YO
8790 /*
8791 We can't do this:
8792
8793 assert(2==regarglen[op]+1);
8794
8795 Anything larger than this has to allocate the extra amount.
8796 If we changed this to be:
8797
8798 RExC_size += (1 + regarglen[op]);
8799
8800 then it wouldn't matter. Its not clear what side effect
8801 might come from that so its not done so far.
8802 -- dmq
8803 */
a0d0e21e
LW
8804 return(ret);
8805 }
3b57cd43
YO
8806 if (RExC_emit >= RExC_emit_bound)
8807 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8808
c277df42 8809 NODE_ALIGN_FILL(ret);
a0d0e21e 8810 ptr = ret;
c277df42 8811 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 8812#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8813 if (RExC_offsets) { /* MJD */
07be1b83 8814 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 8815 "reganode",
ccb2c380 8816 __LINE__,
13d6edb4 8817 PL_reg_name[op],
07be1b83 8818 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 8819 "Overwriting end of array!\n" : "OK",
07be1b83
YO
8820 (UV)(RExC_emit - RExC_emit_start),
8821 (UV)(RExC_parse - RExC_start),
8822 (UV)RExC_offsets[0]));
ccb2c380 8823 Set_Cur_Node_Offset;
fac92740 8824 }
7122b237 8825#endif
830247a4 8826 RExC_emit = ptr;
a0d0e21e 8827 return(ret);
fe14fcc3
LW
8828}
8829
8830/*
cd439c50 8831- reguni - emit (if appropriate) a Unicode character
a0ed51b3 8832*/
71207a34
AL
8833STATIC STRLEN
8834S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 8835{
97aff369 8836 dVAR;
7918f24d
NC
8837
8838 PERL_ARGS_ASSERT_REGUNI;
8839
71207a34 8840 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
8841}
8842
8843/*
a0d0e21e
LW
8844- reginsert - insert an operator in front of already-emitted operand
8845*
8846* Means relocating the operand.
8847*/
76e3520e 8848STATIC void
6bda09f9 8849S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 8850{
97aff369 8851 dVAR;
c277df42
IZ
8852 register regnode *src;
8853 register regnode *dst;
8854 register regnode *place;
504618e9 8855 const int offset = regarglen[(U8)op];
6bda09f9 8856 const int size = NODE_STEP_REGNODE + offset;
07be1b83 8857 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8858
8859 PERL_ARGS_ASSERT_REGINSERT;
def51078 8860 PERL_UNUSED_ARG(depth);
22c35a8c 8861/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 8862 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 8863 if (SIZE_ONLY) {
6bda09f9 8864 RExC_size += size;
a0d0e21e
LW
8865 return;
8866 }
a687059c 8867
830247a4 8868 src = RExC_emit;
6bda09f9 8869 RExC_emit += size;
830247a4 8870 dst = RExC_emit;
40d049e4 8871 if (RExC_open_parens) {
6bda09f9 8872 int paren;
3b57cd43 8873 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 8874 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 8875 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 8876 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
8877 RExC_open_parens[paren] += size;
8878 } else {
3b57cd43 8879 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
8880 }
8881 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 8882 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
8883 RExC_close_parens[paren] += size;
8884 } else {
3b57cd43 8885 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
8886 }
8887 }
6bda09f9 8888 }
40d049e4 8889
fac92740 8890 while (src > opnd) {
c277df42 8891 StructCopy(--src, --dst, regnode);
7122b237 8892#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8893 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 8894 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 8895 "reg_insert",
ccb2c380 8896 __LINE__,
13d6edb4 8897 PL_reg_name[op],
07be1b83
YO
8898 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8899 ? "Overwriting end of array!\n" : "OK",
8900 (UV)(src - RExC_emit_start),
8901 (UV)(dst - RExC_emit_start),
8902 (UV)RExC_offsets[0]));
ccb2c380
MP
8903 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8904 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 8905 }
7122b237 8906#endif
fac92740
MJD
8907 }
8908
a0d0e21e
LW
8909
8910 place = opnd; /* Op node, where operand used to be. */
7122b237 8911#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8912 if (RExC_offsets) { /* MJD */
07be1b83 8913 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 8914 "reginsert",
ccb2c380 8915 __LINE__,
13d6edb4 8916 PL_reg_name[op],
07be1b83 8917 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 8918 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
8919 (UV)(place - RExC_emit_start),
8920 (UV)(RExC_parse - RExC_start),
786e8c11 8921 (UV)RExC_offsets[0]));
ccb2c380 8922 Set_Node_Offset(place, RExC_parse);
45948336 8923 Set_Node_Length(place, 1);
fac92740 8924 }
7122b237 8925#endif
c277df42
IZ
8926 src = NEXTOPER(place);
8927 FILL_ADVANCE_NODE(place, op);
8928 Zero(src, offset, regnode);
a687059c
LW
8929}
8930
8931/*
c277df42 8932- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 8933- SEE ALSO: regtail_study
a0d0e21e 8934*/
097eb12c 8935/* TODO: All three parms should be const */
76e3520e 8936STATIC void
3dab1dad 8937S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 8938{
97aff369 8939 dVAR;
c277df42 8940 register regnode *scan;
72f13be8 8941 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8942
8943 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
8944#ifndef DEBUGGING
8945 PERL_UNUSED_ARG(depth);
8946#endif
a0d0e21e 8947
c277df42 8948 if (SIZE_ONLY)
a0d0e21e
LW
8949 return;
8950
8951 /* Find last node. */
8952 scan = p;
8953 for (;;) {
504618e9 8954 regnode * const temp = regnext(scan);
3dab1dad
YO
8955 DEBUG_PARSE_r({
8956 SV * const mysv=sv_newmortal();
8957 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8958 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
8959 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8960 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8961 (temp == NULL ? "->" : ""),
13d6edb4 8962 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 8963 );
3dab1dad
YO
8964 });
8965 if (temp == NULL)
8966 break;
8967 scan = temp;
8968 }
8969
8970 if (reg_off_by_arg[OP(scan)]) {
8971 ARG_SET(scan, val - scan);
8972 }
8973 else {
8974 NEXT_OFF(scan) = val - scan;
8975 }
8976}
8977
07be1b83 8978#ifdef DEBUGGING
3dab1dad
YO
8979/*
8980- regtail_study - set the next-pointer at the end of a node chain of p to val.
8981- Look for optimizable sequences at the same time.
8982- currently only looks for EXACT chains.
07be1b83
YO
8983
8984This is expermental code. The idea is to use this routine to perform
8985in place optimizations on branches and groups as they are constructed,
8986with the long term intention of removing optimization from study_chunk so
8987that it is purely analytical.
8988
8989Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8990to control which is which.
8991
3dab1dad
YO
8992*/
8993/* TODO: All four parms should be const */
07be1b83 8994
3dab1dad
YO
8995STATIC U8
8996S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8997{
8998 dVAR;
8999 register regnode *scan;
07be1b83
YO
9000 U8 exact = PSEUDO;
9001#ifdef EXPERIMENTAL_INPLACESCAN
9002 I32 min = 0;
9003#endif
3dab1dad
YO
9004 GET_RE_DEBUG_FLAGS_DECL;
9005
7918f24d
NC
9006 PERL_ARGS_ASSERT_REGTAIL_STUDY;
9007
07be1b83 9008
3dab1dad
YO
9009 if (SIZE_ONLY)
9010 return exact;
9011
9012 /* Find last node. */
9013
9014 scan = p;
9015 for (;;) {
9016 regnode * const temp = regnext(scan);
07be1b83
YO
9017#ifdef EXPERIMENTAL_INPLACESCAN
9018 if (PL_regkind[OP(scan)] == EXACT)
9019 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9020 return EXACT;
9021#endif
3dab1dad
YO
9022 if ( exact ) {
9023 switch (OP(scan)) {
9024 case EXACT:
9025 case EXACTF:
9026 case EXACTFL:
9027 if( exact == PSEUDO )
9028 exact= OP(scan);
07be1b83
YO
9029 else if ( exact != OP(scan) )
9030 exact= 0;
3dab1dad
YO
9031 case NOTHING:
9032 break;
9033 default:
9034 exact= 0;
9035 }
9036 }
9037 DEBUG_PARSE_r({
9038 SV * const mysv=sv_newmortal();
9039 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9040 regprop(RExC_rx, mysv, scan);
eaf3ca90 9041 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 9042 SvPV_nolen_const(mysv),
eaf3ca90 9043 REG_NODE_NUM(scan),
13d6edb4 9044 PL_reg_name[exact]);
3dab1dad 9045 });
a0d0e21e
LW
9046 if (temp == NULL)
9047 break;
9048 scan = temp;
9049 }
07be1b83
YO
9050 DEBUG_PARSE_r({
9051 SV * const mysv_val=sv_newmortal();
9052 DEBUG_PARSE_MSG("");
9053 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
9054 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9055 SvPV_nolen_const(mysv_val),
9056 (IV)REG_NODE_NUM(val),
9057 (IV)(val - scan)
07be1b83
YO
9058 );
9059 });
c277df42
IZ
9060 if (reg_off_by_arg[OP(scan)]) {
9061 ARG_SET(scan, val - scan);
a0ed51b3
LW
9062 }
9063 else {
c277df42
IZ
9064 NEXT_OFF(scan) = val - scan;
9065 }
3dab1dad
YO
9066
9067 return exact;
a687059c 9068}
07be1b83 9069#endif
a687059c
LW
9070
9071/*
fd181c75 9072 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 9073 */
f7819f85 9074#ifdef DEBUGGING
c33269f7 9075static void
7918f24d
NC
9076S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9077{
f7819f85
A
9078 int bit;
9079 int set=0;
7918f24d 9080
f7819f85
A
9081 for (bit=0; bit<32; bit++) {
9082 if (flags & (1<<bit)) {
9083 if (!set++ && lead)
9084 PerlIO_printf(Perl_debug_log, "%s",lead);
9085 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9086 }
9087 }
9088 if (lead) {
9089 if (set)
9090 PerlIO_printf(Perl_debug_log, "\n");
9091 else
9092 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9093 }
9094}
9095#endif
9096
a687059c 9097void
097eb12c 9098Perl_regdump(pTHX_ const regexp *r)
a687059c 9099{
35ff7856 9100#ifdef DEBUGGING
97aff369 9101 dVAR;
c445ea15 9102 SV * const sv = sv_newmortal();
ab3bbdeb 9103 SV *dsv= sv_newmortal();
f8fc2ecf 9104 RXi_GET_DECL(r,ri);
f7819f85 9105 GET_RE_DEBUG_FLAGS_DECL;
a687059c 9106
7918f24d
NC
9107 PERL_ARGS_ASSERT_REGDUMP;
9108
f8fc2ecf 9109 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
9110
9111 /* Header fields of interest. */
ab3bbdeb
YO
9112 if (r->anchored_substr) {
9113 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9114 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 9115 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9116 "anchored %s%s at %"IVdf" ",
9117 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 9118 (IV)r->anchored_offset);
ab3bbdeb
YO
9119 } else if (r->anchored_utf8) {
9120 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9121 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 9122 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9123 "anchored utf8 %s%s at %"IVdf" ",
9124 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 9125 (IV)r->anchored_offset);
ab3bbdeb
YO
9126 }
9127 if (r->float_substr) {
9128 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9129 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 9130 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9131 "floating %s%s at %"IVdf"..%"UVuf" ",
9132 s, RE_SV_TAIL(r->float_substr),
7b0972df 9133 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
9134 } else if (r->float_utf8) {
9135 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9136 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 9137 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9138 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9139 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 9140 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 9141 }
33b8afdf 9142 if (r->check_substr || r->check_utf8)
b81d288d 9143 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
9144 (const char *)
9145 (r->check_substr == r->float_substr
9146 && r->check_utf8 == r->float_utf8
9147 ? "(checking floating" : "(checking anchored"));
bbe252da 9148 if (r->extflags & RXf_NOSCAN)
c277df42 9149 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 9150 if (r->extflags & RXf_CHECK_ALL)
c277df42 9151 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 9152 if (r->check_substr || r->check_utf8)
c277df42
IZ
9153 PerlIO_printf(Perl_debug_log, ") ");
9154
f8fc2ecf
YO
9155 if (ri->regstclass) {
9156 regprop(r, sv, ri->regstclass);
1de06328 9157 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 9158 }
bbe252da 9159 if (r->extflags & RXf_ANCH) {
774d564b 9160 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 9161 if (r->extflags & RXf_ANCH_BOL)
774d564b 9162 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 9163 if (r->extflags & RXf_ANCH_MBOL)
c277df42 9164 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 9165 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 9166 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 9167 if (r->extflags & RXf_ANCH_GPOS)
774d564b 9168 PerlIO_printf(Perl_debug_log, "(GPOS)");
9169 PerlIO_putc(Perl_debug_log, ' ');
9170 }
bbe252da 9171 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 9172 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 9173 if (r->intflags & PREGf_SKIP)
760ac839 9174 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 9175 if (r->intflags & PREGf_IMPLICIT)
760ac839 9176 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 9177 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 9178 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 9179 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 9180 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 9181 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 9182#else
7918f24d 9183 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 9184 PERL_UNUSED_CONTEXT;
65e66c80 9185 PERL_UNUSED_ARG(r);
17c3b450 9186#endif /* DEBUGGING */
a687059c
LW
9187}
9188
9189/*
a0d0e21e
LW
9190- regprop - printable representation of opcode
9191*/
3339dfd8
YO
9192#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9193STMT_START { \
9194 if (do_sep) { \
9195 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9196 if (flags & ANYOF_INVERT) \
9197 /*make sure the invert info is in each */ \
9198 sv_catpvs(sv, "^"); \
9199 do_sep = 0; \
9200 } \
9201} STMT_END
9202
46fc3d4c 9203void
32fc9b6a 9204Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 9205{
35ff7856 9206#ifdef DEBUGGING
97aff369 9207 dVAR;
9b155405 9208 register int k;
f8fc2ecf 9209 RXi_GET_DECL(prog,progi);
1de06328 9210 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 9211
7918f24d 9212 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 9213
76f68e9b 9214 sv_setpvs(sv, "");
8aa23a47 9215
03363afd 9216 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
9217 /* It would be nice to FAIL() here, but this may be called from
9218 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 9219 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 9220 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 9221
3dab1dad 9222 k = PL_regkind[OP(o)];
9b155405 9223
2a782b5b 9224 if (k == EXACT) {
f92a2122 9225 sv_catpvs(sv, " ");
ab3bbdeb
YO
9226 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9227 * is a crude hack but it may be the best for now since
9228 * we have no flag "this EXACTish node was UTF-8"
9229 * --jhi */
f92a2122
NC
9230 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9231 PERL_PV_ESCAPE_UNI_DETECT |
9232 PERL_PV_PRETTY_ELLIPSES |
9233 PERL_PV_PRETTY_LTGT |
9234 PERL_PV_PRETTY_NOCLEAR
9235 );
bb263b4e 9236 } else if (k == TRIE) {
3dab1dad 9237 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 9238 * progi->data isn't available here */
1de06328 9239 const char op = OP(o);
647f639f 9240 const U32 n = ARG(o);
1de06328 9241 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 9242 (reg_ac_data *)progi->data->data[n] :
1de06328 9243 NULL;
3251b653
NC
9244 const reg_trie_data * const trie
9245 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 9246
13d6edb4 9247 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
9248 DEBUG_TRIE_COMPILE_r(
9249 Perl_sv_catpvf(aTHX_ sv,
9250 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9251 (UV)trie->startstate,
1e2e3d02 9252 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
9253 (UV)trie->wordcount,
9254 (UV)trie->minlen,
9255 (UV)trie->maxlen,
9256 (UV)TRIE_CHARCOUNT(trie),
9257 (UV)trie->uniquecharcount
9258 )
9259 );
9260 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9261 int i;
9262 int rangestart = -1;
f46cb337 9263 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 9264 sv_catpvs(sv, "[");
1de06328
YO
9265 for (i = 0; i <= 256; i++) {
9266 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9267 if (rangestart == -1)
9268 rangestart = i;
9269 } else if (rangestart != -1) {
9270 if (i <= rangestart + 3)
9271 for (; rangestart < i; rangestart++)
9272 put_byte(sv, rangestart);
9273 else {
9274 put_byte(sv, rangestart);
9275 sv_catpvs(sv, "-");
9276 put_byte(sv, i - 1);
9277 }
9278 rangestart = -1;
9279 }
9280 }
f3a2811a 9281 sv_catpvs(sv, "]");
1de06328
YO
9282 }
9283
a3621e74 9284 } else if (k == CURLY) {
cb434fcc 9285 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
9286 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9287 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 9288 }
2c2d71f5
JH
9289 else if (k == WHILEM && o->flags) /* Ordinal/of */
9290 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 9291 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 9292 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 9293 if ( RXp_PAREN_NAMES(prog) ) {
ee9b8eae 9294 if ( k != REF || OP(o) < NREF) {
502c6561 9295 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
9296 SV **name= av_fetch(list, ARG(o), 0 );
9297 if (name)
9298 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9299 }
9300 else {
502c6561 9301 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 9302 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
9303 I32 *nums=(I32*)SvPVX(sv_dat);
9304 SV **name= av_fetch(list, nums[0], 0 );
9305 I32 n;
9306 if (name) {
9307 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9308 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9309 (n ? "," : ""), (IV)nums[n]);
9310 }
9311 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 9312 }
1f1031fe 9313 }
ee9b8eae 9314 }
1f1031fe 9315 } else if (k == GOSUB)
6bda09f9 9316 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
9317 else if (k == VERB) {
9318 if (!o->flags)
9319 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 9320 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 9321 } else if (k == LOGICAL)
04ebc1ab 9322 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
f9a79580 9323 else if (k == FOLDCHAR)
df44d732 9324 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
653099ff
GS
9325 else if (k == ANYOF) {
9326 int i, rangestart = -1;
2d03de9c 9327 const U8 flags = ANYOF_FLAGS(o);
24d786f4 9328 int do_sep = 0;
0bd48802
AL
9329
9330 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9331 static const char * const anyofs[] = {
653099ff
GS
9332 "\\w",
9333 "\\W",
9334 "\\s",
9335 "\\S",
9336 "\\d",
9337 "\\D",
9338 "[:alnum:]",
9339 "[:^alnum:]",
9340 "[:alpha:]",
9341 "[:^alpha:]",
9342 "[:ascii:]",
9343 "[:^ascii:]",
24d786f4
YO
9344 "[:cntrl:]",
9345 "[:^cntrl:]",
653099ff
GS
9346 "[:graph:]",
9347 "[:^graph:]",
9348 "[:lower:]",
9349 "[:^lower:]",
9350 "[:print:]",
9351 "[:^print:]",
9352 "[:punct:]",
9353 "[:^punct:]",
9354 "[:upper:]",
aaa51d5e 9355 "[:^upper:]",
653099ff 9356 "[:xdigit:]",
aaa51d5e
JF
9357 "[:^xdigit:]",
9358 "[:space:]",
9359 "[:^space:]",
9360 "[:blank:]",
9361 "[:^blank:]"
653099ff
GS
9362 };
9363
19860706 9364 if (flags & ANYOF_LOCALE)
396482e1 9365 sv_catpvs(sv, "{loc}");
19860706 9366 if (flags & ANYOF_FOLD)
396482e1 9367 sv_catpvs(sv, "{i}");
653099ff 9368 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 9369 if (flags & ANYOF_INVERT)
396482e1 9370 sv_catpvs(sv, "^");
3339dfd8
YO
9371
9372 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
9373 for (i = 0; i <= 256; i++) {
9374 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9375 if (rangestart == -1)
9376 rangestart = i;
9377 } else if (rangestart != -1) {
9378 if (i <= rangestart + 3)
9379 for (; rangestart < i; rangestart++)
653099ff 9380 put_byte(sv, rangestart);
ffc61ed2
JH
9381 else {
9382 put_byte(sv, rangestart);
396482e1 9383 sv_catpvs(sv, "-");
ffc61ed2 9384 put_byte(sv, i - 1);
653099ff 9385 }
24d786f4 9386 do_sep = 1;
ffc61ed2 9387 rangestart = -1;
653099ff 9388 }
847a199f 9389 }
3339dfd8
YO
9390
9391 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9392 /* output any special charclass tests (used mostly under use locale) */
ffc61ed2 9393 if (o->flags & ANYOF_CLASS)
bb7a0f54 9394 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 9395 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 9396 sv_catpv(sv, anyofs[i]);
24d786f4
YO
9397 do_sep = 1;
9398 }
9399
3339dfd8
YO
9400 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9401
9402 /* output information about the unicode matching */
ffc61ed2 9403 if (flags & ANYOF_UNICODE)
396482e1 9404 sv_catpvs(sv, "{unicode}");
1aa99e6b 9405 else if (flags & ANYOF_UNICODE_ALL)
396482e1 9406 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
9407
9408 {
9409 SV *lv;
32fc9b6a 9410 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 9411
ffc61ed2
JH
9412 if (lv) {
9413 if (sw) {
89ebb4a3 9414 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 9415
ffc61ed2 9416 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 9417 uvchr_to_utf8(s, i);
ffc61ed2 9418
3568d838 9419 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
9420 if (rangestart == -1)
9421 rangestart = i;
9422 } else if (rangestart != -1) {
ffc61ed2
JH
9423 if (i <= rangestart + 3)
9424 for (; rangestart < i; rangestart++) {
2d03de9c
AL
9425 const U8 * const e = uvchr_to_utf8(s,rangestart);
9426 U8 *p;
9427 for(p = s; p < e; p++)
ffc61ed2
JH
9428 put_byte(sv, *p);
9429 }
9430 else {
2d03de9c
AL
9431 const U8 *e = uvchr_to_utf8(s,rangestart);
9432 U8 *p;
9433 for (p = s; p < e; p++)
ffc61ed2 9434 put_byte(sv, *p);
396482e1 9435 sv_catpvs(sv, "-");
2d03de9c
AL
9436 e = uvchr_to_utf8(s, i-1);
9437 for (p = s; p < e; p++)
1df70142 9438 put_byte(sv, *p);
ffc61ed2
JH
9439 }
9440 rangestart = -1;
9441 }
19860706 9442 }
ffc61ed2 9443
396482e1 9444 sv_catpvs(sv, "..."); /* et cetera */
19860706 9445 }
fde631ed 9446
ffc61ed2 9447 {
2e0de35c 9448 char *s = savesvpv(lv);
c445ea15 9449 char * const origs = s;
b81d288d 9450
3dab1dad
YO
9451 while (*s && *s != '\n')
9452 s++;
b81d288d 9453
ffc61ed2 9454 if (*s == '\n') {
2d03de9c 9455 const char * const t = ++s;
ffc61ed2
JH
9456
9457 while (*s) {
9458 if (*s == '\n')
9459 *s = ' ';
9460 s++;
9461 }
9462 if (s[-1] == ' ')
9463 s[-1] = 0;
9464
9465 sv_catpv(sv, t);
fde631ed 9466 }
b81d288d 9467
ffc61ed2 9468 Safefree(origs);
fde631ed
JH
9469 }
9470 }
653099ff 9471 }
ffc61ed2 9472
653099ff
GS
9473 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9474 }
9b155405 9475 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 9476 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 9477#else
96a5add6 9478 PERL_UNUSED_CONTEXT;
65e66c80
SP
9479 PERL_UNUSED_ARG(sv);
9480 PERL_UNUSED_ARG(o);
f9049ba1 9481 PERL_UNUSED_ARG(prog);
17c3b450 9482#endif /* DEBUGGING */
35ff7856 9483}
a687059c 9484
cad2e5aa 9485SV *
288b8c02 9486Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 9487{ /* Assume that RE_INTUIT is set */
97aff369 9488 dVAR;
288b8c02 9489 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 9490 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9491
9492 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
9493 PERL_UNUSED_CONTEXT;
9494
a3621e74 9495 DEBUG_COMPILE_r(
cfd0369c 9496 {
2d03de9c 9497 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 9498 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
9499
9500 if (!PL_colorset) reginitcolors();
9501 PerlIO_printf(Perl_debug_log,
a0288114 9502 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
9503 PL_colors[4],
9504 prog->check_substr ? "" : "utf8 ",
9505 PL_colors[5],PL_colors[0],
cad2e5aa
JH
9506 s,
9507 PL_colors[1],
9508 (strlen(s) > 60 ? "..." : ""));
9509 } );
9510
33b8afdf 9511 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
9512}
9513
84da74a7 9514/*
f8149455 9515 pregfree()
84da74a7 9516
f8149455
YO
9517 handles refcounting and freeing the perl core regexp structure. When
9518 it is necessary to actually free the structure the first thing it
9519 does is call the 'free' method of the regexp_engine associated to to
9520 the regexp, allowing the handling of the void *pprivate; member
9521 first. (This routine is not overridable by extensions, which is why
9522 the extensions free is called first.)
9523
9524 See regdupe and regdupe_internal if you change anything here.
84da74a7 9525*/
f8149455 9526#ifndef PERL_IN_XSUB_RE
2b69d0c2 9527void
84679df5 9528Perl_pregfree(pTHX_ REGEXP *r)
a687059c 9529{
288b8c02
NC
9530 SvREFCNT_dec(r);
9531}
9532
9533void
9534Perl_pregfree2(pTHX_ REGEXP *rx)
9535{
27da23d5 9536 dVAR;
288b8c02 9537 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 9538 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 9539
7918f24d
NC
9540 PERL_ARGS_ASSERT_PREGFREE2;
9541
28d8d7f4
YO
9542 if (r->mother_re) {
9543 ReREFCNT_dec(r->mother_re);
9544 } else {
288b8c02 9545 CALLREGFREE_PVT(rx); /* free the private data */
ef8d46e8 9546 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
9547 }
9548 if (r->substrs) {
ef8d46e8
VP
9549 SvREFCNT_dec(r->anchored_substr);
9550 SvREFCNT_dec(r->anchored_utf8);
9551 SvREFCNT_dec(r->float_substr);
9552 SvREFCNT_dec(r->float_utf8);
28d8d7f4
YO
9553 Safefree(r->substrs);
9554 }
288b8c02 9555 RX_MATCH_COPY_FREE(rx);
f8c7b90f 9556#ifdef PERL_OLD_COPY_ON_WRITE
ef8d46e8 9557 SvREFCNT_dec(r->saved_copy);
ed252734 9558#endif
f0ab9afb 9559 Safefree(r->offs);
f8149455 9560}
28d8d7f4
YO
9561
9562/* reg_temp_copy()
9563
9564 This is a hacky workaround to the structural issue of match results
9565 being stored in the regexp structure which is in turn stored in
9566 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9567 could be PL_curpm in multiple contexts, and could require multiple
9568 result sets being associated with the pattern simultaneously, such
9569 as when doing a recursive match with (??{$qr})
9570
9571 The solution is to make a lightweight copy of the regexp structure
9572 when a qr// is returned from the code executed by (??{$qr}) this
9573 lightweight copy doesnt actually own any of its data except for
9574 the starp/end and the actual regexp structure itself.
9575
9576*/
9577
9578
84679df5 9579REGEXP *
f0826785 9580Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
7918f24d 9581{
f0826785 9582 struct regexp *ret;
288b8c02 9583 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 9584 register const I32 npar = r->nparens+1;
7918f24d
NC
9585
9586 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9587
f0826785
BM
9588 if (!ret_x)
9589 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9590 ret = (struct regexp *)SvANY(ret_x);
9591
288b8c02 9592 (void)ReREFCNT_inc(rx);
f7c278bf
NC
9593 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9594 by pointing directly at the buffer, but flagging that the allocated
9595 space in the copy is zero. As we've just done a struct copy, it's now
9596 a case of zero-ing that, rather than copying the current length. */
9597 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 9598 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
b6f60916
NC
9599 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9600 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
f7c278bf 9601 SvLEN_set(ret_x, 0);
b9ad13ac 9602 SvSTASH_set(ret_x, NULL);
703c388d 9603 SvMAGIC_set(ret_x, NULL);
f0ab9afb
NC
9604 Newx(ret->offs, npar, regexp_paren_pair);
9605 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 9606 if (r->substrs) {
28d8d7f4 9607 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
9608 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9609
9610 SvREFCNT_inc_void(ret->anchored_substr);
9611 SvREFCNT_inc_void(ret->anchored_utf8);
9612 SvREFCNT_inc_void(ret->float_substr);
9613 SvREFCNT_inc_void(ret->float_utf8);
9614
9615 /* check_substr and check_utf8, if non-NULL, point to either their
9616 anchored or float namesakes, and don't hold a second reference. */
486913e4 9617 }
288b8c02 9618 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 9619#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 9620 ret->saved_copy = NULL;
28d8d7f4 9621#endif
288b8c02 9622 ret->mother_re = rx;
28d8d7f4 9623
288b8c02 9624 return ret_x;
28d8d7f4 9625}
f8149455
YO
9626#endif
9627
9628/* regfree_internal()
9629
9630 Free the private data in a regexp. This is overloadable by
9631 extensions. Perl takes care of the regexp structure in pregfree(),
9632 this covers the *pprivate pointer which technically perldoesnt
9633 know about, however of course we have to handle the
9634 regexp_internal structure when no extension is in use.
9635
9636 Note this is called before freeing anything in the regexp
9637 structure.
9638 */
9639
9640void
288b8c02 9641Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
9642{
9643 dVAR;
288b8c02 9644 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
9645 RXi_GET_DECL(r,ri);
9646 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9647
9648 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9649
f8149455
YO
9650 DEBUG_COMPILE_r({
9651 if (!PL_colorset)
9652 reginitcolors();
9653 {
9654 SV *dsv= sv_newmortal();
3c8556c3 9655 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 9656 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
9657 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9658 PL_colors[4],PL_colors[5],s);
9659 }
9660 });
7122b237
YO
9661#ifdef RE_TRACK_PATTERN_OFFSETS
9662 if (ri->u.offsets)
9663 Safefree(ri->u.offsets); /* 20010421 MJD */
9664#endif
f8fc2ecf
YO
9665 if (ri->data) {
9666 int n = ri->data->count;
f3548bdc
DM
9667 PAD* new_comppad = NULL;
9668 PAD* old_comppad;
4026c95a 9669 PADOFFSET refcnt;
dfad63ad 9670
c277df42 9671 while (--n >= 0) {
261faec3 9672 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 9673 switch (ri->data->what[n]) {
af534a04 9674 case 'a':
c277df42 9675 case 's':
81714fb9 9676 case 'S':
55eed653 9677 case 'u':
ad64d0ec 9678 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 9679 break;
653099ff 9680 case 'f':
f8fc2ecf 9681 Safefree(ri->data->data[n]);
653099ff 9682 break;
dfad63ad 9683 case 'p':
502c6561 9684 new_comppad = MUTABLE_AV(ri->data->data[n]);
dfad63ad 9685 break;
c277df42 9686 case 'o':
dfad63ad 9687 if (new_comppad == NULL)
cea2e8a9 9688 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
9689 PAD_SAVE_LOCAL(old_comppad,
9690 /* Watch out for global destruction's random ordering. */
c445ea15 9691 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 9692 );
b34c0dd4 9693 OP_REFCNT_LOCK;
f8fc2ecf 9694 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
9695 OP_REFCNT_UNLOCK;
9696 if (!refcnt)
f8fc2ecf 9697 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 9698
f3548bdc 9699 PAD_RESTORE_LOCAL(old_comppad);
ad64d0ec 9700 SvREFCNT_dec(MUTABLE_SV(new_comppad));
dfad63ad 9701 new_comppad = NULL;
c277df42
IZ
9702 break;
9703 case 'n':
9e55ce06 9704 break;
07be1b83 9705 case 'T':
be8e71aa
YO
9706 { /* Aho Corasick add-on structure for a trie node.
9707 Used in stclass optimization only */
07be1b83 9708 U32 refcount;
f8fc2ecf 9709 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
9710 OP_REFCNT_LOCK;
9711 refcount = --aho->refcount;
9712 OP_REFCNT_UNLOCK;
9713 if ( !refcount ) {
446bd890
NC
9714 PerlMemShared_free(aho->states);
9715 PerlMemShared_free(aho->fail);
446bd890
NC
9716 /* do this last!!!! */
9717 PerlMemShared_free(ri->data->data[n]);
9718 PerlMemShared_free(ri->regstclass);
07be1b83
YO
9719 }
9720 }
9721 break;
a3621e74 9722 case 't':
07be1b83 9723 {
be8e71aa 9724 /* trie structure. */
07be1b83 9725 U32 refcount;
f8fc2ecf 9726 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
9727 OP_REFCNT_LOCK;
9728 refcount = --trie->refcount;
9729 OP_REFCNT_UNLOCK;
9730 if ( !refcount ) {
446bd890 9731 PerlMemShared_free(trie->charmap);
446bd890
NC
9732 PerlMemShared_free(trie->states);
9733 PerlMemShared_free(trie->trans);
07be1b83 9734 if (trie->bitmap)
446bd890 9735 PerlMemShared_free(trie->bitmap);
786e8c11 9736 if (trie->jump)
446bd890 9737 PerlMemShared_free(trie->jump);
2e64971a 9738 PerlMemShared_free(trie->wordinfo);
446bd890
NC
9739 /* do this last!!!! */
9740 PerlMemShared_free(ri->data->data[n]);
a3621e74 9741 }
07be1b83
YO
9742 }
9743 break;
c277df42 9744 default:
f8fc2ecf 9745 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
9746 }
9747 }
f8fc2ecf
YO
9748 Safefree(ri->data->what);
9749 Safefree(ri->data);
a0d0e21e 9750 }
28d8d7f4 9751
f8fc2ecf 9752 Safefree(ri);
a687059c 9753}
c277df42 9754
a09252eb
NC
9755#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
9756#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
84da74a7
YO
9757#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9758
9759/*
32cd70f6 9760 re_dup - duplicate a regexp.
84da74a7 9761
8233f606
DM
9762 This routine is expected to clone a given regexp structure. It is only
9763 compiled under USE_ITHREADS.
32cd70f6 9764
f8149455
YO
9765 After all of the core data stored in struct regexp is duplicated
9766 the regexp_engine.dupe method is used to copy any private data
9767 stored in the *pprivate pointer. This allows extensions to handle
9768 any duplication it needs to do.
9769
9770 See pregfree() and regfree_internal() if you change anything here.
84da74a7 9771*/
a3c0e9ca 9772#if defined(USE_ITHREADS)
f8149455 9773#ifndef PERL_IN_XSUB_RE
288b8c02
NC
9774void
9775Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 9776{
84da74a7 9777 dVAR;
a86a1ca7 9778 I32 npar;
288b8c02
NC
9779 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9780 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 9781
7918f24d
NC
9782 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9783
84da74a7 9784 npar = r->nparens+1;
f0ab9afb
NC
9785 Newx(ret->offs, npar, regexp_paren_pair);
9786 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 9787 if(ret->swap) {
28d8d7f4 9788 /* no need to copy these */
f0ab9afb 9789 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 9790 }
84da74a7 9791
6057429f 9792 if (ret->substrs) {
32cd70f6
NC
9793 /* Do it this way to avoid reading from *r after the StructCopy().
9794 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9795 cache, it doesn't matter. */
66b1de87
NC
9796 const bool anchored = r->check_substr
9797 ? r->check_substr == r->anchored_substr
9798 : r->check_utf8 == r->anchored_utf8;
785a26d5 9799 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
9800 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9801
32cd70f6
NC
9802 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9803 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9804 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9805 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 9806
32cd70f6
NC
9807 /* check_substr and check_utf8, if non-NULL, point to either their
9808 anchored or float namesakes, and don't hold a second reference. */
9809
9810 if (ret->check_substr) {
9811 if (anchored) {
9812 assert(r->check_utf8 == r->anchored_utf8);
9813 ret->check_substr = ret->anchored_substr;
9814 ret->check_utf8 = ret->anchored_utf8;
9815 } else {
9816 assert(r->check_substr == r->float_substr);
9817 assert(r->check_utf8 == r->float_utf8);
9818 ret->check_substr = ret->float_substr;
9819 ret->check_utf8 = ret->float_utf8;
9820 }
66b1de87
NC
9821 } else if (ret->check_utf8) {
9822 if (anchored) {
9823 ret->check_utf8 = ret->anchored_utf8;
9824 } else {
9825 ret->check_utf8 = ret->float_utf8;
9826 }
32cd70f6 9827 }
6057429f 9828 }
f8149455 9829
5daac39c 9830 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 9831
6057429f 9832 if (ret->pprivate)
288b8c02 9833 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 9834
288b8c02 9835 if (RX_MATCH_COPIED(dstr))
6057429f 9836 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
9837 else
9838 ret->subbeg = NULL;
9839#ifdef PERL_OLD_COPY_ON_WRITE
9840 ret->saved_copy = NULL;
9841#endif
6057429f 9842
c2123ae3
NC
9843 if (ret->mother_re) {
9844 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9845 /* Our storage points directly to our mother regexp, but that's
9846 1: a buffer in a different thread
9847 2: something we no longer hold a reference on
9848 so we need to copy it locally. */
9849 /* Note we need to sue SvCUR() on our mother_re, because it, in
9850 turn, may well be pointing to its own mother_re. */
9851 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9852 SvCUR(ret->mother_re)+1));
9853 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9854 }
9855 ret->mother_re = NULL;
9856 }
6057429f 9857 ret->gofs = 0;
f8149455
YO
9858}
9859#endif /* PERL_IN_XSUB_RE */
9860
9861/*
9862 regdupe_internal()
9863
9864 This is the internal complement to regdupe() which is used to copy
9865 the structure pointed to by the *pprivate pointer in the regexp.
9866 This is the core version of the extension overridable cloning hook.
9867 The regexp structure being duplicated will be copied by perl prior
9868 to this and will be provided as the regexp *r argument, however
9869 with the /old/ structures pprivate pointer value. Thus this routine
9870 may override any copying normally done by perl.
9871
9872 It returns a pointer to the new regexp_internal structure.
9873*/
9874
9875void *
288b8c02 9876Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
9877{
9878 dVAR;
288b8c02 9879 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
9880 regexp_internal *reti;
9881 int len, npar;
9882 RXi_GET_DECL(r,ri);
7918f24d
NC
9883
9884 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455
YO
9885
9886 npar = r->nparens+1;
7122b237 9887 len = ProgLen(ri);
f8149455 9888
45cf4570 9889 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455
YO
9890 Copy(ri->program, reti->program, len+1, regnode);
9891
f8149455 9892
f8fc2ecf 9893 reti->regstclass = NULL;
bcdf7404 9894
f8fc2ecf 9895 if (ri->data) {
84da74a7 9896 struct reg_data *d;
f8fc2ecf 9897 const int count = ri->data->count;
84da74a7
YO
9898 int i;
9899
9900 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9901 char, struct reg_data);
9902 Newx(d->what, count, U8);
9903
9904 d->count = count;
9905 for (i = 0; i < count; i++) {
f8fc2ecf 9906 d->what[i] = ri->data->what[i];
84da74a7 9907 switch (d->what[i]) {
af534a04 9908 /* legal options are one of: sSfpontTua
84da74a7 9909 see also regcomp.h and pregfree() */
af534a04 9910 case 'a': /* actually an AV, but the dup function is identical. */
84da74a7 9911 case 's':
81714fb9 9912 case 'S':
0536c0a7 9913 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 9914 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 9915 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 9916 break;
84da74a7
YO
9917 case 'f':
9918 /* This is cheating. */
9919 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 9920 StructCopy(ri->data->data[i], d->data[i],
84da74a7 9921 struct regnode_charclass_class);
f8fc2ecf 9922 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
9923 break;
9924 case 'o':
bbe252da
YO
9925 /* Compiled op trees are readonly and in shared memory,
9926 and can thus be shared without duplication. */
84da74a7 9927 OP_REFCNT_LOCK;
f8fc2ecf 9928 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
9929 OP_REFCNT_UNLOCK;
9930 break;
23eab42c
NC
9931 case 'T':
9932 /* Trie stclasses are readonly and can thus be shared
9933 * without duplication. We free the stclass in pregfree
9934 * when the corresponding reg_ac_data struct is freed.
9935 */
9936 reti->regstclass= ri->regstclass;
9937 /* Fall through */
84da74a7 9938 case 't':
84da74a7 9939 OP_REFCNT_LOCK;
0536c0a7 9940 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 9941 OP_REFCNT_UNLOCK;
0536c0a7
NC
9942 /* Fall through */
9943 case 'n':
9944 d->data[i] = ri->data->data[i];
84da74a7 9945 break;
84da74a7 9946 default:
f8fc2ecf 9947 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
9948 }
9949 }
9950
f8fc2ecf 9951 reti->data = d;
84da74a7
YO
9952 }
9953 else
f8fc2ecf 9954 reti->data = NULL;
84da74a7 9955
cde0cee5
YO
9956 reti->name_list_idx = ri->name_list_idx;
9957
7122b237
YO
9958#ifdef RE_TRACK_PATTERN_OFFSETS
9959 if (ri->u.offsets) {
9960 Newx(reti->u.offsets, 2*len+1, U32);
9961 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9962 }
9963#else
9964 SetProgLen(reti,len);
9965#endif
9966
f8149455 9967 return (void*)reti;
84da74a7 9968}
f8149455
YO
9969
9970#endif /* USE_ITHREADS */
84da74a7 9971
f8149455 9972#ifndef PERL_IN_XSUB_RE
bcdf7404 9973
c277df42
IZ
9974/*
9975 - regnext - dig the "next" pointer out of a node
c277df42
IZ
9976 */
9977regnode *
864dbfa3 9978Perl_regnext(pTHX_ register regnode *p)
c277df42 9979{
97aff369 9980 dVAR;
c277df42
IZ
9981 register I32 offset;
9982
f8fc2ecf 9983 if (!p)
c277df42
IZ
9984 return(NULL);
9985
35db910f
KW
9986 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
9987 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
9988 }
9989
c277df42
IZ
9990 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9991 if (offset == 0)
9992 return(NULL);
9993
c277df42 9994 return(p+offset);
c277df42 9995}
76234dfb 9996#endif
c277df42 9997
01f988be 9998STATIC void
cea2e8a9 9999S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
10000{
10001 va_list args;
10002 STRLEN l1 = strlen(pat1);
10003 STRLEN l2 = strlen(pat2);
10004 char buf[512];
06bf62c7 10005 SV *msv;
73d840c0 10006 const char *message;
c277df42 10007
7918f24d
NC
10008 PERL_ARGS_ASSERT_RE_CROAK2;
10009
c277df42
IZ
10010 if (l1 > 510)
10011 l1 = 510;
10012 if (l1 + l2 > 510)
10013 l2 = 510 - l1;
10014 Copy(pat1, buf, l1 , char);
10015 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
10016 buf[l1 + l2] = '\n';
10017 buf[l1 + l2 + 1] = '\0';
8736538c
AS
10018#ifdef I_STDARG
10019 /* ANSI variant takes additional second argument */
c277df42 10020 va_start(args, pat2);
8736538c
AS
10021#else
10022 va_start(args);
10023#endif
5a844595 10024 msv = vmess(buf, &args);
c277df42 10025 va_end(args);
cfd0369c 10026 message = SvPV_const(msv,l1);
c277df42
IZ
10027 if (l1 > 512)
10028 l1 = 512;
10029 Copy(message, buf, l1 , char);
197cf9b9 10030 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 10031 Perl_croak(aTHX_ "%s", buf);
c277df42 10032}
a0ed51b3
LW
10033
10034/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
10035
76234dfb 10036#ifndef PERL_IN_XSUB_RE
a0ed51b3 10037void
864dbfa3 10038Perl_save_re_context(pTHX)
b81d288d 10039{
97aff369 10040 dVAR;
1ade1aa1
NC
10041
10042 struct re_save_state *state;
10043
10044 SAVEVPTR(PL_curcop);
10045 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10046
10047 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10048 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
c6bf6a65 10049 SSPUSHUV(SAVEt_RE_STATE);
1ade1aa1 10050
46ab3289 10051 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 10052
a0ed51b3 10053 PL_reg_start_tmp = 0;
a0ed51b3 10054 PL_reg_start_tmpl = 0;
c445ea15 10055 PL_reg_oldsaved = NULL;
a5db57d6 10056 PL_reg_oldsavedlen = 0;
a5db57d6 10057 PL_reg_maxiter = 0;
a5db57d6 10058 PL_reg_leftiter = 0;
c445ea15 10059 PL_reg_poscache = NULL;
a5db57d6 10060 PL_reg_poscache_size = 0;
1ade1aa1
NC
10061#ifdef PERL_OLD_COPY_ON_WRITE
10062 PL_nrs = NULL;
10063#endif
ada6e8a9 10064
c445ea15
AL
10065 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10066 if (PL_curpm) {
10067 const REGEXP * const rx = PM_GETRE(PL_curpm);
10068 if (rx) {
1df70142 10069 U32 i;
07bc277f 10070 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 10071 char digits[TYPE_CHARS(long)];
d9fad198 10072 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
10073 GV *const *const gvp
10074 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10075
b37c2d43
AL
10076 if (gvp) {
10077 GV * const gv = *gvp;
10078 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10079 save_scalar(gv);
49f27e4b 10080 }
ada6e8a9
AMS
10081 }
10082 }
10083 }
a0ed51b3 10084}
76234dfb 10085#endif
51371543 10086
51371543 10087static void
acfe0abc 10088clear_re(pTHX_ void *r)
51371543 10089{
97aff369 10090 dVAR;
84679df5 10091 ReREFCNT_dec((REGEXP *)r);
51371543 10092}
ffbc6a93 10093
a28509cc
AL
10094#ifdef DEBUGGING
10095
10096STATIC void
10097S_put_byte(pTHX_ SV *sv, int c)
10098{
7918f24d
NC
10099 PERL_ARGS_ASSERT_PUT_BYTE;
10100
7fddd944
NC
10101 /* Our definition of isPRINT() ignores locales, so only bytes that are
10102 not part of UTF-8 are considered printable. I assume that the same
10103 holds for UTF-EBCDIC.
10104 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10105 which Wikipedia says:
10106
10107 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10108 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10109 identical, to the ASCII delete (DEL) or rubout control character.
10110 ) So the old condition can be simplified to !isPRINT(c) */
10111 if (!isPRINT(c))
a28509cc 10112 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5e7aa789 10113 else {
88c9ea1e 10114 const char string = c;
5e7aa789
NC
10115 if (c == '-' || c == ']' || c == '\\' || c == '^')
10116 sv_catpvs(sv, "\\");
10117 sv_catpvn(sv, &string, 1);
10118 }
a28509cc
AL
10119}
10120
786e8c11 10121
3dab1dad
YO
10122#define CLEAR_OPTSTART \
10123 if (optstart) STMT_START { \
70685ca0 10124 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
10125 optstart=NULL; \
10126 } STMT_END
10127
786e8c11 10128#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 10129
b5a2f8d8
NC
10130STATIC const regnode *
10131S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
10132 const regnode *last, const regnode *plast,
10133 SV* sv, I32 indent, U32 depth)
a28509cc 10134{
97aff369 10135 dVAR;
786e8c11 10136 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 10137 register const regnode *next;
3dab1dad 10138 const regnode *optstart= NULL;
1f1031fe 10139
f8fc2ecf 10140 RXi_GET_DECL(r,ri);
3dab1dad 10141 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10142
10143 PERL_ARGS_ASSERT_DUMPUNTIL;
10144
786e8c11
YO
10145#ifdef DEBUG_DUMPUNTIL
10146 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10147 last ? last-start : 0,plast ? plast-start : 0);
10148#endif
10149
10150 if (plast && plast < last)
10151 last= plast;
10152
10153 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 10154 /* While that wasn't END last time... */
a28509cc
AL
10155 NODE_ALIGN(node);
10156 op = OP(node);
de734bd5 10157 if (op == CLOSE || op == WHILEM)
786e8c11 10158 indent--;
b5a2f8d8 10159 next = regnext((regnode *)node);
1f1031fe 10160
a28509cc 10161 /* Where, what. */
8e11feef 10162 if (OP(node) == OPTIMIZED) {
e68ec53f 10163 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 10164 optstart = node;
3dab1dad 10165 else
8e11feef 10166 goto after_print;
3dab1dad
YO
10167 } else
10168 CLEAR_OPTSTART;
1f1031fe 10169
32fc9b6a 10170 regprop(r, sv, node);
a28509cc 10171 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 10172 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
10173
10174 if (OP(node) != OPTIMIZED) {
10175 if (next == NULL) /* Next ptr. */
10176 PerlIO_printf(Perl_debug_log, " (0)");
10177 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10178 PerlIO_printf(Perl_debug_log, " (FAIL)");
10179 else
10180 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10181 (void)PerlIO_putc(Perl_debug_log, '\n');
10182 }
10183
a28509cc
AL
10184 after_print:
10185 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
10186 assert(next);
10187 {
10188 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
10189 ? regnext((regnode *)next)
10190 : next);
be8e71aa
YO
10191 if (last && nnode > last)
10192 nnode = last;
786e8c11 10193 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 10194 }
a28509cc
AL
10195 }
10196 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 10197 assert(next);
786e8c11 10198 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
10199 }
10200 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 10201 const regnode *this_trie = node;
1de06328 10202 const char op = OP(node);
647f639f 10203 const U32 n = ARG(node);
1de06328 10204 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 10205 (reg_ac_data *)ri->data->data[n] :
1de06328 10206 NULL;
3251b653
NC
10207 const reg_trie_data * const trie =
10208 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 10209#ifdef DEBUGGING
502c6561 10210 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 10211#endif
786e8c11 10212 const regnode *nextbranch= NULL;
a28509cc 10213 I32 word_idx;
76f68e9b 10214 sv_setpvs(sv, "");
786e8c11 10215 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 10216 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
10217
10218 PerlIO_printf(Perl_debug_log, "%*s%s ",
10219 (int)(2*(indent+3)), "",
10220 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
10221 PL_colors[0], PL_colors[1],
10222 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 10223 PERL_PV_PRETTY_ELLIPSES |
7f69552c 10224 PERL_PV_PRETTY_LTGT
786e8c11
YO
10225 )
10226 : "???"
10227 );
10228 if (trie->jump) {
40d049e4 10229 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
10230 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10231 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
10232 if (dist) {
10233 if (!nextbranch)
24b23f37 10234 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
10235 DUMPUNTIL(this_trie + dist, nextbranch);
10236 }
786e8c11
YO
10237 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10238 nextbranch= regnext((regnode *)nextbranch);
10239 } else {
10240 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 10241 }
786e8c11
YO
10242 }
10243 if (last && next > last)
10244 node= last;
10245 else
10246 node= next;
a28509cc 10247 }
786e8c11
YO
10248 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10249 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10250 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
10251 }
10252 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 10253 assert(next);
786e8c11 10254 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
10255 }
10256 else if ( op == PLUS || op == STAR) {
786e8c11 10257 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
10258 }
10259 else if (op == ANYOF) {
10260 /* arglen 1 + class block */
10261 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10262 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10263 node = NEXTOPER(node);
10264 }
10265 else if (PL_regkind[(U8)op] == EXACT) {
10266 /* Literal string, where present. */
10267 node += NODE_SZ_STR(node) - 1;
10268 node = NEXTOPER(node);
10269 }
10270 else {
10271 node = NEXTOPER(node);
10272 node += regarglen[(U8)op];
10273 }
10274 if (op == CURLYX || op == OPEN)
786e8c11 10275 indent++;
a28509cc 10276 }
3dab1dad 10277 CLEAR_OPTSTART;
786e8c11 10278#ifdef DEBUG_DUMPUNTIL
70685ca0 10279 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 10280#endif
1de06328 10281 return node;
a28509cc
AL
10282}
10283
10284#endif /* DEBUGGING */
10285
241d1a3b
NC
10286/*
10287 * Local variables:
10288 * c-indentation-style: bsd
10289 * c-basic-offset: 4
10290 * indent-tabs-mode: t
10291 * End:
10292 *
37442d52
RGS
10293 * ex: set ts=8 sts=4 sw=4 noet:
10294 */