This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Merge B::HV::MAX and B::HV::KES into the common accessor.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73
AD
29/* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
81714fb9 137 HV *paren_names; /* Paren names */
1f1031fe 138
40d049e4
YO
139 regnode **recurse; /* Recurse regops */
140 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
141#if ADD_TO_REGEXEC
142 char *starttry; /* -Dr: where regtry was called. */
143#define RExC_starttry (pRExC_state->starttry)
144#endif
3dab1dad 145#ifdef DEBUGGING
be8e71aa 146 const char *lastparse;
3dab1dad 147 I32 lastnum;
1f1031fe 148 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
149#define RExC_lastparse (pRExC_state->lastparse)
150#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 151#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 152#endif
830247a4
IZ
153} RExC_state_t;
154
e2509266 155#define RExC_flags (pRExC_state->flags)
830247a4 156#define RExC_precomp (pRExC_state->precomp)
288b8c02 157#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 158#define RExC_rx (pRExC_state->rx)
f8fc2ecf 159#define RExC_rxi (pRExC_state->rxi)
fac92740 160#define RExC_start (pRExC_state->start)
830247a4
IZ
161#define RExC_end (pRExC_state->end)
162#define RExC_parse (pRExC_state->parse)
163#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
164#ifdef RE_TRACK_PATTERN_OFFSETS
165#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
166#endif
830247a4 167#define RExC_emit (pRExC_state->emit)
fac92740 168#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 169#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
170#define RExC_naughty (pRExC_state->naughty)
171#define RExC_sawback (pRExC_state->sawback)
172#define RExC_seen (pRExC_state->seen)
173#define RExC_size (pRExC_state->size)
174#define RExC_npar (pRExC_state->npar)
e2e6a0f1 175#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
176#define RExC_extralen (pRExC_state->extralen)
177#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 179#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 180#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
181#define RExC_open_parens (pRExC_state->open_parens)
182#define RExC_close_parens (pRExC_state->close_parens)
183#define RExC_opend (pRExC_state->opend)
81714fb9 184#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
185#define RExC_recurse (pRExC_state->recurse)
186#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 187
cde0cee5 188
a687059c
LW
189#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
a687059c 192
35c8bce7
LW
193#ifdef SPSTART
194#undef SPSTART /* dratted cpp namespace... */
195#endif
a687059c
LW
196/*
197 * Flags to be passed up and down.
198 */
a687059c 199#define WORST 0 /* Worst case. */
a3b492c3 200#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
201
202/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
203 * character, and if utf8, must be invariant. */
204#define SIMPLE 0x02
a3b492c3
YO
205#define SPSTART 0x04 /* Starts with * or +. */
206#define TRYAGAIN 0x08 /* Weeded out a declaration. */
207#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 208
3dab1dad
YO
209#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
210
07be1b83
YO
211/* whether trie related optimizations are enabled */
212#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
213#define TRIE_STUDY_OPT
786e8c11 214#define FULL_TRIE_STUDY
07be1b83
YO
215#define TRIE_STCLASS
216#endif
1de06328
YO
217
218
40d049e4
YO
219
220#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
221#define PBITVAL(paren) (1 << ((paren) & 7))
222#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
223#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
224#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
225
bbd61b5f
KW
226/* If not already in utf8, do a longjmp back to the beginning */
227#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
228#define REQUIRE_UTF8 STMT_START { \
229 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
230 } STMT_END
40d049e4 231
1de06328
YO
232/* About scan_data_t.
233
234 During optimisation we recurse through the regexp program performing
235 various inplace (keyhole style) optimisations. In addition study_chunk
236 and scan_commit populate this data structure with information about
237 what strings MUST appear in the pattern. We look for the longest
238 string that must appear for at a fixed location, and we look for the
239 longest string that may appear at a floating location. So for instance
240 in the pattern:
241
242 /FOO[xX]A.*B[xX]BAR/
243
244 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
245 strings (because they follow a .* construct). study_chunk will identify
246 both FOO and BAR as being the longest fixed and floating strings respectively.
247
248 The strings can be composites, for instance
249
250 /(f)(o)(o)/
251
252 will result in a composite fixed substring 'foo'.
253
254 For each string some basic information is maintained:
255
256 - offset or min_offset
257 This is the position the string must appear at, or not before.
258 It also implicitly (when combined with minlenp) tells us how many
259 character must match before the string we are searching.
260 Likewise when combined with minlenp and the length of the string
261 tells us how many characters must appear after the string we have
262 found.
263
264 - max_offset
265 Only used for floating strings. This is the rightmost point that
266 the string can appear at. Ifset to I32 max it indicates that the
267 string can occur infinitely far to the right.
268
269 - minlenp
270 A pointer to the minimum length of the pattern that the string
271 was found inside. This is important as in the case of positive
272 lookahead or positive lookbehind we can have multiple patterns
273 involved. Consider
274
275 /(?=FOO).*F/
276
277 The minimum length of the pattern overall is 3, the minimum length
278 of the lookahead part is 3, but the minimum length of the part that
279 will actually match is 1. So 'FOO's minimum length is 3, but the
280 minimum length for the F is 1. This is important as the minimum length
281 is used to determine offsets in front of and behind the string being
282 looked for. Since strings can be composites this is the length of the
283 pattern at the time it was commited with a scan_commit. Note that
284 the length is calculated by study_chunk, so that the minimum lengths
285 are not known until the full pattern has been compiled, thus the
286 pointer to the value.
287
288 - lookbehind
289
290 In the case of lookbehind the string being searched for can be
291 offset past the start point of the final matching string.
292 If this value was just blithely removed from the min_offset it would
293 invalidate some of the calculations for how many chars must match
294 before or after (as they are derived from min_offset and minlen and
295 the length of the string being searched for).
296 When the final pattern is compiled and the data is moved from the
297 scan_data_t structure into the regexp structure the information
298 about lookbehind is factored in, with the information that would
299 have been lost precalculated in the end_shift field for the
300 associated string.
301
302 The fields pos_min and pos_delta are used to store the minimum offset
303 and the delta to the maximum offset at the current point in the pattern.
304
305*/
2c2d71f5
JH
306
307typedef struct scan_data_t {
1de06328
YO
308 /*I32 len_min; unused */
309 /*I32 len_delta; unused */
2c2d71f5
JH
310 I32 pos_min;
311 I32 pos_delta;
312 SV *last_found;
1de06328 313 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
314 I32 last_start_min;
315 I32 last_start_max;
1de06328
YO
316 SV **longest; /* Either &l_fixed, or &l_float. */
317 SV *longest_fixed; /* longest fixed string found in pattern */
318 I32 offset_fixed; /* offset where it starts */
319 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
320 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
321 SV *longest_float; /* longest floating string found in pattern */
322 I32 offset_float_min; /* earliest point in string it can appear */
323 I32 offset_float_max; /* latest point in string it can appear */
324 I32 *minlen_float; /* pointer to the minlen relevent to the string */
325 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
326 I32 flags;
327 I32 whilem_c;
cb434fcc 328 I32 *last_closep;
653099ff 329 struct regnode_charclass_class *start_class;
2c2d71f5
JH
330} scan_data_t;
331
a687059c 332/*
e50aee73 333 * Forward declarations for pregcomp()'s friends.
a687059c 334 */
a0d0e21e 335
27da23d5 336static const scan_data_t zero_scan_data =
1de06328 337 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
338
339#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
340#define SF_BEFORE_SEOL 0x0001
341#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
342#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
343#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
344
09b7f37c
CB
345#ifdef NO_UNARY_PLUS
346# define SF_FIX_SHIFT_EOL (0+2)
347# define SF_FL_SHIFT_EOL (0+4)
348#else
349# define SF_FIX_SHIFT_EOL (+2)
350# define SF_FL_SHIFT_EOL (+4)
351#endif
c277df42
IZ
352
353#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
354#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
355
356#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
357#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
358#define SF_IS_INF 0x0040
359#define SF_HAS_PAR 0x0080
360#define SF_IN_PAR 0x0100
361#define SF_HAS_EVAL 0x0200
362#define SCF_DO_SUBSTR 0x0400
653099ff
GS
363#define SCF_DO_STCLASS_AND 0x0800
364#define SCF_DO_STCLASS_OR 0x1000
365#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 366#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 367
786e8c11 368#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 369#define SCF_SEEN_ACCEPT 0x8000
07be1b83 370
43fead97
KW
371#define UTF cBOOL(RExC_utf8)
372#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
373#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
374#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 375
ffc61ed2 376#define OOB_UNICODE 12345678
93733859 377#define OOB_NAMEDCLASS -1
b8c5462f 378
a0ed51b3
LW
379#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
380#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
381
8615cb43 382
b45f050a
JF
383/* length of regex to show in messages that don't mark a position within */
384#define RegexLengthToShowInErrorMessages 127
385
386/*
387 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
388 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
389 * op/pragma/warn/regcomp.
390 */
7253e4e3
RK
391#define MARKER1 "<-- HERE" /* marker as it appears in the description */
392#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 393
7253e4e3 394#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
395
396/*
397 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
398 * arg. Show regex, up to a maximum length. If it's too long, chop and add
399 * "...".
400 */
58e23c8d 401#define _FAIL(code) STMT_START { \
bfed75c6 402 const char *ellipses = ""; \
ccb2c380
MP
403 IV len = RExC_end - RExC_precomp; \
404 \
405 if (!SIZE_ONLY) \
288b8c02 406 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
407 if (len > RegexLengthToShowInErrorMessages) { \
408 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
409 len = RegexLengthToShowInErrorMessages - 10; \
410 ellipses = "..."; \
411 } \
58e23c8d 412 code; \
ccb2c380 413} STMT_END
8615cb43 414
58e23c8d
YO
415#define FAIL(msg) _FAIL( \
416 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
417 msg, (int)len, RExC_precomp, ellipses))
418
419#define FAIL2(msg,arg) _FAIL( \
420 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
421 arg, (int)len, RExC_precomp, ellipses))
422
b45f050a 423/*
b45f050a
JF
424 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
425 */
ccb2c380 426#define Simple_vFAIL(m) STMT_START { \
a28509cc 427 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
428 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
b45f050a
JF
431
432/*
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
434 */
ccb2c380
MP
435#define vFAIL(m) STMT_START { \
436 if (!SIZE_ONLY) \
288b8c02 437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
438 Simple_vFAIL(m); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Like Simple_vFAIL(), but accepts two arguments.
443 */
ccb2c380 444#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 445 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
448} STMT_END
b45f050a
JF
449
450/*
451 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
452 */
ccb2c380
MP
453#define vFAIL2(m,a1) STMT_START { \
454 if (!SIZE_ONLY) \
288b8c02 455 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
456 Simple_vFAIL2(m, a1); \
457} STMT_END
b45f050a
JF
458
459
460/*
461 * Like Simple_vFAIL(), but accepts three arguments.
462 */
ccb2c380 463#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 464 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
465 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
466 (int)offset, RExC_precomp, RExC_precomp + offset); \
467} STMT_END
b45f050a
JF
468
469/*
470 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
471 */
ccb2c380
MP
472#define vFAIL3(m,a1,a2) STMT_START { \
473 if (!SIZE_ONLY) \
288b8c02 474 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
475 Simple_vFAIL3(m, a1, a2); \
476} STMT_END
b45f050a
JF
477
478/*
479 * Like Simple_vFAIL(), but accepts four arguments.
480 */
ccb2c380 481#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 482 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
483 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
484 (int)offset, RExC_precomp, RExC_precomp + offset); \
485} STMT_END
b45f050a 486
668c081a 487#define ckWARNreg(loc,m) STMT_START { \
a28509cc 488 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
489 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
491} STMT_END
492
668c081a 493#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 494 const IV offset = loc - RExC_precomp; \
d1d15184 495 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
496 m REPORT_LOCATION, \
497 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
498} STMT_END
499
668c081a 500#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 501 const IV offset = loc - RExC_precomp; \
668c081a 502 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
503 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
504} STMT_END
505
506#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 507 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
508 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
509 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
510} STMT_END
511
668c081a
NC
512#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
513 const IV offset = loc - RExC_precomp; \
514 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
515 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
516} STMT_END
517
ccb2c380 518#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 519 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
520 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
521 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
522} STMT_END
523
668c081a
NC
524#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
527 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
528} STMT_END
529
ccb2c380 530#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 531 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
532 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
533 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
534} STMT_END
9d1d55b5 535
8615cb43 536
cd439c50 537/* Allow for side effects in s */
ccb2c380
MP
538#define REGC(c,s) STMT_START { \
539 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
540} STMT_END
cd439c50 541
fac92740
MJD
542/* Macros for recording node offsets. 20001227 mjd@plover.com
543 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
544 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
545 * Element 0 holds the number n.
07be1b83 546 * Position is 1 indexed.
fac92740 547 */
7122b237
YO
548#ifndef RE_TRACK_PATTERN_OFFSETS
549#define Set_Node_Offset_To_R(node,byte)
550#define Set_Node_Offset(node,byte)
551#define Set_Cur_Node_Offset
552#define Set_Node_Length_To_R(node,len)
553#define Set_Node_Length(node,len)
554#define Set_Node_Cur_Length(node)
555#define Node_Offset(n)
556#define Node_Length(n)
557#define Set_Node_Offset_Length(node,offset,len)
558#define ProgLen(ri) ri->u.proglen
559#define SetProgLen(ri,x) ri->u.proglen = x
560#else
561#define ProgLen(ri) ri->u.offsets[0]
562#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
563#define Set_Node_Offset_To_R(node,byte) STMT_START { \
564 if (! SIZE_ONLY) { \
565 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 566 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 567 if((node) < 0) { \
551405c4 568 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
569 } else { \
570 RExC_offsets[2*(node)-1] = (byte); \
571 } \
572 } \
573} STMT_END
574
575#define Set_Node_Offset(node,byte) \
576 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
577#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
578
579#define Set_Node_Length_To_R(node,len) STMT_START { \
580 if (! SIZE_ONLY) { \
581 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 582 __LINE__, (int)(node), (int)(len))); \
ccb2c380 583 if((node) < 0) { \
551405c4 584 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
585 } else { \
586 RExC_offsets[2*(node)] = (len); \
587 } \
588 } \
589} STMT_END
590
591#define Set_Node_Length(node,len) \
592 Set_Node_Length_To_R((node)-RExC_emit_start, len)
593#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
594#define Set_Node_Cur_Length(node) \
595 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
596
597/* Get offsets and lengths */
598#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
599#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
600
07be1b83
YO
601#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
602 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
603 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
604} STMT_END
7122b237 605#endif
07be1b83
YO
606
607#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
608#define EXPERIMENTAL_INPLACESCAN
7122b237 609#endif /*RE_TRACK_PATTERN_OFFSETS*/
07be1b83 610
304ee84b
YO
611#define DEBUG_STUDYDATA(str,data,depth) \
612DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 613 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
614 "%*s" str "Pos:%"IVdf"/%"IVdf \
615 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
616 (int)(depth)*2, "", \
617 (IV)((data)->pos_min), \
618 (IV)((data)->pos_delta), \
304ee84b 619 (UV)((data)->flags), \
1de06328 620 (IV)((data)->whilem_c), \
304ee84b
YO
621 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
622 is_inf ? "INF " : "" \
1de06328
YO
623 ); \
624 if ((data)->last_found) \
625 PerlIO_printf(Perl_debug_log, \
626 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
627 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
628 SvPVX_const((data)->last_found), \
629 (IV)((data)->last_end), \
630 (IV)((data)->last_start_min), \
631 (IV)((data)->last_start_max), \
632 ((data)->longest && \
633 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
634 SvPVX_const((data)->longest_fixed), \
635 (IV)((data)->offset_fixed), \
636 ((data)->longest && \
637 (data)->longest==&((data)->longest_float)) ? "*" : "", \
638 SvPVX_const((data)->longest_float), \
639 (IV)((data)->offset_float_min), \
640 (IV)((data)->offset_float_max) \
641 ); \
642 PerlIO_printf(Perl_debug_log,"\n"); \
643});
644
acfe0abc 645static void clear_re(pTHX_ void *r);
4327152a 646
653099ff 647/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 648 Update the longest found anchored substring and the longest found
653099ff
GS
649 floating substrings if needed. */
650
4327152a 651STATIC void
304ee84b 652S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 653{
e1ec3a88
AL
654 const STRLEN l = CHR_SVLEN(data->last_found);
655 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 656 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 657
7918f24d
NC
658 PERL_ARGS_ASSERT_SCAN_COMMIT;
659
c277df42 660 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 661 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
662 if (*data->longest == data->longest_fixed) {
663 data->offset_fixed = l ? data->last_start_min : data->pos_min;
664 if (data->flags & SF_BEFORE_EOL)
b81d288d 665 data->flags
c277df42
IZ
666 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
667 else
668 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
669 data->minlen_fixed=minlenp;
670 data->lookbehind_fixed=0;
a0ed51b3 671 }
304ee84b 672 else { /* *data->longest == data->longest_float */
c277df42 673 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
674 data->offset_float_max = (l
675 ? data->last_start_max
c277df42 676 : data->pos_min + data->pos_delta);
304ee84b 677 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 678 data->offset_float_max = I32_MAX;
c277df42 679 if (data->flags & SF_BEFORE_EOL)
b81d288d 680 data->flags
c277df42
IZ
681 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
682 else
683 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
684 data->minlen_float=minlenp;
685 data->lookbehind_float=0;
c277df42
IZ
686 }
687 }
688 SvCUR_set(data->last_found, 0);
0eda9292 689 {
a28509cc 690 SV * const sv = data->last_found;
097eb12c
AL
691 if (SvUTF8(sv) && SvMAGICAL(sv)) {
692 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
693 if (mg)
694 mg->mg_len = 0;
695 }
0eda9292 696 }
c277df42
IZ
697 data->last_end = -1;
698 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 699 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
700}
701
653099ff
GS
702/* Can match anything (initialization) */
703STATIC void
097eb12c 704S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 705{
7918f24d
NC
706 PERL_ARGS_ASSERT_CL_ANYTHING;
707
653099ff 708 ANYOF_CLASS_ZERO(cl);
f8bef550 709 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 710 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
711 if (LOC)
712 cl->flags |= ANYOF_LOCALE;
713}
714
715/* Can match anything (initialization) */
716STATIC int
5f66b61c 717S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
718{
719 int value;
720
7918f24d
NC
721 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
722
aaa51d5e 723 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
724 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
725 return 1;
1aa99e6b
IH
726 if (!(cl->flags & ANYOF_UNICODE_ALL))
727 return 0;
10edeb5d 728 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 729 return 0;
653099ff
GS
730 return 1;
731}
732
733/* Can match anything (initialization) */
734STATIC void
097eb12c 735S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 736{
7918f24d
NC
737 PERL_ARGS_ASSERT_CL_INIT;
738
8ecf7187 739 Zero(cl, 1, struct regnode_charclass_class);
653099ff 740 cl->type = ANYOF;
830247a4 741 cl_anything(pRExC_state, cl);
653099ff
GS
742}
743
744STATIC void
097eb12c 745S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 746{
7918f24d
NC
747 PERL_ARGS_ASSERT_CL_INIT_ZERO;
748
8ecf7187 749 Zero(cl, 1, struct regnode_charclass_class);
653099ff 750 cl->type = ANYOF;
830247a4 751 cl_anything(pRExC_state, cl);
653099ff
GS
752 if (LOC)
753 cl->flags |= ANYOF_LOCALE;
754}
755
756/* 'And' a given class with another one. Can create false positives */
757/* We assume that cl is not inverted */
758STATIC void
5f66b61c 759S_cl_and(struct regnode_charclass_class *cl,
a28509cc 760 const struct regnode_charclass_class *and_with)
653099ff 761{
7918f24d 762 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
763
764 assert(and_with->type == ANYOF);
653099ff
GS
765 if (!(and_with->flags & ANYOF_CLASS)
766 && !(cl->flags & ANYOF_CLASS)
767 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
768 && !(and_with->flags & ANYOF_FOLD)
769 && !(cl->flags & ANYOF_FOLD)) {
770 int i;
771
772 if (and_with->flags & ANYOF_INVERT)
773 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
774 cl->bitmap[i] &= ~and_with->bitmap[i];
775 else
776 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
777 cl->bitmap[i] &= and_with->bitmap[i];
778 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
779 if (!(and_with->flags & ANYOF_EOS))
780 cl->flags &= ~ANYOF_EOS;
1aa99e6b 781
14ebb1a2
JH
782 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
783 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
784 cl->flags &= ~ANYOF_UNICODE_ALL;
785 cl->flags |= ANYOF_UNICODE;
786 ARG_SET(cl, ARG(and_with));
787 }
14ebb1a2
JH
788 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
789 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 790 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
791 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
792 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 793 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
794}
795
796/* 'OR' a given class with another one. Can create false positives */
797/* We assume that cl is not inverted */
798STATIC void
097eb12c 799S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 800{
7918f24d
NC
801 PERL_ARGS_ASSERT_CL_OR;
802
653099ff
GS
803 if (or_with->flags & ANYOF_INVERT) {
804 /* We do not use
805 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
806 * <= (B1 | !B2) | (CL1 | !CL2)
807 * which is wasteful if CL2 is small, but we ignore CL2:
808 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
809 * XXXX Can we handle case-fold? Unclear:
810 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
811 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
812 */
813 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
814 && !(or_with->flags & ANYOF_FOLD)
815 && !(cl->flags & ANYOF_FOLD) ) {
816 int i;
817
818 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
819 cl->bitmap[i] |= ~or_with->bitmap[i];
820 } /* XXXX: logic is complicated otherwise */
821 else {
830247a4 822 cl_anything(pRExC_state, cl);
653099ff
GS
823 }
824 } else {
825 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
826 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 827 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
828 || (cl->flags & ANYOF_FOLD)) ) {
829 int i;
830
831 /* OR char bitmap and class bitmap separately */
832 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
833 cl->bitmap[i] |= or_with->bitmap[i];
834 if (or_with->flags & ANYOF_CLASS) {
835 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
836 cl->classflags[i] |= or_with->classflags[i];
837 cl->flags |= ANYOF_CLASS;
838 }
839 }
840 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 841 cl_anything(pRExC_state, cl);
653099ff
GS
842 }
843 }
844 if (or_with->flags & ANYOF_EOS)
845 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
846
847 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
848 ARG(cl) != ARG(or_with)) {
849 cl->flags |= ANYOF_UNICODE_ALL;
850 cl->flags &= ~ANYOF_UNICODE;
851 }
852 if (or_with->flags & ANYOF_UNICODE_ALL) {
853 cl->flags |= ANYOF_UNICODE_ALL;
854 cl->flags &= ~ANYOF_UNICODE;
855 }
653099ff
GS
856}
857
a3621e74
YO
858#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
859#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
860#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
861#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
862
3dab1dad
YO
863
864#ifdef DEBUGGING
07be1b83 865/*
2b8b4781
NC
866 dump_trie(trie,widecharmap,revcharmap)
867 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
868 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
869
870 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
871 The _interim_ variants are used for debugging the interim
872 tables that are used to generate the final compressed
873 representation which is what dump_trie expects.
874
3dab1dad
YO
875 Part of the reason for their existance is to provide a form
876 of documentation as to how the different representations function.
07be1b83
YO
877
878*/
3dab1dad
YO
879
880/*
3dab1dad
YO
881 Dumps the final compressed table form of the trie to Perl_debug_log.
882 Used for debugging make_trie().
883*/
b9a59e08 884
3dab1dad 885STATIC void
2b8b4781
NC
886S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
887 AV *revcharmap, U32 depth)
3dab1dad
YO
888{
889 U32 state;
ab3bbdeb 890 SV *sv=sv_newmortal();
55eed653 891 int colwidth= widecharmap ? 6 : 4;
2e64971a 892 U16 word;
3dab1dad
YO
893 GET_RE_DEBUG_FLAGS_DECL;
894
7918f24d 895 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 896
3dab1dad
YO
897 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
898 (int)depth * 2 + 2,"",
899 "Match","Base","Ofs" );
900
901 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 902 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 903 if ( tmp ) {
ab3bbdeb
YO
904 PerlIO_printf( Perl_debug_log, "%*s",
905 colwidth,
ddc5bc0f 906 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
907 PL_colors[0], PL_colors[1],
908 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
909 PERL_PV_ESCAPE_FIRSTCHAR
910 )
911 );
3dab1dad
YO
912 }
913 }
914 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
915 (int)depth * 2 + 2,"");
916
917 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 918 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
919 PerlIO_printf( Perl_debug_log, "\n");
920
1e2e3d02 921 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 922 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
923
924 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
925
926 if ( trie->states[ state ].wordnum ) {
927 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
928 } else {
929 PerlIO_printf( Perl_debug_log, "%6s", "" );
930 }
931
932 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
933
934 if ( base ) {
935 U32 ofs = 0;
936
937 while( ( base + ofs < trie->uniquecharcount ) ||
938 ( base + ofs - trie->uniquecharcount < trie->lasttrans
939 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
940 ofs++;
941
942 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
943
944 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
945 if ( ( base + ofs >= trie->uniquecharcount ) &&
946 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
947 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
948 {
ab3bbdeb
YO
949 PerlIO_printf( Perl_debug_log, "%*"UVXf,
950 colwidth,
3dab1dad
YO
951 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
952 } else {
ab3bbdeb 953 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
954 }
955 }
956
957 PerlIO_printf( Perl_debug_log, "]");
958
959 }
960 PerlIO_printf( Perl_debug_log, "\n" );
961 }
2e64971a
DM
962 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
963 for (word=1; word <= trie->wordcount; word++) {
964 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
965 (int)word, (int)(trie->wordinfo[word].prev),
966 (int)(trie->wordinfo[word].len));
967 }
968 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
969}
970/*
3dab1dad
YO
971 Dumps a fully constructed but uncompressed trie in list form.
972 List tries normally only are used for construction when the number of
973 possible chars (trie->uniquecharcount) is very high.
974 Used for debugging make_trie().
975*/
976STATIC void
55eed653 977S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
978 HV *widecharmap, AV *revcharmap, U32 next_alloc,
979 U32 depth)
3dab1dad
YO
980{
981 U32 state;
ab3bbdeb 982 SV *sv=sv_newmortal();
55eed653 983 int colwidth= widecharmap ? 6 : 4;
3dab1dad 984 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
985
986 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
987
3dab1dad 988 /* print out the table precompression. */
ab3bbdeb
YO
989 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
990 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
991 "------:-----+-----------------\n" );
3dab1dad
YO
992
993 for( state=1 ; state < next_alloc ; state ++ ) {
994 U16 charid;
995
ab3bbdeb 996 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
997 (int)depth * 2 + 2,"", (UV)state );
998 if ( ! trie->states[ state ].wordnum ) {
999 PerlIO_printf( Perl_debug_log, "%5s| ","");
1000 } else {
1001 PerlIO_printf( Perl_debug_log, "W%4x| ",
1002 trie->states[ state ].wordnum
1003 );
1004 }
1005 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1006 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1007 if ( tmp ) {
1008 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1009 colwidth,
ddc5bc0f 1010 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1011 PL_colors[0], PL_colors[1],
1012 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1013 PERL_PV_ESCAPE_FIRSTCHAR
1014 ) ,
1e2e3d02
YO
1015 TRIE_LIST_ITEM(state,charid).forid,
1016 (UV)TRIE_LIST_ITEM(state,charid).newstate
1017 );
1018 if (!(charid % 10))
664e119d
RGS
1019 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1020 (int)((depth * 2) + 14), "");
1e2e3d02 1021 }
ab3bbdeb
YO
1022 }
1023 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1024 }
1025}
1026
1027/*
3dab1dad
YO
1028 Dumps a fully constructed but uncompressed trie in table form.
1029 This is the normal DFA style state transition table, with a few
1030 twists to facilitate compression later.
1031 Used for debugging make_trie().
1032*/
1033STATIC void
55eed653 1034S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1035 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1036 U32 depth)
3dab1dad
YO
1037{
1038 U32 state;
1039 U16 charid;
ab3bbdeb 1040 SV *sv=sv_newmortal();
55eed653 1041 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1042 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1043
1044 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1045
1046 /*
1047 print out the table precompression so that we can do a visual check
1048 that they are identical.
1049 */
1050
1051 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1052
1053 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1054 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1055 if ( tmp ) {
ab3bbdeb
YO
1056 PerlIO_printf( Perl_debug_log, "%*s",
1057 colwidth,
ddc5bc0f 1058 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1059 PL_colors[0], PL_colors[1],
1060 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1061 PERL_PV_ESCAPE_FIRSTCHAR
1062 )
1063 );
3dab1dad
YO
1064 }
1065 }
1066
1067 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1068
1069 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1070 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1071 }
1072
1073 PerlIO_printf( Perl_debug_log, "\n" );
1074
1075 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1076
1077 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1078 (int)depth * 2 + 2,"",
1079 (UV)TRIE_NODENUM( state ) );
1080
1081 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1082 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1083 if (v)
1084 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1085 else
1086 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1087 }
1088 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1089 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1090 } else {
1091 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1092 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1093 }
1094 }
07be1b83 1095}
3dab1dad
YO
1096
1097#endif
1098
2e64971a 1099
786e8c11
YO
1100/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1101 startbranch: the first branch in the whole branch sequence
1102 first : start branch of sequence of branch-exact nodes.
1103 May be the same as startbranch
1104 last : Thing following the last branch.
1105 May be the same as tail.
1106 tail : item following the branch sequence
1107 count : words in the sequence
1108 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1109 depth : indent depth
3dab1dad 1110
786e8c11 1111Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1112
786e8c11
YO
1113A trie is an N'ary tree where the branches are determined by digital
1114decomposition of the key. IE, at the root node you look up the 1st character and
1115follow that branch repeat until you find the end of the branches. Nodes can be
1116marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1117
786e8c11 1118 /he|she|his|hers/
72f13be8 1119
786e8c11
YO
1120would convert into the following structure. Numbers represent states, letters
1121following numbers represent valid transitions on the letter from that state, if
1122the number is in square brackets it represents an accepting state, otherwise it
1123will be in parenthesis.
07be1b83 1124
786e8c11
YO
1125 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1126 | |
1127 | (2)
1128 | |
1129 (1) +-i->(6)-+-s->[7]
1130 |
1131 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1132
786e8c11
YO
1133 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1134
1135This shows that when matching against the string 'hers' we will begin at state 1
1136read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1137then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1138is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1139single traverse. We store a mapping from accepting to state to which word was
1140matched, and then when we have multiple possibilities we try to complete the
1141rest of the regex in the order in which they occured in the alternation.
1142
1143The only prior NFA like behaviour that would be changed by the TRIE support is
1144the silent ignoring of duplicate alternations which are of the form:
1145
1146 / (DUPE|DUPE) X? (?{ ... }) Y /x
1147
1148Thus EVAL blocks follwing a trie may be called a different number of times with
1149and without the optimisation. With the optimisations dupes will be silently
1150ignored. This inconsistant behaviour of EVAL type nodes is well established as
1151the following demonstrates:
1152
1153 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1154
1155which prints out 'word' three times, but
1156
1157 'words'=~/(word|word|word)(?{ print $1 })S/
1158
1159which doesnt print it out at all. This is due to other optimisations kicking in.
1160
1161Example of what happens on a structural level:
1162
1163The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1164
1165 1: CURLYM[1] {1,32767}(18)
1166 5: BRANCH(8)
1167 6: EXACT <ac>(16)
1168 8: BRANCH(11)
1169 9: EXACT <ad>(16)
1170 11: BRANCH(14)
1171 12: EXACT <ab>(16)
1172 16: SUCCEED(0)
1173 17: NOTHING(18)
1174 18: END(0)
1175
1176This would be optimizable with startbranch=5, first=5, last=16, tail=16
1177and should turn into:
1178
1179 1: CURLYM[1] {1,32767}(18)
1180 5: TRIE(16)
1181 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1182 <ac>
1183 <ad>
1184 <ab>
1185 16: SUCCEED(0)
1186 17: NOTHING(18)
1187 18: END(0)
1188
1189Cases where tail != last would be like /(?foo|bar)baz/:
1190
1191 1: BRANCH(4)
1192 2: EXACT <foo>(8)
1193 4: BRANCH(7)
1194 5: EXACT <bar>(8)
1195 7: TAIL(8)
1196 8: EXACT <baz>(10)
1197 10: END(0)
1198
1199which would be optimizable with startbranch=1, first=1, last=7, tail=8
1200and would end up looking like:
1201
1202 1: TRIE(8)
1203 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1204 <foo>
1205 <bar>
1206 7: TAIL(8)
1207 8: EXACT <baz>(10)
1208 10: END(0)
1209
1210 d = uvuni_to_utf8_flags(d, uv, 0);
1211
1212is the recommended Unicode-aware way of saying
1213
1214 *(d++) = uv;
1215*/
1216
1e2e3d02 1217#define TRIE_STORE_REVCHAR \
786e8c11 1218 STMT_START { \
73031816
NC
1219 if (UTF) { \
1220 SV *zlopp = newSV(2); \
88c9ea1e
CB
1221 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1222 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1223 SvCUR_set(zlopp, kapow - flrbbbbb); \
1224 SvPOK_on(zlopp); \
1225 SvUTF8_on(zlopp); \
1226 av_push(revcharmap, zlopp); \
1227 } else { \
6bdeddd2 1228 char ooooff = (char)uvc; \
73031816
NC
1229 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1230 } \
1231 } STMT_END
786e8c11
YO
1232
1233#define TRIE_READ_CHAR STMT_START { \
1234 wordlen++; \
1235 if ( UTF ) { \
1236 if ( folder ) { \
1237 if ( foldlen > 0 ) { \
1238 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1239 foldlen -= len; \
1240 scan += len; \
1241 len = 0; \
1242 } else { \
1243 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1244 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1245 foldlen -= UNISKIP( uvc ); \
1246 scan = foldbuf + UNISKIP( uvc ); \
1247 } \
1248 } else { \
1249 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1250 } \
1251 } else { \
1252 uvc = (U32)*uc; \
1253 len = 1; \
1254 } \
1255} STMT_END
1256
1257
1258
1259#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1260 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1261 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1262 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1263 } \
1264 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1265 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1266 TRIE_LIST_CUR( state )++; \
1267} STMT_END
07be1b83 1268
786e8c11
YO
1269#define TRIE_LIST_NEW(state) STMT_START { \
1270 Newxz( trie->states[ state ].trans.list, \
1271 4, reg_trie_trans_le ); \
1272 TRIE_LIST_CUR( state ) = 1; \
1273 TRIE_LIST_LEN( state ) = 4; \
1274} STMT_END
07be1b83 1275
786e8c11
YO
1276#define TRIE_HANDLE_WORD(state) STMT_START { \
1277 U16 dupe= trie->states[ state ].wordnum; \
1278 regnode * const noper_next = regnext( noper ); \
1279 \
786e8c11
YO
1280 DEBUG_r({ \
1281 /* store the word for dumping */ \
1282 SV* tmp; \
1283 if (OP(noper) != NOTHING) \
740cce10 1284 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1285 else \
740cce10 1286 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1287 av_push( trie_words, tmp ); \
786e8c11
YO
1288 }); \
1289 \
1290 curword++; \
2e64971a
DM
1291 trie->wordinfo[curword].prev = 0; \
1292 trie->wordinfo[curword].len = wordlen; \
1293 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1294 \
1295 if ( noper_next < tail ) { \
1296 if (!trie->jump) \
c944940b 1297 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1298 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1299 if (!jumper) \
1300 jumper = noper_next; \
1301 if (!nextbranch) \
1302 nextbranch= regnext(cur); \
1303 } \
1304 \
1305 if ( dupe ) { \
2e64971a
DM
1306 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1307 /* chain, so that when the bits of chain are later */\
1308 /* linked together, the dups appear in the chain */\
1309 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1310 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1311 } else { \
1312 /* we haven't inserted this word yet. */ \
1313 trie->states[ state ].wordnum = curword; \
1314 } \
1315} STMT_END
07be1b83 1316
3dab1dad 1317
786e8c11
YO
1318#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1319 ( ( base + charid >= ucharcount \
1320 && base + charid < ubound \
1321 && state == trie->trans[ base - ucharcount + charid ].check \
1322 && trie->trans[ base - ucharcount + charid ].next ) \
1323 ? trie->trans[ base - ucharcount + charid ].next \
1324 : ( state==1 ? special : 0 ) \
1325 )
3dab1dad 1326
786e8c11
YO
1327#define MADE_TRIE 1
1328#define MADE_JUMP_TRIE 2
1329#define MADE_EXACT_TRIE 4
3dab1dad 1330
a3621e74 1331STATIC I32
786e8c11 1332S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1333{
27da23d5 1334 dVAR;
a3621e74
YO
1335 /* first pass, loop through and scan words */
1336 reg_trie_data *trie;
55eed653 1337 HV *widecharmap = NULL;
2b8b4781 1338 AV *revcharmap = newAV();
a3621e74 1339 regnode *cur;
9f7f3913 1340 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1341 STRLEN len = 0;
1342 UV uvc = 0;
1343 U16 curword = 0;
1344 U32 next_alloc = 0;
786e8c11
YO
1345 regnode *jumper = NULL;
1346 regnode *nextbranch = NULL;
7f69552c 1347 regnode *convert = NULL;
2e64971a 1348 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1349 /* we just use folder as a flag in utf8 */
e1ec3a88 1350 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1351 ? PL_fold
1352 : ( flags == EXACTFL
1353 ? PL_fold_locale
1354 : NULL
1355 )
1356 );
1357
2b8b4781
NC
1358#ifdef DEBUGGING
1359 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1360 AV *trie_words = NULL;
1361 /* along with revcharmap, this only used during construction but both are
1362 * useful during debugging so we store them in the struct when debugging.
8e11feef 1363 */
2b8b4781
NC
1364#else
1365 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1366 STRLEN trie_charcount=0;
3dab1dad 1367#endif
2b8b4781 1368 SV *re_trie_maxbuff;
a3621e74 1369 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1370
1371 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1372#ifndef DEBUGGING
1373 PERL_UNUSED_ARG(depth);
1374#endif
a3621e74 1375
c944940b 1376 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1377 trie->refcount = 1;
3dab1dad 1378 trie->startstate = 1;
786e8c11 1379 trie->wordcount = word_count;
f8fc2ecf 1380 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1381 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1382 if (!(UTF && folder))
c944940b 1383 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1384 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1385 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1386
a3621e74 1387 DEBUG_r({
2b8b4781 1388 trie_words = newAV();
a3621e74 1389 });
a3621e74 1390
0111c4fd 1391 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1392 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1393 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1394 }
3dab1dad
YO
1395 DEBUG_OPTIMISE_r({
1396 PerlIO_printf( Perl_debug_log,
786e8c11 1397 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1398 (int)depth * 2 + 2, "",
1399 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1400 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1401 (int)depth);
3dab1dad 1402 });
7f69552c
YO
1403
1404 /* Find the node we are going to overwrite */
1405 if ( first == startbranch && OP( last ) != BRANCH ) {
1406 /* whole branch chain */
1407 convert = first;
1408 } else {
1409 /* branch sub-chain */
1410 convert = NEXTOPER( first );
1411 }
1412
a3621e74
YO
1413 /* -- First loop and Setup --
1414
1415 We first traverse the branches and scan each word to determine if it
1416 contains widechars, and how many unique chars there are, this is
1417 important as we have to build a table with at least as many columns as we
1418 have unique chars.
1419
1420 We use an array of integers to represent the character codes 0..255
38a44b82 1421 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1422 native representation of the character value as the key and IV's for the
1423 coded index.
1424
1425 *TODO* If we keep track of how many times each character is used we can
1426 remap the columns so that the table compression later on is more
1427 efficient in terms of memory by ensuring most common value is in the
1428 middle and the least common are on the outside. IMO this would be better
1429 than a most to least common mapping as theres a decent chance the most
1430 common letter will share a node with the least common, meaning the node
1431 will not be compressable. With a middle is most common approach the worst
1432 case is when we have the least common nodes twice.
1433
1434 */
1435
a3621e74 1436 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1437 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1438 const U8 *uc = (U8*)STRING( noper );
a28509cc 1439 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1440 STRLEN foldlen = 0;
1441 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1442 const U8 *scan = (U8*)NULL;
07be1b83 1443 U32 wordlen = 0; /* required init */
02daf0ab
YO
1444 STRLEN chars = 0;
1445 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1446
3dab1dad
YO
1447 if (OP(noper) == NOTHING) {
1448 trie->minlen= 0;
1449 continue;
1450 }
02daf0ab
YO
1451 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1452 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1453 regardless of encoding */
1454
a3621e74 1455 for ( ; uc < e ; uc += len ) {
3dab1dad 1456 TRIE_CHARCOUNT(trie)++;
a3621e74 1457 TRIE_READ_CHAR;
3dab1dad 1458 chars++;
a3621e74
YO
1459 if ( uvc < 256 ) {
1460 if ( !trie->charmap[ uvc ] ) {
1461 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1462 if ( folder )
1463 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1464 TRIE_STORE_REVCHAR;
a3621e74 1465 }
02daf0ab
YO
1466 if ( set_bit ) {
1467 /* store the codepoint in the bitmap, and if its ascii
1468 also store its folded equivelent. */
1469 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1470
1471 /* store the folded codepoint */
1472 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1473
1474 if ( !UTF ) {
1475 /* store first byte of utf8 representation of
1476 codepoints in the 127 < uvc < 256 range */
1477 if (127 < uvc && uvc < 192) {
1478 TRIE_BITMAP_SET(trie,194);
1479 } else if (191 < uvc ) {
1480 TRIE_BITMAP_SET(trie,195);
1481 /* && uvc < 256 -- we know uvc is < 256 already */
1482 }
1483 }
02daf0ab
YO
1484 set_bit = 0; /* We've done our bit :-) */
1485 }
a3621e74
YO
1486 } else {
1487 SV** svpp;
55eed653
NC
1488 if ( !widecharmap )
1489 widecharmap = newHV();
a3621e74 1490
55eed653 1491 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1492
1493 if ( !svpp )
e4584336 1494 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1495
1496 if ( !SvTRUE( *svpp ) ) {
1497 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1498 TRIE_STORE_REVCHAR;
a3621e74
YO
1499 }
1500 }
1501 }
3dab1dad
YO
1502 if( cur == first ) {
1503 trie->minlen=chars;
1504 trie->maxlen=chars;
1505 } else if (chars < trie->minlen) {
1506 trie->minlen=chars;
1507 } else if (chars > trie->maxlen) {
1508 trie->maxlen=chars;
1509 }
1510
a3621e74
YO
1511 } /* end first pass */
1512 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1513 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1514 (int)depth * 2 + 2,"",
55eed653 1515 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1516 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1517 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1518 );
a3621e74
YO
1519
1520 /*
1521 We now know what we are dealing with in terms of unique chars and
1522 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1523 representation using a flat table will take. If it's over a reasonable
1524 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1525 conservative but potentially much slower representation using an array
1526 of lists.
1527
1528 At the end we convert both representations into the same compressed
1529 form that will be used in regexec.c for matching with. The latter
1530 is a form that cannot be used to construct with but has memory
1531 properties similar to the list form and access properties similar
1532 to the table form making it both suitable for fast searches and
1533 small enough that its feasable to store for the duration of a program.
1534
1535 See the comment in the code where the compressed table is produced
1536 inplace from the flat tabe representation for an explanation of how
1537 the compression works.
1538
1539 */
1540
1541
2e64971a
DM
1542 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1543 prev_states[1] = 0;
1544
3dab1dad 1545 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1546 /*
1547 Second Pass -- Array Of Lists Representation
1548
1549 Each state will be represented by a list of charid:state records
1550 (reg_trie_trans_le) the first such element holds the CUR and LEN
1551 points of the allocated array. (See defines above).
1552
1553 We build the initial structure using the lists, and then convert
1554 it into the compressed table form which allows faster lookups
1555 (but cant be modified once converted).
a3621e74
YO
1556 */
1557
a3621e74
YO
1558 STRLEN transcount = 1;
1559
1e2e3d02
YO
1560 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1561 "%*sCompiling trie using list compiler\n",
1562 (int)depth * 2 + 2, ""));
446bd890 1563
c944940b
JH
1564 trie->states = (reg_trie_state *)
1565 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1566 sizeof(reg_trie_state) );
a3621e74
YO
1567 TRIE_LIST_NEW(1);
1568 next_alloc = 2;
1569
1570 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1571
c445ea15
AL
1572 regnode * const noper = NEXTOPER( cur );
1573 U8 *uc = (U8*)STRING( noper );
1574 const U8 * const e = uc + STR_LEN( noper );
1575 U32 state = 1; /* required init */
1576 U16 charid = 0; /* sanity init */
1577 U8 *scan = (U8*)NULL; /* sanity init */
1578 STRLEN foldlen = 0; /* required init */
07be1b83 1579 U32 wordlen = 0; /* required init */
c445ea15
AL
1580 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1581
3dab1dad 1582 if (OP(noper) != NOTHING) {
786e8c11 1583 for ( ; uc < e ; uc += len ) {
c445ea15 1584
786e8c11 1585 TRIE_READ_CHAR;
c445ea15 1586
786e8c11
YO
1587 if ( uvc < 256 ) {
1588 charid = trie->charmap[ uvc ];
c445ea15 1589 } else {
55eed653 1590 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1591 if ( !svpp ) {
1592 charid = 0;
1593 } else {
1594 charid=(U16)SvIV( *svpp );
1595 }
c445ea15 1596 }
786e8c11
YO
1597 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1598 if ( charid ) {
a3621e74 1599
786e8c11
YO
1600 U16 check;
1601 U32 newstate = 0;
a3621e74 1602
786e8c11
YO
1603 charid--;
1604 if ( !trie->states[ state ].trans.list ) {
1605 TRIE_LIST_NEW( state );
c445ea15 1606 }
786e8c11
YO
1607 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1608 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1609 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1610 break;
1611 }
1612 }
1613 if ( ! newstate ) {
1614 newstate = next_alloc++;
2e64971a 1615 prev_states[newstate] = state;
786e8c11
YO
1616 TRIE_LIST_PUSH( state, charid, newstate );
1617 transcount++;
1618 }
1619 state = newstate;
1620 } else {
1621 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1622 }
a28509cc 1623 }
c445ea15 1624 }
3dab1dad 1625 TRIE_HANDLE_WORD(state);
a3621e74
YO
1626
1627 } /* end second pass */
1628
1e2e3d02
YO
1629 /* next alloc is the NEXT state to be allocated */
1630 trie->statecount = next_alloc;
c944940b
JH
1631 trie->states = (reg_trie_state *)
1632 PerlMemShared_realloc( trie->states,
1633 next_alloc
1634 * sizeof(reg_trie_state) );
a3621e74 1635
3dab1dad 1636 /* and now dump it out before we compress it */
2b8b4781
NC
1637 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1638 revcharmap, next_alloc,
1639 depth+1)
1e2e3d02 1640 );
a3621e74 1641
c944940b
JH
1642 trie->trans = (reg_trie_trans *)
1643 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1644 {
1645 U32 state;
a3621e74
YO
1646 U32 tp = 0;
1647 U32 zp = 0;
1648
1649
1650 for( state=1 ; state < next_alloc ; state ++ ) {
1651 U32 base=0;
1652
1653 /*
1654 DEBUG_TRIE_COMPILE_MORE_r(
1655 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1656 );
1657 */
1658
1659 if (trie->states[state].trans.list) {
1660 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1661 U16 maxid=minid;
a28509cc 1662 U16 idx;
a3621e74
YO
1663
1664 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1665 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1666 if ( forid < minid ) {
1667 minid=forid;
1668 } else if ( forid > maxid ) {
1669 maxid=forid;
1670 }
a3621e74
YO
1671 }
1672 if ( transcount < tp + maxid - minid + 1) {
1673 transcount *= 2;
c944940b
JH
1674 trie->trans = (reg_trie_trans *)
1675 PerlMemShared_realloc( trie->trans,
446bd890
NC
1676 transcount
1677 * sizeof(reg_trie_trans) );
a3621e74
YO
1678 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1679 }
1680 base = trie->uniquecharcount + tp - minid;
1681 if ( maxid == minid ) {
1682 U32 set = 0;
1683 for ( ; zp < tp ; zp++ ) {
1684 if ( ! trie->trans[ zp ].next ) {
1685 base = trie->uniquecharcount + zp - minid;
1686 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1687 trie->trans[ zp ].check = state;
1688 set = 1;
1689 break;
1690 }
1691 }
1692 if ( !set ) {
1693 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1694 trie->trans[ tp ].check = state;
1695 tp++;
1696 zp = tp;
1697 }
1698 } else {
1699 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1700 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1701 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1702 trie->trans[ tid ].check = state;
1703 }
1704 tp += ( maxid - minid + 1 );
1705 }
1706 Safefree(trie->states[ state ].trans.list);
1707 }
1708 /*
1709 DEBUG_TRIE_COMPILE_MORE_r(
1710 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1711 );
1712 */
1713 trie->states[ state ].trans.base=base;
1714 }
cc601c31 1715 trie->lasttrans = tp + 1;
a3621e74
YO
1716 }
1717 } else {
1718 /*
1719 Second Pass -- Flat Table Representation.
1720
1721 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1722 We know that we will need Charcount+1 trans at most to store the data
1723 (one row per char at worst case) So we preallocate both structures
1724 assuming worst case.
1725
1726 We then construct the trie using only the .next slots of the entry
1727 structs.
1728
1729 We use the .check field of the first entry of the node temporarily to
1730 make compression both faster and easier by keeping track of how many non
1731 zero fields are in the node.
1732
1733 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1734 transition.
1735
1736 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1737 number representing the first entry of the node, and state as a
1738 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1739 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1740 are 2 entrys per node. eg:
1741
1742 A B A B
1743 1. 2 4 1. 3 7
1744 2. 0 3 3. 0 5
1745 3. 0 0 5. 0 0
1746 4. 0 0 7. 0 0
1747
1748 The table is internally in the right hand, idx form. However as we also
1749 have to deal with the states array which is indexed by nodenum we have to
1750 use TRIE_NODENUM() to convert.
1751
1752 */
1e2e3d02
YO
1753 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1754 "%*sCompiling trie using table compiler\n",
1755 (int)depth * 2 + 2, ""));
3dab1dad 1756
c944940b
JH
1757 trie->trans = (reg_trie_trans *)
1758 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1759 * trie->uniquecharcount + 1,
1760 sizeof(reg_trie_trans) );
1761 trie->states = (reg_trie_state *)
1762 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1763 sizeof(reg_trie_state) );
a3621e74
YO
1764 next_alloc = trie->uniquecharcount + 1;
1765
3dab1dad 1766
a3621e74
YO
1767 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1768
c445ea15 1769 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1770 const U8 *uc = (U8*)STRING( noper );
1771 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1772
1773 U32 state = 1; /* required init */
1774
1775 U16 charid = 0; /* sanity init */
1776 U32 accept_state = 0; /* sanity init */
1777 U8 *scan = (U8*)NULL; /* sanity init */
1778
1779 STRLEN foldlen = 0; /* required init */
07be1b83 1780 U32 wordlen = 0; /* required init */
a3621e74
YO
1781 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1782
3dab1dad 1783 if ( OP(noper) != NOTHING ) {
786e8c11 1784 for ( ; uc < e ; uc += len ) {
a3621e74 1785
786e8c11 1786 TRIE_READ_CHAR;
a3621e74 1787
786e8c11
YO
1788 if ( uvc < 256 ) {
1789 charid = trie->charmap[ uvc ];
1790 } else {
55eed653 1791 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1792 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1793 }
786e8c11
YO
1794 if ( charid ) {
1795 charid--;
1796 if ( !trie->trans[ state + charid ].next ) {
1797 trie->trans[ state + charid ].next = next_alloc;
1798 trie->trans[ state ].check++;
2e64971a
DM
1799 prev_states[TRIE_NODENUM(next_alloc)]
1800 = TRIE_NODENUM(state);
786e8c11
YO
1801 next_alloc += trie->uniquecharcount;
1802 }
1803 state = trie->trans[ state + charid ].next;
1804 } else {
1805 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1806 }
1807 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1808 }
a3621e74 1809 }
3dab1dad
YO
1810 accept_state = TRIE_NODENUM( state );
1811 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1812
1813 } /* end second pass */
1814
3dab1dad 1815 /* and now dump it out before we compress it */
2b8b4781
NC
1816 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1817 revcharmap,
1818 next_alloc, depth+1));
a3621e74 1819
a3621e74
YO
1820 {
1821 /*
1822 * Inplace compress the table.*
1823
1824 For sparse data sets the table constructed by the trie algorithm will
1825 be mostly 0/FAIL transitions or to put it another way mostly empty.
1826 (Note that leaf nodes will not contain any transitions.)
1827
1828 This algorithm compresses the tables by eliminating most such
1829 transitions, at the cost of a modest bit of extra work during lookup:
1830
1831 - Each states[] entry contains a .base field which indicates the
1832 index in the state[] array wheres its transition data is stored.
1833
1834 - If .base is 0 there are no valid transitions from that node.
1835
1836 - If .base is nonzero then charid is added to it to find an entry in
1837 the trans array.
1838
1839 -If trans[states[state].base+charid].check!=state then the
1840 transition is taken to be a 0/Fail transition. Thus if there are fail
1841 transitions at the front of the node then the .base offset will point
1842 somewhere inside the previous nodes data (or maybe even into a node
1843 even earlier), but the .check field determines if the transition is
1844 valid.
1845
786e8c11 1846 XXX - wrong maybe?
a3621e74
YO
1847 The following process inplace converts the table to the compressed
1848 table: We first do not compress the root node 1,and mark its all its
1849 .check pointers as 1 and set its .base pointer as 1 as well. This
1850 allows to do a DFA construction from the compressed table later, and
1851 ensures that any .base pointers we calculate later are greater than
1852 0.
1853
1854 - We set 'pos' to indicate the first entry of the second node.
1855
1856 - We then iterate over the columns of the node, finding the first and
1857 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1858 and set the .check pointers accordingly, and advance pos
1859 appropriately and repreat for the next node. Note that when we copy
1860 the next pointers we have to convert them from the original
1861 NODEIDX form to NODENUM form as the former is not valid post
1862 compression.
1863
1864 - If a node has no transitions used we mark its base as 0 and do not
1865 advance the pos pointer.
1866
1867 - If a node only has one transition we use a second pointer into the
1868 structure to fill in allocated fail transitions from other states.
1869 This pointer is independent of the main pointer and scans forward
1870 looking for null transitions that are allocated to a state. When it
1871 finds one it writes the single transition into the "hole". If the
786e8c11 1872 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1873
1874 - Once compressed we can Renew/realloc the structures to release the
1875 excess space.
1876
1877 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1878 specifically Fig 3.47 and the associated pseudocode.
1879
1880 demq
1881 */
a3b680e6 1882 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1883 U32 state, charid;
a3621e74 1884 U32 pos = 0, zp=0;
1e2e3d02 1885 trie->statecount = laststate;
a3621e74
YO
1886
1887 for ( state = 1 ; state < laststate ; state++ ) {
1888 U8 flag = 0;
a28509cc
AL
1889 const U32 stateidx = TRIE_NODEIDX( state );
1890 const U32 o_used = trie->trans[ stateidx ].check;
1891 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1892 trie->trans[ stateidx ].check = 0;
1893
1894 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1895 if ( flag || trie->trans[ stateidx + charid ].next ) {
1896 if ( trie->trans[ stateidx + charid ].next ) {
1897 if (o_used == 1) {
1898 for ( ; zp < pos ; zp++ ) {
1899 if ( ! trie->trans[ zp ].next ) {
1900 break;
1901 }
1902 }
1903 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1904 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1905 trie->trans[ zp ].check = state;
1906 if ( ++zp > pos ) pos = zp;
1907 break;
1908 }
1909 used--;
1910 }
1911 if ( !flag ) {
1912 flag = 1;
1913 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1914 }
1915 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1916 trie->trans[ pos ].check = state;
1917 pos++;
1918 }
1919 }
1920 }
cc601c31 1921 trie->lasttrans = pos + 1;
c944940b
JH
1922 trie->states = (reg_trie_state *)
1923 PerlMemShared_realloc( trie->states, laststate
1924 * sizeof(reg_trie_state) );
a3621e74 1925 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1926 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1927 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1928 (int)depth * 2 + 2,"",
1929 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1930 (IV)next_alloc,
1931 (IV)pos,
a3621e74
YO
1932 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1933 );
1934
1935 } /* end table compress */
1936 }
1e2e3d02
YO
1937 DEBUG_TRIE_COMPILE_MORE_r(
1938 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1939 (int)depth * 2 + 2, "",
1940 (UV)trie->statecount,
1941 (UV)trie->lasttrans)
1942 );
cc601c31 1943 /* resize the trans array to remove unused space */
c944940b
JH
1944 trie->trans = (reg_trie_trans *)
1945 PerlMemShared_realloc( trie->trans, trie->lasttrans
1946 * sizeof(reg_trie_trans) );
a3621e74 1947
3dab1dad 1948 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1949 U8 nodetype =(U8)(flags & 0xFF);
1950 char *str=NULL;
786e8c11 1951
07be1b83 1952#ifdef DEBUGGING
e62cc96a 1953 regnode *optimize = NULL;
7122b237
YO
1954#ifdef RE_TRACK_PATTERN_OFFSETS
1955
b57a0404
JH
1956 U32 mjd_offset = 0;
1957 U32 mjd_nodelen = 0;
7122b237
YO
1958#endif /* RE_TRACK_PATTERN_OFFSETS */
1959#endif /* DEBUGGING */
a3621e74 1960 /*
3dab1dad
YO
1961 This means we convert either the first branch or the first Exact,
1962 depending on whether the thing following (in 'last') is a branch
1963 or not and whther first is the startbranch (ie is it a sub part of
1964 the alternation or is it the whole thing.)
1965 Assuming its a sub part we conver the EXACT otherwise we convert
1966 the whole branch sequence, including the first.
a3621e74 1967 */
3dab1dad 1968 /* Find the node we are going to overwrite */
7f69552c 1969 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1970 /* branch sub-chain */
3dab1dad 1971 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1972#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1973 DEBUG_r({
1974 mjd_offset= Node_Offset((convert));
1975 mjd_nodelen= Node_Length((convert));
1976 });
7122b237 1977#endif
7f69552c 1978 /* whole branch chain */
7122b237
YO
1979 }
1980#ifdef RE_TRACK_PATTERN_OFFSETS
1981 else {
7f69552c
YO
1982 DEBUG_r({
1983 const regnode *nop = NEXTOPER( convert );
1984 mjd_offset= Node_Offset((nop));
1985 mjd_nodelen= Node_Length((nop));
1986 });
07be1b83
YO
1987 }
1988 DEBUG_OPTIMISE_r(
1989 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1990 (int)depth * 2 + 2, "",
786e8c11 1991 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1992 );
7122b237 1993#endif
3dab1dad
YO
1994 /* But first we check to see if there is a common prefix we can
1995 split out as an EXACT and put in front of the TRIE node. */
1996 trie->startstate= 1;
55eed653 1997 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1998 U32 state;
1e2e3d02 1999 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2000 U32 ofs = 0;
8e11feef
RGS
2001 I32 idx = -1;
2002 U32 count = 0;
2003 const U32 base = trie->states[ state ].trans.base;
a3621e74 2004
3dab1dad 2005 if ( trie->states[state].wordnum )
8e11feef 2006 count = 1;
a3621e74 2007
8e11feef 2008 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2009 if ( ( base + ofs >= trie->uniquecharcount ) &&
2010 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2011 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2012 {
3dab1dad 2013 if ( ++count > 1 ) {
2b8b4781 2014 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2015 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2016 if ( state == 1 ) break;
3dab1dad
YO
2017 if ( count == 2 ) {
2018 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2019 DEBUG_OPTIMISE_r(
8e11feef
RGS
2020 PerlIO_printf(Perl_debug_log,
2021 "%*sNew Start State=%"UVuf" Class: [",
2022 (int)depth * 2 + 2, "",
786e8c11 2023 (UV)state));
be8e71aa 2024 if (idx >= 0) {
2b8b4781 2025 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2026 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2027
3dab1dad 2028 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2029 if ( folder )
2030 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2031 DEBUG_OPTIMISE_r(
f1f66076 2032 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2033 );
8e11feef
RGS
2034 }
2035 }
2036 TRIE_BITMAP_SET(trie,*ch);
2037 if ( folder )
2038 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2039 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2040 }
2041 idx = ofs;
2042 }
3dab1dad
YO
2043 }
2044 if ( count == 1 ) {
2b8b4781 2045 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2046 STRLEN len;
2047 char *ch = SvPV( *tmp, len );
de734bd5
A
2048 DEBUG_OPTIMISE_r({
2049 SV *sv=sv_newmortal();
8e11feef
RGS
2050 PerlIO_printf( Perl_debug_log,
2051 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2052 (int)depth * 2 + 2, "",
de734bd5
A
2053 (UV)state, (UV)idx,
2054 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2055 PL_colors[0], PL_colors[1],
2056 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2057 PERL_PV_ESCAPE_FIRSTCHAR
2058 )
2059 );
2060 });
3dab1dad
YO
2061 if ( state==1 ) {
2062 OP( convert ) = nodetype;
2063 str=STRING(convert);
2064 STR_LEN(convert)=0;
2065 }
c490c714
YO
2066 STR_LEN(convert) += len;
2067 while (len--)
de734bd5 2068 *str++ = *ch++;
8e11feef 2069 } else {
f9049ba1 2070#ifdef DEBUGGING
8e11feef
RGS
2071 if (state>1)
2072 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2073#endif
8e11feef
RGS
2074 break;
2075 }
2076 }
2e64971a 2077 trie->prefixlen = (state-1);
3dab1dad 2078 if (str) {
8e11feef 2079 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2080 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2081 trie->startstate = state;
07be1b83
YO
2082 trie->minlen -= (state - 1);
2083 trie->maxlen -= (state - 1);
33809eae
JH
2084#ifdef DEBUGGING
2085 /* At least the UNICOS C compiler choked on this
2086 * being argument to DEBUG_r(), so let's just have
2087 * it right here. */
2088 if (
2089#ifdef PERL_EXT_RE_BUILD
2090 1
2091#else
2092 DEBUG_r_TEST
2093#endif
2094 ) {
2095 regnode *fix = convert;
2096 U32 word = trie->wordcount;
2097 mjd_nodelen++;
2098 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2099 while( ++fix < n ) {
2100 Set_Node_Offset_Length(fix, 0, 0);
2101 }
2102 while (word--) {
2103 SV ** const tmp = av_fetch( trie_words, word, 0 );
2104 if (tmp) {
2105 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2106 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2107 else
2108 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2109 }
2110 }
2111 }
2112#endif
8e11feef
RGS
2113 if (trie->maxlen) {
2114 convert = n;
2115 } else {
3dab1dad 2116 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2117 DEBUG_r(optimize= n);
3dab1dad
YO
2118 }
2119 }
2120 }
a5ca303d
YO
2121 if (!jumper)
2122 jumper = last;
3dab1dad 2123 if ( trie->maxlen ) {
8e11feef
RGS
2124 NEXT_OFF( convert ) = (U16)(tail - convert);
2125 ARG_SET( convert, data_slot );
786e8c11
YO
2126 /* Store the offset to the first unabsorbed branch in
2127 jump[0], which is otherwise unused by the jump logic.
2128 We use this when dumping a trie and during optimisation. */
2129 if (trie->jump)
7f69552c 2130 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2131
786e8c11
YO
2132 /* XXXX */
2133 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2134 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2135 {
2136 OP( convert ) = TRIEC;
2137 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2138 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2139 trie->bitmap= NULL;
2140 } else
2141 OP( convert ) = TRIE;
a3621e74 2142
3dab1dad
YO
2143 /* store the type in the flags */
2144 convert->flags = nodetype;
a5ca303d
YO
2145 DEBUG_r({
2146 optimize = convert
2147 + NODE_STEP_REGNODE
2148 + regarglen[ OP( convert ) ];
2149 });
2150 /* XXX We really should free up the resource in trie now,
2151 as we won't use them - (which resources?) dmq */
3dab1dad 2152 }
a3621e74 2153 /* needed for dumping*/
e62cc96a 2154 DEBUG_r(if (optimize) {
07be1b83 2155 regnode *opt = convert;
bcdf7404 2156
e62cc96a 2157 while ( ++opt < optimize) {
07be1b83
YO
2158 Set_Node_Offset_Length(opt,0,0);
2159 }
786e8c11
YO
2160 /*
2161 Try to clean up some of the debris left after the
2162 optimisation.
a3621e74 2163 */
786e8c11 2164 while( optimize < jumper ) {
07be1b83 2165 mjd_nodelen += Node_Length((optimize));
a3621e74 2166 OP( optimize ) = OPTIMIZED;
07be1b83 2167 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2168 optimize++;
2169 }
07be1b83 2170 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2171 });
2172 } /* end node insert */
2e64971a
DM
2173
2174 /* Finish populating the prev field of the wordinfo array. Walk back
2175 * from each accept state until we find another accept state, and if
2176 * so, point the first word's .prev field at the second word. If the
2177 * second already has a .prev field set, stop now. This will be the
2178 * case either if we've already processed that word's accept state,
2179 * or that that state had multiple words, and the overspill words
2180 * were already linked up earlier.
2181 */
2182 {
2183 U16 word;
2184 U32 state;
2185 U16 prev;
2186
2187 for (word=1; word <= trie->wordcount; word++) {
2188 prev = 0;
2189 if (trie->wordinfo[word].prev)
2190 continue;
2191 state = trie->wordinfo[word].accept;
2192 while (state) {
2193 state = prev_states[state];
2194 if (!state)
2195 break;
2196 prev = trie->states[state].wordnum;
2197 if (prev)
2198 break;
2199 }
2200 trie->wordinfo[word].prev = prev;
2201 }
2202 Safefree(prev_states);
2203 }
2204
2205
2206 /* and now dump out the compressed format */
2207 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2208
55eed653 2209 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2210#ifdef DEBUGGING
2211 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2212 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2213#else
2214 SvREFCNT_dec(revcharmap);
07be1b83 2215#endif
786e8c11
YO
2216 return trie->jump
2217 ? MADE_JUMP_TRIE
2218 : trie->startstate>1
2219 ? MADE_EXACT_TRIE
2220 : MADE_TRIE;
2221}
2222
2223STATIC void
2224S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2225{
2226/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2227
2228 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2229 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2230 ISBN 0-201-10088-6
2231
2232 We find the fail state for each state in the trie, this state is the longest proper
2233 suffix of the current states 'word' that is also a proper prefix of another word in our
2234 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2235 the DFA not to have to restart after its tried and failed a word at a given point, it
2236 simply continues as though it had been matching the other word in the first place.
2237 Consider
2238 'abcdgu'=~/abcdefg|cdgu/
2239 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2240 fail, which would bring use to the state representing 'd' in the second word where we would
2241 try 'g' and succeed, prodceding to match 'cdgu'.
2242 */
2243 /* add a fail transition */
3251b653
NC
2244 const U32 trie_offset = ARG(source);
2245 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2246 U32 *q;
2247 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2248 const U32 numstates = trie->statecount;
786e8c11
YO
2249 const U32 ubound = trie->lasttrans + ucharcount;
2250 U32 q_read = 0;
2251 U32 q_write = 0;
2252 U32 charid;
2253 U32 base = trie->states[ 1 ].trans.base;
2254 U32 *fail;
2255 reg_ac_data *aho;
2256 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2257 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2258
2259 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2260#ifndef DEBUGGING
2261 PERL_UNUSED_ARG(depth);
2262#endif
2263
2264
2265 ARG_SET( stclass, data_slot );
c944940b 2266 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2267 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2268 aho->trie=trie_offset;
446bd890
NC
2269 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2270 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2271 Newxz( q, numstates, U32);
c944940b 2272 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2273 aho->refcount = 1;
2274 fail = aho->fail;
2275 /* initialize fail[0..1] to be 1 so that we always have
2276 a valid final fail state */
2277 fail[ 0 ] = fail[ 1 ] = 1;
2278
2279 for ( charid = 0; charid < ucharcount ; charid++ ) {
2280 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2281 if ( newstate ) {
2282 q[ q_write ] = newstate;
2283 /* set to point at the root */
2284 fail[ q[ q_write++ ] ]=1;
2285 }
2286 }
2287 while ( q_read < q_write) {
2288 const U32 cur = q[ q_read++ % numstates ];
2289 base = trie->states[ cur ].trans.base;
2290
2291 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2292 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2293 if (ch_state) {
2294 U32 fail_state = cur;
2295 U32 fail_base;
2296 do {
2297 fail_state = fail[ fail_state ];
2298 fail_base = aho->states[ fail_state ].trans.base;
2299 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2300
2301 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2302 fail[ ch_state ] = fail_state;
2303 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2304 {
2305 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2306 }
2307 q[ q_write++ % numstates] = ch_state;
2308 }
2309 }
2310 }
2311 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2312 when we fail in state 1, this allows us to use the
2313 charclass scan to find a valid start char. This is based on the principle
2314 that theres a good chance the string being searched contains lots of stuff
2315 that cant be a start char.
2316 */
2317 fail[ 0 ] = fail[ 1 ] = 0;
2318 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2319 PerlIO_printf(Perl_debug_log,
2320 "%*sStclass Failtable (%"UVuf" states): 0",
2321 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2322 );
786e8c11
YO
2323 for( q_read=1; q_read<numstates; q_read++ ) {
2324 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2325 }
2326 PerlIO_printf(Perl_debug_log, "\n");
2327 });
2328 Safefree(q);
2329 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2330}
2331
786e8c11 2332
a3621e74 2333/*
5d1c421c
JH
2334 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2335 * These need to be revisited when a newer toolchain becomes available.
2336 */
2337#if defined(__sparc64__) && defined(__GNUC__)
2338# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2339# undef SPARC64_GCC_WORKAROUND
2340# define SPARC64_GCC_WORKAROUND 1
2341# endif
2342#endif
2343
07be1b83 2344#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2345 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2346 SV * const mysv=sv_newmortal(); \
2347 regnode *Next = regnext(scan); \
2348 regprop(RExC_rx, mysv, scan); \
7f69552c 2349 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2350 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2351 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2352 }});
07be1b83 2353
1de06328
YO
2354
2355
2356
2357
07be1b83
YO
2358#define JOIN_EXACT(scan,min,flags) \
2359 if (PL_regkind[OP(scan)] == EXACT) \
2360 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2361
be8e71aa 2362STATIC U32
07be1b83
YO
2363S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2364 /* Merge several consecutive EXACTish nodes into one. */
2365 regnode *n = regnext(scan);
2366 U32 stringok = 1;
2367 regnode *next = scan + NODE_SZ_STR(scan);
2368 U32 merged = 0;
2369 U32 stopnow = 0;
2370#ifdef DEBUGGING
2371 regnode *stop = scan;
72f13be8 2372 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2373#else
d47053eb
RGS
2374 PERL_UNUSED_ARG(depth);
2375#endif
7918f24d
NC
2376
2377 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2378#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2379 PERL_UNUSED_ARG(flags);
2380 PERL_UNUSED_ARG(val);
07be1b83 2381#endif
07be1b83
YO
2382 DEBUG_PEEP("join",scan,depth);
2383
2384 /* Skip NOTHING, merge EXACT*. */
2385 while (n &&
2386 ( PL_regkind[OP(n)] == NOTHING ||
2387 (stringok && (OP(n) == OP(scan))))
2388 && NEXT_OFF(n)
2389 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2390
2391 if (OP(n) == TAIL || n > next)
2392 stringok = 0;
2393 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2394 DEBUG_PEEP("skip:",n,depth);
2395 NEXT_OFF(scan) += NEXT_OFF(n);
2396 next = n + NODE_STEP_REGNODE;
2397#ifdef DEBUGGING
2398 if (stringok)
2399 stop = n;
2400#endif
2401 n = regnext(n);
2402 }
2403 else if (stringok) {
786e8c11 2404 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2405 regnode * const nnext = regnext(n);
2406
2407 DEBUG_PEEP("merg",n,depth);
2408
2409 merged++;
2410 if (oldl + STR_LEN(n) > U8_MAX)
2411 break;
2412 NEXT_OFF(scan) += NEXT_OFF(n);
2413 STR_LEN(scan) += STR_LEN(n);
2414 next = n + NODE_SZ_STR(n);
2415 /* Now we can overwrite *n : */
2416 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2417#ifdef DEBUGGING
2418 stop = next - 1;
2419#endif
2420 n = nnext;
2421 if (stopnow) break;
2422 }
2423
d47053eb
RGS
2424#ifdef EXPERIMENTAL_INPLACESCAN
2425 if (flags && !NEXT_OFF(n)) {
2426 DEBUG_PEEP("atch", val, depth);
2427 if (reg_off_by_arg[OP(n)]) {
2428 ARG_SET(n, val - n);
2429 }
2430 else {
2431 NEXT_OFF(n) = val - n;
2432 }
2433 stopnow = 1;
2434 }
07be1b83
YO
2435#endif
2436 }
2437
2438 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2439 /*
2440 Two problematic code points in Unicode casefolding of EXACT nodes:
2441
2442 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2443 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2444
2445 which casefold to
2446
2447 Unicode UTF-8
2448
2449 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2450 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2451
2452 This means that in case-insensitive matching (or "loose matching",
2453 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2454 length of the above casefolded versions) can match a target string
2455 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2456 This would rather mess up the minimum length computation.
2457
2458 What we'll do is to look for the tail four bytes, and then peek
2459 at the preceding two bytes to see whether we need to decrease
2460 the minimum length by four (six minus two).
2461
2462 Thanks to the design of UTF-8, there cannot be false matches:
2463 A sequence of valid UTF-8 bytes cannot be a subsequence of
2464 another valid sequence of UTF-8 bytes.
2465
2466 */
2467 char * const s0 = STRING(scan), *s, *t;
2468 char * const s1 = s0 + STR_LEN(scan) - 1;
2469 char * const s2 = s1 - 4;
e294cc5d
JH
2470#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2471 const char t0[] = "\xaf\x49\xaf\x42";
2472#else
07be1b83 2473 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2474#endif
07be1b83
YO
2475 const char * const t1 = t0 + 3;
2476
2477 for (s = s0 + 2;
2478 s < s2 && (t = ninstr(s, s1, t0, t1));
2479 s = t + 4) {
e294cc5d
JH
2480#ifdef EBCDIC
2481 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2482 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2483#else
07be1b83
YO
2484 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2485 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2486#endif
07be1b83
YO
2487 *min -= 4;
2488 }
2489 }
2490
2491#ifdef DEBUGGING
2492 /* Allow dumping */
2493 n = scan + NODE_SZ_STR(scan);
2494 while (n <= stop) {
2495 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2496 OP(n) = OPTIMIZED;
2497 NEXT_OFF(n) = 0;
2498 }
2499 n++;
2500 }
2501#endif
2502 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2503 return stopnow;
2504}
2505
653099ff
GS
2506/* REx optimizer. Converts nodes into quickier variants "in place".
2507 Finds fixed substrings. */
2508
a0288114 2509/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2510 to the position after last scanned or to NULL. */
2511
40d049e4
YO
2512#define INIT_AND_WITHP \
2513 assert(!and_withp); \
2514 Newx(and_withp,1,struct regnode_charclass_class); \
2515 SAVEFREEPV(and_withp)
07be1b83 2516
b515a41d
YO
2517/* this is a chain of data about sub patterns we are processing that
2518 need to be handled seperately/specially in study_chunk. Its so
2519 we can simulate recursion without losing state. */
2520struct scan_frame;
2521typedef struct scan_frame {
2522 regnode *last; /* last node to process in this frame */
2523 regnode *next; /* next node to process when last is reached */
2524 struct scan_frame *prev; /*previous frame*/
2525 I32 stop; /* what stopparen do we use */
2526} scan_frame;
2527
304ee84b
YO
2528
2529#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2530
e1d1eefb
YO
2531#define CASE_SYNST_FNC(nAmE) \
2532case nAmE: \
2533 if (flags & SCF_DO_STCLASS_AND) { \
2534 for (value = 0; value < 256; value++) \
2535 if (!is_ ## nAmE ## _cp(value)) \
2536 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2537 } \
2538 else { \
2539 for (value = 0; value < 256; value++) \
2540 if (is_ ## nAmE ## _cp(value)) \
2541 ANYOF_BITMAP_SET(data->start_class, value); \
2542 } \
2543 break; \
2544case N ## nAmE: \
2545 if (flags & SCF_DO_STCLASS_AND) { \
2546 for (value = 0; value < 256; value++) \
2547 if (is_ ## nAmE ## _cp(value)) \
2548 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2549 } \
2550 else { \
2551 for (value = 0; value < 256; value++) \
2552 if (!is_ ## nAmE ## _cp(value)) \
2553 ANYOF_BITMAP_SET(data->start_class, value); \
2554 } \
2555 break
2556
2557
2558
76e3520e 2559STATIC I32
40d049e4 2560S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2561 I32 *minlenp, I32 *deltap,
40d049e4
YO
2562 regnode *last,
2563 scan_data_t *data,
2564 I32 stopparen,
2565 U8* recursed,
2566 struct regnode_charclass_class *and_withp,
2567 U32 flags, U32 depth)
c277df42
IZ
2568 /* scanp: Start here (read-write). */
2569 /* deltap: Write maxlen-minlen here. */
2570 /* last: Stop before this one. */
40d049e4
YO
2571 /* data: string data about the pattern */
2572 /* stopparen: treat close N as END */
2573 /* recursed: which subroutines have we recursed into */
2574 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2575{
97aff369 2576 dVAR;
c277df42
IZ
2577 I32 min = 0, pars = 0, code;
2578 regnode *scan = *scanp, *next;
2579 I32 delta = 0;
2580 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2581 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2582 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2583 scan_data_t data_fake;
a3621e74 2584 SV *re_trie_maxbuff = NULL;
786e8c11 2585 regnode *first_non_open = scan;
e2e6a0f1 2586 I32 stopmin = I32_MAX;
8aa23a47 2587 scan_frame *frame = NULL;
a3621e74 2588 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2589
7918f24d
NC
2590 PERL_ARGS_ASSERT_STUDY_CHUNK;
2591
13a24bad 2592#ifdef DEBUGGING
40d049e4 2593 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2594#endif
40d049e4 2595
786e8c11 2596 if ( depth == 0 ) {
40d049e4 2597 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2598 first_non_open=regnext(first_non_open);
2599 }
2600
b81d288d 2601
8aa23a47
YO
2602 fake_study_recurse:
2603 while ( scan && OP(scan) != END && scan < last ){
2604 /* Peephole optimizer: */
304ee84b 2605 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2606 DEBUG_PEEP("Peep",scan,depth);
2607 JOIN_EXACT(scan,&min,0);
2608
2609 /* Follow the next-chain of the current node and optimize
2610 away all the NOTHINGs from it. */
2611 if (OP(scan) != CURLYX) {
2612 const int max = (reg_off_by_arg[OP(scan)]
2613 ? I32_MAX
2614 /* I32 may be smaller than U16 on CRAYs! */
2615 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2616 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2617 int noff;
2618 regnode *n = scan;
2619
2620 /* Skip NOTHING and LONGJMP. */
2621 while ((n = regnext(n))
2622 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2623 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2624 && off + noff < max)
2625 off += noff;
2626 if (reg_off_by_arg[OP(scan)])
2627 ARG(scan) = off;
2628 else
2629 NEXT_OFF(scan) = off;
2630 }
a3621e74 2631
c277df42 2632
8aa23a47
YO
2633
2634 /* The principal pseudo-switch. Cannot be a switch, since we
2635 look into several different things. */
2636 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2637 || OP(scan) == IFTHEN) {
2638 next = regnext(scan);
2639 code = OP(scan);
2640 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2641
2642 if (OP(next) == code || code == IFTHEN) {
2643 /* NOTE - There is similar code to this block below for handling
2644 TRIE nodes on a re-study. If you change stuff here check there
2645 too. */
2646 I32 max1 = 0, min1 = I32_MAX, num = 0;
2647 struct regnode_charclass_class accum;
2648 regnode * const startbranch=scan;
2649
2650 if (flags & SCF_DO_SUBSTR)
304ee84b 2651 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2652 if (flags & SCF_DO_STCLASS)
2653 cl_init_zero(pRExC_state, &accum);
2654
2655 while (OP(scan) == code) {
2656 I32 deltanext, minnext, f = 0, fake;
2657 struct regnode_charclass_class this_class;
2658
2659 num++;
2660 data_fake.flags = 0;
2661 if (data) {
2662 data_fake.whilem_c = data->whilem_c;
2663 data_fake.last_closep = data->last_closep;
2664 }
2665 else
2666 data_fake.last_closep = &fake;
58e23c8d
YO
2667
2668 data_fake.pos_delta = delta;
8aa23a47
YO
2669 next = regnext(scan);
2670 scan = NEXTOPER(scan);
2671 if (code != BRANCH)
c277df42 2672 scan = NEXTOPER(scan);
8aa23a47
YO
2673 if (flags & SCF_DO_STCLASS) {
2674 cl_init(pRExC_state, &this_class);
2675 data_fake.start_class = &this_class;
2676 f = SCF_DO_STCLASS_AND;
58e23c8d 2677 }
8aa23a47
YO
2678 if (flags & SCF_WHILEM_VISITED_POS)
2679 f |= SCF_WHILEM_VISITED_POS;
2680
2681 /* we suppose the run is continuous, last=next...*/
2682 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2683 next, &data_fake,
2684 stopparen, recursed, NULL, f,depth+1);
2685 if (min1 > minnext)
2686 min1 = minnext;
2687 if (max1 < minnext + deltanext)
2688 max1 = minnext + deltanext;
2689 if (deltanext == I32_MAX)
2690 is_inf = is_inf_internal = 1;
2691 scan = next;
2692 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2693 pars++;
2694 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2695 if ( stopmin > minnext)
2696 stopmin = min + min1;
2697 flags &= ~SCF_DO_SUBSTR;
2698 if (data)
2699 data->flags |= SCF_SEEN_ACCEPT;
2700 }
2701 if (data) {
2702 if (data_fake.flags & SF_HAS_EVAL)
2703 data->flags |= SF_HAS_EVAL;
2704 data->whilem_c = data_fake.whilem_c;
3dab1dad 2705 }
8aa23a47
YO
2706 if (flags & SCF_DO_STCLASS)
2707 cl_or(pRExC_state, &accum, &this_class);
2708 }
2709 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2710 min1 = 0;
2711 if (flags & SCF_DO_SUBSTR) {
2712 data->pos_min += min1;
2713 data->pos_delta += max1 - min1;
2714 if (max1 != min1 || is_inf)
2715 data->longest = &(data->longest_float);
2716 }
2717 min += min1;
2718 delta += max1 - min1;
2719 if (flags & SCF_DO_STCLASS_OR) {
2720 cl_or(pRExC_state, data->start_class, &accum);
2721 if (min1) {
2722 cl_and(data->start_class, and_withp);
2723 flags &= ~SCF_DO_STCLASS;
653099ff 2724 }
8aa23a47
YO
2725 }
2726 else if (flags & SCF_DO_STCLASS_AND) {
2727 if (min1) {
2728 cl_and(data->start_class, &accum);
2729 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2730 }
8aa23a47
YO
2731 else {
2732 /* Switch to OR mode: cache the old value of
2733 * data->start_class */
2734 INIT_AND_WITHP;
2735 StructCopy(data->start_class, and_withp,
2736 struct regnode_charclass_class);
2737 flags &= ~SCF_DO_STCLASS_AND;
2738 StructCopy(&accum, data->start_class,
2739 struct regnode_charclass_class);
2740 flags |= SCF_DO_STCLASS_OR;
2741 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2742 }
8aa23a47 2743 }
a3621e74 2744
8aa23a47
YO
2745 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2746 /* demq.
a3621e74 2747
8aa23a47
YO
2748 Assuming this was/is a branch we are dealing with: 'scan' now
2749 points at the item that follows the branch sequence, whatever
2750 it is. We now start at the beginning of the sequence and look
2751 for subsequences of
a3621e74 2752
8aa23a47
YO
2753 BRANCH->EXACT=>x1
2754 BRANCH->EXACT=>x2
2755 tail
a3621e74 2756
8aa23a47 2757 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2758
8aa23a47
YO
2759 If we can find such a subseqence we need to turn the first
2760 element into a trie and then add the subsequent branch exact
2761 strings to the trie.
a3621e74 2762
8aa23a47 2763 We have two cases
a3621e74 2764
8aa23a47 2765 1. patterns where the whole set of branch can be converted.
a3621e74 2766
8aa23a47 2767 2. patterns where only a subset can be converted.
a3621e74 2768
8aa23a47
YO
2769 In case 1 we can replace the whole set with a single regop
2770 for the trie. In case 2 we need to keep the start and end
2771 branchs so
a3621e74 2772
8aa23a47
YO
2773 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2774 becomes BRANCH TRIE; BRANCH X;
786e8c11 2775
8aa23a47
YO
2776 There is an additional case, that being where there is a
2777 common prefix, which gets split out into an EXACT like node
2778 preceding the TRIE node.
a3621e74 2779
8aa23a47
YO
2780 If x(1..n)==tail then we can do a simple trie, if not we make
2781 a "jump" trie, such that when we match the appropriate word
2782 we "jump" to the appopriate tail node. Essentailly we turn
2783 a nested if into a case structure of sorts.
b515a41d 2784
8aa23a47
YO
2785 */
2786
2787 int made=0;
2788 if (!re_trie_maxbuff) {
2789 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2790 if (!SvIOK(re_trie_maxbuff))
2791 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2792 }
2793 if ( SvIV(re_trie_maxbuff)>=0 ) {
2794 regnode *cur;
2795 regnode *first = (regnode *)NULL;
2796 regnode *last = (regnode *)NULL;
2797 regnode *tail = scan;
2798 U8 optype = 0;
2799 U32 count=0;
a3621e74
YO
2800
2801#ifdef DEBUGGING
8aa23a47 2802 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2803#endif
8aa23a47
YO
2804 /* var tail is used because there may be a TAIL
2805 regop in the way. Ie, the exacts will point to the
2806 thing following the TAIL, but the last branch will
2807 point at the TAIL. So we advance tail. If we
2808 have nested (?:) we may have to move through several
2809 tails.
2810 */
2811
2812 while ( OP( tail ) == TAIL ) {
2813 /* this is the TAIL generated by (?:) */
2814 tail = regnext( tail );
2815 }
a3621e74 2816
8aa23a47
YO
2817
2818 DEBUG_OPTIMISE_r({
2819 regprop(RExC_rx, mysv, tail );
2820 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2821 (int)depth * 2 + 2, "",
2822 "Looking for TRIE'able sequences. Tail node is: ",
2823 SvPV_nolen_const( mysv )
2824 );
2825 });
2826
2827 /*
2828
2829 step through the branches, cur represents each
2830 branch, noper is the first thing to be matched
2831 as part of that branch and noper_next is the
2832 regnext() of that node. if noper is an EXACT
2833 and noper_next is the same as scan (our current
2834 position in the regex) then the EXACT branch is
2835 a possible optimization target. Once we have
2836 two or more consequetive such branches we can
2837 create a trie of the EXACT's contents and stich
2838 it in place. If the sequence represents all of
2839 the branches we eliminate the whole thing and
2840 replace it with a single TRIE. If it is a
2841 subsequence then we need to stitch it in. This
2842 means the first branch has to remain, and needs
2843 to be repointed at the item on the branch chain
2844 following the last branch optimized. This could
2845 be either a BRANCH, in which case the
2846 subsequence is internal, or it could be the
2847 item following the branch sequence in which
2848 case the subsequence is at the end.
2849
2850 */
2851
2852 /* dont use tail as the end marker for this traverse */
2853 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2854 regnode * const noper = NEXTOPER( cur );
b515a41d 2855#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2856 regnode * const noper_next = regnext( noper );
b515a41d
YO
2857#endif
2858
8aa23a47
YO
2859 DEBUG_OPTIMISE_r({
2860 regprop(RExC_rx, mysv, cur);
2861 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2862 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2863
2864 regprop(RExC_rx, mysv, noper);
2865 PerlIO_printf( Perl_debug_log, " -> %s",
2866 SvPV_nolen_const(mysv));
2867
2868 if ( noper_next ) {
2869 regprop(RExC_rx, mysv, noper_next );
2870 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2871 SvPV_nolen_const(mysv));
2872 }
2873 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2874 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2875 });
2876 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2877 : PL_regkind[ OP( noper ) ] == EXACT )
2878 || OP(noper) == NOTHING )
786e8c11 2879#ifdef NOJUMPTRIE
8aa23a47 2880 && noper_next == tail
786e8c11 2881#endif
8aa23a47
YO
2882 && count < U16_MAX)
2883 {
2884 count++;
2885 if ( !first || optype == NOTHING ) {
2886 if (!first) first = cur;
2887 optype = OP( noper );
2888 } else {
2889 last = cur;
2890 }
2891 } else {
a0a388a1 2892/*
0abd0d78
YO
2893 Currently we do not believe that the trie logic can
2894 handle case insensitive matching properly when the
2895 pattern is not unicode (thus forcing unicode semantics).
2896
2897 If/when this is fixed the following define can be swapped
2898 in below to fully enable trie logic.
2899
a0a388a1 2900#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2901
2902*/
2903#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2904
a0a388a1 2905 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2906 make_trie( pRExC_state,
2907 startbranch, first, cur, tail, count,
2908 optype, depth+1 );
2909 }
2910 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2911#ifdef NOJUMPTRIE
8aa23a47 2912 && noper_next == tail
786e8c11 2913#endif
8aa23a47
YO
2914 ){
2915 count = 1;
2916 first = cur;
2917 optype = OP( noper );
2918 } else {
2919 count = 0;
2920 first = NULL;
2921 optype = 0;
2922 }
2923 last = NULL;
2924 }
2925 }
2926 DEBUG_OPTIMISE_r({
2927 regprop(RExC_rx, mysv, cur);
2928 PerlIO_printf( Perl_debug_log,
2929 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2930 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2931
2932 });
a0a388a1
YO
2933
2934 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2935 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2936#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2937 if ( ((made == MADE_EXACT_TRIE &&
2938 startbranch == first)
2939 || ( first_non_open == first )) &&
2940 depth==0 ) {
2941 flags |= SCF_TRIE_RESTUDY;
2942 if ( startbranch == first
2943 && scan == tail )
2944 {
2945 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2946 }
2947 }
3dab1dad 2948#endif
8aa23a47
YO
2949 }
2950 }
2951
2952 } /* do trie */
2953
653099ff 2954 }
8aa23a47
YO
2955 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2956 scan = NEXTOPER(NEXTOPER(scan));
2957 } else /* single branch is optimized. */
2958 scan = NEXTOPER(scan);
2959 continue;
2960 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2961 scan_frame *newframe = NULL;
2962 I32 paren;
2963 regnode *start;
2964 regnode *end;
2965
2966 if (OP(scan) != SUSPEND) {
2967 /* set the pointer */
2968 if (OP(scan) == GOSUB) {
2969 paren = ARG(scan);
2970 RExC_recurse[ARG2L(scan)] = scan;
2971 start = RExC_open_parens[paren-1];
2972 end = RExC_close_parens[paren-1];
2973 } else {
2974 paren = 0;
f8fc2ecf 2975 start = RExC_rxi->program + 1;
8aa23a47
YO
2976 end = RExC_opend;
2977 }
2978 if (!recursed) {
2979 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2980 SAVEFREEPV(recursed);
2981 }
2982 if (!PAREN_TEST(recursed,paren+1)) {
2983 PAREN_SET(recursed,paren+1);
2984 Newx(newframe,1,scan_frame);
2985 } else {
2986 if (flags & SCF_DO_SUBSTR) {
304ee84b 2987 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2988 data->longest = &(data->longest_float);
2989 }
2990 is_inf = is_inf_internal = 1;
2991 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2992 cl_anything(pRExC_state, data->start_class);
2993 flags &= ~SCF_DO_STCLASS;
2994 }
2995 } else {
2996 Newx(newframe,1,scan_frame);
2997 paren = stopparen;
2998 start = scan+2;
2999 end = regnext(scan);
3000 }
3001 if (newframe) {
3002 assert(start);
3003 assert(end);
3004 SAVEFREEPV(newframe);
3005 newframe->next = regnext(scan);
3006 newframe->last = last;
3007 newframe->stop = stopparen;
3008 newframe->prev = frame;
3009
3010 frame = newframe;
3011 scan = start;
3012 stopparen = paren;
3013 last = end;
3014
3015 continue;
3016 }
3017 }
3018 else if (OP(scan) == EXACT) {
3019 I32 l = STR_LEN(scan);
3020 UV uc;
3021 if (UTF) {
3022 const U8 * const s = (U8*)STRING(scan);
3023 l = utf8_length(s, s + l);
3024 uc = utf8_to_uvchr(s, NULL);
3025 } else {
3026 uc = *((U8*)STRING(scan));
3027 }
3028 min += l;
3029 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3030 /* The code below prefers earlier match for fixed
3031 offset, later match for variable offset. */
3032 if (data->last_end == -1) { /* Update the start info. */
3033 data->last_start_min = data->pos_min;
3034 data->last_start_max = is_inf
3035 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3036 }
8aa23a47
YO
3037 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3038 if (UTF)
3039 SvUTF8_on(data->last_found);
3040 {
3041 SV * const sv = data->last_found;
3042 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3043 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3044 if (mg && mg->mg_len >= 0)
3045 mg->mg_len += utf8_length((U8*)STRING(scan),
3046 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3047 }
8aa23a47
YO
3048 data->last_end = data->pos_min + l;
3049 data->pos_min += l; /* As in the first entry. */
3050 data->flags &= ~SF_BEFORE_EOL;
3051 }
3052 if (flags & SCF_DO_STCLASS_AND) {
3053 /* Check whether it is compatible with what we know already! */
3054 int compat = 1;
3055
3056 if (uc >= 0x100 ||
3057 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3058 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3059 && (!(data->start_class->flags & ANYOF_FOLD)
3060 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3061 )
3062 compat = 0;
3063 ANYOF_CLASS_ZERO(data->start_class);
3064 ANYOF_BITMAP_ZERO(data->start_class);
3065 if (compat)
3066 ANYOF_BITMAP_SET(data->start_class, uc);
3067 data->start_class->flags &= ~ANYOF_EOS;
3068 if (uc < 0x100)
3069 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3070 }
3071 else if (flags & SCF_DO_STCLASS_OR) {
3072 /* false positive possible if the class is case-folded */
3073 if (uc < 0x100)
3074 ANYOF_BITMAP_SET(data->start_class, uc);
3075 else
3076 data->start_class->flags |= ANYOF_UNICODE_ALL;
3077 data->start_class->flags &= ~ANYOF_EOS;
3078 cl_and(data->start_class, and_withp);
3079 }
3080 flags &= ~SCF_DO_STCLASS;
3081 }
3082 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3083 I32 l = STR_LEN(scan);
3084 UV uc = *((U8*)STRING(scan));
3085
3086 /* Search for fixed substrings supports EXACT only. */
3087 if (flags & SCF_DO_SUBSTR) {
3088 assert(data);
304ee84b 3089 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3090 }
3091 if (UTF) {
3092 const U8 * const s = (U8 *)STRING(scan);
3093 l = utf8_length(s, s + l);
3094 uc = utf8_to_uvchr(s, NULL);
3095 }
3096 min += l;
3097 if (flags & SCF_DO_SUBSTR)
3098 data->pos_min += l;
3099 if (flags & SCF_DO_STCLASS_AND) {
3100 /* Check whether it is compatible with what we know already! */
3101 int compat = 1;
3102
3103 if (uc >= 0x100 ||
3104 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3105 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3106 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3107 compat = 0;
3108 ANYOF_CLASS_ZERO(data->start_class);
3109 ANYOF_BITMAP_ZERO(data->start_class);
3110 if (compat) {
3111 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3112 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
3113 data->start_class->flags |= ANYOF_FOLD;
3114 if (OP(scan) == EXACTFL)
3115 data->start_class->flags |= ANYOF_LOCALE;
653099ff 3116 }
8aa23a47
YO
3117 }
3118 else if (flags & SCF_DO_STCLASS_OR) {
3119 if (data->start_class->flags & ANYOF_FOLD) {
3120 /* false positive possible if the class is case-folded.
3121 Assume that the locale settings are the same... */
1aa99e6b
IH
3122 if (uc < 0x100)
3123 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
3124 data->start_class->flags &= ~ANYOF_EOS;
3125 }
8aa23a47 3126 cl_and(data->start_class, and_withp);
653099ff 3127 }
8aa23a47
YO
3128 flags &= ~SCF_DO_STCLASS;
3129 }
e52fc539 3130 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3131 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3132 I32 f = flags, pos_before = 0;
3133 regnode * const oscan = scan;
3134 struct regnode_charclass_class this_class;
3135 struct regnode_charclass_class *oclass = NULL;
3136 I32 next_is_eval = 0;
3137
3138 switch (PL_regkind[OP(scan)]) {
3139 case WHILEM: /* End of (?:...)* . */
3140 scan = NEXTOPER(scan);
3141 goto finish;
3142 case PLUS:
3143 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3144 next = NEXTOPER(scan);
3145 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3146 mincount = 1;
3147 maxcount = REG_INFTY;
3148 next = regnext(scan);
3149 scan = NEXTOPER(scan);
3150 goto do_curly;
3151 }
3152 }
3153 if (flags & SCF_DO_SUBSTR)
3154 data->pos_min++;
3155 min++;
3156 /* Fall through. */
3157 case STAR:
3158 if (flags & SCF_DO_STCLASS) {
3159 mincount = 0;
3160 maxcount = REG_INFTY;
3161 next = regnext(scan);
3162 scan = NEXTOPER(scan);
3163 goto do_curly;
3164 }
3165 is_inf = is_inf_internal = 1;
3166 scan = regnext(scan);
c277df42 3167 if (flags & SCF_DO_SUBSTR) {
304ee84b 3168 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3169 data->longest = &(data->longest_float);
c277df42 3170 }
8aa23a47
YO
3171 goto optimize_curly_tail;
3172 case CURLY:
3173 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3174 && (scan->flags == stopparen))
3175 {
3176 mincount = 1;
3177 maxcount = 1;
3178 } else {
3179 mincount = ARG1(scan);
3180 maxcount = ARG2(scan);
653099ff 3181 }
8aa23a47
YO
3182 next = regnext(scan);
3183 if (OP(scan) == CURLYX) {
3184 I32 lp = (data ? *(data->last_closep) : 0);
3185 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3186 }
8aa23a47
YO
3187 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3188 next_is_eval = (OP(scan) == EVAL);
3189 do_curly:
3190 if (flags & SCF_DO_SUBSTR) {
304ee84b 3191 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3192 pos_before = data->pos_min;
b45f050a 3193 }
8aa23a47
YO
3194 if (data) {
3195 fl = data->flags;
3196 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3197 if (is_inf)
3198 data->flags |= SF_IS_INF;
3199 }
3200 if (flags & SCF_DO_STCLASS) {
3201 cl_init(pRExC_state, &this_class);
3202 oclass = data->start_class;
3203 data->start_class = &this_class;
3204 f |= SCF_DO_STCLASS_AND;
3205 f &= ~SCF_DO_STCLASS_OR;
3206 }
3207 /* These are the cases when once a subexpression
3208 fails at a particular position, it cannot succeed
3209 even after backtracking at the enclosing scope.
b9a59e08 3210
8aa23a47
YO
3211 XXXX what if minimal match and we are at the
3212 initial run of {n,m}? */
3213 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3214 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3215
8aa23a47
YO
3216 /* This will finish on WHILEM, setting scan, or on NULL: */
3217 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3218 last, data, stopparen, recursed, NULL,
3219 (mincount == 0
3220 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3221
8aa23a47
YO
3222 if (flags & SCF_DO_STCLASS)
3223 data->start_class = oclass;
3224 if (mincount == 0 || minnext == 0) {
3225 if (flags & SCF_DO_STCLASS_OR) {
3226 cl_or(pRExC_state, data->start_class, &this_class);
3227 }
3228 else if (flags & SCF_DO_STCLASS_AND) {
3229 /* Switch to OR mode: cache the old value of
3230 * data->start_class */
3231 INIT_AND_WITHP;
3232 StructCopy(data->start_class, and_withp,
3233 struct regnode_charclass_class);
3234 flags &= ~SCF_DO_STCLASS_AND;
3235 StructCopy(&this_class, data->start_class,
3236 struct regnode_charclass_class);
3237 flags |= SCF_DO_STCLASS_OR;
3238 data->start_class->flags |= ANYOF_EOS;
3239 }
3240 } else { /* Non-zero len */
3241 if (flags & SCF_DO_STCLASS_OR) {
3242 cl_or(pRExC_state, data->start_class, &this_class);
3243 cl_and(data->start_class, and_withp);
3244 }
3245 else if (flags & SCF_DO_STCLASS_AND)
3246 cl_and(data->start_class, &this_class);
3247 flags &= ~SCF_DO_STCLASS;
3248 }
3249 if (!scan) /* It was not CURLYX, but CURLY. */
3250 scan = next;
3251 if ( /* ? quantifier ok, except for (?{ ... }) */
3252 (next_is_eval || !(mincount == 0 && maxcount == 1))
3253 && (minnext == 0) && (deltanext == 0)
3254 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3255 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3256 {
668c081a
NC
3257 ckWARNreg(RExC_parse,
3258 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3259 }
3260
3261 min += minnext * mincount;
3262 is_inf_internal |= ((maxcount == REG_INFTY
3263 && (minnext + deltanext) > 0)
3264 || deltanext == I32_MAX);
3265 is_inf |= is_inf_internal;
3266 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3267
3268 /* Try powerful optimization CURLYX => CURLYN. */
3269 if ( OP(oscan) == CURLYX && data
3270 && data->flags & SF_IN_PAR
3271 && !(data->flags & SF_HAS_EVAL)
3272 && !deltanext && minnext == 1 ) {
3273 /* Try to optimize to CURLYN. */
3274 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3275 regnode * const nxt1 = nxt;
497b47a8 3276#ifdef DEBUGGING
8aa23a47 3277 regnode *nxt2;
497b47a8 3278#endif
c277df42 3279
8aa23a47
YO
3280 /* Skip open. */
3281 nxt = regnext(nxt);
e52fc539 3282 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3283 && !(PL_regkind[OP(nxt)] == EXACT
3284 && STR_LEN(nxt) == 1))
3285 goto nogo;
497b47a8 3286#ifdef DEBUGGING
8aa23a47 3287 nxt2 = nxt;
497b47a8 3288#endif
8aa23a47
YO
3289 nxt = regnext(nxt);
3290 if (OP(nxt) != CLOSE)
3291 goto nogo;
3292 if (RExC_open_parens) {
3293 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3294 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3295 }
3296 /* Now we know that nxt2 is the only contents: */
3297 oscan->flags = (U8)ARG(nxt);
3298 OP(oscan) = CURLYN;
3299 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3300
c277df42 3301#ifdef DEBUGGING
8aa23a47 3302 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3303 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3304 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3305 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3306 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3307 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3308#endif
8aa23a47
YO
3309 }
3310 nogo:
3311
3312 /* Try optimization CURLYX => CURLYM. */
3313 if ( OP(oscan) == CURLYX && data
3314 && !(data->flags & SF_HAS_PAR)
3315 && !(data->flags & SF_HAS_EVAL)
3316 && !deltanext /* atom is fixed width */
3317 && minnext != 0 /* CURLYM can't handle zero width */
3318 ) {
3319 /* XXXX How to optimize if data == 0? */
3320 /* Optimize to a simpler form. */
3321 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3322 regnode *nxt2;
3323
3324 OP(oscan) = CURLYM;
3325 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3326 && (OP(nxt2) != WHILEM))
3327 nxt = nxt2;
3328 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3329 /* Need to optimize away parenths. */
b3c0965f 3330 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3331 /* Set the parenth number. */
3332 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3333
8aa23a47
YO
3334 oscan->flags = (U8)ARG(nxt);
3335 if (RExC_open_parens) {
3336 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3337 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3338 }
8aa23a47
YO
3339 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3340 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3341
c277df42 3342#ifdef DEBUGGING
8aa23a47
YO
3343 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3344 OP(nxt + 1) = OPTIMIZED; /* was count. */
3345 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3346 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3347#endif
c277df42 3348#if 0
8aa23a47
YO
3349 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3350 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3351 if (nnxt == nxt) {
3352 if (reg_off_by_arg[OP(nxt1)])
3353 ARG_SET(nxt1, nxt2 - nxt1);
3354 else if (nxt2 - nxt1 < U16_MAX)
3355 NEXT_OFF(nxt1) = nxt2 - nxt1;
3356 else
3357 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3358 }
8aa23a47 3359 nxt1 = nnxt;
c277df42 3360 }
5d1c421c 3361#endif
8aa23a47
YO
3362 /* Optimize again: */
3363 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3364 NULL, stopparen, recursed, NULL, 0,depth+1);
3365 }
3366 else
3367 oscan->flags = 0;
3368 }
3369 else if ((OP(oscan) == CURLYX)
3370 && (flags & SCF_WHILEM_VISITED_POS)
3371 /* See the comment on a similar expression above.
3372 However, this time it not a subexpression
3373 we care about, but the expression itself. */
3374 && (maxcount == REG_INFTY)
3375 && data && ++data->whilem_c < 16) {
3376 /* This stays as CURLYX, we can put the count/of pair. */
3377 /* Find WHILEM (as in regexec.c) */
3378 regnode *nxt = oscan + NEXT_OFF(oscan);
3379
3380 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3381 nxt += ARG(nxt);
3382 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3383 | (RExC_whilem_seen << 4)); /* On WHILEM */
3384 }
3385 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3386 pars++;
3387 if (flags & SCF_DO_SUBSTR) {
3388 SV *last_str = NULL;
3389 int counted = mincount != 0;
a0ed51b3 3390
8aa23a47
YO
3391 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3392#if defined(SPARC64_GCC_WORKAROUND)
3393 I32 b = 0;
3394 STRLEN l = 0;
3395 const char *s = NULL;
3396 I32 old = 0;
b515a41d 3397
8aa23a47
YO
3398 if (pos_before >= data->last_start_min)
3399 b = pos_before;
3400 else
3401 b = data->last_start_min;
b515a41d 3402
8aa23a47
YO
3403 l = 0;
3404 s = SvPV_const(data->last_found, l);
3405 old = b - data->last_start_min;
3406
3407#else
3408 I32 b = pos_before >= data->last_start_min
3409 ? pos_before : data->last_start_min;
3410 STRLEN l;
3411 const char * const s = SvPV_const(data->last_found, l);
3412 I32 old = b - data->last_start_min;
3413#endif
3414
3415 if (UTF)
3416 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3417 l -= old;
3418 /* Get the added string: */
740cce10 3419 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3420 if (deltanext == 0 && pos_before == b) {
3421 /* What was added is a constant string */
3422 if (mincount > 1) {
3423 SvGROW(last_str, (mincount * l) + 1);
3424 repeatcpy(SvPVX(last_str) + l,
3425 SvPVX_const(last_str), l, mincount - 1);
3426 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3427 /* Add additional parts. */
3428 SvCUR_set(data->last_found,
3429 SvCUR(data->last_found) - l);
3430 sv_catsv(data->last_found, last_str);
3431 {
3432 SV * sv = data->last_found;
3433 MAGIC *mg =
3434 SvUTF8(sv) && SvMAGICAL(sv) ?
3435 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3436 if (mg && mg->mg_len >= 0)
bd94e887 3437 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3438 }
8aa23a47 3439 data->last_end += l * (mincount - 1);
b515a41d 3440 }
8aa23a47
YO
3441 } else {
3442 /* start offset must point into the last copy */
3443 data->last_start_min += minnext * (mincount - 1);
3444 data->last_start_max += is_inf ? I32_MAX
3445 : (maxcount - 1) * (minnext + data->pos_delta);
3446 }
c277df42 3447 }
8aa23a47
YO
3448 /* It is counted once already... */
3449 data->pos_min += minnext * (mincount - counted);
3450 data->pos_delta += - counted * deltanext +
3451 (minnext + deltanext) * maxcount - minnext * mincount;
3452 if (mincount != maxcount) {
3453 /* Cannot extend fixed substrings found inside
3454 the group. */
304ee84b 3455 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3456 if (mincount && last_str) {
3457 SV * const sv = data->last_found;
3458 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3459 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3460
3461 if (mg)
3462 mg->mg_len = -1;
3463 sv_setsv(sv, last_str);
3464 data->last_end = data->pos_min;
3465 data->last_start_min =
3466 data->pos_min - CHR_SVLEN(last_str);
3467 data->last_start_max = is_inf
3468 ? I32_MAX
3469 : data->pos_min + data->pos_delta
3470 - CHR_SVLEN(last_str);
3471 }
3472 data->longest = &(data->longest_float);
3473 }
3474 SvREFCNT_dec(last_str);
c277df42 3475 }
8aa23a47
YO
3476 if (data && (fl & SF_HAS_EVAL))
3477 data->flags |= SF_HAS_EVAL;
3478 optimize_curly_tail:
3479 if (OP(oscan) != CURLYX) {
3480 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3481 && NEXT_OFF(next))
3482 NEXT_OFF(oscan) += NEXT_OFF(next);
3483 }
3484 continue;
3485 default: /* REF and CLUMP only? */
3486 if (flags & SCF_DO_SUBSTR) {
304ee84b 3487 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3488 data->longest = &(data->longest_float);
3489 }
3490 is_inf = is_inf_internal = 1;
3491 if (flags & SCF_DO_STCLASS_OR)
3492 cl_anything(pRExC_state, data->start_class);
3493 flags &= ~SCF_DO_STCLASS;
3494 break;
c277df42 3495 }
8aa23a47 3496 }
e1d1eefb
YO
3497 else if (OP(scan) == LNBREAK) {
3498 if (flags & SCF_DO_STCLASS) {
3499 int value = 0;
3500 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3501 if (flags & SCF_DO_STCLASS_AND) {
3502 for (value = 0; value < 256; value++)
e64b1bd1 3503 if (!is_VERTWS_cp(value))
b9a59e08
KW
3504 ANYOF_BITMAP_CLEAR(data->start_class, value);
3505 }
3506 else {
e1d1eefb 3507 for (value = 0; value < 256; value++)
e64b1bd1 3508 if (is_VERTWS_cp(value))
b9a59e08
KW
3509 ANYOF_BITMAP_SET(data->start_class, value);
3510 }
e1d1eefb
YO
3511 if (flags & SCF_DO_STCLASS_OR)
3512 cl_and(data->start_class, and_withp);
3513 flags &= ~SCF_DO_STCLASS;
3514 }
3515 min += 1;
f9a79580 3516 delta += 1;
e1d1eefb
YO
3517 if (flags & SCF_DO_SUBSTR) {
3518 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3519 data->pos_min += 1;
f9a79580 3520 data->pos_delta += 1;
e1d1eefb
YO
3521 data->longest = &(data->longest_float);
3522 }
e1d1eefb 3523 }
f9a79580
RGS
3524 else if (OP(scan) == FOLDCHAR) {
3525 int d = ARG(scan)==0xDF ? 1 : 2;
3526 flags &= ~SCF_DO_STCLASS;
3527 min += 1;
3528 delta += d;
3529 if (flags & SCF_DO_SUBSTR) {
3530 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3531 data->pos_min += 1;
3532 data->pos_delta += d;
3533 data->longest = &(data->longest_float);
3534 }
3535 }
e52fc539 3536 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 3537 int value = 0;
653099ff 3538
8aa23a47 3539 if (flags & SCF_DO_SUBSTR) {
304ee84b 3540 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3541 data->pos_min++;
3542 }
3543 min++;
3544 if (flags & SCF_DO_STCLASS) {
3545 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3546
8aa23a47
YO
3547 /* Some of the logic below assumes that switching
3548 locale on will only add false positives. */
3549 switch (PL_regkind[OP(scan)]) {
3550 case SANY:
3551 default:
3552 do_default:
3553 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3554 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3555 cl_anything(pRExC_state, data->start_class);
3556 break;
3557 case REG_ANY:
3558 if (OP(scan) == SANY)
3559 goto do_default;
3560 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3561 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3562 || (data->start_class->flags & ANYOF_CLASS));
3563 cl_anything(pRExC_state, data->start_class);
653099ff 3564 }
8aa23a47
YO
3565 if (flags & SCF_DO_STCLASS_AND || !value)
3566 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3567 break;
3568 case ANYOF:
3569 if (flags & SCF_DO_STCLASS_AND)
3570 cl_and(data->start_class,
3571 (struct regnode_charclass_class*)scan);
653099ff 3572 else
8aa23a47
YO
3573 cl_or(pRExC_state, data->start_class,
3574 (struct regnode_charclass_class*)scan);
3575 break;
3576 case ALNUM:
3577 if (flags & SCF_DO_STCLASS_AND) {
3578 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3579 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
a12cf05f
KW
3580 if (FLAGS(scan) & USE_UNI) {
3581 for (value = 0; value < 256; value++) {
3582 if (!isWORDCHAR_L1(value)) {
3583 ANYOF_BITMAP_CLEAR(data->start_class, value);
3584 }
3585 }
3586 } else {
3587 for (value = 0; value < 256; value++) {
3588 if (!isALNUM(value)) {
3589 ANYOF_BITMAP_CLEAR(data->start_class, value);
3590 }
3591 }
3592 }
8aa23a47 3593 }
653099ff 3594 }
8aa23a47
YO
3595 else {
3596 if (data->start_class->flags & ANYOF_LOCALE)
3597 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
a12cf05f
KW
3598 else if (FLAGS(scan) & USE_UNI) {
3599 for (value = 0; value < 256; value++) {
3600 if (isWORDCHAR_L1(value)) {
3601 ANYOF_BITMAP_SET(data->start_class, value);
3602 }
3603 }
3604 } else {
3605 for (value = 0; value < 256; value++) {
3606 if (isALNUM(value)) {
3607 ANYOF_BITMAP_SET(data->start_class, value);
3608 }
3609 }
3610 }
8aa23a47
YO
3611 }
3612 break;
3613 case ALNUML:
3614 if (flags & SCF_DO_STCLASS_AND) {
3615 if (data->start_class->flags & ANYOF_LOCALE)
3616 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3617 }
3618 else {
3619 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3620 data->start_class->flags |= ANYOF_LOCALE;
3621 }
3622 break;
3623 case NALNUM:
3624 if (flags & SCF_DO_STCLASS_AND) {
3625 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3626 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
a12cf05f
KW
3627 if (FLAGS(scan) & USE_UNI) {
3628 for (value = 0; value < 256; value++) {
3629 if (isWORDCHAR_L1(value)) {
3630 ANYOF_BITMAP_CLEAR(data->start_class, value);
3631 }
3632 }
3633 } else {
3634 for (value = 0; value < 256; value++) {
3635 if (isALNUM(value)) {
3636 ANYOF_BITMAP_CLEAR(data->start_class, value);
3637 }
3638 }
3639 }
653099ff
GS
3640 }
3641 }
8aa23a47
YO
3642 else {
3643 if (data->start_class->flags & ANYOF_LOCALE)
3644 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3645 else {
3646 for (value = 0; value < 256; value++)
3647 if (!isALNUM(value))
b9a59e08 3648 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47 3649 }
653099ff 3650 }
8aa23a47
YO
3651 break;
3652 case NALNUML:
3653 if (flags & SCF_DO_STCLASS_AND) {
3654 if (data->start_class->flags & ANYOF_LOCALE)
3655 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3656 }
8aa23a47
YO
3657 else {
3658 data->start_class->flags |= ANYOF_LOCALE;
3659 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3660 }
3661 break;
3662 case SPACE:
3663 if (flags & SCF_DO_STCLASS_AND) {
3664 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3665 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
a12cf05f
KW
3666 if (FLAGS(scan) & USE_UNI) {
3667 for (value = 0; value < 256; value++) {
3668 if (!isSPACE_L1(value)) {
3669 ANYOF_BITMAP_CLEAR(data->start_class, value);
3670 }
3671 }
3672 } else {
3673 for (value = 0; value < 256; value++) {
3674 if (!isSPACE(value)) {
3675 ANYOF_BITMAP_CLEAR(data->start_class, value);
3676 }
3677 }
3678 }
653099ff
GS
3679 }
3680 }
8aa23a47 3681 else {
a12cf05f 3682 if (data->start_class->flags & ANYOF_LOCALE) {
8aa23a47 3683 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
a12cf05f
KW
3684 }
3685 else if (FLAGS(scan) & USE_UNI) {
3686 for (value = 0; value < 256; value++) {
3687 if (isSPACE_L1(value)) {
3688 ANYOF_BITMAP_SET(data->start_class, value);
3689 }
3690 }
3691 } else {
3692 for (value = 0; value < 256; value++) {
3693 if (isSPACE(value)) {
3694 ANYOF_BITMAP_SET(data->start_class, value);
3695 }
3696 }
8aa23a47 3697 }
653099ff 3698 }
8aa23a47
YO
3699 break;
3700 case SPACEL:
3701 if (flags & SCF_DO_STCLASS_AND) {
3702 if (data->start_class->flags & ANYOF_LOCALE)
3703 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3704 }
3705 else {
3706 data->start_class->flags |= ANYOF_LOCALE;
3707 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3708 }
3709 break;
3710 case NSPACE:
3711 if (flags & SCF_DO_STCLASS_AND) {
3712 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3713 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
a12cf05f
KW
3714 if (FLAGS(scan) & USE_UNI) {
3715 for (value = 0; value < 256; value++) {
3716 if (isSPACE_L1(value)) {
3717 ANYOF_BITMAP_CLEAR(data->start_class, value);
3718 }
3719 }
3720 } else {
3721 for (value = 0; value < 256; value++) {
3722 if (isSPACE(value)) {
3723 ANYOF_BITMAP_CLEAR(data->start_class, value);
3724 }
3725 }
3726 }
653099ff 3727 }
8aa23a47
YO
3728 }
3729 else {
3730 if (data->start_class->flags & ANYOF_LOCALE)
3731 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
a12cf05f
KW
3732 else if (FLAGS(scan) & USE_UNI) {
3733 for (value = 0; value < 256; value++) {
3734 if (!isSPACE_L1(value)) {
3735 ANYOF_BITMAP_SET(data->start_class, value);
3736 }
3737 }
3738 }
3739 else {
3740 for (value = 0; value < 256; value++) {
3741 if (!isSPACE(value)) {
3742 ANYOF_BITMAP_SET(data->start_class, value);
3743 }
3744 }
3745 }
653099ff 3746 }
8aa23a47
YO
3747 break;
3748 case NSPACEL:
3749 if (flags & SCF_DO_STCLASS_AND) {
3750 if (data->start_class->flags & ANYOF_LOCALE) {
3751 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3752 for (value = 0; value < 256; value++)
3753 if (!isSPACE(value))
3754 ANYOF_BITMAP_CLEAR(data->start_class, value);
3755 }
653099ff 3756 }
8aa23a47
YO
3757 else {
3758 data->start_class->flags |= ANYOF_LOCALE;
3759 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3760 }
3761 break;
3762 case DIGIT:
3763 if (flags & SCF_DO_STCLASS_AND) {
3764 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3765 for (value = 0; value < 256; value++)
3766 if (!isDIGIT(value))
3767 ANYOF_BITMAP_CLEAR(data->start_class, value);
3768 }
3769 else {
3770 if (data->start_class->flags & ANYOF_LOCALE)
3771 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3772 else {
3773 for (value = 0; value < 256; value++)
3774 if (isDIGIT(value))
b9a59e08 3775 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47
YO
3776 }
3777 }
3778 break;
3779 case NDIGIT:
3780 if (flags & SCF_DO_STCLASS_AND) {
3781 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3782 for (value = 0; value < 256; value++)
3783 if (isDIGIT(value))
3784 ANYOF_BITMAP_CLEAR(data->start_class, value);
3785 }
3786 else {
3787 if (data->start_class->flags & ANYOF_LOCALE)
3788 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3789 else {
3790 for (value = 0; value < 256; value++)
3791 if (!isDIGIT(value))
b9a59e08 3792 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3793 }
3794 }
8aa23a47 3795 break;
e1d1eefb
YO
3796 CASE_SYNST_FNC(VERTWS);
3797 CASE_SYNST_FNC(HORIZWS);
3798
8aa23a47
YO
3799 }
3800 if (flags & SCF_DO_STCLASS_OR)
3801 cl_and(data->start_class, and_withp);
3802 flags &= ~SCF_DO_STCLASS;
3803 }
3804 }
3805 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3806 data->flags |= (OP(scan) == MEOL
3807 ? SF_BEFORE_MEOL
3808 : SF_BEFORE_SEOL);
3809 }
3810 else if ( PL_regkind[OP(scan)] == BRANCHJ
3811 /* Lookbehind, or need to calculate parens/evals/stclass: */
3812 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3813 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3814 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3815 || OP(scan) == UNLESSM )
3816 {
3817 /* Negative Lookahead/lookbehind
3818 In this case we can't do fixed string optimisation.
3819 */
1de06328 3820
8aa23a47
YO
3821 I32 deltanext, minnext, fake = 0;
3822 regnode *nscan;
3823 struct regnode_charclass_class intrnl;
3824 int f = 0;
1de06328 3825
8aa23a47
YO
3826 data_fake.flags = 0;
3827 if (data) {
3828 data_fake.whilem_c = data->whilem_c;
3829 data_fake.last_closep = data->last_closep;
c277df42 3830 }
8aa23a47
YO
3831 else
3832 data_fake.last_closep = &fake;
58e23c8d 3833 data_fake.pos_delta = delta;
8aa23a47
YO
3834 if ( flags & SCF_DO_STCLASS && !scan->flags
3835 && OP(scan) == IFMATCH ) { /* Lookahead */
3836 cl_init(pRExC_state, &intrnl);
3837 data_fake.start_class = &intrnl;
3838 f |= SCF_DO_STCLASS_AND;
3839 }
3840 if (flags & SCF_WHILEM_VISITED_POS)
3841 f |= SCF_WHILEM_VISITED_POS;
3842 next = regnext(scan);
3843 nscan = NEXTOPER(NEXTOPER(scan));
3844 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3845 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3846 if (scan->flags) {
3847 if (deltanext) {
58e23c8d 3848 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3849 }
3850 else if (minnext > (I32)U8_MAX) {
58e23c8d 3851 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3852 }
3853 scan->flags = (U8)minnext;
3854 }
3855 if (data) {
3856 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3857 pars++;
3858 if (data_fake.flags & SF_HAS_EVAL)
3859 data->flags |= SF_HAS_EVAL;
3860 data->whilem_c = data_fake.whilem_c;
3861 }
3862 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
3863 if (flags & SCF_DO_STCLASS_OR) {
3864 /* OR before, AND after: ideally we would recurse with
3865 * data_fake to get the AND applied by study of the
3866 * remainder of the pattern, and then derecurse;
3867 * *** HACK *** for now just treat as "no information".
3868 * See [perl #56690].
3869 */
3870 cl_init(pRExC_state, data->start_class);
3871 } else {
3872 /* AND before and after: combine and continue */
3873 const int was = (data->start_class->flags & ANYOF_EOS);
3874
3875 cl_and(data->start_class, &intrnl);
3876 if (was)
3877 data->start_class->flags |= ANYOF_EOS;
3878 }
8aa23a47 3879 }
cb434fcc 3880 }
8aa23a47
YO
3881#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3882 else {
3883 /* Positive Lookahead/lookbehind
3884 In this case we can do fixed string optimisation,
3885 but we must be careful about it. Note in the case of
3886 lookbehind the positions will be offset by the minimum
3887 length of the pattern, something we won't know about
3888 until after the recurse.
3889 */
3890 I32 deltanext, fake = 0;
3891 regnode *nscan;
3892 struct regnode_charclass_class intrnl;
3893 int f = 0;
3894 /* We use SAVEFREEPV so that when the full compile
3895 is finished perl will clean up the allocated
3896 minlens when its all done. This was we don't
3897 have to worry about freeing them when we know
3898 they wont be used, which would be a pain.
3899 */
3900 I32 *minnextp;
3901 Newx( minnextp, 1, I32 );
3902 SAVEFREEPV(minnextp);
3903
3904 if (data) {
3905 StructCopy(data, &data_fake, scan_data_t);
3906 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3907 f |= SCF_DO_SUBSTR;
3908 if (scan->flags)
304ee84b 3909 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3910 data_fake.last_found=newSVsv(data->last_found);
3911 }
3912 }
3913 else
3914 data_fake.last_closep = &fake;
3915 data_fake.flags = 0;
58e23c8d 3916 data_fake.pos_delta = delta;
8aa23a47
YO
3917 if (is_inf)
3918 data_fake.flags |= SF_IS_INF;
3919 if ( flags & SCF_DO_STCLASS && !scan->flags
3920 && OP(scan) == IFMATCH ) { /* Lookahead */
3921 cl_init(pRExC_state, &intrnl);
3922 data_fake.start_class = &intrnl;
3923 f |= SCF_DO_STCLASS_AND;
3924 }
3925 if (flags & SCF_WHILEM_VISITED_POS)
3926 f |= SCF_WHILEM_VISITED_POS;
3927 next = regnext(scan);
3928 nscan = NEXTOPER(NEXTOPER(scan));
3929
3930 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3931 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3932 if (scan->flags) {
3933 if (deltanext) {
58e23c8d 3934 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3935 }
3936 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3937 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3938 }
3939 scan->flags = (U8)*minnextp;
3940 }
3941
3942 *minnextp += min;
3943
3944 if (f & SCF_DO_STCLASS_AND) {
3945 const int was = (data->start_class->flags & ANYOF_EOS);
3946
3947 cl_and(data->start_class, &intrnl);
3948 if (was)
3949 data->start_class->flags |= ANYOF_EOS;
3950 }
3951 if (data) {
3952 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3953 pars++;
3954 if (data_fake.flags & SF_HAS_EVAL)
3955 data->flags |= SF_HAS_EVAL;
3956 data->whilem_c = data_fake.whilem_c;
3957 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3958 if (RExC_rx->minlen<*minnextp)
3959 RExC_rx->minlen=*minnextp;
304ee84b 3960 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3961 SvREFCNT_dec(data_fake.last_found);
3962
3963 if ( data_fake.minlen_fixed != minlenp )
3964 {
3965 data->offset_fixed= data_fake.offset_fixed;
3966 data->minlen_fixed= data_fake.minlen_fixed;
3967 data->lookbehind_fixed+= scan->flags;
3968 }
3969 if ( data_fake.minlen_float != minlenp )
3970 {
3971 data->minlen_float= data_fake.minlen_float;
3972 data->offset_float_min=data_fake.offset_float_min;
3973 data->offset_float_max=data_fake.offset_float_max;
3974 data->lookbehind_float+= scan->flags;
3975 }
3976 }
3977 }
3978
3979
40d049e4 3980 }
8aa23a47
YO
3981#endif
3982 }
3983 else if (OP(scan) == OPEN) {
3984 if (stopparen != (I32)ARG(scan))
3985 pars++;
3986 }
3987 else if (OP(scan) == CLOSE) {
3988 if (stopparen == (I32)ARG(scan)) {
3989 break;
3990 }
3991 if ((I32)ARG(scan) == is_par) {
3992 next = regnext(scan);
b515a41d 3993
8aa23a47
YO
3994 if ( next && (OP(next) != WHILEM) && next < last)
3995 is_par = 0; /* Disable optimization */
40d049e4 3996 }
8aa23a47
YO
3997 if (data)
3998 *(data->last_closep) = ARG(scan);
3999 }
4000 else if (OP(scan) == EVAL) {
c277df42
IZ
4001 if (data)
4002 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
4003 }
4004 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4005 if (flags & SCF_DO_SUBSTR) {
304ee84b 4006 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 4007 flags &= ~SCF_DO_SUBSTR;
40d049e4 4008 }
8aa23a47
YO
4009 if (data && OP(scan)==ACCEPT) {
4010 data->flags |= SCF_SEEN_ACCEPT;
4011 if (stopmin > min)
4012 stopmin = min;
e2e6a0f1 4013 }
8aa23a47
YO
4014 }
4015 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4016 {
0f5d15d6 4017 if (flags & SCF_DO_SUBSTR) {
304ee84b 4018 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
4019 data->longest = &(data->longest_float);
4020 }
4021 is_inf = is_inf_internal = 1;
653099ff 4022 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 4023 cl_anything(pRExC_state, data->start_class);
96776eda 4024 flags &= ~SCF_DO_STCLASS;
8aa23a47 4025 }
58e23c8d 4026 else if (OP(scan) == GPOS) {
bbe252da 4027 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
4028 !(delta || is_inf || (data && data->pos_delta)))
4029 {
bbe252da
YO
4030 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4031 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
4032 if (RExC_rx->gofs < (U32)min)
4033 RExC_rx->gofs = min;
4034 } else {
bbe252da 4035 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
4036 RExC_rx->gofs = 0;
4037 }
4038 }
786e8c11 4039#ifdef TRIE_STUDY_OPT
40d049e4 4040#ifdef FULL_TRIE_STUDY
8aa23a47
YO
4041 else if (PL_regkind[OP(scan)] == TRIE) {
4042 /* NOTE - There is similar code to this block above for handling
4043 BRANCH nodes on the initial study. If you change stuff here
4044 check there too. */
4045 regnode *trie_node= scan;
4046 regnode *tail= regnext(scan);
f8fc2ecf 4047 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4048 I32 max1 = 0, min1 = I32_MAX;
4049 struct regnode_charclass_class accum;
4050
4051 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 4052 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
4053 if (flags & SCF_DO_STCLASS)
4054 cl_init_zero(pRExC_state, &accum);
4055
4056 if (!trie->jump) {
4057 min1= trie->minlen;
4058 max1= trie->maxlen;
4059 } else {
4060 const regnode *nextbranch= NULL;
4061 U32 word;
4062
4063 for ( word=1 ; word <= trie->wordcount ; word++)
4064 {
4065 I32 deltanext=0, minnext=0, f = 0, fake;
4066 struct regnode_charclass_class this_class;
4067
4068 data_fake.flags = 0;
4069 if (data) {
4070 data_fake.whilem_c = data->whilem_c;
4071 data_fake.last_closep = data->last_closep;
4072 }
4073 else
4074 data_fake.last_closep = &fake;
58e23c8d 4075 data_fake.pos_delta = delta;
8aa23a47
YO
4076 if (flags & SCF_DO_STCLASS) {
4077 cl_init(pRExC_state, &this_class);
4078 data_fake.start_class = &this_class;
4079 f = SCF_DO_STCLASS_AND;
4080 }
4081 if (flags & SCF_WHILEM_VISITED_POS)
4082 f |= SCF_WHILEM_VISITED_POS;
4083
4084 if (trie->jump[word]) {
4085 if (!nextbranch)
4086 nextbranch = trie_node + trie->jump[0];
4087 scan= trie_node + trie->jump[word];
4088 /* We go from the jump point to the branch that follows
4089 it. Note this means we need the vestigal unused branches
4090 even though they arent otherwise used.
4091 */
4092 minnext = study_chunk(pRExC_state, &scan, minlenp,
4093 &deltanext, (regnode *)nextbranch, &data_fake,
4094 stopparen, recursed, NULL, f,depth+1);
4095 }
4096 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4097 nextbranch= regnext((regnode*)nextbranch);
4098
4099 if (min1 > (I32)(minnext + trie->minlen))
4100 min1 = minnext + trie->minlen;
4101 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4102 max1 = minnext + deltanext + trie->maxlen;
4103 if (deltanext == I32_MAX)
4104 is_inf = is_inf_internal = 1;
4105
4106 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4107 pars++;
4108 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4109 if ( stopmin > min + min1)
4110 stopmin = min + min1;
4111 flags &= ~SCF_DO_SUBSTR;
4112 if (data)
4113 data->flags |= SCF_SEEN_ACCEPT;
4114 }
4115 if (data) {
4116 if (data_fake.flags & SF_HAS_EVAL)
4117 data->flags |= SF_HAS_EVAL;
4118 data->whilem_c = data_fake.whilem_c;
4119 }
4120 if (flags & SCF_DO_STCLASS)
4121 cl_or(pRExC_state, &accum, &this_class);
4122 }
4123 }
4124 if (flags & SCF_DO_SUBSTR) {
4125 data->pos_min += min1;
4126 data->pos_delta += max1 - min1;
4127 if (max1 != min1 || is_inf)
4128 data->longest = &(data->longest_float);
4129 }
4130 min += min1;
4131 delta += max1 - min1;
4132 if (flags & SCF_DO_STCLASS_OR) {
4133 cl_or(pRExC_state, data->start_class, &accum);
4134 if (min1) {
4135 cl_and(data->start_class, and_withp);
4136 flags &= ~SCF_DO_STCLASS;
4137 }
4138 }
4139 else if (flags & SCF_DO_STCLASS_AND) {
4140 if (min1) {
4141 cl_and(data->start_class, &accum);
4142 flags &= ~SCF_DO_STCLASS;
4143 }
4144 else {
4145 /* Switch to OR mode: cache the old value of
4146 * data->start_class */
4147 INIT_AND_WITHP;
4148 StructCopy(data->start_class, and_withp,
4149 struct regnode_charclass_class);
4150 flags &= ~SCF_DO_STCLASS_AND;
4151 StructCopy(&accum, data->start_class,
4152 struct regnode_charclass_class);
4153 flags |= SCF_DO_STCLASS_OR;
4154 data->start_class->flags |= ANYOF_EOS;
4155 }
4156 }
4157 scan= tail;
4158 continue;
4159 }
786e8c11 4160#else
8aa23a47 4161 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4162 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4163 U8*bang=NULL;
4164
4165 min += trie->minlen;
4166 delta += (trie->maxlen - trie->minlen);
4167 flags &= ~SCF_DO_STCLASS; /* xxx */
4168 if (flags & SCF_DO_SUBSTR) {
304ee84b 4169 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4170 data->pos_min += trie->minlen;
4171 data->pos_delta += (trie->maxlen - trie->minlen);
4172 if (trie->maxlen != trie->minlen)
4173 data->longest = &(data->longest_float);
4174 }
4175 if (trie->jump) /* no more substrings -- for now /grr*/
4176 flags &= ~SCF_DO_SUBSTR;
b515a41d 4177 }
8aa23a47
YO
4178#endif /* old or new */
4179#endif /* TRIE_STUDY_OPT */
e1d1eefb 4180
8aa23a47
YO
4181 /* Else: zero-length, ignore. */
4182 scan = regnext(scan);
4183 }
4184 if (frame) {
4185 last = frame->last;
4186 scan = frame->next;
4187 stopparen = frame->stop;
4188 frame = frame->prev;
4189 goto fake_study_recurse;
c277df42
IZ
4190 }
4191
4192 finish:
8aa23a47 4193 assert(!frame);
304ee84b 4194 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4195
c277df42 4196 *scanp = scan;
aca2d497 4197 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4198 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4199 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4200 if (is_par > (I32)U8_MAX)
c277df42
IZ
4201 is_par = 0;
4202 if (is_par && pars==1 && data) {
4203 data->flags |= SF_IN_PAR;
4204 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4205 }
4206 else if (pars && data) {
c277df42
IZ
4207 data->flags |= SF_HAS_PAR;
4208 data->flags &= ~SF_IN_PAR;
4209 }
653099ff 4210 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4211 cl_and(data->start_class, and_withp);
786e8c11
YO
4212 if (flags & SCF_TRIE_RESTUDY)
4213 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4214
304ee84b 4215 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4216
e2e6a0f1 4217 return min < stopmin ? min : stopmin;
c277df42
IZ
4218}
4219
2eccd3b2
NC
4220STATIC U32
4221S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4222{
4a4e7719
NC
4223 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4224
7918f24d
NC
4225 PERL_ARGS_ASSERT_ADD_DATA;
4226
4a4e7719
NC
4227 Renewc(RExC_rxi->data,
4228 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4229 char, struct reg_data);
4230 if(count)
f8fc2ecf 4231 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4232 else
f8fc2ecf 4233 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4234 RExC_rxi->data->count = count + n;
4235 Copy(s, RExC_rxi->data->what + count, n, U8);
4236 return count;
c277df42
IZ
4237}
4238
f8149455 4239/*XXX: todo make this not included in a non debugging perl */
76234dfb 4240#ifndef PERL_IN_XSUB_RE
d88dccdf 4241void
864dbfa3 4242Perl_reginitcolors(pTHX)
d88dccdf 4243{
97aff369 4244 dVAR;
1df70142 4245 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4246 if (s) {
1df70142
AL
4247 char *t = savepv(s);
4248 int i = 0;
4249 PL_colors[0] = t;
d88dccdf 4250 while (++i < 6) {
1df70142
AL
4251 t = strchr(t, '\t');
4252 if (t) {
4253 *t = '\0';
4254 PL_colors[i] = ++t;
d88dccdf
IZ
4255 }
4256 else
1df70142 4257 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4258 }
4259 } else {
1df70142 4260 int i = 0;
b81d288d 4261 while (i < 6)
06b5626a 4262 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4263 }
4264 PL_colorset = 1;
4265}
76234dfb 4266#endif
8615cb43 4267
07be1b83 4268
786e8c11
YO
4269#ifdef TRIE_STUDY_OPT
4270#define CHECK_RESTUDY_GOTO \
4271 if ( \
4272 (data.flags & SCF_TRIE_RESTUDY) \
4273 && ! restudied++ \
4274 ) goto reStudy
4275#else
4276#define CHECK_RESTUDY_GOTO
4277#endif
f9f4320a 4278
a687059c 4279/*
e50aee73 4280 - pregcomp - compile a regular expression into internal code
a687059c
LW
4281 *
4282 * We can't allocate space until we know how big the compiled form will be,
4283 * but we can't compile it (and thus know how big it is) until we've got a
4284 * place to put the code. So we cheat: we compile it twice, once with code
4285 * generation turned off and size counting turned on, and once "for real".
4286 * This also means that we don't allocate space until we are sure that the
4287 * thing really will compile successfully, and we never have to move the
4288 * code and thus invalidate pointers into it. (Note that it has to be in
4289 * one piece because free() must be able to free it all.) [NB: not true in perl]
4290 *
4291 * Beware that the optimization-preparation code in here knows about some
4292 * of the structure of the compiled regexp. [I'll say.]
4293 */
b9b4dddf
YO
4294
4295
4296
f9f4320a 4297#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4298#define RE_ENGINE_PTR &PL_core_reg_engine
4299#else
f9f4320a
YO
4300extern const struct regexp_engine my_reg_engine;
4301#define RE_ENGINE_PTR &my_reg_engine
4302#endif
6d5c990f
RGS
4303
4304#ifndef PERL_IN_XSUB_RE
3ab4a224 4305REGEXP *
1593ad57 4306Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4307{
97aff369 4308 dVAR;
6d5c990f 4309 HV * const table = GvHV(PL_hintgv);
7918f24d
NC
4310
4311 PERL_ARGS_ASSERT_PREGCOMP;
4312
f9f4320a
YO
4313 /* Dispatch a request to compile a regexp to correct
4314 regexp engine. */
f9f4320a
YO
4315 if (table) {
4316 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4317 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4318 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4319 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4320 DEBUG_COMPILE_r({
8d8756e7 4321 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4322 SvIV(*ptr));
4323 });
3ab4a224 4324 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4325 }
b9b4dddf 4326 }
3ab4a224 4327 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4328}
6d5c990f 4329#endif
2a5d9b1d 4330
3ab4a224 4331REGEXP *
1593ad57 4332Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
2a5d9b1d
RGS
4333{
4334 dVAR;
288b8c02
NC
4335 REGEXP *rx;
4336 struct regexp *r;
f8fc2ecf 4337 register regexp_internal *ri;
3ab4a224 4338 STRLEN plen;
5d51ce98
KW
4339 char *exp;
4340 char* xend;
c277df42 4341 regnode *scan;
a0d0e21e 4342 I32 flags;
a0d0e21e
LW
4343 I32 minlen = 0;
4344 I32 sawplus = 0;
4345 I32 sawopen = 0;
bbd61b5f
KW
4346 U8 jump_ret = 0;
4347 dJMPENV;
2c2d71f5 4348 scan_data_t data;
830247a4 4349 RExC_state_t RExC_state;
be8e71aa 4350 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83 4351#ifdef TRIE_STUDY_OPT
5d51ce98 4352 int restudied;
07be1b83
YO
4353 RExC_state_t copyRExC_state;
4354#endif
2a5d9b1d 4355 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4356
4357 PERL_ARGS_ASSERT_RE_COMPILE;
4358
6d5c990f 4359 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4360
b9ad30b4 4361 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
a0ed51b3 4362
02daf0ab 4363
bbd61b5f
KW
4364 /* Longjmp back to here if have to switch in midstream to utf8 */
4365 if (! RExC_orig_utf8) {
4366 JMPENV_PUSH(jump_ret);
4367 }
4368
5d51ce98
KW
4369 if (jump_ret == 0) { /* First time through */
4370 exp = SvPV(pattern, plen);
4371 xend = exp + plen;
4372
4373 DEBUG_COMPILE_r({
4374 SV *dsv= sv_newmortal();
4375 RE_PV_QUOTED_DECL(s, RExC_utf8,
4376 dsv, exp, plen, 60);
4377 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4378 PL_colors[4],PL_colors[5],s);
4379 });
4380 }
4381 else { /* longjumped back */
bbd61b5f
KW
4382 STRLEN len = plen;
4383
5d51ce98
KW
4384 /* If the cause for the longjmp was other than changing to utf8, pop
4385 * our own setjmp, and longjmp to the correct handler */
bbd61b5f
KW
4386 if (jump_ret != UTF8_LONGJMP) {
4387 JMPENV_POP;
4388 JMPENV_JUMP(jump_ret);
4389 }
4390
595598ee
KW
4391 GET_RE_DEBUG_FLAGS;
4392
bbd61b5f
KW
4393 /* It's possible to write a regexp in ascii that represents Unicode
4394 codepoints outside of the byte range, such as via \x{100}. If we
4395 detect such a sequence we have to convert the entire pattern to utf8
4396 and then recompile, as our sizing calculation will have been based
4397 on 1 byte == 1 character, but we will need to use utf8 to encode
4398 at least some part of the pattern, and therefore must convert the whole
4399 thing.
4400 -- dmq */
4401 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4402 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
595598ee 4403 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
bbd61b5f
KW
4404 xend = exp + len;
4405 RExC_orig_utf8 = RExC_utf8 = 1;
4406 SAVEFREEPV(exp);
4407 }
4408
5d51ce98
KW
4409#ifdef TRIE_STUDY_OPT
4410 restudied = 0;
4411#endif
4412
02daf0ab 4413 RExC_precomp = exp;
c737faaf 4414 RExC_flags = pm_flags;
830247a4 4415 RExC_sawback = 0;
bbce6d69 4416
830247a4
IZ
4417 RExC_seen = 0;
4418 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4419 RExC_seen_evals = 0;
4420 RExC_extralen = 0;
c277df42 4421
bbce6d69 4422 /* First pass: determine size, legality. */
830247a4 4423 RExC_parse = exp;
fac92740 4424 RExC_start = exp;
830247a4
IZ
4425 RExC_end = xend;
4426 RExC_naughty = 0;
4427 RExC_npar = 1;
e2e6a0f1 4428 RExC_nestroot = 0;
830247a4
IZ
4429 RExC_size = 0L;
4430 RExC_emit = &PL_regdummy;
4431 RExC_whilem_seen = 0;
40d049e4
YO
4432 RExC_open_parens = NULL;
4433 RExC_close_parens = NULL;
4434 RExC_opend = NULL;
81714fb9 4435 RExC_paren_names = NULL;
1f1031fe
YO
4436#ifdef DEBUGGING
4437 RExC_paren_name_list = NULL;
4438#endif
40d049e4
YO
4439 RExC_recurse = NULL;
4440 RExC_recurse_count = 0;
81714fb9 4441
85ddcde9
JH
4442#if 0 /* REGC() is (currently) a NOP at the first pass.
4443 * Clever compilers notice this and complain. --jhi */
830247a4 4444 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4445#endif
3dab1dad
YO
4446 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4447 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4448 RExC_precomp = NULL;
a0d0e21e
LW
4449 return(NULL);
4450 }
bbd61b5f
KW
4451
4452 /* Here, finished first pass. Get rid of our setjmp, which we added for
4453 * efficiency only if the passed-in string wasn't in utf8, as shown by
4454 * RExC_orig_utf8. But if the first pass was redone, that variable will be
4455 * 1 here even though the original string wasn't utf8, but in this case
4456 * there will have been a long jump */
4457 if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
4458 JMPENV_POP;
02daf0ab 4459 }
07be1b83 4460 DEBUG_PARSE_r({
81714fb9
YO
4461 PerlIO_printf(Perl_debug_log,
4462 "Required size %"IVdf" nodes\n"
4463 "Starting second pass (creation)\n",
4464 (IV)RExC_size);
07be1b83
YO
4465 RExC_lastnum=0;
4466 RExC_lastparse=NULL;
4467 });
c277df42
IZ
4468 /* Small enough for pointer-storage convention?
4469 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4470 if (RExC_size >= 0x10000L && RExC_extralen)
4471 RExC_size += RExC_extralen;
c277df42 4472 else
830247a4
IZ
4473 RExC_extralen = 0;
4474 if (RExC_whilem_seen > 15)
4475 RExC_whilem_seen = 15;
a0d0e21e 4476
f9f4320a
YO
4477 /* Allocate space and zero-initialize. Note, the two step process
4478 of zeroing when in debug mode, thus anything assigned has to
4479 happen after that */
d2f13c59 4480 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 4481 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
4482 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4483 char, regexp_internal);
4484 if ( r == NULL || ri == NULL )
b45f050a 4485 FAIL("Regexp out of space");
0f79a09d
GS
4486#ifdef DEBUGGING
4487 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4488 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4489#else
f8fc2ecf
YO
4490 /* bulk initialize base fields with 0. */
4491 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4492#endif
58e23c8d
YO
4493
4494 /* non-zero initialization begins here */
f8fc2ecf 4495 RXi_SET( r, ri );
f9f4320a 4496 r->engine= RE_ENGINE_PTR;
c737faaf 4497 r->extflags = pm_flags;
bcdf7404 4498 {
f7819f85 4499 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
77ba44fb 4500 bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
c5ea2ffa
KW
4501
4502 /* The caret is output if there are any defaults: if not all the STD
4503 * flags are set, or if no character set specifier is needed */
4504 bool has_default =
4505 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4506 || ! has_charset);
bcdf7404 4507 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
4508 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4509 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
4510 const char *fptr = STD_PAT_MODS; /*"msix"*/
4511 char *p;
fb85c044 4512 /* Allocate for the worst case, which is all the std flags are turned
c5ea2ffa
KW
4513 * on. If more precision is desired, we could do a population count of
4514 * the flags set. This could be done with a small lookup table, or by
4515 * shifting, masking and adding, or even, when available, assembly
4516 * language for a machine-language population count.
4517 * We never output a minus, as all those are defaults, so are
4518 * covered by the caret */
fb85c044 4519 const STRLEN wraplen = plen + has_p + has_runon
c5ea2ffa
KW
4520 + has_default /* If needs a caret */
4521 + has_charset /* If needs a character set specifier */
bcdf7404
YO
4522 + (sizeof(STD_PAT_MODS) - 1)
4523 + (sizeof("(?:)") - 1);
4524
c5ea2ffa 4525 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
f7c278bf 4526 SvPOK_on(rx);
8f6ae13c 4527 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 4528 *p++='('; *p++='?';
9de15fec
KW
4529
4530 /* If a default, cover it using the caret */
c5ea2ffa 4531 if (has_default) {
85508812 4532 *p++= DEFAULT_PAT_MOD;
fb85c044 4533 }
c5ea2ffa
KW
4534 if (has_charset) {
4535 if (r->extflags & RXf_PMf_LOCALE) {
4536 *p++ = LOCALE_PAT_MOD;
4537 } else {
4538 *p++ = UNICODE_PAT_MOD;
4539 }
9de15fec 4540 }
f7819f85
A
4541 if (has_p)
4542 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404 4543 {
bcdf7404 4544 char ch;
bcdf7404
YO
4545 while((ch = *fptr++)) {
4546 if(reganch & 1)
4547 *p++ = ch;
bcdf7404
YO
4548 reganch >>= 1;
4549 }
bcdf7404
YO
4550 }
4551
28d8d7f4 4552 *p++ = ':';
bb661a58 4553 Copy(RExC_precomp, p, plen, char);
efd26800
NC
4554 assert ((RX_WRAPPED(rx) - p) < 16);
4555 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 4556 p += plen;
bcdf7404 4557 if (has_runon)
28d8d7f4
YO
4558 *p++ = '\n';
4559 *p++ = ')';
4560 *p = 0;
fb85c044 4561 SvCUR_set(rx, p - SvPVX_const(rx));
bcdf7404
YO
4562 }
4563
bbe252da 4564 r->intflags = 0;
830247a4 4565 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4566
6bda09f9 4567 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4568 Newxz(RExC_open_parens, RExC_npar,regnode *);
4569 SAVEFREEPV(RExC_open_parens);
4570 Newxz(RExC_close_parens,RExC_npar,regnode *);
4571 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4572 }
4573
4574 /* Useful during FAIL. */
7122b237
YO
4575#ifdef RE_TRACK_PATTERN_OFFSETS
4576 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4577 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4578 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4579 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4580 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4581#endif
4582 SetProgLen(ri,RExC_size);
288b8c02 4583 RExC_rx_sv = rx;
830247a4 4584 RExC_rx = r;
f8fc2ecf 4585 RExC_rxi = ri;
bbce6d69 4586
4587 /* Second pass: emit code. */
c737faaf 4588 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
4589 RExC_parse = exp;
4590 RExC_end = xend;
4591 RExC_naughty = 0;
4592 RExC_npar = 1;
f8fc2ecf
YO
4593 RExC_emit_start = ri->program;
4594 RExC_emit = ri->program;
3b57cd43
YO
4595 RExC_emit_bound = ri->program + RExC_size + 1;
4596
2cd61cdb 4597 /* Store the count of eval-groups for security checks: */
f8149455 4598 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4599 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 4600 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 4601 ReREFCNT_dec(rx);
a0d0e21e 4602 return(NULL);
80757612 4603 }
07be1b83
YO
4604 /* XXXX To minimize changes to RE engine we always allocate
4605 3-units-long substrs field. */
4606 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4607 if (RExC_recurse_count) {
4608 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4609 SAVEFREEPV(RExC_recurse);
4610 }
a0d0e21e 4611
07be1b83 4612reStudy:
1de06328 4613 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4614 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4615
07be1b83 4616#ifdef TRIE_STUDY_OPT
0934c9d9
SH
4617 if (!restudied) {
4618 StructCopy(&zero_scan_data, &data, scan_data_t);
4619 copyRExC_state = RExC_state;
4620 } else {
5d458dd8 4621 U32 seen=RExC_seen;
07be1b83 4622 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4623
4624 RExC_state = copyRExC_state;
4625 if (seen & REG_TOP_LEVEL_BRANCHES)
4626 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4627 else
4628 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4629 if (data.last_found) {
07be1b83 4630 SvREFCNT_dec(data.longest_fixed);
07be1b83 4631 SvREFCNT_dec(data.longest_float);
07be1b83 4632 SvREFCNT_dec(data.last_found);
1de06328 4633 }
40d049e4 4634 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4635 }
40d049e4
YO
4636#else
4637 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4638#endif
fc8cd66c 4639
a0d0e21e 4640 /* Dig out information for optimizations. */
f7819f85 4641 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
4642 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4643
a0ed51b3 4644 if (UTF)
8f6ae13c 4645 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 4646 ri->regstclass = NULL;
830247a4 4647 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4648 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4649 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4650
1de06328
YO
4651 /* testing for BRANCH here tells us whether there is "must appear"
4652 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4653 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4654 I32 fake;
c5254dd6 4655 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4656 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4657 int stclass_flag;
07be1b83 4658 I32 last_close = 0; /* pointed to by data */
5339e136
YO
4659 regnode *first= scan;
4660 regnode *first_next= regnext(first);
4661
639081d6
YO
4662 /*
4663 * Skip introductions and multiplicators >= 1
4664 * so that we can extract the 'meat' of the pattern that must
4665 * match in the large if() sequence following.
4666 * NOTE that EXACT is NOT covered here, as it is normally
4667 * picked up by the optimiser separately.
4668 *
4669 * This is unfortunate as the optimiser isnt handling lookahead
4670 * properly currently.
4671 *
4672 */
a0d0e21e 4673 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4674 /* An OR of *one* alternative - should not happen now. */
5339e136 4675 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83
YO
4676 /* for now we can't handle lookbehind IFMATCH*/
4677 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4678 (OP(first) == PLUS) ||
4679 (OP(first) == MINMOD) ||
653099ff 4680 /* An {n,m} with n>0 */
5339e136
YO
4681 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4682 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 4683 {
639081d6
YO
4684 /*
4685 * the only op that could be a regnode is PLUS, all the rest
4686 * will be regnode_1 or regnode_2.
4687 *
4688 */
a0d0e21e
LW
4689 if (OP(first) == PLUS)
4690 sawplus = 1;
4691 else
3dab1dad 4692 first += regarglen[OP(first)];
639081d6
YO
4693
4694 first = NEXTOPER(first);
5339e136 4695 first_next= regnext(first);
a687059c
LW
4696 }
4697
a0d0e21e
LW
4698 /* Starting-point info. */
4699 again:
786e8c11 4700 DEBUG_PEEP("first:",first,0);
07be1b83 4701 /* Ignore EXACT as we deal with it later. */
3dab1dad 4702 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4703 if (OP(first) == EXACT)
6f207bd3 4704 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4705 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4706 ri->regstclass = first;
b3c9acc1 4707 }
07be1b83 4708#ifdef TRIE_STCLASS
786e8c11 4709 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4710 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4711 {
786e8c11 4712 regnode *trie_op;
07be1b83 4713 /* this can happen only on restudy */
786e8c11 4714 if ( OP(first) == TRIE ) {
c944940b 4715 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4716 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4717 StructCopy(first,trieop,struct regnode_1);
4718 trie_op=(regnode *)trieop;
4719 } else {
c944940b 4720 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4721 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4722 StructCopy(first,trieop,struct regnode_charclass);
4723 trie_op=(regnode *)trieop;
4724 }
1de06328 4725 OP(trie_op)+=2;
786e8c11 4726 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4727 ri->regstclass = trie_op;
07be1b83
YO
4728 }
4729#endif
e52fc539 4730 else if (REGNODE_SIMPLE(OP(first)))
f8fc2ecf 4731 ri->regstclass = first;
3dab1dad
YO
4732 else if (PL_regkind[OP(first)] == BOUND ||
4733 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4734 ri->regstclass = first;
3dab1dad 4735 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4736 r->extflags |= (OP(first) == MBOL
4737 ? RXf_ANCH_MBOL
cad2e5aa 4738 : (OP(first) == SBOL
bbe252da
YO
4739 ? RXf_ANCH_SBOL
4740 : RXf_ANCH_BOL));
a0d0e21e 4741 first = NEXTOPER(first);
774d564b 4742 goto again;
4743 }
4744 else if (OP(first) == GPOS) {
bbe252da 4745 r->extflags |= RXf_ANCH_GPOS;
774d564b 4746 first = NEXTOPER(first);
4747 goto again;
a0d0e21e 4748 }
cf2a2b69
YO
4749 else if ((!sawopen || !RExC_sawback) &&
4750 (OP(first) == STAR &&
3dab1dad 4751 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4752 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4753 {
4754 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4755 const int type =
4756 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4757 ? RXf_ANCH_MBOL
4758 : RXf_ANCH_SBOL;
4759 r->extflags |= type;
4760 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4761 first = NEXTOPER(first);
774d564b 4762 goto again;
a0d0e21e 4763 }
b81d288d 4764 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4765 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4766 /* x+ must match at the 1st pos of run of x's */
bbe252da 4767 r->intflags |= PREGf_SKIP;
a0d0e21e 4768
c277df42 4769 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4770#ifdef TRIE_STUDY_OPT
81714fb9 4771 DEBUG_PARSE_r(
be8e71aa
YO
4772 if (!restudied)
4773 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4774 (IV)(first - scan + 1))
4775 );
4776#else
81714fb9 4777 DEBUG_PARSE_r(
be8e71aa
YO
4778 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4779 (IV)(first - scan + 1))
4780 );
4781#endif
4782
4783
a0d0e21e
LW
4784 /*
4785 * If there's something expensive in the r.e., find the
4786 * longest literal string that must appear and make it the
4787 * regmust. Resolve ties in favor of later strings, since
4788 * the regstart check works with the beginning of the r.e.
4789 * and avoiding duplication strengthens checking. Not a
4790 * strong reason, but sufficient in the absence of others.
4791 * [Now we resolve ties in favor of the earlier string if
c277df42 4792 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4793 * earlier string may buy us something the later one won't.]
4794 */
de8c5301 4795
396482e1
GA
4796 data.longest_fixed = newSVpvs("");
4797 data.longest_float = newSVpvs("");
4798 data.last_found = newSVpvs("");
c277df42
IZ
4799 data.longest = &(data.longest_fixed);
4800 first = scan;
f8fc2ecf 4801 if (!ri->regstclass) {
830247a4 4802 cl_init(pRExC_state, &ch_class);
653099ff
GS
4803 data.start_class = &ch_class;
4804 stclass_flag = SCF_DO_STCLASS_AND;
4805 } else /* XXXX Check for BOUND? */
4806 stclass_flag = 0;
cb434fcc 4807 data.last_closep = &last_close;
de8c5301 4808
1de06328 4809 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4810 &data, -1, NULL, NULL,
4811 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4812
07be1b83 4813
786e8c11
YO
4814 CHECK_RESTUDY_GOTO;
4815
4816
830247a4 4817 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4818 && data.last_start_min == 0 && data.last_end > 0
830247a4 4819 && !RExC_seen_zerolen
2bf803e2 4820 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4821 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4822 r->extflags |= RXf_CHECK_ALL;
304ee84b 4823 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4824 SvREFCNT_dec(data.last_found);
4825
1de06328
YO
4826 /* Note that code very similar to this but for anchored string
4827 follows immediately below, changes may need to be made to both.
4828 Be careful.
4829 */
a0ed51b3 4830 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4831 if (longest_float_length
c277df42
IZ
4832 || (data.flags & SF_FL_BEFORE_EOL
4833 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4834 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4835 {
1182767e 4836 I32 t,ml;
cf93c79d 4837
1de06328 4838 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4839 && data.offset_fixed == data.offset_float_min
4840 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4841 goto remove_float; /* As in (a)+. */
4842
1de06328
YO
4843 /* copy the information about the longest float from the reg_scan_data
4844 over to the program. */
33b8afdf
JH
4845 if (SvUTF8(data.longest_float)) {
4846 r->float_utf8 = data.longest_float;
c445ea15 4847 r->float_substr = NULL;
33b8afdf
JH
4848 } else {
4849 r->float_substr = data.longest_float;
c445ea15 4850 r->float_utf8 = NULL;
33b8afdf 4851 }
1de06328
YO
4852 /* float_end_shift is how many chars that must be matched that
4853 follow this item. We calculate it ahead of time as once the
4854 lookbehind offset is added in we lose the ability to correctly
4855 calculate it.*/
4856 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4857 : (I32)longest_float_length;
1de06328
YO
4858 r->float_end_shift = ml - data.offset_float_min
4859 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4860 + data.lookbehind_float;
4861 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4862 r->float_max_offset = data.offset_float_max;
1182767e 4863 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4864 r->float_max_offset -= data.lookbehind_float;
4865
cf93c79d
IZ
4866 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4867 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4868 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4869 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4870 }
4871 else {
aca2d497 4872 remove_float:
c445ea15 4873 r->float_substr = r->float_utf8 = NULL;
c277df42 4874 SvREFCNT_dec(data.longest_float);
c5254dd6 4875 longest_float_length = 0;
a0d0e21e 4876 }
c277df42 4877
1de06328
YO
4878 /* Note that code very similar to this but for floating string
4879 is immediately above, changes may need to be made to both.
4880 Be careful.
4881 */
a0ed51b3 4882 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4883 if (longest_fixed_length
c277df42
IZ
4884 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4885 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4886 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4887 {
1182767e 4888 I32 t,ml;
cf93c79d 4889
1de06328
YO
4890 /* copy the information about the longest fixed
4891 from the reg_scan_data over to the program. */
33b8afdf
JH
4892 if (SvUTF8(data.longest_fixed)) {
4893 r->anchored_utf8 = data.longest_fixed;
c445ea15 4894 r->anchored_substr = NULL;
33b8afdf
JH
4895 } else {
4896 r->anchored_substr = data.longest_fixed;
c445ea15 4897 r->anchored_utf8 = NULL;
33b8afdf 4898 }
1de06328
YO
4899 /* fixed_end_shift is how many chars that must be matched that
4900 follow this item. We calculate it ahead of time as once the
4901 lookbehind offset is added in we lose the ability to correctly
4902 calculate it.*/
4903 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4904 : (I32)longest_fixed_length;
1de06328
YO
4905 r->anchored_end_shift = ml - data.offset_fixed
4906 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4907 + data.lookbehind_fixed;
4908 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4909
cf93c79d
IZ
4910 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4911 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4912 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4913 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4914 }
4915 else {
c445ea15 4916 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4917 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4918 longest_fixed_length = 0;
a0d0e21e 4919 }
f8fc2ecf
YO
4920 if (ri->regstclass
4921 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4922 ri->regstclass = NULL;
33b8afdf
JH
4923 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4924 && stclass_flag
653099ff 4925 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4926 && !cl_is_anything(data.start_class))
4927 {
2eccd3b2 4928 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4929
f8fc2ecf 4930 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4931 struct regnode_charclass_class);
4932 StructCopy(data.start_class,
f8fc2ecf 4933 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4934 struct regnode_charclass_class);
f8fc2ecf 4935 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4936 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4937 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4938 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4939 PerlIO_printf(Perl_debug_log,
a0288114 4940 "synthetic stclass \"%s\".\n",
3f7c398e 4941 SvPVX_const(sv));});
653099ff 4942 }
c277df42
IZ
4943
4944 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4945 if (longest_fixed_length > longest_float_length) {
1de06328 4946 r->check_end_shift = r->anchored_end_shift;
c277df42 4947 r->check_substr = r->anchored_substr;
33b8afdf 4948 r->check_utf8 = r->anchored_utf8;
c277df42 4949 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4950 if (r->extflags & RXf_ANCH_SINGLE)
4951 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4952 }
4953 else {
1de06328 4954 r->check_end_shift = r->float_end_shift;
c277df42 4955 r->check_substr = r->float_substr;
33b8afdf 4956 r->check_utf8 = r->float_utf8;
1de06328
YO
4957 r->check_offset_min = r->float_min_offset;
4958 r->check_offset_max = r->float_max_offset;
a0d0e21e 4959 }
30382c73
IZ
4960 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4961 This should be changed ASAP! */
bbe252da
YO
4962 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4963 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4964 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4965 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4966 }
1de06328
YO
4967 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4968 if ( (STRLEN)minlen < longest_float_length )
4969 minlen= longest_float_length;
4970 if ( (STRLEN)minlen < longest_fixed_length )
4971 minlen= longest_fixed_length;
4972 */
a0ed51b3
LW
4973 }
4974 else {
c277df42
IZ
4975 /* Several toplevels. Best we can is to set minlen. */
4976 I32 fake;
653099ff 4977 struct regnode_charclass_class ch_class;
cb434fcc 4978 I32 last_close = 0;
c277df42 4979
5d458dd8 4980 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4981
f8fc2ecf 4982 scan = ri->program + 1;
830247a4 4983 cl_init(pRExC_state, &ch_class);
653099ff 4984 data.start_class = &ch_class;
cb434fcc 4985 data.last_closep = &last_close;
07be1b83 4986
de8c5301 4987
1de06328 4988 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4989 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4990
786e8c11 4991 CHECK_RESTUDY_GOTO;
07be1b83 4992
33b8afdf 4993 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4994 = r->float_substr = r->float_utf8 = NULL;
653099ff 4995 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4996 && !cl_is_anything(data.start_class))
4997 {
2eccd3b2 4998 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4999
f8fc2ecf 5000 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
5001 struct regnode_charclass_class);
5002 StructCopy(data.start_class,
f8fc2ecf 5003 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 5004 struct regnode_charclass_class);
f8fc2ecf 5005 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5006 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5007 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 5008 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5009 PerlIO_printf(Perl_debug_log,
a0288114 5010 "synthetic stclass \"%s\".\n",
3f7c398e 5011 SvPVX_const(sv));});
653099ff 5012 }
a0d0e21e
LW
5013 }
5014
1de06328
YO
5015 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5016 the "real" pattern. */
cf9788e3
RGS
5017 DEBUG_OPTIMISE_r({
5018 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 5019 (IV)minlen, (IV)r->minlen);
cf9788e3 5020 });
de8c5301 5021 r->minlenret = minlen;
1de06328
YO
5022 if (r->minlen < minlen)
5023 r->minlen = minlen;
5024
b81d288d 5025 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 5026 r->extflags |= RXf_GPOS_SEEN;
830247a4 5027 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 5028 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 5029 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 5030 r->extflags |= RXf_EVAL_SEEN;
f33976b4 5031 if (RExC_seen & REG_SEEN_CANY)
bbe252da 5032 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 5033 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 5034 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 5035 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 5036 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 5037 if (RExC_paren_names)
85fbaab2 5038 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 5039 else
5daac39c 5040 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 5041
7bd1e614 5042#ifdef STUPID_PATTERN_CHECKS
5509d87a 5043 if (RX_PRELEN(rx) == 0)
640f820d 5044 r->extflags |= RXf_NULL;
5509d87a 5045 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
5046 /* XXX: this should happen BEFORE we compile */
5047 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 5048 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 5049 r->extflags |= RXf_WHITE;
5509d87a 5050 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 5051 r->extflags |= RXf_START_ONLY;
f1b875a0 5052#else
5509d87a 5053 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
5054 /* XXX: this should happen BEFORE we compile */
5055 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5056 else {
5057 regnode *first = ri->program + 1;
39aa8307
JH
5058 U8 fop = OP(first);
5059 U8 nop = OP(NEXTOPER(first));
7bd1e614 5060
640f820d
AB
5061 if (PL_regkind[fop] == NOTHING && nop == END)
5062 r->extflags |= RXf_NULL;
5063 else if (PL_regkind[fop] == BOL && nop == END)
7bd1e614
YO
5064 r->extflags |= RXf_START_ONLY;
5065 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5066 r->extflags |= RXf_WHITE;
5067 }
f1b875a0 5068#endif
1f1031fe
YO
5069#ifdef DEBUGGING
5070 if (RExC_paren_names) {
af534a04 5071 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
1f1031fe
YO
5072 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5073 } else
1f1031fe 5074#endif
cde0cee5 5075 ri->name_list_idx = 0;
1f1031fe 5076
40d049e4
YO
5077 if (RExC_recurse_count) {
5078 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5079 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5080 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5081 }
5082 }
f0ab9afb 5083 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
5084 /* assume we don't need to swap parens around before we match */
5085
be8e71aa
YO
5086 DEBUG_DUMP_r({
5087 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
5088 regdump(r);
5089 });
7122b237
YO
5090#ifdef RE_TRACK_PATTERN_OFFSETS
5091 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5092 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
5093 U32 i;
5094 GET_RE_DEBUG_FLAGS_DECL;
7122b237 5095 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 5096 for (i = 1; i <= len; i++) {
7122b237 5097 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 5098 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 5099 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
5100 }
5101 PerlIO_printf(Perl_debug_log, "\n");
5102 });
7122b237 5103#endif
288b8c02 5104 return rx;
a687059c
LW
5105}
5106
f9f4320a 5107#undef RE_ENGINE_PTR
3dab1dad 5108
93b32b6d 5109
81714fb9 5110SV*
192b9cd1
AB
5111Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5112 const U32 flags)
5113{
7918f24d
NC
5114 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5115
192b9cd1
AB
5116 PERL_UNUSED_ARG(value);
5117
f1b875a0 5118 if (flags & RXapif_FETCH) {
192b9cd1 5119 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 5120 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6ad8f254 5121 Perl_croak_no_modify(aTHX);
192b9cd1 5122 return NULL;
f1b875a0 5123 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
5124 return reg_named_buff_exists(rx, key, flags)
5125 ? &PL_sv_yes
5126 : &PL_sv_no;
f1b875a0 5127 } else if (flags & RXapif_REGNAMES) {
192b9cd1 5128 return reg_named_buff_all(rx, flags);
f1b875a0 5129 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
5130 return reg_named_buff_scalar(rx, flags);
5131 } else {
5132 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5133 return NULL;
5134 }
5135}
5136
5137SV*
5138Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5139 const U32 flags)
5140{
7918f24d 5141 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
5142 PERL_UNUSED_ARG(lastkey);
5143
f1b875a0 5144 if (flags & RXapif_FIRSTKEY)
192b9cd1 5145 return reg_named_buff_firstkey(rx, flags);
f1b875a0 5146 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
5147 return reg_named_buff_nextkey(rx, flags);
5148 else {
5149 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5150 return NULL;
5151 }
5152}
5153
5154SV*
288b8c02
NC
5155Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5156 const U32 flags)
81714fb9 5157{
44a2ac75
YO
5158 AV *retarray = NULL;
5159 SV *ret;
288b8c02 5160 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5161
5162 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5163
f1b875a0 5164 if (flags & RXapif_ALL)
44a2ac75 5165 retarray=newAV();
93b32b6d 5166
5daac39c
NC
5167 if (rx && RXp_PAREN_NAMES(rx)) {
5168 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
5169 if (he_str) {
5170 IV i;
5171 SV* sv_dat=HeVAL(he_str);
5172 I32 *nums=(I32*)SvPVX(sv_dat);
5173 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
5174 if ((I32)(rx->nparens) >= nums[i]
5175 && rx->offs[nums[i]].start != -1
5176 && rx->offs[nums[i]].end != -1)
93b32b6d 5177 {
49d7dfbc 5178 ret = newSVpvs("");
288b8c02 5179 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
5180 if (!retarray)
5181 return ret;
5182 } else {
5183 ret = newSVsv(&PL_sv_undef);
5184 }
ec83ea38 5185 if (retarray)
93b32b6d 5186 av_push(retarray, ret);
81714fb9 5187 }
93b32b6d 5188 if (retarray)
ad64d0ec 5189 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
5190 }
5191 }
5192 return NULL;
5193}
5194
5195bool
288b8c02 5196Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
5197 const U32 flags)
5198{
288b8c02 5199 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5200
5201 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5202
5daac39c 5203 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5204 if (flags & RXapif_ALL) {
5daac39c 5205 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 5206 } else {
288b8c02 5207 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
5208 if (sv) {
5209 SvREFCNT_dec(sv);
192b9cd1
AB
5210 return TRUE;
5211 } else {
5212 return FALSE;
5213 }
5214 }
5215 } else {
5216 return FALSE;
5217 }
5218}
5219
5220SV*
288b8c02 5221Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5222{
288b8c02 5223 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5224
5225 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5226
5daac39c
NC
5227 if ( rx && RXp_PAREN_NAMES(rx) ) {
5228 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 5229
288b8c02 5230 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
5231 } else {
5232 return FALSE;
5233 }
192b9cd1
AB
5234}
5235
5236SV*
288b8c02 5237Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5238{
288b8c02 5239 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 5240 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5241
5242 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5243
5daac39c
NC
5244 if (rx && RXp_PAREN_NAMES(rx)) {
5245 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
5246 HE *temphe;
5247 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5248 IV i;
5249 IV parno = 0;
5250 SV* sv_dat = HeVAL(temphe);
5251 I32 *nums = (I32*)SvPVX(sv_dat);
5252 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5253 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5254 rx->offs[nums[i]].start != -1 &&
5255 rx->offs[nums[i]].end != -1)
5256 {
5257 parno = nums[i];
5258 break;
5259 }
5260 }
f1b875a0 5261 if (parno || flags & RXapif_ALL) {
a663657d 5262 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5263 }
81714fb9
YO
5264 }
5265 }
44a2ac75
YO
5266 return NULL;
5267}
5268
192b9cd1 5269SV*
288b8c02 5270Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5271{
5272 SV *ret;
5273 AV *av;
5274 I32 length;
288b8c02 5275 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5276
7918f24d
NC
5277 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5278
5daac39c 5279 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5280 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5281 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5282 } else if (flags & RXapif_ONE) {
288b8c02 5283 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 5284 av = MUTABLE_AV(SvRV(ret));
192b9cd1 5285 length = av_len(av);
ec83ea38 5286 SvREFCNT_dec(ret);
192b9cd1
AB
5287 return newSViv(length + 1);
5288 } else {
5289 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5290 return NULL;
5291 }
5292 }
5293 return &PL_sv_undef;
5294}
5295
5296SV*
288b8c02 5297Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5298{
288b8c02 5299 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5300 AV *av = newAV();
5301
7918f24d
NC
5302 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5303
5daac39c
NC
5304 if (rx && RXp_PAREN_NAMES(rx)) {
5305 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5306 HE *temphe;
5307 (void)hv_iterinit(hv);
5308 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5309 IV i;
5310 IV parno = 0;
5311 SV* sv_dat = HeVAL(temphe);
5312 I32 *nums = (I32*)SvPVX(sv_dat);
5313 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5314 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5315 rx->offs[nums[i]].start != -1 &&
5316 rx->offs[nums[i]].end != -1)
5317 {
5318 parno = nums[i];
5319 break;
5320 }
5321 }
f1b875a0 5322 if (parno || flags & RXapif_ALL) {
a663657d 5323 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
5324 }
5325 }
5326 }
5327
ad64d0ec 5328 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
5329}
5330
49d7dfbc 5331void
288b8c02
NC
5332Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5333 SV * const sv)
44a2ac75 5334{
288b8c02 5335 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 5336 char *s = NULL;
a9d504c3 5337 I32 i = 0;
44a2ac75 5338 I32 s1, t1;
7918f24d
NC
5339
5340 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 5341
cde0cee5
YO
5342 if (!rx->subbeg) {
5343 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5344 return;
cde0cee5
YO
5345 }
5346 else
f1b875a0 5347 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 5348 /* $` */
f0ab9afb 5349 i = rx->offs[0].start;
cde0cee5 5350 s = rx->subbeg;
44a2ac75
YO
5351 }
5352 else
f1b875a0 5353 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 5354 /* $' */
f0ab9afb
NC
5355 s = rx->subbeg + rx->offs[0].end;
5356 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
5357 }
5358 else
5359 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
5360 (s1 = rx->offs[paren].start) != -1 &&
5361 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
5362 {
5363 /* $& $1 ... */
5364 i = t1 - s1;
5365 s = rx->subbeg + s1;
cde0cee5
YO
5366 } else {
5367 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5368 return;
cde0cee5
YO
5369 }
5370 assert(rx->sublen >= (s - rx->subbeg) + i );
5371 if (i >= 0) {
5372 const int oldtainted = PL_tainted;
5373 TAINT_NOT;
5374 sv_setpvn(sv, s, i);
5375 PL_tainted = oldtainted;
5376 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 5377 ? (RXp_MATCH_UTF8(rx)
cde0cee5 5378 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 5379 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
5380 {
5381 SvUTF8_on(sv);
5382 }
5383 else
5384 SvUTF8_off(sv);
5385 if (PL_tainting) {
07bc277f 5386 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
5387 if (SvTYPE(sv) >= SVt_PVMG) {
5388 MAGIC* const mg = SvMAGIC(sv);
5389 MAGIC* mgt;
5390 PL_tainted = 1;
5391 SvMAGIC_set(sv, mg->mg_moremagic);
5392 SvTAINT(sv);
5393 if ((mgt = SvMAGIC(sv))) {
5394 mg->mg_moremagic = mgt;
5395 SvMAGIC_set(sv, mg);
44a2ac75 5396 }
cde0cee5
YO
5397 } else {
5398 PL_tainted = 1;
5399 SvTAINT(sv);
5400 }
5401 } else
5402 SvTAINTED_off(sv);
44a2ac75 5403 }
81714fb9 5404 } else {
44a2ac75 5405 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5406 return;
81714fb9
YO
5407 }
5408}
93b32b6d 5409
2fdbfb4d
AB
5410void
5411Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5412 SV const * const value)
5413{
7918f24d
NC
5414 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5415
2fdbfb4d
AB
5416 PERL_UNUSED_ARG(rx);
5417 PERL_UNUSED_ARG(paren);
5418 PERL_UNUSED_ARG(value);
5419
5420 if (!PL_localizing)
6ad8f254 5421 Perl_croak_no_modify(aTHX);
2fdbfb4d
AB
5422}
5423
5424I32
288b8c02 5425Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
5426 const I32 paren)
5427{
288b8c02 5428 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
5429 I32 i;
5430 I32 s1, t1;
5431
7918f24d
NC
5432 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5433
2fdbfb4d
AB
5434 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5435 switch (paren) {
192b9cd1 5436 /* $` / ${^PREMATCH} */
f1b875a0 5437 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
5438 if (rx->offs[0].start != -1) {
5439 i = rx->offs[0].start;
5440 if (i > 0) {
5441 s1 = 0;
5442 t1 = i;
5443 goto getlen;
5444 }
5445 }
5446 return 0;
192b9cd1 5447 /* $' / ${^POSTMATCH} */
f1b875a0 5448 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
5449 if (rx->offs[0].end != -1) {
5450 i = rx->sublen - rx->offs[0].end;
5451 if (i > 0) {
5452 s1 = rx->offs[0].end;
5453 t1 = rx->sublen;
5454 goto getlen;
5455 }
5456 }
5457 return 0;
192b9cd1
AB
5458 /* $& / ${^MATCH}, $1, $2, ... */
5459 default:
2fdbfb4d
AB
5460 if (paren <= (I32)rx->nparens &&
5461 (s1 = rx->offs[paren].start) != -1 &&
5462 (t1 = rx->offs[paren].end) != -1)
5463 {
5464 i = t1 - s1;
5465 goto getlen;
5466 } else {
5467 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 5468 report_uninit((const SV *)sv);
2fdbfb4d
AB
5469 return 0;
5470 }
5471 }
5472 getlen:
07bc277f 5473 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
5474 const char * const s = rx->subbeg + s1;
5475 const U8 *ep;
5476 STRLEN el;
5477
5478 i = t1 - s1;
5479 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5480 i = el;
5481 }
5482 return i;
5483}
5484
fe578d7f 5485SV*
49d7dfbc 5486Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 5487{
7918f24d 5488 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 5489 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
5490 if (0)
5491 return NULL;
5492 else
5493 return newSVpvs("Regexp");
fe578d7f 5494}
0a4db386 5495
894be9b7 5496/* Scans the name of a named buffer from the pattern.
0a4db386
YO
5497 * If flags is REG_RSN_RETURN_NULL returns null.
5498 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5499 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5500 * to the parsed name as looked up in the RExC_paren_names hash.
5501 * If there is an error throws a vFAIL().. type exception.
894be9b7 5502 */
0a4db386
YO
5503
5504#define REG_RSN_RETURN_NULL 0
5505#define REG_RSN_RETURN_NAME 1
5506#define REG_RSN_RETURN_DATA 2
5507
894be9b7 5508STATIC SV*
7918f24d
NC
5509S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5510{
894be9b7 5511 char *name_start = RExC_parse;
1f1031fe 5512
7918f24d
NC
5513 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5514
1f1031fe
YO
5515 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5516 /* skip IDFIRST by using do...while */
5517 if (UTF)
5518 do {
5519 RExC_parse += UTF8SKIP(RExC_parse);
5520 } while (isALNUM_utf8((U8*)RExC_parse));
5521 else
5522 do {
5523 RExC_parse++;
5524 } while (isALNUM(*RExC_parse));
894be9b7 5525 }
1f1031fe 5526
0a4db386 5527 if ( flags ) {
59cd0e26
NC
5528 SV* sv_name
5529 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5530 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
5531 if ( flags == REG_RSN_RETURN_NAME)
5532 return sv_name;
5533 else if (flags==REG_RSN_RETURN_DATA) {
5534 HE *he_str = NULL;
5535 SV *sv_dat = NULL;
5536 if ( ! sv_name ) /* should not happen*/
5537 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5538 if (RExC_paren_names)
5539 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5540 if ( he_str )
5541 sv_dat = HeVAL(he_str);
5542 if ( ! sv_dat )
5543 vFAIL("Reference to nonexistent named group");
5544 return sv_dat;
5545 }
5546 else {
5547 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5548 }
5549 /* NOT REACHED */
894be9b7 5550 }
0a4db386 5551 return NULL;
894be9b7
YO
5552}
5553
3dab1dad
YO
5554#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5555 int rem=(int)(RExC_end - RExC_parse); \
5556 int cut; \
5557 int num; \
5558 int iscut=0; \
5559 if (rem>10) { \
5560 rem=10; \
5561 iscut=1; \
5562 } \
5563 cut=10-rem; \
5564 if (RExC_lastparse!=RExC_parse) \
5565 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5566 rem, RExC_parse, \
5567 cut + 4, \
5568 iscut ? "..." : "<" \
5569 ); \
5570 else \
5571 PerlIO_printf(Perl_debug_log,"%16s",""); \
5572 \
5573 if (SIZE_ONLY) \
3b57cd43 5574 num = RExC_size + 1; \
3dab1dad
YO
5575 else \
5576 num=REG_NODE_NUM(RExC_emit); \
5577 if (RExC_lastnum!=num) \
0a4db386 5578 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 5579 else \
0a4db386 5580 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
5581 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5582 (int)((depth*2)), "", \
3dab1dad
YO
5583 (funcname) \
5584 ); \
5585 RExC_lastnum=num; \
5586 RExC_lastparse=RExC_parse; \
5587})
5588
07be1b83
YO
5589
5590
3dab1dad
YO
5591#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5592 DEBUG_PARSE_MSG((funcname)); \
5593 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5594})
6bda09f9
YO
5595#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5596 DEBUG_PARSE_MSG((funcname)); \
5597 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5598})
a687059c
LW
5599/*
5600 - reg - regular expression, i.e. main body or parenthesized thing
5601 *
5602 * Caller must absorb opening parenthesis.
5603 *
5604 * Combining parenthesis handling with the base level of regular expression
5605 * is a trifle forced, but the need to tie the tails of the branches to what
5606 * follows makes it hard to avoid.
5607 */
07be1b83
YO
5608#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5609#ifdef DEBUGGING
5610#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5611#else
5612#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5613#endif
3dab1dad 5614
76e3520e 5615STATIC regnode *
3dab1dad 5616S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 5617 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 5618{
27da23d5 5619 dVAR;
c277df42
IZ
5620 register regnode *ret; /* Will be the head of the group. */
5621 register regnode *br;
5622 register regnode *lastbr;
cbbf8932 5623 register regnode *ender = NULL;
a0d0e21e 5624 register I32 parno = 0;
cbbf8932 5625 I32 flags;
f7819f85 5626 U32 oregflags = RExC_flags;
6136c704
AL
5627 bool have_branch = 0;
5628 bool is_open = 0;
594d7033
YO
5629 I32 freeze_paren = 0;
5630 I32 after_freeze = 0;
9d1d55b5
JP
5631
5632 /* for (?g), (?gc), and (?o) warnings; warning
5633 about (?c) will warn about (?g) -- japhy */
5634
6136c704
AL
5635#define WASTED_O 0x01
5636#define WASTED_G 0x02
5637#define WASTED_C 0x04
5638#define WASTED_GC (0x02|0x04)
cbbf8932 5639 I32 wastedflags = 0x00;
9d1d55b5 5640
fac92740 5641 char * parse_start = RExC_parse; /* MJD */
a28509cc 5642 char * const oregcomp_parse = RExC_parse;
a0d0e21e 5643
3dab1dad 5644 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5645
5646 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
5647 DEBUG_PARSE("reg ");
5648
821b33a5 5649 *flagp = 0; /* Tentatively. */
a0d0e21e 5650
9d1d55b5 5651
a0d0e21e
LW
5652 /* Make an OPEN node, if parenthesized. */
5653 if (paren) {
e2e6a0f1
YO
5654 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5655 char *start_verb = RExC_parse;
5656 STRLEN verb_len = 0;
5657 char *start_arg = NULL;
5658 unsigned char op = 0;
5659 int argok = 1;
5660 int internal_argval = 0; /* internal_argval is only useful if !argok */
5661 while ( *RExC_parse && *RExC_parse != ')' ) {
5662 if ( *RExC_parse == ':' ) {
5663 start_arg = RExC_parse + 1;
5664 break;
5665 }
5666 RExC_parse++;
5667 }
5668 ++start_verb;
5669 verb_len = RExC_parse - start_verb;
5670 if ( start_arg ) {
5671 RExC_parse++;
5672 while ( *RExC_parse && *RExC_parse != ')' )
5673 RExC_parse++;
5674 if ( *RExC_parse != ')' )
5675 vFAIL("Unterminated verb pattern argument");
5676 if ( RExC_parse == start_arg )
5677 start_arg = NULL;
5678 } else {
5679 if ( *RExC_parse != ')' )
5680 vFAIL("Unterminated verb pattern");
5681 }
5d458dd8 5682
e2e6a0f1
YO
5683 switch ( *start_verb ) {
5684 case 'A': /* (*ACCEPT) */
568a785a 5685 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
5686 op = ACCEPT;
5687 internal_argval = RExC_nestroot;
5688 }
5689 break;
5690 case 'C': /* (*COMMIT) */
568a785a 5691 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 5692 op = COMMIT;
e2e6a0f1
YO
5693 break;
5694 case 'F': /* (*FAIL) */
568a785a 5695 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
5696 op = OPFAIL;
5697 argok = 0;
5698 }
5699 break;
5d458dd8
YO
5700 case ':': /* (*:NAME) */
5701 case 'M': /* (*MARK:NAME) */
568a785a 5702 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 5703 op = MARKPOINT;
5d458dd8
YO
5704 argok = -1;
5705 }
5706 break;
5707 case 'P': /* (*PRUNE) */
568a785a 5708 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 5709 op = PRUNE;
e2e6a0f1 5710 break;
5d458dd8 5711 case 'S': /* (*SKIP) */
568a785a 5712 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
5713 op = SKIP;
5714 break;
5715 case 'T': /* (*THEN) */
5716 /* [19:06] <TimToady> :: is then */
568a785a 5717 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
5718 op = CUTGROUP;
5719 RExC_seen |= REG_SEEN_CUTGROUP;
5720 }
e2e6a0f1
YO
5721 break;
5722 }
5723 if ( ! op ) {
5724 RExC_parse++;
5725 vFAIL3("Unknown verb pattern '%.*s'",
5726 verb_len, start_verb);
5727 }
5728 if ( argok ) {
5729 if ( start_arg && internal_argval ) {
5730 vFAIL3("Verb pattern '%.*s' may not have an argument",
5731 verb_len, start_verb);
5732 } else if ( argok < 0 && !start_arg ) {
5733 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5734 verb_len, start_verb);
5735 } else {
5736 ret = reganode(pRExC_state, op, internal_argval);
5737 if ( ! internal_argval && ! SIZE_ONLY ) {
5738 if (start_arg) {
5739 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5740 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5741 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
5742 ret->flags = 0;
5743 } else {
5744 ret->flags = 1;
5745 }
5746 }
5747 }
5748 if (!internal_argval)
5749 RExC_seen |= REG_SEEN_VERBARG;
5750 } else if ( start_arg ) {
5751 vFAIL3("Verb pattern '%.*s' may not have an argument",
5752 verb_len, start_verb);
5753 } else {
5754 ret = reg_node(pRExC_state, op);
5755 }
5756 nextchar(pRExC_state);
5757 return ret;
5758 } else
fac92740 5759 if (*RExC_parse == '?') { /* (?...) */
6136c704 5760 bool is_logical = 0;
a28509cc 5761 const char * const seqstart = RExC_parse;
fb85c044 5762 bool has_use_defaults = FALSE;
ca9dfc88 5763
830247a4
IZ
5764 RExC_parse++;
5765 paren = *RExC_parse++;
c277df42 5766 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5767 switch (paren) {
894be9b7 5768
1f1031fe
YO
5769 case 'P': /* (?P...) variants for those used to PCRE/Python */
5770 paren = *RExC_parse++;
5771 if ( paren == '<') /* (?P<...>) named capture */
5772 goto named_capture;
5773 else if (paren == '>') { /* (?P>name) named recursion */
5774 goto named_recursion;
5775 }
5776 else if (paren == '=') { /* (?P=...) named backref */
5777 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5778 you change this make sure you change that */
5779 char* name_start = RExC_parse;
5780 U32 num = 0;
5781 SV *sv_dat = reg_scan_name(pRExC_state,
5782 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5783 if (RExC_parse == name_start || *RExC_parse != ')')
5784 vFAIL2("Sequence %.3s... not terminated",parse_start);
5785
5786 if (!SIZE_ONLY) {
5787 num = add_data( pRExC_state, 1, "S" );
5788 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5789 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
5790 }
5791 RExC_sawback = 1;
5792 ret = reganode(pRExC_state,
5793 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5794 num);
5795 *flagp |= HASWIDTH;
5796
5797 Set_Node_Offset(ret, parse_start+1);
5798 Set_Node_Cur_Length(ret); /* MJD */
5799
5800 nextchar(pRExC_state);
5801 return ret;
5802 }
57b84237
YO
5803 RExC_parse++;
5804 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5805 /*NOTREACHED*/
5806 case '<': /* (?<...) */
b81d288d 5807 if (*RExC_parse == '!')
c277df42 5808 paren = ',';
0a4db386 5809 else if (*RExC_parse != '=')
1f1031fe 5810 named_capture:
0a4db386 5811 { /* (?<...>) */
81714fb9 5812 char *name_start;
894be9b7 5813 SV *svname;
81714fb9
YO
5814 paren= '>';
5815 case '\'': /* (?'...') */
5816 name_start= RExC_parse;
0a4db386
YO
5817 svname = reg_scan_name(pRExC_state,
5818 SIZE_ONLY ? /* reverse test from the others */
5819 REG_RSN_RETURN_NAME :
5820 REG_RSN_RETURN_NULL);
57b84237
YO
5821 if (RExC_parse == name_start) {
5822 RExC_parse++;
5823 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5824 /*NOTREACHED*/
5825 }
81714fb9
YO
5826 if (*RExC_parse != paren)
5827 vFAIL2("Sequence (?%c... not terminated",
5828 paren=='>' ? '<' : paren);
5829 if (SIZE_ONLY) {
e62cc96a
YO
5830 HE *he_str;
5831 SV *sv_dat = NULL;
894be9b7
YO
5832 if (!svname) /* shouldnt happen */
5833 Perl_croak(aTHX_
5834 "panic: reg_scan_name returned NULL");
81714fb9
YO
5835 if (!RExC_paren_names) {
5836 RExC_paren_names= newHV();
ad64d0ec 5837 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
5838#ifdef DEBUGGING
5839 RExC_paren_name_list= newAV();
ad64d0ec 5840 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 5841#endif
81714fb9
YO
5842 }
5843 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5844 if ( he_str )
81714fb9 5845 sv_dat = HeVAL(he_str);
e62cc96a 5846 if ( ! sv_dat ) {
81714fb9 5847 /* croak baby croak */
e62cc96a
YO
5848 Perl_croak(aTHX_
5849 "panic: paren_name hash element allocation failed");
5850 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
5851 /* (?|...) can mean we have dupes so scan to check
5852 its already been stored. Maybe a flag indicating
5853 we are inside such a construct would be useful,
5854 but the arrays are likely to be quite small, so
5855 for now we punt -- dmq */
5856 IV count = SvIV(sv_dat);
5857 I32 *pv = (I32*)SvPVX(sv_dat);
5858 IV i;
5859 for ( i = 0 ; i < count ; i++ ) {
5860 if ( pv[i] == RExC_npar ) {
5861 count = 0;
5862 break;
5863 }
5864 }
5865 if ( count ) {
5866 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5867 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5868 pv[count] = RExC_npar;
3a92e6ae 5869 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 5870 }
81714fb9
YO
5871 } else {
5872 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5873 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5874 SvIOK_on(sv_dat);
3ec35e0f 5875 SvIV_set(sv_dat, 1);
e62cc96a 5876 }
1f1031fe
YO
5877#ifdef DEBUGGING
5878 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5879 SvREFCNT_dec(svname);
5880#endif
e62cc96a 5881
81714fb9
YO
5882 /*sv_dump(sv_dat);*/
5883 }
5884 nextchar(pRExC_state);
5885 paren = 1;
5886 goto capturing_parens;
5887 }
5888 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5889 RExC_parse++;
fac92740 5890 case '=': /* (?=...) */
89c6a13e 5891 RExC_seen_zerolen++;
5c3fa2e7 5892 break;
fac92740 5893 case '!': /* (?!...) */
830247a4 5894 RExC_seen_zerolen++;
e2e6a0f1
YO
5895 if (*RExC_parse == ')') {
5896 ret=reg_node(pRExC_state, OPFAIL);
5897 nextchar(pRExC_state);
5898 return ret;
5899 }
594d7033
YO
5900 break;
5901 case '|': /* (?|...) */
5902 /* branch reset, behave like a (?:...) except that
5903 buffers in alternations share the same numbers */
5904 paren = ':';
5905 after_freeze = freeze_paren = RExC_npar;
5906 break;
fac92740
MJD
5907 case ':': /* (?:...) */
5908 case '>': /* (?>...) */
a0d0e21e 5909 break;
fac92740
MJD
5910 case '$': /* (?$...) */
5911 case '@': /* (?@...) */
8615cb43 5912 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5913 break;
fac92740 5914 case '#': /* (?#...) */
830247a4
IZ
5915 while (*RExC_parse && *RExC_parse != ')')
5916 RExC_parse++;
5917 if (*RExC_parse != ')')
c277df42 5918 FAIL("Sequence (?#... not terminated");
830247a4 5919 nextchar(pRExC_state);
a0d0e21e
LW
5920 *flagp = TRYAGAIN;
5921 return NULL;
894be9b7
YO
5922 case '0' : /* (?0) */
5923 case 'R' : /* (?R) */
5924 if (*RExC_parse != ')')
6bda09f9 5925 FAIL("Sequence (?R) not terminated");
1a147d38 5926 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 5927 *flagp |= POSTPONED;
7f69552c
YO
5928 nextchar(pRExC_state);
5929 return ret;
5930 /*notreached*/
894be9b7
YO
5931 { /* named and numeric backreferences */
5932 I32 num;
894be9b7
YO
5933 case '&': /* (?&NAME) */
5934 parse_start = RExC_parse - 1;
1f1031fe 5935 named_recursion:
894be9b7 5936 {
0a4db386
YO
5937 SV *sv_dat = reg_scan_name(pRExC_state,
5938 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5939 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5940 }
5941 goto gen_recurse_regop;
5942 /* NOT REACHED */
542fa716
YO
5943 case '+':
5944 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5945 RExC_parse++;
5946 vFAIL("Illegal pattern");
5947 }
5948 goto parse_recursion;
5949 /* NOT REACHED*/
5950 case '-': /* (?-1) */
5951 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5952 RExC_parse--; /* rewind to let it be handled later */
5953 goto parse_flags;
5954 }
5955 /*FALLTHROUGH */
6bda09f9
YO
5956 case '1': case '2': case '3': case '4': /* (?1) */
5957 case '5': case '6': case '7': case '8': case '9':
5958 RExC_parse--;
542fa716 5959 parse_recursion:
894be9b7
YO
5960 num = atoi(RExC_parse);
5961 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5962 if (*RExC_parse == '-')
5963 RExC_parse++;
6bda09f9
YO
5964 while (isDIGIT(*RExC_parse))
5965 RExC_parse++;
5966 if (*RExC_parse!=')')
5967 vFAIL("Expecting close bracket");
894be9b7
YO
5968
5969 gen_recurse_regop:
542fa716
YO
5970 if ( paren == '-' ) {
5971 /*
5972 Diagram of capture buffer numbering.
5973 Top line is the normal capture buffer numbers
5974 Botton line is the negative indexing as from
5975 the X (the (?-2))
5976
5977 + 1 2 3 4 5 X 6 7
5978 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5979 - 5 4 3 2 1 X x x
5980
5981 */
5982 num = RExC_npar + num;
5983 if (num < 1) {
5984 RExC_parse++;
5985 vFAIL("Reference to nonexistent group");
5986 }
5987 } else if ( paren == '+' ) {
5988 num = RExC_npar + num - 1;
5989 }
5990
1a147d38 5991 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5992 if (!SIZE_ONLY) {
5993 if (num > (I32)RExC_rx->nparens) {
5994 RExC_parse++;
5995 vFAIL("Reference to nonexistent group");
5996 }
40d049e4 5997 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5998 RExC_emit++;
226de585 5999 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 6000 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 6001 } else {
6bda09f9 6002 RExC_size++;
6bda09f9 6003 }
0a4db386 6004 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 6005 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
6006 Set_Node_Offset(ret, parse_start); /* MJD */
6007
a3b492c3 6008 *flagp |= POSTPONED;
6bda09f9
YO
6009 nextchar(pRExC_state);
6010 return ret;
894be9b7
YO
6011 } /* named and numeric backreferences */
6012 /* NOT REACHED */
6013
fac92740 6014 case '?': /* (??...) */
6136c704 6015 is_logical = 1;
57b84237
YO
6016 if (*RExC_parse != '{') {
6017 RExC_parse++;
6018 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6019 /*NOTREACHED*/
6020 }
a3b492c3 6021 *flagp |= POSTPONED;
830247a4 6022 paren = *RExC_parse++;
0f5d15d6 6023 /* FALL THROUGH */
fac92740 6024 case '{': /* (?{...}) */
c277df42 6025 {
2eccd3b2
NC
6026 I32 count = 1;
6027 U32 n = 0;
c277df42 6028 char c;
830247a4 6029 char *s = RExC_parse;
c277df42 6030
830247a4
IZ
6031 RExC_seen_zerolen++;
6032 RExC_seen |= REG_SEEN_EVAL;
6033 while (count && (c = *RExC_parse)) {
6136c704
AL
6034 if (c == '\\') {
6035 if (RExC_parse[1])
6036 RExC_parse++;
6037 }
b81d288d 6038 else if (c == '{')
c277df42 6039 count++;
b81d288d 6040 else if (c == '}')
c277df42 6041 count--;
830247a4 6042 RExC_parse++;
c277df42 6043 }
6136c704 6044 if (*RExC_parse != ')') {
b81d288d 6045 RExC_parse = s;
b45f050a
JF
6046 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6047 }
c277df42 6048 if (!SIZE_ONLY) {
f3548bdc 6049 PAD *pad;
6136c704
AL
6050 OP_4tree *sop, *rop;
6051 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 6052
569233ed
SB
6053 ENTER;
6054 Perl_save_re_context(aTHX);
f3548bdc 6055 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
6056 sop->op_private |= OPpREFCOUNTED;
6057 /* re_dup will OpREFCNT_inc */
6058 OpREFCNT_set(sop, 1);
569233ed 6059 LEAVE;
c277df42 6060
830247a4 6061 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
6062 RExC_rxi->data->data[n] = (void*)rop;
6063 RExC_rxi->data->data[n+1] = (void*)sop;
6064 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 6065 SvREFCNT_dec(sv);
a0ed51b3 6066 }
e24b16f9 6067 else { /* First pass */
830247a4 6068 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 6069 && IN_PERL_RUNTIME)
2cd61cdb
IZ
6070 /* No compiled RE interpolated, has runtime
6071 components ===> unsafe. */
6072 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 6073 if (PL_tainting && PL_tainted)
cc6b7395 6074 FAIL("Eval-group in insecure regular expression");
54df2634 6075#if PERL_VERSION > 8
923e4eb5 6076 if (IN_PERL_COMPILETIME)
b5c19bd7 6077 PL_cv_has_eval = 1;
54df2634 6078#endif
c277df42 6079 }
b5c19bd7 6080
830247a4 6081 nextchar(pRExC_state);
6136c704 6082 if (is_logical) {
830247a4 6083 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6084 if (!SIZE_ONLY)
6085 ret->flags = 2;
3dab1dad 6086 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 6087 /* deal with the length of this later - MJD */
0f5d15d6
IZ
6088 return ret;
6089 }
ccb2c380
MP
6090 ret = reganode(pRExC_state, EVAL, n);
6091 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6092 Set_Node_Offset(ret, parse_start);
6093 return ret;
c277df42 6094 }
fac92740 6095 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 6096 {
0a4db386 6097 int is_define= 0;
fac92740 6098 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
6099 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6100 || RExC_parse[1] == '<'
830247a4 6101 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
6102 I32 flag;
6103
830247a4 6104 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6105 if (!SIZE_ONLY)
6106 ret->flags = 1;
3dab1dad 6107 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 6108 goto insert_if;
b81d288d 6109 }
a0ed51b3 6110 }
0a4db386
YO
6111 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6112 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6113 {
6114 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6115 char *name_start= RExC_parse++;
2eccd3b2 6116 U32 num = 0;
0a4db386
YO
6117 SV *sv_dat=reg_scan_name(pRExC_state,
6118 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6119 if (RExC_parse == name_start || *RExC_parse != ch)
6120 vFAIL2("Sequence (?(%c... not terminated",
6121 (ch == '>' ? '<' : ch));
6122 RExC_parse++;
6123 if (!SIZE_ONLY) {
6124 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 6125 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 6126 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
6127 }
6128 ret = reganode(pRExC_state,NGROUPP,num);
6129 goto insert_if_check_paren;
6130 }
6131 else if (RExC_parse[0] == 'D' &&
6132 RExC_parse[1] == 'E' &&
6133 RExC_parse[2] == 'F' &&
6134 RExC_parse[3] == 'I' &&
6135 RExC_parse[4] == 'N' &&
6136 RExC_parse[5] == 'E')
6137 {
6138 ret = reganode(pRExC_state,DEFINEP,0);
6139 RExC_parse +=6 ;
6140 is_define = 1;
6141 goto insert_if_check_paren;
6142 }
6143 else if (RExC_parse[0] == 'R') {
6144 RExC_parse++;
6145 parno = 0;
6146 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6147 parno = atoi(RExC_parse++);
6148 while (isDIGIT(*RExC_parse))
6149 RExC_parse++;
6150 } else if (RExC_parse[0] == '&') {
6151 SV *sv_dat;
6152 RExC_parse++;
6153 sv_dat = reg_scan_name(pRExC_state,
6154 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6155 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6156 }
1a147d38 6157 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
6158 goto insert_if_check_paren;
6159 }
830247a4 6160 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 6161 /* (?(1)...) */
6136c704 6162 char c;
830247a4 6163 parno = atoi(RExC_parse++);
c277df42 6164
830247a4
IZ
6165 while (isDIGIT(*RExC_parse))
6166 RExC_parse++;
fac92740 6167 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 6168
0a4db386 6169 insert_if_check_paren:
830247a4 6170 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 6171 vFAIL("Switch condition not recognized");
c277df42 6172 insert_if:
3dab1dad
YO
6173 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6174 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 6175 if (br == NULL)
830247a4 6176 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 6177 else
3dab1dad 6178 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 6179 c = *nextchar(pRExC_state);
d1b80229
IZ
6180 if (flags&HASWIDTH)
6181 *flagp |= HASWIDTH;
c277df42 6182 if (c == '|') {
0a4db386
YO
6183 if (is_define)
6184 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 6185 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
6186 regbranch(pRExC_state, &flags, 1,depth+1);
6187 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
6188 if (flags&HASWIDTH)
6189 *flagp |= HASWIDTH;
830247a4 6190 c = *nextchar(pRExC_state);
a0ed51b3
LW
6191 }
6192 else
c277df42
IZ
6193 lastbr = NULL;
6194 if (c != ')')
8615cb43 6195 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 6196 ender = reg_node(pRExC_state, TAIL);
3dab1dad 6197 REGTAIL(pRExC_state, br, ender);
c277df42 6198 if (lastbr) {
3dab1dad
YO
6199 REGTAIL(pRExC_state, lastbr, ender);
6200 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
6201 }
6202 else
3dab1dad 6203 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
6204 RExC_size++; /* XXX WHY do we need this?!!
6205 For large programs it seems to be required
6206 but I can't figure out why. -- dmq*/
c277df42 6207 return ret;
a0ed51b3
LW
6208 }
6209 else {
830247a4 6210 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
6211 }
6212 }
1b1626e4 6213 case 0:
830247a4 6214 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 6215 vFAIL("Sequence (? incomplete");
1b1626e4 6216 break;
85508812
KW
6217 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6218 that follow */
fb85c044
KW
6219 has_use_defaults = TRUE;
6220 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9de15fec 6221 RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
fb85c044 6222 goto parse_flags;
a0d0e21e 6223 default:
cde0cee5
YO
6224 --RExC_parse;
6225 parse_flags: /* (?i) */
6226 {
6227 U32 posflags = 0, negflags = 0;
6228 U32 *flagsp = &posflags;
9de15fec 6229 bool has_charset_modifier = 0;
cde0cee5
YO
6230
6231 while (*RExC_parse) {
6232 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
6233 /* (?g), (?gc) and (?o) are useless here
6234 and must be globally applied -- japhy */
cde0cee5
YO
6235 switch (*RExC_parse) {
6236 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9de15fec
KW
6237 case LOCALE_PAT_MOD:
6238 if (has_charset_modifier || flagsp == &negflags) {
6239 goto fail_modifiers;
6240 }
8cc86590
KW
6241 posflags |= RXf_PMf_LOCALE;
6242 negflags |= RXf_PMf_UNICODE;
9de15fec
KW
6243 has_charset_modifier = 1;
6244 break;
6245 case UNICODE_PAT_MOD:
6246 if (has_charset_modifier || flagsp == &negflags) {
6247 goto fail_modifiers;
6248 }
8cc86590
KW
6249 posflags |= RXf_PMf_UNICODE;
6250 negflags |= RXf_PMf_LOCALE;
9de15fec
KW
6251 has_charset_modifier = 1;
6252 break;
6253 case DUAL_PAT_MOD:
6254 if (has_use_defaults
6255 || has_charset_modifier
6256 || flagsp == &negflags)
6257 {
6258 goto fail_modifiers;
6259 }
8cc86590 6260 negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
9de15fec
KW
6261 has_charset_modifier = 1;
6262 break;
f7819f85
A
6263 case ONCE_PAT_MOD: /* 'o' */
6264 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 6265 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 6266 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
6267 if (! (wastedflags & wflagbit) ) {
6268 wastedflags |= wflagbit;
6269 vWARN5(
6270 RExC_parse + 1,
6271 "Useless (%s%c) - %suse /%c modifier",
6272 flagsp == &negflags ? "?-" : "?",
6273 *RExC_parse,
6274 flagsp == &negflags ? "don't " : "",
6275 *RExC_parse
6276 );
6277 }
6278 }
cde0cee5
YO
6279 break;
6280
f7819f85 6281 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 6282 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
6283 if (! (wastedflags & WASTED_C) ) {
6284 wastedflags |= WASTED_GC;
9d1d55b5
JP
6285 vWARN3(
6286 RExC_parse + 1,
6287 "Useless (%sc) - %suse /gc modifier",
6288 flagsp == &negflags ? "?-" : "?",
6289 flagsp == &negflags ? "don't " : ""
6290 );
6291 }
6292 }
cde0cee5 6293 break;
f7819f85 6294 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5 6295 if (flagsp == &negflags) {
668c081a
NC
6296 if (SIZE_ONLY)
6297 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
6298 } else {
6299 *flagsp |= RXf_PMf_KEEPCOPY;
6300 }
6301 break;
6302 case '-':
fb85c044
KW
6303 /* A flag is a default iff it is following a minus, so
6304 * if there is a minus, it means will be trying to
6305 * re-specify a default which is an error */
6306 if (has_use_defaults || flagsp == &negflags) {
9de15fec 6307 fail_modifiers:
57b84237
YO
6308 RExC_parse++;
6309 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6310 /*NOTREACHED*/
6311 }
cde0cee5
YO
6312 flagsp = &negflags;
6313 wastedflags = 0; /* reset so (?g-c) warns twice */
6314 break;
6315 case ':':
6316 paren = ':';
6317 /*FALLTHROUGH*/
6318 case ')':
6319 RExC_flags |= posflags;
6320 RExC_flags &= ~negflags;
f7819f85
A
6321 if (paren != ':') {
6322 oregflags |= posflags;
6323 oregflags &= ~negflags;
6324 }
cde0cee5
YO
6325 nextchar(pRExC_state);
6326 if (paren != ':') {
6327 *flagp = TRYAGAIN;
6328 return NULL;
6329 } else {
6330 ret = NULL;
6331 goto parse_rest;
6332 }
6333 /*NOTREACHED*/
6334 default:
cde0cee5
YO
6335 RExC_parse++;
6336 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6337 /*NOTREACHED*/
6338 }
830247a4 6339 ++RExC_parse;
48c036b1 6340 }
cde0cee5 6341 }} /* one for the default block, one for the switch */
a0d0e21e 6342 }
fac92740 6343 else { /* (...) */
81714fb9 6344 capturing_parens:
830247a4
IZ
6345 parno = RExC_npar;
6346 RExC_npar++;
e2e6a0f1 6347
830247a4 6348 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
6349 if (!SIZE_ONLY ){
6350 if (!RExC_nestroot)
6351 RExC_nestroot = parno;
c009da3d
YO
6352 if (RExC_seen & REG_SEEN_RECURSE
6353 && !RExC_open_parens[parno-1])
6354 {
e2e6a0f1 6355 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
6356 "Setting open paren #%"IVdf" to %d\n",
6357 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
6358 RExC_open_parens[parno-1]= ret;
6359 }
6bda09f9 6360 }
fac92740
MJD
6361 Set_Node_Length(ret, 1); /* MJD */
6362 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 6363 is_open = 1;
a0d0e21e 6364 }
a0ed51b3 6365 }
fac92740 6366 else /* ! paren */
a0d0e21e 6367 ret = NULL;
cde0cee5
YO
6368
6369 parse_rest:
a0d0e21e 6370 /* Pick up the branches, linking them together. */
fac92740 6371 parse_start = RExC_parse; /* MJD */
3dab1dad 6372 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e
VP
6373
6374 if (freeze_paren) {
6375 if (RExC_npar > after_freeze)
6376 after_freeze = RExC_npar;
6377 RExC_npar = freeze_paren;
6378 }
6379
fac92740 6380 /* branch_len = (paren != 0); */
2af232bd 6381
a0d0e21e
LW
6382 if (br == NULL)
6383 return(NULL);
830247a4
IZ
6384 if (*RExC_parse == '|') {
6385 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 6386 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 6387 }
fac92740 6388 else { /* MJD */
6bda09f9 6389 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
6390 Set_Node_Length(br, paren != 0);
6391 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6392 }
c277df42
IZ
6393 have_branch = 1;
6394 if (SIZE_ONLY)
830247a4 6395 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
6396 }
6397 else if (paren == ':') {
c277df42
IZ
6398 *flagp |= flags&SIMPLE;
6399 }
6136c704 6400 if (is_open) { /* Starts with OPEN. */
3dab1dad 6401 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
6402 }
6403 else if (paren != '?') /* Not Conditional */
a0d0e21e 6404 ret = br;
8ae10a67 6405 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 6406 lastbr = br;
830247a4
IZ
6407 while (*RExC_parse == '|') {
6408 if (!SIZE_ONLY && RExC_extralen) {
6409 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 6410 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
6411 }
6412 if (SIZE_ONLY)
830247a4
IZ
6413 RExC_extralen += 2; /* Account for LONGJMP. */
6414 nextchar(pRExC_state);
594d7033
YO
6415 if (freeze_paren) {
6416 if (RExC_npar > after_freeze)
6417 after_freeze = RExC_npar;
6418 RExC_npar = freeze_paren;
6419 }
3dab1dad 6420 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 6421
a687059c 6422 if (br == NULL)
a0d0e21e 6423 return(NULL);
3dab1dad 6424 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 6425 lastbr = br;
8ae10a67 6426 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
6427 }
6428
c277df42
IZ
6429 if (have_branch || paren != ':') {
6430 /* Make a closing node, and hook it on the end. */
6431 switch (paren) {
6432 case ':':
830247a4 6433 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
6434 break;
6435 case 1:
830247a4 6436 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
6437 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6438 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6439 "Setting close paren #%"IVdf" to %d\n",
6440 (IV)parno, REG_NODE_NUM(ender)));
6441 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
6442 if (RExC_nestroot == parno)
6443 RExC_nestroot = 0;
40d049e4 6444 }
fac92740
MJD
6445 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6446 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
6447 break;
6448 case '<':
c277df42
IZ
6449 case ',':
6450 case '=':
6451 case '!':
c277df42 6452 *flagp &= ~HASWIDTH;
821b33a5
IZ
6453 /* FALL THROUGH */
6454 case '>':
830247a4 6455 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
6456 break;
6457 case 0:
830247a4 6458 ender = reg_node(pRExC_state, END);
40d049e4
YO
6459 if (!SIZE_ONLY) {
6460 assert(!RExC_opend); /* there can only be one! */
6461 RExC_opend = ender;
6462 }
c277df42
IZ
6463 break;
6464 }
eaf3ca90 6465 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 6466
9674d46a 6467 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
6468 if (depth==1)
6469 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6470
c277df42 6471 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
6472 for (br = ret; br; br = regnext(br)) {
6473 const U8 op = PL_regkind[OP(br)];
6474 if (op == BRANCH) {
07be1b83 6475 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
6476 }
6477 else if (op == BRANCHJ) {
07be1b83 6478 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 6479 }
c277df42
IZ
6480 }
6481 }
a0d0e21e 6482 }
c277df42
IZ
6483
6484 {
e1ec3a88
AL
6485 const char *p;
6486 static const char parens[] = "=!<,>";
c277df42
IZ
6487
6488 if (paren && (p = strchr(parens, paren))) {
eb160463 6489 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
6490 int flag = (p - parens) > 1;
6491
6492 if (paren == '>')
6493 node = SUSPEND, flag = 0;
6bda09f9 6494 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
6495 Set_Node_Cur_Length(ret);
6496 Set_Node_Offset(ret, parse_start + 1);
c277df42 6497 ret->flags = flag;
07be1b83 6498 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 6499 }
a0d0e21e
LW
6500 }
6501
6502 /* Check for proper termination. */
ce3e6498 6503 if (paren) {
e2509266 6504 RExC_flags = oregflags;
830247a4
IZ
6505 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6506 RExC_parse = oregcomp_parse;
380a0633 6507 vFAIL("Unmatched (");
ce3e6498 6508 }
a0ed51b3 6509 }
830247a4
IZ
6510 else if (!paren && RExC_parse < RExC_end) {
6511 if (*RExC_parse == ')') {
6512 RExC_parse++;
380a0633 6513 vFAIL("Unmatched )");
a0ed51b3
LW
6514 }
6515 else
b45f050a 6516 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
6517 /* NOTREACHED */
6518 }
594d7033
YO
6519 if (after_freeze)
6520 RExC_npar = after_freeze;
a0d0e21e 6521 return(ret);
a687059c
LW
6522}
6523
6524/*
6525 - regbranch - one alternative of an | operator
6526 *
6527 * Implements the concatenation operator.
6528 */
76e3520e 6529STATIC regnode *
3dab1dad 6530S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 6531{
97aff369 6532 dVAR;
c277df42
IZ
6533 register regnode *ret;
6534 register regnode *chain = NULL;
6535 register regnode *latest;
6536 I32 flags = 0, c = 0;
3dab1dad 6537 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6538
6539 PERL_ARGS_ASSERT_REGBRANCH;
6540
3dab1dad 6541 DEBUG_PARSE("brnc");
02daf0ab 6542
b81d288d 6543 if (first)
c277df42
IZ
6544 ret = NULL;
6545 else {
b81d288d 6546 if (!SIZE_ONLY && RExC_extralen)
830247a4 6547 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 6548 else {
830247a4 6549 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
6550 Set_Node_Length(ret, 1);
6551 }
c277df42
IZ
6552 }
6553
b81d288d 6554 if (!first && SIZE_ONLY)
830247a4 6555 RExC_extralen += 1; /* BRANCHJ */
b81d288d 6556
c277df42 6557 *flagp = WORST; /* Tentatively. */
a0d0e21e 6558
830247a4
IZ
6559 RExC_parse--;
6560 nextchar(pRExC_state);
6561 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 6562 flags &= ~TRYAGAIN;
3dab1dad 6563 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6564 if (latest == NULL) {
6565 if (flags & TRYAGAIN)
6566 continue;
6567 return(NULL);
a0ed51b3
LW
6568 }
6569 else if (ret == NULL)
c277df42 6570 ret = latest;
8ae10a67 6571 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 6572 if (chain == NULL) /* First piece. */
a0d0e21e
LW
6573 *flagp |= flags&SPSTART;
6574 else {
830247a4 6575 RExC_naughty++;
3dab1dad 6576 REGTAIL(pRExC_state, chain, latest);
a687059c 6577 }
a0d0e21e 6578 chain = latest;
c277df42
IZ
6579 c++;
6580 }
6581 if (chain == NULL) { /* Loop ran zero times. */
830247a4 6582 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
6583 if (ret == NULL)
6584 ret = chain;
6585 }
6586 if (c == 1) {
6587 *flagp |= flags&SIMPLE;
a0d0e21e 6588 }
a687059c 6589
d4c19fe8 6590 return ret;
a687059c
LW
6591}
6592
6593/*
6594 - regpiece - something followed by possible [*+?]
6595 *
6596 * Note that the branching code sequences used for ? and the general cases
6597 * of * and + are somewhat optimized: they use the same NOTHING node as
6598 * both the endmarker for their branch list and the body of the last branch.
6599 * It might seem that this node could be dispensed with entirely, but the
6600 * endmarker role is not redundant.
6601 */
76e3520e 6602STATIC regnode *
3dab1dad 6603S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6604{
97aff369 6605 dVAR;
c277df42 6606 register regnode *ret;
a0d0e21e
LW
6607 register char op;
6608 register char *next;
6609 I32 flags;
1df70142 6610 const char * const origparse = RExC_parse;
a0d0e21e 6611 I32 min;
c277df42 6612 I32 max = REG_INFTY;
fac92740 6613 char *parse_start;
10edeb5d 6614 const char *maxpos = NULL;
3dab1dad 6615 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6616
6617 PERL_ARGS_ASSERT_REGPIECE;
6618
3dab1dad 6619 DEBUG_PARSE("piec");
a0d0e21e 6620
3dab1dad 6621 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6622 if (ret == NULL) {
6623 if (flags & TRYAGAIN)
6624 *flagp |= TRYAGAIN;
6625 return(NULL);
6626 }
6627
830247a4 6628 op = *RExC_parse;
a0d0e21e 6629
830247a4 6630 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 6631 maxpos = NULL;
fac92740 6632 parse_start = RExC_parse; /* MJD */
830247a4 6633 next = RExC_parse + 1;
a0d0e21e
LW
6634 while (isDIGIT(*next) || *next == ',') {
6635 if (*next == ',') {
6636 if (maxpos)
6637 break;
6638 else
6639 maxpos = next;
a687059c 6640 }
a0d0e21e
LW
6641 next++;
6642 }
6643 if (*next == '}') { /* got one */
6644 if (!maxpos)
6645 maxpos = next;
830247a4
IZ
6646 RExC_parse++;
6647 min = atoi(RExC_parse);
a0d0e21e
LW
6648 if (*maxpos == ',')
6649 maxpos++;
6650 else
830247a4 6651 maxpos = RExC_parse;
a0d0e21e
LW
6652 max = atoi(maxpos);
6653 if (!max && *maxpos != '0')
c277df42
IZ
6654 max = REG_INFTY; /* meaning "infinity" */
6655 else if (max >= REG_INFTY)
8615cb43 6656 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
6657 RExC_parse = next;
6658 nextchar(pRExC_state);
a0d0e21e
LW
6659
6660 do_curly:
6661 if ((flags&SIMPLE)) {
830247a4 6662 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 6663 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
6664 Set_Node_Offset(ret, parse_start+1); /* MJD */
6665 Set_Node_Cur_Length(ret);
a0d0e21e
LW
6666 }
6667 else {
3dab1dad 6668 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
6669
6670 w->flags = 0;
3dab1dad 6671 REGTAIL(pRExC_state, ret, w);
830247a4 6672 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
6673 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6674 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
6675 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6676 }
6bda09f9 6677 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
6678 /* MJD hk */
6679 Set_Node_Offset(ret, parse_start+1);
2af232bd 6680 Set_Node_Length(ret,
fac92740 6681 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 6682
830247a4 6683 if (!SIZE_ONLY && RExC_extralen)
c277df42 6684 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 6685 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 6686 if (SIZE_ONLY)
830247a4
IZ
6687 RExC_whilem_seen++, RExC_extralen += 3;
6688 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 6689 }
c277df42 6690 ret->flags = 0;
a0d0e21e
LW
6691
6692 if (min > 0)
821b33a5
IZ
6693 *flagp = WORST;
6694 if (max > 0)
6695 *flagp |= HASWIDTH;
8fa23287 6696 if (max < min)
8615cb43 6697 vFAIL("Can't do {n,m} with n > m");
c277df42 6698 if (!SIZE_ONLY) {
eb160463
GS
6699 ARG1_SET(ret, (U16)min);
6700 ARG2_SET(ret, (U16)max);
a687059c 6701 }
a687059c 6702
a0d0e21e 6703 goto nest_check;
a687059c 6704 }
a0d0e21e 6705 }
a687059c 6706
a0d0e21e
LW
6707 if (!ISMULT1(op)) {
6708 *flagp = flags;
a687059c 6709 return(ret);
a0d0e21e 6710 }
bb20fd44 6711
c277df42 6712#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
6713
6714 /* if this is reinstated, don't forget to put this back into perldiag:
6715
6716 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6717
6718 (F) The part of the regexp subject to either the * or + quantifier
6719 could match an empty string. The {#} shows in the regular
6720 expression about where the problem was discovered.
6721
6722 */
6723
bb20fd44 6724 if (!(flags&HASWIDTH) && op != '?')
b45f050a 6725 vFAIL("Regexp *+ operand could be empty");
b81d288d 6726#endif
bb20fd44 6727
fac92740 6728 parse_start = RExC_parse;
830247a4 6729 nextchar(pRExC_state);
a0d0e21e 6730
821b33a5 6731 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
6732
6733 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 6734 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 6735 ret->flags = 0;
830247a4 6736 RExC_naughty += 4;
a0d0e21e
LW
6737 }
6738 else if (op == '*') {
6739 min = 0;
6740 goto do_curly;
a0ed51b3
LW
6741 }
6742 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 6743 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 6744 ret->flags = 0;
830247a4 6745 RExC_naughty += 3;
a0d0e21e
LW
6746 }
6747 else if (op == '+') {
6748 min = 1;
6749 goto do_curly;
a0ed51b3
LW
6750 }
6751 else if (op == '?') {
a0d0e21e
LW
6752 min = 0; max = 1;
6753 goto do_curly;
6754 }
6755 nest_check:
668c081a
NC
6756 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6757 ckWARN3reg(RExC_parse,
6758 "%.*s matches null string many times",
6759 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6760 origparse);
a0d0e21e
LW
6761 }
6762
b9b4dddf 6763 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 6764 nextchar(pRExC_state);
6bda09f9 6765 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 6766 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 6767 }
b9b4dddf
YO
6768#ifndef REG_ALLOW_MINMOD_SUSPEND
6769 else
6770#endif
6771 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6772 regnode *ender;
6773 nextchar(pRExC_state);
6774 ender = reg_node(pRExC_state, SUCCEED);
6775 REGTAIL(pRExC_state, ret, ender);
6776 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6777 ret->flags = 0;
6778 ender = reg_node(pRExC_state, TAIL);
6779 REGTAIL(pRExC_state, ret, ender);
6780 /*ret= ender;*/
6781 }
6782
6783 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 6784 RExC_parse++;
b45f050a
JF
6785 vFAIL("Nested quantifiers");
6786 }
a0d0e21e
LW
6787
6788 return(ret);
a687059c
LW
6789}
6790
fc8cd66c
YO
6791
6792/* reg_namedseq(pRExC_state,UVp)
6793
6794 This is expected to be called by a parser routine that has
afefe6bf 6795 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
6796 expected to point at the first char following the N at the time
6797 of the call.
ff3f963a
KW
6798
6799 The \N may be inside (indicated by valuep not being NULL) or outside a
6800 character class.
6801
6802 \N may begin either a named sequence, or if outside a character class, mean
6803 to match a non-newline. For non single-quoted regexes, the tokenizer has
6804 attempted to decide which, and in the case of a named sequence converted it
6805 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6806 where c1... are the characters in the sequence. For single-quoted regexes,
6807 the tokenizer passes the \N sequence through unchanged; this code will not
6808 attempt to determine this nor expand those. The net effect is that if the
6809 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6810 signals that this \N occurrence means to match a non-newline.
6811
6812 Only the \N{U+...} form should occur in a character class, for the same
6813 reason that '.' inside a character class means to just match a period: it
6814 just doesn't make sense.
fc8cd66c
YO
6815
6816 If valuep is non-null then it is assumed that we are parsing inside
6817 of a charclass definition and the first codepoint in the resolved
6818 string is returned via *valuep and the routine will return NULL.
6819 In this mode if a multichar string is returned from the charnames
ff3f963a 6820 handler, a warning will be issued, and only the first char in the
fc8cd66c
YO
6821 sequence will be examined. If the string returned is zero length
6822 then the value of *valuep is undefined and NON-NULL will
6823 be returned to indicate failure. (This will NOT be a valid pointer
6824 to a regnode.)
6825
ff3f963a
KW
6826 If valuep is null then it is assumed that we are parsing normal text and a
6827 new EXACT node is inserted into the program containing the resolved string,
6828 and a pointer to the new node is returned. But if the string is zero length
6829 a NOTHING node is emitted instead.
afefe6bf 6830
fc8cd66c 6831 On success RExC_parse is set to the char following the endbrace.
ff3f963a 6832 Parsing failures will generate a fatal error via vFAIL(...)
fc8cd66c
YO
6833 */
6834STATIC regnode *
afefe6bf 6835S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
fc8cd66c 6836{
c3c41406 6837 char * endbrace; /* '}' following the name */
fc8cd66c 6838 regnode *ret = NULL;
ff3f963a
KW
6839#ifdef DEBUGGING
6840 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6841#endif
c3c41406 6842 char* p;
ff3f963a
KW
6843
6844 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6845
6846 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
ff3f963a
KW
6847
6848 GET_RE_DEBUG_FLAGS;
c3c41406
KW
6849
6850 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6851 * modifier. The other meaning does not */
6852 p = (RExC_flags & RXf_PMf_EXTENDED)
6853 ? regwhite( pRExC_state, RExC_parse )
6854 : RExC_parse;
7918f24d 6855
ff3f963a 6856 /* Disambiguate between \N meaning a named character versus \N meaning
c3c41406
KW
6857 * [^\n]. The former is assumed when it can't be the latter. */
6858 if (*p != '{' || regcurly(p)) {
6859 RExC_parse = p;
ff3f963a 6860 if (valuep) {
afefe6bf 6861 /* no bare \N in a charclass */
ff3f963a
KW
6862 vFAIL("\\N in a character class must be a named character: \\N{...}");
6863 }
afefe6bf
RGS
6864 nextchar(pRExC_state);
6865 ret = reg_node(pRExC_state, REG_ANY);
6866 *flagp |= HASWIDTH|SIMPLE;
6867 RExC_naughty++;
6868 RExC_parse--;
6869 Set_Node_Length(ret, 1); /* MJD */
6870 return ret;
fc8cd66c 6871 }
a4893424 6872
c3c41406
KW
6873 /* Here, we have decided it should be a named sequence */
6874
6875 /* The test above made sure that the next real character is a '{', but
6876 * under the /x modifier, it could be separated by space (or a comment and
6877 * \n) and this is not allowed (for consistency with \x{...} and the
6878 * tokenizer handling of \N{NAME}). */
6879 if (*RExC_parse != '{') {
6880 vFAIL("Missing braces on \\N{}");
6881 }
6882
ff3f963a 6883 RExC_parse++; /* Skip past the '{' */
c3c41406
KW
6884
6885 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6886 || ! (endbrace == RExC_parse /* nothing between the {} */
6887 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6888 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6889 {
6890 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6891 vFAIL("\\N{NAME} must be resolved by the lexer");
6892 }
6893
ff3f963a
KW
6894 if (endbrace == RExC_parse) { /* empty: \N{} */
6895 if (! valuep) {
6896 RExC_parse = endbrace + 1;
6897 return reg_node(pRExC_state,NOTHING);
a4893424 6898 }
fc8cd66c 6899
ff3f963a
KW
6900 if (SIZE_ONLY) {
6901 ckWARNreg(RExC_parse,
6902 "Ignoring zero length \\N{} in character class"
6903 );
6904 RExC_parse = endbrace + 1;
6905 }
6906 *valuep = 0;
6907 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 6908 }
ff3f963a 6909
62fed28b 6910 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
ff3f963a
KW
6911 RExC_parse += 2; /* Skip past the 'U+' */
6912
6913 if (valuep) { /* In a bracketed char class */
6914 /* We only pay attention to the first char of
6915 multichar strings being returned. I kinda wonder
6916 if this makes sense as it does change the behaviour
6917 from earlier versions, OTOH that behaviour was broken
6918 as well. XXX Solution is to recharacterize as
6919 [rest-of-class]|multi1|multi2... */
6920
6921 STRLEN length_of_hex;
6922 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6923 | PERL_SCAN_DISALLOW_PREFIX
6924 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6925
37820adc
KW
6926 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6927 if (endchar < endbrace) {
ff3f963a
KW
6928 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6929 }
ff3f963a
KW
6930
6931 length_of_hex = (STRLEN)(endchar - RExC_parse);
6932 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6933
6934 /* The tokenizer should have guaranteed validity, but it's possible to
6935 * bypass it by using single quoting, so check */
c3c41406
KW
6936 if (length_of_hex == 0
6937 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6938 {
6939 RExC_parse += length_of_hex; /* Includes all the valid */
6940 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6941 ? UTF8SKIP(RExC_parse)
6942 : 1;
6943 /* Guard against malformed utf8 */
6944 if (RExC_parse >= endchar) RExC_parse = endchar;
6945 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
6946 }
6947
6948 RExC_parse = endbrace + 1;
6949 if (endchar == endbrace) return NULL;
6950
6951 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 6952 }
ff3f963a
KW
6953 else { /* Not a char class */
6954 char *s; /* String to put in generated EXACT node */
fda99bee 6955 STRLEN len = 0; /* Its current byte length */
ff3f963a
KW
6956 char *endchar; /* Points to '.' or '}' ending cur char in the input
6957 stream */
6958
6959 ret = reg_node(pRExC_state,
6960 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6961 s= STRING(ret);
6962
6963 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
6964 * the input which is of the form now 'c1.c2.c3...}' until find the
fda99bee 6965 * ending brace or exceed length 255. The characters that exceed this
ff3f963a
KW
6966 * limit are dropped. The limit could be relaxed should it become
6967 * desirable by reparsing this as (?:\N{NAME}), so could generate
6968 * multiple EXACT nodes, as is done for just regular input. But this
6969 * is primarily a named character, and not intended to be a huge long
6970 * string, so 255 bytes should be good enough */
6971 while (1) {
c3c41406 6972 STRLEN length_of_hex;
ff3f963a
KW
6973 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6974 | PERL_SCAN_DISALLOW_PREFIX
6975 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6976 UV cp; /* Ord of current character */
6977
6978 /* Code points are separated by dots. If none, there is only one
6979 * code point, and is terminated by the brace */
37820adc 6980 endchar = RExC_parse + strcspn(RExC_parse, ".}");
ff3f963a
KW
6981
6982 /* The values are Unicode even on EBCDIC machines */
c3c41406
KW
6983 length_of_hex = (STRLEN)(endchar - RExC_parse);
6984 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
6985 if ( length_of_hex == 0
6986 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
ff3f963a 6987 {
c3c41406
KW
6988 RExC_parse += length_of_hex; /* Includes all the valid */
6989 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6990 ? UTF8SKIP(RExC_parse)
6991 : 1;
6992 /* Guard against malformed utf8 */
6993 if (RExC_parse >= endchar) RExC_parse = endchar;
6994 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
6995 }
6996
6997 if (! FOLD) { /* Not folding, just append to the string */
6998 STRLEN unilen;
6999
7000 /* Quit before adding this character if would exceed limit */
7001 if (len + UNISKIP(cp) > U8_MAX) break;
fc8cd66c 7002
ff3f963a
KW
7003 unilen = reguni(pRExC_state, cp, s);
7004 if (unilen > 0) {
7005 s += unilen;
7006 len += unilen;
7007 }
7008 } else { /* Folding, output the folded equivalent */
7009 STRLEN foldlen,numlen;
7010 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7011 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7012
7013 /* Quit before exceeding size limit */
7014 if (len + foldlen > U8_MAX) break;
7015
7016 for (foldbuf = tmpbuf;
7017 foldlen;
7018 foldlen -= numlen)
7019 {
7020 cp = utf8_to_uvchr(foldbuf, &numlen);
7021 if (numlen > 0) {
7022 const STRLEN unilen = reguni(pRExC_state, cp, s);
7023 s += unilen;
7024 len += unilen;
7025 /* In EBCDIC the numlen and unilen can differ. */
7026 foldbuf += numlen;
7027 if (numlen >= foldlen)
7028 break;
7029 }
7030 else
7031 break; /* "Can't happen." */
7032 }
7033 }
7034
7035 /* Point to the beginning of the next character in the sequence. */
7036 RExC_parse = endchar + 1;
7037
7038 /* Quit if no more characters */
7039 if (RExC_parse >= endbrace) break;
7040 }
7041
7042
7043 if (SIZE_ONLY) {
7044 if (RExC_parse < endbrace) {
7045 ckWARNreg(RExC_parse - 1,
7046 "Using just the first characters returned by \\N{}");
7047 }
7048
7049 RExC_size += STR_SZ(len);
7050 } else {
7051 STR_LEN(ret) = len;
7052 RExC_emit += STR_SZ(len);
7053 }
7054
7055 RExC_parse = endbrace + 1;
7056
7057 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7058 with malformed in t/re/pat_advanced.t */
7059 RExC_parse --;
7060 Set_Node_Cur_Length(ret); /* MJD */
7061 nextchar(pRExC_state);
7062 }
7063
7064 return ret;
fc8cd66c
YO
7065}
7066
7067
9e08bc66
TS
7068/*
7069 * reg_recode
7070 *
7071 * It returns the code point in utf8 for the value in *encp.
7072 * value: a code value in the source encoding
7073 * encp: a pointer to an Encode object
7074 *
7075 * If the result from Encode is not a single character,
7076 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7077 */
7078STATIC UV
7079S_reg_recode(pTHX_ const char value, SV **encp)
7080{
7081 STRLEN numlen = 1;
59cd0e26 7082 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 7083 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
7084 const STRLEN newlen = SvCUR(sv);
7085 UV uv = UNICODE_REPLACEMENT;
7086
7918f24d
NC
7087 PERL_ARGS_ASSERT_REG_RECODE;
7088
9e08bc66
TS
7089 if (newlen)
7090 uv = SvUTF8(sv)
7091 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7092 : *(U8*)s;
7093
7094 if (!newlen || numlen != newlen) {
7095 uv = UNICODE_REPLACEMENT;
c86f7df5 7096 *encp = NULL;
9e08bc66
TS
7097 }
7098 return uv;
7099}
7100
fc8cd66c 7101
a687059c
LW
7102/*
7103 - regatom - the lowest level
ee9b8eae
YO
7104
7105 Try to identify anything special at the start of the pattern. If there
7106 is, then handle it as required. This may involve generating a single regop,
7107 such as for an assertion; or it may involve recursing, such as to
7108 handle a () structure.
7109
7110 If the string doesn't start with something special then we gobble up
7111 as much literal text as we can.
7112
7113 Once we have been able to handle whatever type of thing started the
7114 sequence, we return.
7115
7116 Note: we have to be careful with escapes, as they can be both literal
7117 and special, and in the case of \10 and friends can either, depending
7118 on context. Specifically there are two seperate switches for handling
7119 escape sequences, with the one for handling literal escapes requiring
7120 a dummy entry for all of the special escapes that are actually handled
7121 by the other.
7122*/
7123
76e3520e 7124STATIC regnode *
3dab1dad 7125S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 7126{
97aff369 7127 dVAR;
cbbf8932 7128 register regnode *ret = NULL;
a0d0e21e 7129 I32 flags;
45948336 7130 char *parse_start = RExC_parse;
3dab1dad
YO
7131 GET_RE_DEBUG_FLAGS_DECL;
7132 DEBUG_PARSE("atom");
a0d0e21e
LW
7133 *flagp = WORST; /* Tentatively. */
7134
7918f24d 7135 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 7136
a0d0e21e 7137tryagain:
f9a79580 7138 switch ((U8)*RExC_parse) {
a0d0e21e 7139 case '^':
830247a4
IZ
7140 RExC_seen_zerolen++;
7141 nextchar(pRExC_state);
bbe252da 7142 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7143 ret = reg_node(pRExC_state, MBOL);
bbe252da 7144 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7145 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7146 else
830247a4 7147 ret = reg_node(pRExC_state, BOL);
fac92740 7148 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7149 break;
7150 case '$':
830247a4 7151 nextchar(pRExC_state);
b81d288d 7152 if (*RExC_parse)
830247a4 7153 RExC_seen_zerolen++;
bbe252da 7154 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7155 ret = reg_node(pRExC_state, MEOL);
bbe252da 7156 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7157 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7158 else
830247a4 7159 ret = reg_node(pRExC_state, EOL);
fac92740 7160 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7161 break;
7162 case '.':
830247a4 7163 nextchar(pRExC_state);
bbe252da 7164 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
7165 ret = reg_node(pRExC_state, SANY);
7166 else
7167 ret = reg_node(pRExC_state, REG_ANY);
7168 *flagp |= HASWIDTH|SIMPLE;
830247a4 7169 RExC_naughty++;
fac92740 7170 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7171 break;
7172 case '[':
b45f050a 7173 {
3dab1dad
YO
7174 char * const oregcomp_parse = ++RExC_parse;
7175 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
7176 if (*RExC_parse != ']') {
7177 RExC_parse = oregcomp_parse;
b45f050a
JF
7178 vFAIL("Unmatched [");
7179 }
830247a4 7180 nextchar(pRExC_state);
a0d0e21e 7181 *flagp |= HASWIDTH|SIMPLE;
fac92740 7182 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 7183 break;
b45f050a 7184 }
a0d0e21e 7185 case '(':
830247a4 7186 nextchar(pRExC_state);
3dab1dad 7187 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 7188 if (ret == NULL) {
bf93d4cc 7189 if (flags & TRYAGAIN) {
830247a4 7190 if (RExC_parse == RExC_end) {
bf93d4cc
GS
7191 /* Make parent create an empty node if needed. */
7192 *flagp |= TRYAGAIN;
7193 return(NULL);
7194 }
a0d0e21e 7195 goto tryagain;
bf93d4cc 7196 }
a0d0e21e
LW
7197 return(NULL);
7198 }
a3b492c3 7199 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
7200 break;
7201 case '|':
7202 case ')':
7203 if (flags & TRYAGAIN) {
7204 *flagp |= TRYAGAIN;
7205 return NULL;
7206 }
b45f050a 7207 vFAIL("Internal urp");
a0d0e21e
LW
7208 /* Supposed to be caught earlier. */
7209 break;
85afd4ae 7210 case '{':
830247a4
IZ
7211 if (!regcurly(RExC_parse)) {
7212 RExC_parse++;
85afd4ae
CS
7213 goto defchar;
7214 }
7215 /* FALL THROUGH */
a0d0e21e
LW
7216 case '?':
7217 case '+':
7218 case '*':
830247a4 7219 RExC_parse++;
b45f050a 7220 vFAIL("Quantifier follows nothing");
a0d0e21e 7221 break;
f9a79580
RGS
7222 case 0xDF:
7223 case 0xC3:
7224 case 0xCE:
a0a388a1 7225 do_foldchar:
56d400ed 7226 if (!LOC && FOLD) {
e64b1bd1 7227 U32 len,cp;
7cf3a6a3 7228 len=0; /* silence a spurious compiler warning */
56d400ed 7229 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
e64b1bd1
YO
7230 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7231 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7232 ret = reganode(pRExC_state, FOLDCHAR, cp);
7233 Set_Node_Length(ret, 1); /* MJD */
7234 nextchar(pRExC_state); /* kill whitespace under /x */
7235 return ret;
7236 }
7237 }
7238 goto outer_default;
a0d0e21e 7239 case '\\':
ee9b8eae
YO
7240 /* Special Escapes
7241
7242 This switch handles escape sequences that resolve to some kind
7243 of special regop and not to literal text. Escape sequnces that
7244 resolve to literal text are handled below in the switch marked
7245 "Literal Escapes".
7246
7247 Every entry in this switch *must* have a corresponding entry
7248 in the literal escape switch. However, the opposite is not
7249 required, as the default for this switch is to jump to the
7250 literal text handling code.
7251 */
a0a388a1
YO
7252 switch ((U8)*++RExC_parse) {
7253 case 0xDF:
7254 case 0xC3:
7255 case 0xCE:
7256 goto do_foldchar;
ee9b8eae 7257 /* Special Escapes */
a0d0e21e 7258 case 'A':
830247a4
IZ
7259 RExC_seen_zerolen++;
7260 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7261 *flagp |= SIMPLE;
ee9b8eae 7262 goto finish_meta_pat;
a0d0e21e 7263 case 'G':
830247a4
IZ
7264 ret = reg_node(pRExC_state, GPOS);
7265 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 7266 *flagp |= SIMPLE;
ee9b8eae
YO
7267 goto finish_meta_pat;
7268 case 'K':
7269 RExC_seen_zerolen++;
7270 ret = reg_node(pRExC_state, KEEPS);
7271 *flagp |= SIMPLE;
37923168
RGS
7272 /* XXX:dmq : disabling in-place substitution seems to
7273 * be necessary here to avoid cases of memory corruption, as
7274 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7275 */
7276 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 7277 goto finish_meta_pat;
a0d0e21e 7278 case 'Z':
830247a4 7279 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7280 *flagp |= SIMPLE;
a1917ab9 7281 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7282 goto finish_meta_pat;
b85d18e9 7283 case 'z':
830247a4 7284 ret = reg_node(pRExC_state, EOS);
b85d18e9 7285 *flagp |= SIMPLE;
830247a4 7286 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7287 goto finish_meta_pat;
4a2d328f 7288 case 'C':
f33976b4
DB
7289 ret = reg_node(pRExC_state, CANY);
7290 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 7291 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7292 goto finish_meta_pat;
a0ed51b3 7293 case 'X':
830247a4 7294 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 7295 *flagp |= HASWIDTH;
ee9b8eae 7296 goto finish_meta_pat;
a0d0e21e 7297 case 'w':
a12cf05f
KW
7298 if (LOC) {
7299 ret = reg_node(pRExC_state, (U8)(ALNUML));
7300 } else {
7301 ret = reg_node(pRExC_state, (U8)(ALNUM));
7302 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7303 }
a0d0e21e 7304 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7305 goto finish_meta_pat;
a0d0e21e 7306 case 'W':
a12cf05f
KW
7307 if (LOC) {
7308 ret = reg_node(pRExC_state, (U8)(NALNUML));
7309 } else {
7310 ret = reg_node(pRExC_state, (U8)(NALNUM));
7311 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7312 }
a0d0e21e 7313 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7314 goto finish_meta_pat;
a0d0e21e 7315 case 'b':
830247a4
IZ
7316 RExC_seen_zerolen++;
7317 RExC_seen |= REG_SEEN_LOOKBEHIND;
a12cf05f
KW
7318 if (LOC) {
7319 ret = reg_node(pRExC_state, (U8)(BOUNDL));
7320 } else {
7321 ret = reg_node(pRExC_state, (U8)(BOUND));
7322 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7323 }
a0d0e21e 7324 *flagp |= SIMPLE;
ee9b8eae 7325 goto finish_meta_pat;
a0d0e21e 7326 case 'B':
830247a4
IZ
7327 RExC_seen_zerolen++;
7328 RExC_seen |= REG_SEEN_LOOKBEHIND;
a12cf05f
KW
7329 if (LOC) {
7330 ret = reg_node(pRExC_state, (U8)(NBOUNDL));
7331 } else {
7332 ret = reg_node(pRExC_state, (U8)(NBOUND));
7333 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7334 }
a0d0e21e 7335 *flagp |= SIMPLE;
ee9b8eae 7336 goto finish_meta_pat;
a0d0e21e 7337 case 's':
a12cf05f
KW
7338 if (LOC) {
7339 ret = reg_node(pRExC_state, (U8)(SPACEL));
7340 } else {
7341 ret = reg_node(pRExC_state, (U8)(SPACE));
7342 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7343 }
a0d0e21e 7344 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7345 goto finish_meta_pat;
a0d0e21e 7346 case 'S':
a12cf05f
KW
7347 if (LOC) {
7348 ret = reg_node(pRExC_state, (U8)(NSPACEL));
7349 } else {
7350 ret = reg_node(pRExC_state, (U8)(NSPACE));
7351 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7352 }
a0d0e21e 7353 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7354 goto finish_meta_pat;
a0d0e21e 7355 case 'd':
ffc61ed2 7356 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 7357 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7358 goto finish_meta_pat;
a0d0e21e 7359 case 'D':
ffc61ed2 7360 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 7361 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7362 goto finish_meta_pat;
e1d1eefb
YO
7363 case 'R':
7364 ret = reg_node(pRExC_state, LNBREAK);
7365 *flagp |= HASWIDTH|SIMPLE;
7366 goto finish_meta_pat;
7367 case 'h':
7368 ret = reg_node(pRExC_state, HORIZWS);
7369 *flagp |= HASWIDTH|SIMPLE;
7370 goto finish_meta_pat;
7371 case 'H':
7372 ret = reg_node(pRExC_state, NHORIZWS);
7373 *flagp |= HASWIDTH|SIMPLE;
7374 goto finish_meta_pat;
ee9b8eae 7375 case 'v':
e1d1eefb
YO
7376 ret = reg_node(pRExC_state, VERTWS);
7377 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
7378 goto finish_meta_pat;
7379 case 'V':
e1d1eefb
YO
7380 ret = reg_node(pRExC_state, NVERTWS);
7381 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7382 finish_meta_pat:
830247a4 7383 nextchar(pRExC_state);
fac92740 7384 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 7385 break;
a14b48bc
LW
7386 case 'p':
7387 case 'P':
3568d838 7388 {
3dab1dad 7389 char* const oldregxend = RExC_end;
d008bc60 7390#ifdef DEBUGGING
ccb2c380 7391 char* parse_start = RExC_parse - 2;
d008bc60 7392#endif
a14b48bc 7393
830247a4 7394 if (RExC_parse[1] == '{') {
3568d838 7395 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
7396 RExC_end = strchr(RExC_parse, '}');
7397 if (!RExC_end) {
3dab1dad 7398 const U8 c = (U8)*RExC_parse;
830247a4
IZ
7399 RExC_parse += 2;
7400 RExC_end = oldregxend;
0da60cf5 7401 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 7402 }
830247a4 7403 RExC_end++;
a14b48bc 7404 }
af6f566e 7405 else {
830247a4 7406 RExC_end = RExC_parse + 2;
af6f566e
HS
7407 if (RExC_end > oldregxend)
7408 RExC_end = oldregxend;
7409 }
830247a4 7410 RExC_parse--;
a14b48bc 7411
3dab1dad 7412 ret = regclass(pRExC_state,depth+1);
a14b48bc 7413
830247a4
IZ
7414 RExC_end = oldregxend;
7415 RExC_parse--;
ccb2c380
MP
7416
7417 Set_Node_Offset(ret, parse_start + 2);
7418 Set_Node_Cur_Length(ret);
830247a4 7419 nextchar(pRExC_state);
a14b48bc
LW
7420 *flagp |= HASWIDTH|SIMPLE;
7421 }
7422 break;
fc8cd66c 7423 case 'N':
afefe6bf 7424 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
7425 multicharacter. join_exact() will join them up later on.
7426 Also this makes sure that things like /\N{BLAH}+/ and
7427 \N{BLAH} being multi char Just Happen. dmq*/
7428 ++RExC_parse;
afefe6bf 7429 ret= reg_namedseq(pRExC_state, NULL, flagp);
fc8cd66c 7430 break;
0a4db386 7431 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 7432 parse_named_seq:
81714fb9
YO
7433 {
7434 char ch= RExC_parse[1];
1f1031fe
YO
7435 if (ch != '<' && ch != '\'' && ch != '{') {
7436 RExC_parse++;
7437 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 7438 } else {
1f1031fe
YO
7439 /* this pretty much dupes the code for (?P=...) in reg(), if
7440 you change this make sure you change that */
81714fb9 7441 char* name_start = (RExC_parse += 2);
2eccd3b2 7442 U32 num = 0;
0a4db386
YO
7443 SV *sv_dat = reg_scan_name(pRExC_state,
7444 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 7445 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 7446 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
7447 vFAIL2("Sequence %.3s... not terminated",parse_start);
7448
7449 if (!SIZE_ONLY) {
7450 num = add_data( pRExC_state, 1, "S" );
7451 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 7452 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
7453 }
7454
81714fb9
YO
7455 RExC_sawback = 1;
7456 ret = reganode(pRExC_state,
7457 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7458 num);
7459 *flagp |= HASWIDTH;
1f1031fe 7460
81714fb9
YO
7461 /* override incorrect value set in reganode MJD */
7462 Set_Node_Offset(ret, parse_start+1);
7463 Set_Node_Cur_Length(ret); /* MJD */
7464 nextchar(pRExC_state);
1f1031fe 7465
81714fb9
YO
7466 }
7467 break;
1f1031fe 7468 }
2bf803e2 7469 case 'g':
a0d0e21e
LW
7470 case '1': case '2': case '3': case '4':
7471 case '5': case '6': case '7': case '8': case '9':
7472 {
c74340f9 7473 I32 num;
2bf803e2
YO
7474 bool isg = *RExC_parse == 'g';
7475 bool isrel = 0;
7476 bool hasbrace = 0;
7477 if (isg) {
c74340f9 7478 RExC_parse++;
2bf803e2
YO
7479 if (*RExC_parse == '{') {
7480 RExC_parse++;
7481 hasbrace = 1;
7482 }
7483 if (*RExC_parse == '-') {
7484 RExC_parse++;
7485 isrel = 1;
7486 }
1f1031fe
YO
7487 if (hasbrace && !isDIGIT(*RExC_parse)) {
7488 if (isrel) RExC_parse--;
7489 RExC_parse -= 2;
7490 goto parse_named_seq;
7491 } }
c74340f9 7492 num = atoi(RExC_parse);
b72d83b2
RGS
7493 if (isg && num == 0)
7494 vFAIL("Reference to invalid group 0");
c74340f9 7495 if (isrel) {
5624f11d 7496 num = RExC_npar - num;
c74340f9
YO
7497 if (num < 1)
7498 vFAIL("Reference to nonexistent or unclosed group");
7499 }
2bf803e2 7500 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
7501 goto defchar;
7502 else {
3dab1dad 7503 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
7504 while (isDIGIT(*RExC_parse))
7505 RExC_parse++;
1f1031fe
YO
7506 if (parse_start == RExC_parse - 1)
7507 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
7508 if (hasbrace) {
7509 if (*RExC_parse != '}')
7510 vFAIL("Unterminated \\g{...} pattern");
7511 RExC_parse++;
7512 }
c74340f9
YO
7513 if (!SIZE_ONLY) {
7514 if (num > (I32)RExC_rx->nparens)
7515 vFAIL("Reference to nonexistent group");
c74340f9 7516 }
830247a4 7517 RExC_sawback = 1;
eb160463
GS
7518 ret = reganode(pRExC_state,
7519 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7520 num);
a0d0e21e 7521 *flagp |= HASWIDTH;
2af232bd 7522
fac92740 7523 /* override incorrect value set in reganode MJD */
2af232bd 7524 Set_Node_Offset(ret, parse_start+1);
fac92740 7525 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
7526 RExC_parse--;
7527 nextchar(pRExC_state);
a0d0e21e
LW
7528 }
7529 }
7530 break;
7531 case '\0':
830247a4 7532 if (RExC_parse >= RExC_end)
b45f050a 7533 FAIL("Trailing \\");
a0d0e21e
LW
7534 /* FALL THROUGH */
7535 default:
a0288114 7536 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 7537 back into the quick-grab loop below */
45948336 7538 parse_start--;
a0d0e21e
LW
7539 goto defchar;
7540 }
7541 break;
4633a7c4
LW
7542
7543 case '#':
bbe252da 7544 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 7545 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
7546 goto tryagain;
7547 }
7548 /* FALL THROUGH */
7549
f9a79580
RGS
7550 default:
7551 outer_default:{
ba210ebe 7552 register STRLEN len;
58ae7d3f 7553 register UV ender;
a0d0e21e 7554 register char *p;
3dab1dad 7555 char *s;
80aecb99 7556 STRLEN foldlen;
89ebb4a3 7557 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
7558
7559 parse_start = RExC_parse - 1;
a0d0e21e 7560
830247a4 7561 RExC_parse++;
a0d0e21e
LW
7562
7563 defchar:
58ae7d3f 7564 ender = 0;
eb160463
GS
7565 ret = reg_node(pRExC_state,
7566 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 7567 s = STRING(ret);
830247a4
IZ
7568 for (len = 0, p = RExC_parse - 1;
7569 len < 127 && p < RExC_end;
a0d0e21e
LW
7570 len++)
7571 {
3dab1dad 7572 char * const oldp = p;
5b5a24f7 7573
bbe252da 7574 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 7575 p = regwhite( pRExC_state, p );
f9a79580
RGS
7576 switch ((U8)*p) {
7577 case 0xDF:
7578 case 0xC3:
7579 case 0xCE:
56d400ed 7580 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
f9a79580 7581 goto normal_default;
a0d0e21e
LW
7582 case '^':
7583 case '$':
7584 case '.':
7585 case '[':
7586 case '(':
7587 case ')':
7588 case '|':
7589 goto loopdone;
7590 case '\\':
ee9b8eae
YO
7591 /* Literal Escapes Switch
7592
7593 This switch is meant to handle escape sequences that
7594 resolve to a literal character.
7595
7596 Every escape sequence that represents something
7597 else, like an assertion or a char class, is handled
7598 in the switch marked 'Special Escapes' above in this
7599 routine, but also has an entry here as anything that
7600 isn't explicitly mentioned here will be treated as
7601 an unescaped equivalent literal.
7602 */
7603
a0a388a1 7604 switch ((U8)*++p) {
ee9b8eae 7605 /* These are all the special escapes. */
a0a388a1
YO
7606 case 0xDF:
7607 case 0xC3:
7608 case 0xCE:
7609 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7610 goto normal_default;
ee9b8eae
YO
7611 case 'A': /* Start assertion */
7612 case 'b': case 'B': /* Word-boundary assertion*/
7613 case 'C': /* Single char !DANGEROUS! */
7614 case 'd': case 'D': /* digit class */
7615 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 7616 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
7617 case 'k': case 'K': /* named backref, keep marker */
7618 case 'N': /* named char sequence */
38a44b82 7619 case 'p': case 'P': /* Unicode property */
e1d1eefb 7620 case 'R': /* LNBREAK */
ee9b8eae 7621 case 's': case 'S': /* space class */
e1d1eefb 7622 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
7623 case 'w': case 'W': /* word class */
7624 case 'X': /* eXtended Unicode "combining character sequence" */
7625 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
7626 --p;
7627 goto loopdone;
ee9b8eae
YO
7628
7629 /* Anything after here is an escape that resolves to a
7630 literal. (Except digits, which may or may not)
7631 */
a0d0e21e
LW
7632 case 'n':
7633 ender = '\n';
7634 p++;
a687059c 7635 break;
a0d0e21e
LW
7636 case 'r':
7637 ender = '\r';
7638 p++;
a687059c 7639 break;
a0d0e21e
LW
7640 case 't':
7641 ender = '\t';
7642 p++;
a687059c 7643 break;
a0d0e21e
LW
7644 case 'f':
7645 ender = '\f';
7646 p++;
a687059c 7647 break;
a0d0e21e 7648 case 'e':
c7f1f016 7649 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 7650 p++;
a687059c 7651 break;
a0d0e21e 7652 case 'a':
c7f1f016 7653 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 7654 p++;
a687059c 7655 break;
f0a2b745
KW
7656 case 'o':
7657 {
7658 STRLEN brace_len = len;
00c0cb6d 7659 UV result;
454155d9
KW
7660 const char* error_msg;
7661
7662 bool valid = grok_bslash_o(p,
7663 &result,
7664 &brace_len,
7665 &error_msg,
7666 1);
7667 p += brace_len;
7668 if (! valid) {
7669 RExC_parse = p; /* going to die anyway; point
7670 to exact spot of failure */
f0a2b745
KW
7671 vFAIL(error_msg);
7672 }
00c0cb6d
DG
7673 else
7674 {
7675 ender = result;
7676 }
f0a2b745
KW
7677 if (PL_encoding && ender < 0x100) {
7678 goto recode_encoding;
7679 }
7680 if (ender > 0xff) {
62fed28b 7681 REQUIRE_UTF8;
f0a2b745
KW
7682 }
7683 break;
7684 }
a0d0e21e 7685 case 'x':
a0ed51b3 7686 if (*++p == '{') {
1df70142 7687 char* const e = strchr(p, '}');
b81d288d 7688
b45f050a 7689 if (!e) {
830247a4 7690 RExC_parse = p + 1;
b45f050a
JF
7691 vFAIL("Missing right brace on \\x{}");
7692 }
de5f0749 7693 else {
a4c04bdc
NC
7694 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7695 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 7696 STRLEN numlen = e - p - 1;
53305cf1 7697 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028 7698 if (ender > 0xff)
62fed28b 7699 REQUIRE_UTF8;
a0ed51b3
LW
7700 p = e + 1;
7701 }
a0ed51b3
LW
7702 }
7703 else {
a4c04bdc 7704 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 7705 STRLEN numlen = 2;
53305cf1 7706 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
7707 p += numlen;
7708 }
9e08bc66
TS
7709 if (PL_encoding && ender < 0x100)
7710 goto recode_encoding;
a687059c 7711 break;
a0d0e21e
LW
7712 case 'c':
7713 p++;
f9d13529 7714 ender = grok_bslash_c(*p++, SIZE_ONLY);
a687059c 7715 break;
a0d0e21e
LW
7716 case '0': case '1': case '2': case '3':case '4':
7717 case '5': case '6': case '7': case '8':case '9':
7718 if (*p == '0' ||
ca67da41 7719 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
c99e91e9
KW
7720 {
7721 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
1df70142 7722 STRLEN numlen = 3;
53305cf1 7723 ender = grok_oct(p, &numlen, &flags, NULL);
fa1639c5 7724 if (ender > 0xff) {
62fed28b 7725 REQUIRE_UTF8;
609122bd 7726 }
a0d0e21e
LW
7727 p += numlen;
7728 }
7729 else {
7730 --p;
7731 goto loopdone;
a687059c 7732 }
9e08bc66
TS
7733 if (PL_encoding && ender < 0x100)
7734 goto recode_encoding;
7735 break;
7736 recode_encoding:
7737 {
7738 SV* enc = PL_encoding;
7739 ender = reg_recode((const char)(U8)ender, &enc);
668c081a
NC
7740 if (!enc && SIZE_ONLY)
7741 ckWARNreg(p, "Invalid escape in the specified encoding");
62fed28b 7742 REQUIRE_UTF8;
9e08bc66 7743 }
a687059c 7744 break;
a0d0e21e 7745 case '\0':
830247a4 7746 if (p >= RExC_end)
b45f050a 7747 FAIL("Trailing \\");
a687059c 7748 /* FALL THROUGH */
a0d0e21e 7749 default:
668c081a
NC
7750 if (!SIZE_ONLY&& isALPHA(*p))
7751 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 7752 goto normal_default;
a0d0e21e
LW
7753 }
7754 break;
a687059c 7755 default:
a0ed51b3 7756 normal_default:
fd400ab9 7757 if (UTF8_IS_START(*p) && UTF) {
1df70142 7758 STRLEN numlen;
5e12f4fb 7759 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 7760 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
7761 p += numlen;
7762 }
7763 else
7764 ender = *p++;
a0d0e21e 7765 break;
a687059c 7766 }
bcdf7404
YO
7767 if ( RExC_flags & RXf_PMf_EXTENDED)
7768 p = regwhite( pRExC_state, p );
60a8b682
JH
7769 if (UTF && FOLD) {
7770 /* Prime the casefolded buffer. */
ac7e0132 7771 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 7772 }
bcdf7404 7773 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
7774 if (len)
7775 p = oldp;
16ea2a2e 7776 else if (UTF) {
80aecb99 7777 if (FOLD) {
60a8b682 7778 /* Emit all the Unicode characters. */
1df70142 7779 STRLEN numlen;
80aecb99
JH
7780 for (foldbuf = tmpbuf;
7781 foldlen;
7782 foldlen -= numlen) {
7783 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7784 if (numlen > 0) {
71207a34 7785 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7786 s += unilen;
7787 len += unilen;
7788 /* In EBCDIC the numlen
7789 * and unilen can differ. */
9dc45d57 7790 foldbuf += numlen;
47654450
JH
7791 if (numlen >= foldlen)
7792 break;
9dc45d57
JH
7793 }
7794 else
7795 break; /* "Can't happen." */
80aecb99
JH
7796 }
7797 }
7798 else {
71207a34 7799 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7800 if (unilen > 0) {
0ebc6274
JH
7801 s += unilen;
7802 len += unilen;
9dc45d57 7803 }
80aecb99 7804 }
a0ed51b3 7805 }
a0d0e21e
LW
7806 else {
7807 len++;
eb160463 7808 REGC((char)ender, s++);
a0d0e21e
LW
7809 }
7810 break;
a687059c 7811 }
16ea2a2e 7812 if (UTF) {
80aecb99 7813 if (FOLD) {
60a8b682 7814 /* Emit all the Unicode characters. */
1df70142 7815 STRLEN numlen;
80aecb99
JH
7816 for (foldbuf = tmpbuf;
7817 foldlen;
7818 foldlen -= numlen) {
7819 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7820 if (numlen > 0) {
71207a34 7821 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7822 len += unilen;
7823 s += unilen;
7824 /* In EBCDIC the numlen
7825 * and unilen can differ. */
9dc45d57 7826 foldbuf += numlen;
47654450
JH
7827 if (numlen >= foldlen)
7828 break;
9dc45d57
JH
7829 }
7830 else
7831 break;
80aecb99
JH
7832 }
7833 }
7834 else {
71207a34 7835 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7836 if (unilen > 0) {
0ebc6274
JH
7837 s += unilen;
7838 len += unilen;
9dc45d57 7839 }
80aecb99
JH
7840 }
7841 len--;
a0ed51b3
LW
7842 }
7843 else
eb160463 7844 REGC((char)ender, s++);
a0d0e21e
LW
7845 }
7846 loopdone:
830247a4 7847 RExC_parse = p - 1;
fac92740 7848 Set_Node_Cur_Length(ret); /* MJD */
830247a4 7849 nextchar(pRExC_state);
793db0cb
JH
7850 {
7851 /* len is STRLEN which is unsigned, need to copy to signed */
7852 IV iv = len;
7853 if (iv < 0)
7854 vFAIL("Internal disaster");
7855 }
a0d0e21e
LW
7856 if (len > 0)
7857 *flagp |= HASWIDTH;
090f7165 7858 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 7859 *flagp |= SIMPLE;
3dab1dad 7860
cd439c50 7861 if (SIZE_ONLY)
830247a4 7862 RExC_size += STR_SZ(len);
3dab1dad
YO
7863 else {
7864 STR_LEN(ret) = len;
830247a4 7865 RExC_emit += STR_SZ(len);
07be1b83 7866 }
3dab1dad 7867 }
a0d0e21e
LW
7868 break;
7869 }
a687059c 7870
a0d0e21e 7871 return(ret);
a687059c
LW
7872}
7873
873ef191 7874STATIC char *
bcdf7404 7875S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 7876{
bcdf7404 7877 const char *e = RExC_end;
7918f24d
NC
7878
7879 PERL_ARGS_ASSERT_REGWHITE;
7880
5b5a24f7
CS
7881 while (p < e) {
7882 if (isSPACE(*p))
7883 ++p;
7884 else if (*p == '#') {
bcdf7404 7885 bool ended = 0;
5b5a24f7 7886 do {
bcdf7404
YO
7887 if (*p++ == '\n') {
7888 ended = 1;
7889 break;
7890 }
7891 } while (p < e);
7892 if (!ended)
7893 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
7894 }
7895 else
7896 break;
7897 }
7898 return p;
7899}
7900
b8c5462f
JH
7901/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7902 Character classes ([:foo:]) can also be negated ([:^foo:]).
7903 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7904 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 7905 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
7906
7907#define POSIXCC_DONE(c) ((c) == ':')
7908#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7909#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7910
b8c5462f 7911STATIC I32
830247a4 7912S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 7913{
97aff369 7914 dVAR;
936ed897 7915 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 7916
7918f24d
NC
7917 PERL_ARGS_ASSERT_REGPPOSIXCC;
7918
830247a4 7919 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 7920 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 7921 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 7922 const char c = UCHARAT(RExC_parse);
097eb12c 7923 char* const s = RExC_parse++;
b81d288d 7924
9a86a77b 7925 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
7926 RExC_parse++;
7927 if (RExC_parse == RExC_end)
620e46c5 7928 /* Grandfather lone [:, [=, [. */
830247a4 7929 RExC_parse = s;
620e46c5 7930 else {
3dab1dad 7931 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
7932 assert(*t == c);
7933
9a86a77b 7934 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 7935 const char *posixcc = s + 1;
830247a4 7936 RExC_parse++; /* skip over the ending ] */
3dab1dad 7937
b8c5462f 7938 if (*s == ':') {
1df70142
AL
7939 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7940 const I32 skip = t - posixcc;
80916619
NC
7941
7942 /* Initially switch on the length of the name. */
7943 switch (skip) {
7944 case 4:
3dab1dad
YO
7945 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7946 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 7947 break;
80916619
NC
7948 case 5:
7949 /* Names all of length 5. */
7950 /* alnum alpha ascii blank cntrl digit graph lower
7951 print punct space upper */
7952 /* Offset 4 gives the best switch position. */
7953 switch (posixcc[4]) {
7954 case 'a':
3dab1dad
YO
7955 if (memEQ(posixcc, "alph", 4)) /* alpha */
7956 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
7957 break;
7958 case 'e':
3dab1dad
YO
7959 if (memEQ(posixcc, "spac", 4)) /* space */
7960 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
7961 break;
7962 case 'h':
3dab1dad
YO
7963 if (memEQ(posixcc, "grap", 4)) /* graph */
7964 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
7965 break;
7966 case 'i':
3dab1dad
YO
7967 if (memEQ(posixcc, "asci", 4)) /* ascii */
7968 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
7969 break;
7970 case 'k':
3dab1dad
YO
7971 if (memEQ(posixcc, "blan", 4)) /* blank */
7972 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
7973 break;
7974 case 'l':
3dab1dad
YO
7975 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7976 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
7977 break;
7978 case 'm':
3dab1dad
YO
7979 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7980 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
7981 break;
7982 case 'r':
3dab1dad
YO
7983 if (memEQ(posixcc, "lowe", 4)) /* lower */
7984 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7985 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7986 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
7987 break;
7988 case 't':
3dab1dad
YO
7989 if (memEQ(posixcc, "digi", 4)) /* digit */
7990 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7991 else if (memEQ(posixcc, "prin", 4)) /* print */
7992 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7993 else if (memEQ(posixcc, "punc", 4)) /* punct */
7994 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 7995 break;
b8c5462f
JH
7996 }
7997 break;
80916619 7998 case 6:
3dab1dad
YO
7999 if (memEQ(posixcc, "xdigit", 6))
8000 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
8001 break;
8002 }
80916619
NC
8003
8004 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
8005 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8006 t - s - 1, s + 1);
80916619
NC
8007 assert (posixcc[skip] == ':');
8008 assert (posixcc[skip+1] == ']');
b45f050a 8009 } else if (!SIZE_ONLY) {
b8c5462f 8010 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 8011
830247a4 8012 /* adjust RExC_parse so the warning shows after
b45f050a 8013 the class closes */
9a86a77b 8014 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 8015 RExC_parse++;
b45f050a
JF
8016 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8017 }
b8c5462f
JH
8018 } else {
8019 /* Maternal grandfather:
8020 * "[:" ending in ":" but not in ":]" */
830247a4 8021 RExC_parse = s;
767d463e 8022 }
620e46c5
JH
8023 }
8024 }
8025
b8c5462f
JH
8026 return namedclass;
8027}
8028
8029STATIC void
830247a4 8030S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 8031{
97aff369 8032 dVAR;
7918f24d
NC
8033
8034 PERL_ARGS_ASSERT_CHECKPOSIXCC;
8035
3dab1dad 8036 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
8037 const char *s = RExC_parse;
8038 const char c = *s++;
b8c5462f 8039
3dab1dad 8040 while (isALNUM(*s))
b8c5462f
JH
8041 s++;
8042 if (*s && c == *s && s[1] == ']') {
668c081a
NC
8043 ckWARN3reg(s+2,
8044 "POSIX syntax [%c %c] belongs inside character classes",
8045 c, c);
b45f050a
JF
8046
8047 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 8048 if (POSIXCC_NOTYET(c)) {
830247a4 8049 /* adjust RExC_parse so the error shows after
b45f050a 8050 the class closes */
9a86a77b 8051 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 8052 NOOP;
b45f050a
JF
8053 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8054 }
b8c5462f
JH
8055 }
8056 }
620e46c5
JH
8057}
8058
7f6f358c 8059
89836f1f
YO
8060#define _C_C_T_(NAME,TEST,WORD) \
8061ANYOF_##NAME: \
8062 if (LOC) \
8063 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8064 else { \
8065 for (value = 0; value < 256; value++) \
8066 if (TEST) \
8067 ANYOF_BITMAP_SET(ret, value); \
8068 } \
8069 yesno = '+'; \
8070 what = WORD; \
8071 break; \
8072case ANYOF_N##NAME: \
8073 if (LOC) \
8074 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8075 else { \
8076 for (value = 0; value < 256; value++) \
8077 if (!TEST) \
8078 ANYOF_BITMAP_SET(ret, value); \
8079 } \
8080 yesno = '!'; \
8081 what = WORD; \
8082 break
8083
a12cf05f 8084/* Like above, but no locale test */
e1d1eefb
YO
8085#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
8086ANYOF_##NAME: \
8087 for (value = 0; value < 256; value++) \
8088 if (TEST) \
8089 ANYOF_BITMAP_SET(ret, value); \
8090 yesno = '+'; \
8091 what = WORD; \
8092 break; \
8093case ANYOF_N##NAME: \
8094 for (value = 0; value < 256; value++) \
8095 if (!TEST) \
8096 ANYOF_BITMAP_SET(ret, value); \
8097 yesno = '!'; \
8098 what = WORD; \
8099 break
89836f1f 8100
a12cf05f
KW
8101/* Like the above, but there are differences if we are in uni-8-bit or not, so
8102 * there are two tests passed in, to use depending on that. There aren't any
8103 * cases where the label is different from the name, so no need for that
8104 * parameter */
8105#define _C_C_T_UNI_8_BIT(NAME,TEST_8,TEST_7,WORD) \
8106ANYOF_##NAME: \
8107 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8108 else if (UNI_SEMANTICS) { \
8109 for (value = 0; value < 256; value++) { \
8110 if (TEST_8) ANYOF_BITMAP_SET(ret, value); \
8111 } \
8112 } \
8113 else { \
8114 for (value = 0; value < 256; value++) { \
8115 if (TEST_7) ANYOF_BITMAP_SET(ret, value); \
8116 } \
8117 } \
8118 yesno = '+'; \
8119 what = WORD; \
8120 break; \
8121case ANYOF_N##NAME: \
8122 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8123 else if (UNI_SEMANTICS) { \
8124 for (value = 0; value < 256; value++) { \
8125 if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \
8126 } \
8127 } \
8128 else { \
8129 for (value = 0; value < 256; value++) { \
8130 if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \
8131 } \
8132 } \
8133 yesno = '!'; \
8134 what = WORD; \
8135 break
8136
da7fcca4
YO
8137/*
8138 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8139 so that it is possible to override the option here without having to
8140 rebuild the entire core. as we are required to do if we change regcomp.h
8141 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8142*/
8143#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8144#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8145#endif
8146
8147#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8148#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8149#else
8150#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8151#endif
8152
7f6f358c
YO
8153/*
8154 parse a class specification and produce either an ANYOF node that
89836f1f
YO
8155 matches the pattern or if the pattern matches a single char only and
8156 that char is < 256 and we are case insensitive then we produce an
8157 EXACT node instead.
7f6f358c 8158*/
89836f1f 8159
76e3520e 8160STATIC regnode *
3dab1dad 8161S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 8162{
97aff369 8163 dVAR;
9a86a77b 8164 register UV nextvalue;
3568d838 8165 register IV prevvalue = OOB_UNICODE;
ffc61ed2 8166 register IV range = 0;
e1d1eefb 8167 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 8168 register regnode *ret;
ba210ebe 8169 STRLEN numlen;
ffc61ed2 8170 IV namedclass;
cbbf8932 8171 char *rangebegin = NULL;
936ed897 8172 bool need_class = 0;
c445ea15 8173 SV *listsv = NULL;
ffc61ed2 8174 UV n;
9e55ce06 8175 bool optimize_invert = TRUE;
cbbf8932 8176 AV* unicode_alternate = NULL;
1b2d223b
JH
8177#ifdef EBCDIC
8178 UV literal_endpoint = 0;
8179#endif
2c63ecad 8180 UV stored = 0; /* 0, 1, or more than 1 chars stored in the class */
ffc61ed2 8181
3dab1dad 8182 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 8183 case we need to change the emitted regop to an EXACT. */
07be1b83 8184 const char * orig_parse = RExC_parse;
72f13be8 8185 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8186
8187 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
8188#ifndef DEBUGGING
8189 PERL_UNUSED_ARG(depth);
8190#endif
72f13be8 8191
3dab1dad 8192 DEBUG_PARSE("clas");
7f6f358c
YO
8193
8194 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
8195 ret = reganode(pRExC_state, ANYOF, 0);
8196
8197 if (!SIZE_ONLY)
8198 ANYOF_FLAGS(ret) = 0;
8199
9a86a77b 8200 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
8201 RExC_naughty++;
8202 RExC_parse++;
8203 if (!SIZE_ONLY)
8204 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8205 }
a0d0e21e 8206
73060fc4 8207 if (SIZE_ONLY) {
830247a4 8208 RExC_size += ANYOF_SKIP;
73060fc4
JH
8209 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8210 }
936ed897 8211 else {
830247a4 8212 RExC_emit += ANYOF_SKIP;
936ed897
IZ
8213 if (FOLD)
8214 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8215 if (LOC)
8216 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 8217 ANYOF_BITMAP_ZERO(ret);
396482e1 8218 listsv = newSVpvs("# comment\n");
a0d0e21e 8219 }
b8c5462f 8220
9a86a77b
JH
8221 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8222
b938889d 8223 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 8224 checkposixcc(pRExC_state);
b8c5462f 8225
f064b6ad
HS
8226 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8227 if (UCHARAT(RExC_parse) == ']')
8228 goto charclassloop;
ffc61ed2 8229
fc8cd66c 8230parseit:
9a86a77b 8231 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
8232
8233 charclassloop:
8234
8235 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8236
73b437c8 8237 if (!range)
830247a4 8238 rangebegin = RExC_parse;
ffc61ed2 8239 if (UTF) {
5e12f4fb 8240 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 8241 RExC_end - RExC_parse,
9f7f3913 8242 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
8243 RExC_parse += numlen;
8244 }
8245 else
8246 value = UCHARAT(RExC_parse++);
7f6f358c 8247
9a86a77b
JH
8248 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8249 if (value == '[' && POSIXCC(nextvalue))
830247a4 8250 namedclass = regpposixcc(pRExC_state, value);
620e46c5 8251 else if (value == '\\') {
ffc61ed2 8252 if (UTF) {
5e12f4fb 8253 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 8254 RExC_end - RExC_parse,
9f7f3913 8255 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
8256 RExC_parse += numlen;
8257 }
8258 else
8259 value = UCHARAT(RExC_parse++);
470c3474 8260 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 8261 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
8262 * be a problem later if we want switch on Unicode.
8263 * A similar issue a little bit later when switching on
8264 * namedclass. --jhi */
ffc61ed2 8265 switch ((I32)value) {
b8c5462f
JH
8266 case 'w': namedclass = ANYOF_ALNUM; break;
8267 case 'W': namedclass = ANYOF_NALNUM; break;
8268 case 's': namedclass = ANYOF_SPACE; break;
8269 case 'S': namedclass = ANYOF_NSPACE; break;
8270 case 'd': namedclass = ANYOF_DIGIT; break;
8271 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
8272 case 'v': namedclass = ANYOF_VERTWS; break;
8273 case 'V': namedclass = ANYOF_NVERTWS; break;
8274 case 'h': namedclass = ANYOF_HORIZWS; break;
8275 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
8276 case 'N': /* Handle \N{NAME} in class */
8277 {
8278 /* We only pay attention to the first char of
8279 multichar strings being returned. I kinda wonder
8280 if this makes sense as it does change the behaviour
8281 from earlier versions, OTOH that behaviour was broken
8282 as well. */
8283 UV v; /* value is register so we cant & it /grrr */
afefe6bf 8284 if (reg_namedseq(pRExC_state, &v, NULL)) {
fc8cd66c
YO
8285 goto parseit;
8286 }
8287 value= v;
8288 }
8289 break;
ffc61ed2
JH
8290 case 'p':
8291 case 'P':
3dab1dad
YO
8292 {
8293 char *e;
af6f566e 8294 if (RExC_parse >= RExC_end)
2a4859cd 8295 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 8296 if (*RExC_parse == '{') {
1df70142 8297 const U8 c = (U8)value;
ffc61ed2
JH
8298 e = strchr(RExC_parse++, '}');
8299 if (!e)
0da60cf5 8300 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
8301 while (isSPACE(UCHARAT(RExC_parse)))
8302 RExC_parse++;
8303 if (e == RExC_parse)
0da60cf5 8304 vFAIL2("Empty \\%c{}", c);
ffc61ed2 8305 n = e - RExC_parse;
ab13f0c7
JH
8306 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8307 n--;
ffc61ed2
JH
8308 }
8309 else {
8310 e = RExC_parse;
8311 n = 1;
8312 }
8313 if (!SIZE_ONLY) {
ab13f0c7
JH
8314 if (UCHARAT(RExC_parse) == '^') {
8315 RExC_parse++;
8316 n--;
8317 value = value == 'p' ? 'P' : 'p'; /* toggle */
8318 while (isSPACE(UCHARAT(RExC_parse))) {
8319 RExC_parse++;
8320 n--;
8321 }
8322 }
097eb12c
AL
8323 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8324 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
8325 }
8326 RExC_parse = e + 1;
8327 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 8328 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 8329 }
f81125e2 8330 break;
b8c5462f
JH
8331 case 'n': value = '\n'; break;
8332 case 'r': value = '\r'; break;
8333 case 't': value = '\t'; break;
8334 case 'f': value = '\f'; break;
8335 case 'b': value = '\b'; break;
c7f1f016
NIS
8336 case 'e': value = ASCII_TO_NATIVE('\033');break;
8337 case 'a': value = ASCII_TO_NATIVE('\007');break;
f0a2b745
KW
8338 case 'o':
8339 RExC_parse--; /* function expects to be pointed at the 'o' */
454155d9
KW
8340 {
8341 const char* error_msg;
8342 bool valid = grok_bslash_o(RExC_parse,
f0a2b745
KW
8343 &value,
8344 &numlen,
454155d9
KW
8345 &error_msg,
8346 SIZE_ONLY);
8347 RExC_parse += numlen;
8348 if (! valid) {
8349 vFAIL(error_msg);
8350 }
f0a2b745 8351 }
f0a2b745
KW
8352 if (PL_encoding && value < 0x100) {
8353 goto recode_encoding;
8354 }
8355 break;
b8c5462f 8356 case 'x':
ffc61ed2 8357 if (*RExC_parse == '{') {
a4c04bdc
NC
8358 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8359 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 8360 char * const e = strchr(RExC_parse++, '}');
b81d288d 8361 if (!e)
ffc61ed2 8362 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
8363
8364 numlen = e - RExC_parse;
8365 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8366 RExC_parse = e + 1;
8367 }
8368 else {
a4c04bdc 8369 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
8370 numlen = 2;
8371 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8372 RExC_parse += numlen;
8373 }
9e08bc66
TS
8374 if (PL_encoding && value < 0x100)
8375 goto recode_encoding;
b8c5462f
JH
8376 break;
8377 case 'c':
f9d13529 8378 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
b8c5462f
JH
8379 break;
8380 case '0': case '1': case '2': case '3': case '4':
c99e91e9 8381 case '5': case '6': case '7':
9e08bc66 8382 {
c99e91e9
KW
8383 /* Take 1-3 octal digits */
8384 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9e08bc66
TS
8385 numlen = 3;
8386 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8387 RExC_parse += numlen;
8388 if (PL_encoding && value < 0x100)
8389 goto recode_encoding;
8390 break;
8391 }
8392 recode_encoding:
8393 {
8394 SV* enc = PL_encoding;
8395 value = reg_recode((const char)(U8)value, &enc);
668c081a
NC
8396 if (!enc && SIZE_ONLY)
8397 ckWARNreg(RExC_parse,
8398 "Invalid escape in the specified encoding");
9e08bc66
TS
8399 break;
8400 }
1028017a 8401 default:
c99e91e9
KW
8402 /* Allow \_ to not give an error */
8403 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
668c081a
NC
8404 ckWARN2reg(RExC_parse,
8405 "Unrecognized escape \\%c in character class passed through",
8406 (int)value);
c99e91e9 8407 }
1028017a 8408 break;
b8c5462f 8409 }
ffc61ed2 8410 } /* end of \blah */
1b2d223b
JH
8411#ifdef EBCDIC
8412 else
8413 literal_endpoint++;
8414#endif
ffc61ed2
JH
8415
8416 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8417
2c63ecad
KW
8418 /* What matches in a locale is not known until runtime, so need to
8419 * (one time per class) allocate extra space to pass to regexec.
8420 * The space will contain a bit for each named class that is to be
8421 * matched against. This isn't needed for \p{} and pseudo-classes,
8422 * as they are not affected by locale, and hence are dealt with
8423 * separately */
8424 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8425 need_class = 1;
8426 if (SIZE_ONLY) {
8427 RExC_size += ANYOF_CLASS_ADD_SKIP;
8428 }
8429 else {
8430 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8431 ANYOF_CLASS_ZERO(ret);
8432 }
8433 ANYOF_FLAGS(ret) |= ANYOF_CLASS|ANYOF_LARGE;
8434 }
ffc61ed2
JH
8435
8436 /* a bad range like a-\d, a-[:digit:] ? */
8437 if (range) {
73b437c8 8438 if (!SIZE_ONLY) {
668c081a
NC
8439 const int w =
8440 RExC_parse >= rangebegin ?
8441 RExC_parse - rangebegin : 0;
8442 ckWARN4reg(RExC_parse,
b45f050a 8443 "False [] range \"%*.*s\"",
097eb12c 8444 w, w, rangebegin);
668c081a 8445
3568d838
JH
8446 if (prevvalue < 256) {
8447 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
8448 ANYOF_BITMAP_SET(ret, '-');
8449 }
8450 else {
8451 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8452 Perl_sv_catpvf(aTHX_ listsv,
3568d838 8453 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 8454 }
b8c5462f 8455 }
ffc61ed2
JH
8456
8457 range = 0; /* this was not a true range */
73b437c8 8458 }
ffc61ed2 8459
89836f1f
YO
8460
8461
73b437c8 8462 if (!SIZE_ONLY) {
c49a72a9
NC
8463 const char *what = NULL;
8464 char yesno = 0;
8465
3568d838
JH
8466 if (namedclass > OOB_NAMEDCLASS)
8467 optimize_invert = FALSE;
e2962f66
JH
8468 /* Possible truncation here but in some 64-bit environments
8469 * the compiler gets heartburn about switch on 64-bit values.
8470 * A similar issue a little earlier when switching on value.
98f323fa 8471 * --jhi */
e2962f66 8472 switch ((I32)namedclass) {
da7fcca4
YO
8473
8474 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8475 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8476 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8477 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8478 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8479 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8480 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8481 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8482 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8483 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8484#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
a12cf05f
KW
8485 /* \s, \w match all unicode if utf8. */
8486 case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8487 case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
da7fcca4 8488#else
a12cf05f
KW
8489 /* \s, \w match ascii and locale only */
8490 case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8491 case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
da7fcca4 8492#endif
89836f1f 8493 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
e1d1eefb
YO
8494 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8495 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
73b437c8
JH
8496 case ANYOF_ASCII:
8497 if (LOC)
936ed897 8498 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 8499 else {
c7f1f016 8500#ifndef EBCDIC
1ba5c669
JH
8501 for (value = 0; value < 128; value++)
8502 ANYOF_BITMAP_SET(ret, value);
8503#else /* EBCDIC */
ffbc6a93 8504 for (value = 0; value < 256; value++) {
3a3c4447
JH
8505 if (isASCII(value))
8506 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 8507 }
1ba5c669 8508#endif /* EBCDIC */
73b437c8 8509 }
c49a72a9
NC
8510 yesno = '+';
8511 what = "ASCII";
73b437c8
JH
8512 break;
8513 case ANYOF_NASCII:
8514 if (LOC)
936ed897 8515 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 8516 else {
c7f1f016 8517#ifndef EBCDIC
1ba5c669
JH
8518 for (value = 128; value < 256; value++)
8519 ANYOF_BITMAP_SET(ret, value);
8520#else /* EBCDIC */
ffbc6a93 8521 for (value = 0; value < 256; value++) {
3a3c4447
JH
8522 if (!isASCII(value))
8523 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 8524 }
1ba5c669 8525#endif /* EBCDIC */
73b437c8 8526 }
c49a72a9
NC
8527 yesno = '!';
8528 what = "ASCII";
89836f1f 8529 break;
ffc61ed2
JH
8530 case ANYOF_DIGIT:
8531 if (LOC)
8532 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8533 else {
8534 /* consecutive digits assumed */
8535 for (value = '0'; value <= '9'; value++)
8536 ANYOF_BITMAP_SET(ret, value);
8537 }
c49a72a9 8538 yesno = '+';
da7fcca4 8539 what = POSIX_CC_UNI_NAME("Digit");
ffc61ed2
JH
8540 break;
8541 case ANYOF_NDIGIT:
8542 if (LOC)
8543 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8544 else {
8545 /* consecutive digits assumed */
8546 for (value = 0; value < '0'; value++)
8547 ANYOF_BITMAP_SET(ret, value);
8548 for (value = '9' + 1; value < 256; value++)
8549 ANYOF_BITMAP_SET(ret, value);
8550 }
c49a72a9 8551 yesno = '!';
da7fcca4 8552 what = POSIX_CC_UNI_NAME("Digit");
89836f1f 8553 break;
f81125e2
JP
8554 case ANYOF_MAX:
8555 /* this is to handle \p and \P */
8556 break;
73b437c8 8557 default:
b45f050a 8558 vFAIL("Invalid [::] class");
73b437c8 8559 break;
b8c5462f 8560 }
c49a72a9
NC
8561 if (what) {
8562 /* Strings such as "+utf8::isWord\n" */
8563 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8564 }
2c63ecad 8565 stored+=2; /* can't optimize this class */
73b437c8 8566 continue;
a0d0e21e 8567 }
ffc61ed2
JH
8568 } /* end of namedclass \blah */
8569
a0d0e21e 8570 if (range) {
eb160463 8571 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
8572 const int w = RExC_parse - rangebegin;
8573 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 8574 range = 0; /* not a valid range */
73b437c8 8575 }
a0d0e21e
LW
8576 }
8577 else {
3568d838 8578 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
8579 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8580 RExC_parse[1] != ']') {
8581 RExC_parse++;
ffc61ed2
JH
8582
8583 /* a bad range like \w-, [:word:]- ? */
8584 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 8585 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 8586 const int w =
afd78fd5
JH
8587 RExC_parse >= rangebegin ?
8588 RExC_parse - rangebegin : 0;
830247a4 8589 vWARN4(RExC_parse,
b45f050a 8590 "False [] range \"%*.*s\"",
097eb12c 8591 w, w, rangebegin);
afd78fd5 8592 }
73b437c8 8593 if (!SIZE_ONLY)
936ed897 8594 ANYOF_BITMAP_SET(ret, '-');
73b437c8 8595 } else
ffc61ed2
JH
8596 range = 1; /* yeah, it's a range! */
8597 continue; /* but do it the next time */
a0d0e21e 8598 }
a687059c 8599 }
ffc61ed2 8600
93733859 8601 /* now is the next time */
07be1b83 8602 /*stored += (value - prevvalue + 1);*/
ae5c130c 8603 if (!SIZE_ONLY) {
3568d838 8604 if (prevvalue < 256) {
1df70142 8605 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 8606 IV i;
3568d838 8607#ifdef EBCDIC
1b2d223b
JH
8608 /* In EBCDIC [\x89-\x91] should include
8609 * the \x8e but [i-j] should not. */
8610 if (literal_endpoint == 2 &&
8611 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8612 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 8613 {
3568d838
JH
8614 if (isLOWER(prevvalue)) {
8615 for (i = prevvalue; i <= ceilvalue; i++)
2670d666
BC
8616 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8617 stored++;
ffc61ed2 8618 ANYOF_BITMAP_SET(ret, i);
2670d666 8619 }
ffc61ed2 8620 } else {
3568d838 8621 for (i = prevvalue; i <= ceilvalue; i++)
2670d666
BC
8622 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8623 stored++;
ffc61ed2 8624 ANYOF_BITMAP_SET(ret, i);
2670d666 8625 }
ffc61ed2 8626 }
8ada0baa 8627 }
ffc61ed2 8628 else
8ada0baa 8629#endif
07be1b83
YO
8630 for (i = prevvalue; i <= ceilvalue; i++) {
8631 if (!ANYOF_BITMAP_TEST(ret,i)) {
8632 stored++;
8633 ANYOF_BITMAP_SET(ret, i);
8634 }
8635 }
3568d838 8636 }
a5961de5 8637 if (value > 255 || UTF) {
1df70142
AL
8638 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8639 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 8640 stored+=2; /* can't optimize this class */
ffc61ed2 8641 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 8642 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 8643 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
8644 prevnatvalue, natvalue);
8645 }
8646 else if (prevnatvalue == natvalue) {
8647 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 8648 if (FOLD) {
89ebb4a3 8649 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 8650 STRLEN foldlen;
1df70142 8651 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 8652
e294cc5d
JH
8653#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8654 if (RExC_precomp[0] == ':' &&
8655 RExC_precomp[1] == '[' &&
8656 (f == 0xDF || f == 0x92)) {
8657 f = NATIVE_TO_UNI(f);
8658 }
8659#endif
c840d2a2
JH
8660 /* If folding and foldable and a single
8661 * character, insert also the folded version
8662 * to the charclass. */
9e55ce06 8663 if (f != value) {
e294cc5d
JH
8664#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8665 if ((RExC_precomp[0] == ':' &&
8666 RExC_precomp[1] == '[' &&
8667 (f == 0xA2 &&
8668 (value == 0xFB05 || value == 0xFB06))) ?
8669 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8670 foldlen == (STRLEN)UNISKIP(f) )
8671#else
eb160463 8672 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 8673#endif
9e55ce06
JH
8674 Perl_sv_catpvf(aTHX_ listsv,
8675 "%04"UVxf"\n", f);
8676 else {
8677 /* Any multicharacter foldings
8678 * require the following transform:
8679 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8680 * where E folds into "pq" and F folds
8681 * into "rst", all other characters
8682 * fold to single characters. We save
8683 * away these multicharacter foldings,
8684 * to be later saved as part of the
8685 * additional "s" data. */
8686 SV *sv;
8687
8688 if (!unicode_alternate)
8689 unicode_alternate = newAV();
740cce10
NC
8690 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8691 TRUE);
9e55ce06
JH
8692 av_push(unicode_alternate, sv);
8693 }
8694 }
254ba52a 8695
60a8b682
JH
8696 /* If folding and the value is one of the Greek
8697 * sigmas insert a few more sigmas to make the
8698 * folding rules of the sigmas to work right.
8699 * Note that not all the possible combinations
8700 * are handled here: some of them are handled
9e55ce06
JH
8701 * by the standard folding rules, and some of
8702 * them (literal or EXACTF cases) are handled
8703 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
8704 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8705 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8706 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 8707 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8708 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8709 }
8710 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8711 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8712 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8713 }
8714 }
ffc61ed2 8715 }
1b2d223b
JH
8716#ifdef EBCDIC
8717 literal_endpoint = 0;
8718#endif
8ada0baa 8719 }
ffc61ed2
JH
8720
8721 range = 0; /* this range (if it was one) is done now */
a0d0e21e 8722 }
ffc61ed2 8723
ffc61ed2 8724
7f6f358c
YO
8725
8726 if (SIZE_ONLY)
8727 return ret;
8728 /****** !SIZE_ONLY AFTER HERE *********/
8729
3b57cd43 8730 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
7f6f358c
YO
8731 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8732 ) {
8733 /* optimize single char class to an EXACT node
8734 but *only* when its not a UTF/high char */
07be1b83
YO
8735 const char * cur_parse= RExC_parse;
8736 RExC_emit = (regnode *)orig_emit;
8737 RExC_parse = (char *)orig_parse;
7f6f358c
YO
8738 ret = reg_node(pRExC_state,
8739 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 8740 RExC_parse = (char *)cur_parse;
7f6f358c
YO
8741 *STRING(ret)= (char)value;
8742 STR_LEN(ret)= 1;
8743 RExC_emit += STR_SZ(1);
ef8d46e8 8744 SvREFCNT_dec(listsv);
7f6f358c
YO
8745 return ret;
8746 }
ae5c130c 8747 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 8748 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
8749 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8750 ) {
a0ed51b3 8751 for (value = 0; value < 256; ++value) {
936ed897 8752 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 8753 UV fold = PL_fold[value];
ffc61ed2
JH
8754
8755 if (fold != value)
8756 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
8757 }
8758 }
936ed897 8759 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 8760 }
ffc61ed2 8761
ae5c130c 8762 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 8763 if (optimize_invert &&
ffc61ed2
JH
8764 /* If the only flag is inversion. */
8765 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 8766 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 8767 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 8768 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 8769 }
7f6f358c 8770 {
097eb12c 8771 AV * const av = newAV();
ffc61ed2 8772 SV *rv;
9e55ce06 8773 /* The 0th element stores the character class description
6a0407ee 8774 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
8775 * to initialize the appropriate swash (which gets stored in
8776 * the 1st element), and also useful for dumping the regnode.
8777 * The 2nd element stores the multicharacter foldings,
6a0407ee 8778 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
8779 av_store(av, 0, listsv);
8780 av_store(av, 1, NULL);
ad64d0ec
NC
8781 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8782 rv = newRV_noinc(MUTABLE_SV(av));
19860706 8783 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 8784 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 8785 ARG_SET(ret, n);
a0ed51b3 8786 }
a0ed51b3
LW
8787 return ret;
8788}
89836f1f
YO
8789#undef _C_C_T_
8790
a0ed51b3 8791
bcdf7404
YO
8792/* reg_skipcomment()
8793
8794 Absorbs an /x style # comments from the input stream.
8795 Returns true if there is more text remaining in the stream.
8796 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8797 terminates the pattern without including a newline.
8798
8799 Note its the callers responsibility to ensure that we are
8800 actually in /x mode
8801
8802*/
8803
8804STATIC bool
8805S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8806{
8807 bool ended = 0;
7918f24d
NC
8808
8809 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8810
bcdf7404
YO
8811 while (RExC_parse < RExC_end)
8812 if (*RExC_parse++ == '\n') {
8813 ended = 1;
8814 break;
8815 }
8816 if (!ended) {
8817 /* we ran off the end of the pattern without ending
8818 the comment, so we have to add an \n when wrapping */
8819 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8820 return 0;
8821 } else
8822 return 1;
8823}
8824
8825/* nextchar()
8826
f6e17a84 8827 Advance that parse position, and optionally absorbs
bcdf7404
YO
8828 "whitespace" from the inputstream.
8829
8830 Without /x "whitespace" means (?#...) style comments only,
8831 with /x this means (?#...) and # comments and whitespace proper.
8832
8833 Returns the RExC_parse point from BEFORE the scan occurs.
8834
8835 This is the /x friendly way of saying RExC_parse++.
8836*/
8837
76e3520e 8838STATIC char*
830247a4 8839S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 8840{
097eb12c 8841 char* const retval = RExC_parse++;
a0d0e21e 8842
7918f24d
NC
8843 PERL_ARGS_ASSERT_NEXTCHAR;
8844
4633a7c4 8845 for (;;) {
830247a4
IZ
8846 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8847 RExC_parse[2] == '#') {
e994fd66
AE
8848 while (*RExC_parse != ')') {
8849 if (RExC_parse == RExC_end)
8850 FAIL("Sequence (?#... not terminated");
830247a4 8851 RExC_parse++;
e994fd66 8852 }
830247a4 8853 RExC_parse++;
4633a7c4
LW
8854 continue;
8855 }
bbe252da 8856 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
8857 if (isSPACE(*RExC_parse)) {
8858 RExC_parse++;
748a9306
LW
8859 continue;
8860 }
830247a4 8861 else if (*RExC_parse == '#') {
bcdf7404
YO
8862 if ( reg_skipcomment( pRExC_state ) )
8863 continue;
748a9306 8864 }
748a9306 8865 }
4633a7c4 8866 return retval;
a0d0e21e 8867 }
a687059c
LW
8868}
8869
8870/*
c277df42 8871- reg_node - emit a node
a0d0e21e 8872*/
76e3520e 8873STATIC regnode * /* Location. */
830247a4 8874S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 8875{
97aff369 8876 dVAR;
c277df42 8877 register regnode *ptr;
504618e9 8878 regnode * const ret = RExC_emit;
07be1b83 8879 GET_RE_DEBUG_FLAGS_DECL;
a687059c 8880
7918f24d
NC
8881 PERL_ARGS_ASSERT_REG_NODE;
8882
c277df42 8883 if (SIZE_ONLY) {
830247a4
IZ
8884 SIZE_ALIGN(RExC_size);
8885 RExC_size += 1;
a0d0e21e
LW
8886 return(ret);
8887 }
3b57cd43
YO
8888 if (RExC_emit >= RExC_emit_bound)
8889 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8890
c277df42 8891 NODE_ALIGN_FILL(ret);
a0d0e21e 8892 ptr = ret;
c277df42 8893 FILL_ADVANCE_NODE(ptr, op);
7122b237 8894#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8895 if (RExC_offsets) { /* MJD */
07be1b83 8896 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 8897 "reg_node", __LINE__,
13d6edb4 8898 PL_reg_name[op],
07be1b83
YO
8899 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8900 ? "Overwriting end of array!\n" : "OK",
8901 (UV)(RExC_emit - RExC_emit_start),
8902 (UV)(RExC_parse - RExC_start),
8903 (UV)RExC_offsets[0]));
ccb2c380 8904 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 8905 }
7122b237 8906#endif
830247a4 8907 RExC_emit = ptr;
a0d0e21e 8908 return(ret);
a687059c
LW
8909}
8910
8911/*
a0d0e21e
LW
8912- reganode - emit a node with an argument
8913*/
76e3520e 8914STATIC regnode * /* Location. */
830247a4 8915S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 8916{
97aff369 8917 dVAR;
c277df42 8918 register regnode *ptr;
504618e9 8919 regnode * const ret = RExC_emit;
07be1b83 8920 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 8921
7918f24d
NC
8922 PERL_ARGS_ASSERT_REGANODE;
8923
c277df42 8924 if (SIZE_ONLY) {
830247a4
IZ
8925 SIZE_ALIGN(RExC_size);
8926 RExC_size += 2;
6bda09f9
YO
8927 /*
8928 We can't do this:
8929
8930 assert(2==regarglen[op]+1);
8931
8932 Anything larger than this has to allocate the extra amount.
8933 If we changed this to be:
8934
8935 RExC_size += (1 + regarglen[op]);
8936
8937 then it wouldn't matter. Its not clear what side effect
8938 might come from that so its not done so far.
8939 -- dmq
8940 */
a0d0e21e
LW
8941 return(ret);
8942 }
3b57cd43
YO
8943 if (RExC_emit >= RExC_emit_bound)
8944 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8945
c277df42 8946 NODE_ALIGN_FILL(ret);
a0d0e21e 8947 ptr = ret;
c277df42 8948 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 8949#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8950 if (RExC_offsets) { /* MJD */
07be1b83 8951 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 8952 "reganode",
ccb2c380 8953 __LINE__,
13d6edb4 8954 PL_reg_name[op],
07be1b83 8955 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 8956 "Overwriting end of array!\n" : "OK",
07be1b83
YO
8957 (UV)(RExC_emit - RExC_emit_start),
8958 (UV)(RExC_parse - RExC_start),
8959 (UV)RExC_offsets[0]));
ccb2c380 8960 Set_Cur_Node_Offset;
fac92740 8961 }
7122b237 8962#endif
830247a4 8963 RExC_emit = ptr;
a0d0e21e 8964 return(ret);
fe14fcc3
LW
8965}
8966
8967/*
cd439c50 8968- reguni - emit (if appropriate) a Unicode character
a0ed51b3 8969*/
71207a34
AL
8970STATIC STRLEN
8971S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 8972{
97aff369 8973 dVAR;
7918f24d
NC
8974
8975 PERL_ARGS_ASSERT_REGUNI;
8976
71207a34 8977 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
8978}
8979
8980/*
a0d0e21e
LW
8981- reginsert - insert an operator in front of already-emitted operand
8982*
8983* Means relocating the operand.
8984*/
76e3520e 8985STATIC void
6bda09f9 8986S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 8987{
97aff369 8988 dVAR;
c277df42
IZ
8989 register regnode *src;
8990 register regnode *dst;
8991 register regnode *place;
504618e9 8992 const int offset = regarglen[(U8)op];
6bda09f9 8993 const int size = NODE_STEP_REGNODE + offset;
07be1b83 8994 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8995
8996 PERL_ARGS_ASSERT_REGINSERT;
def51078 8997 PERL_UNUSED_ARG(depth);
22c35a8c 8998/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 8999 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 9000 if (SIZE_ONLY) {
6bda09f9 9001 RExC_size += size;
a0d0e21e
LW
9002 return;
9003 }
a687059c 9004
830247a4 9005 src = RExC_emit;
6bda09f9 9006 RExC_emit += size;
830247a4 9007 dst = RExC_emit;
40d049e4 9008 if (RExC_open_parens) {
6bda09f9 9009 int paren;
3b57cd43 9010 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 9011 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 9012 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 9013 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
9014 RExC_open_parens[paren] += size;
9015 } else {
3b57cd43 9016 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
9017 }
9018 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 9019 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
9020 RExC_close_parens[paren] += size;
9021 } else {
3b57cd43 9022 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
9023 }
9024 }
6bda09f9 9025 }
40d049e4 9026
fac92740 9027 while (src > opnd) {
c277df42 9028 StructCopy(--src, --dst, regnode);
7122b237 9029#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9030 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 9031 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 9032 "reg_insert",
ccb2c380 9033 __LINE__,
13d6edb4 9034 PL_reg_name[op],
07be1b83
YO
9035 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9036 ? "Overwriting end of array!\n" : "OK",
9037 (UV)(src - RExC_emit_start),
9038 (UV)(dst - RExC_emit_start),
9039 (UV)RExC_offsets[0]));
ccb2c380
MP
9040 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9041 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 9042 }
7122b237 9043#endif
fac92740
MJD
9044 }
9045
a0d0e21e
LW
9046
9047 place = opnd; /* Op node, where operand used to be. */
7122b237 9048#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9049 if (RExC_offsets) { /* MJD */
07be1b83 9050 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 9051 "reginsert",
ccb2c380 9052 __LINE__,
13d6edb4 9053 PL_reg_name[op],
07be1b83 9054 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 9055 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
9056 (UV)(place - RExC_emit_start),
9057 (UV)(RExC_parse - RExC_start),
786e8c11 9058 (UV)RExC_offsets[0]));
ccb2c380 9059 Set_Node_Offset(place, RExC_parse);
45948336 9060 Set_Node_Length(place, 1);
fac92740 9061 }
7122b237 9062#endif
c277df42
IZ
9063 src = NEXTOPER(place);
9064 FILL_ADVANCE_NODE(place, op);
9065 Zero(src, offset, regnode);
a687059c
LW
9066}
9067
9068/*
c277df42 9069- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 9070- SEE ALSO: regtail_study
a0d0e21e 9071*/
097eb12c 9072/* TODO: All three parms should be const */
76e3520e 9073STATIC void
3dab1dad 9074S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 9075{
97aff369 9076 dVAR;
c277df42 9077 register regnode *scan;
72f13be8 9078 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9079
9080 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
9081#ifndef DEBUGGING
9082 PERL_UNUSED_ARG(depth);
9083#endif
a0d0e21e 9084
c277df42 9085 if (SIZE_ONLY)
a0d0e21e
LW
9086 return;
9087
9088 /* Find last node. */
9089 scan = p;
9090 for (;;) {
504618e9 9091 regnode * const temp = regnext(scan);
3dab1dad
YO
9092 DEBUG_PARSE_r({
9093 SV * const mysv=sv_newmortal();
9094 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9095 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
9096 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9097 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9098 (temp == NULL ? "->" : ""),
13d6edb4 9099 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 9100 );
3dab1dad
YO
9101 });
9102 if (temp == NULL)
9103 break;
9104 scan = temp;
9105 }
9106
9107 if (reg_off_by_arg[OP(scan)]) {
9108 ARG_SET(scan, val - scan);
9109 }
9110 else {
9111 NEXT_OFF(scan) = val - scan;
9112 }
9113}
9114
07be1b83 9115#ifdef DEBUGGING
3dab1dad
YO
9116/*
9117- regtail_study - set the next-pointer at the end of a node chain of p to val.
9118- Look for optimizable sequences at the same time.
9119- currently only looks for EXACT chains.
07be1b83
YO
9120
9121This is expermental code. The idea is to use this routine to perform
9122in place optimizations on branches and groups as they are constructed,
9123with the long term intention of removing optimization from study_chunk so
9124that it is purely analytical.
9125
9126Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9127to control which is which.
9128
3dab1dad
YO
9129*/
9130/* TODO: All four parms should be const */
07be1b83 9131
3dab1dad
YO
9132STATIC U8
9133S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9134{
9135 dVAR;
9136 register regnode *scan;
07be1b83
YO
9137 U8 exact = PSEUDO;
9138#ifdef EXPERIMENTAL_INPLACESCAN
9139 I32 min = 0;
9140#endif
3dab1dad
YO
9141 GET_RE_DEBUG_FLAGS_DECL;
9142
7918f24d
NC
9143 PERL_ARGS_ASSERT_REGTAIL_STUDY;
9144
07be1b83 9145
3dab1dad
YO
9146 if (SIZE_ONLY)
9147 return exact;
9148
9149 /* Find last node. */
9150
9151 scan = p;
9152 for (;;) {
9153 regnode * const temp = regnext(scan);
07be1b83
YO
9154#ifdef EXPERIMENTAL_INPLACESCAN
9155 if (PL_regkind[OP(scan)] == EXACT)
9156 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9157 return EXACT;
9158#endif
3dab1dad
YO
9159 if ( exact ) {
9160 switch (OP(scan)) {
9161 case EXACT:
9162 case EXACTF:
9163 case EXACTFL:
9164 if( exact == PSEUDO )
9165 exact= OP(scan);
07be1b83
YO
9166 else if ( exact != OP(scan) )
9167 exact= 0;
3dab1dad
YO
9168 case NOTHING:
9169 break;
9170 default:
9171 exact= 0;
9172 }
9173 }
9174 DEBUG_PARSE_r({
9175 SV * const mysv=sv_newmortal();
9176 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9177 regprop(RExC_rx, mysv, scan);
eaf3ca90 9178 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 9179 SvPV_nolen_const(mysv),
eaf3ca90 9180 REG_NODE_NUM(scan),
13d6edb4 9181 PL_reg_name[exact]);
3dab1dad 9182 });
a0d0e21e
LW
9183 if (temp == NULL)
9184 break;
9185 scan = temp;
9186 }
07be1b83
YO
9187 DEBUG_PARSE_r({
9188 SV * const mysv_val=sv_newmortal();
9189 DEBUG_PARSE_MSG("");
9190 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
9191 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9192 SvPV_nolen_const(mysv_val),
9193 (IV)REG_NODE_NUM(val),
9194 (IV)(val - scan)
07be1b83
YO
9195 );
9196 });
c277df42
IZ
9197 if (reg_off_by_arg[OP(scan)]) {
9198 ARG_SET(scan, val - scan);
a0ed51b3
LW
9199 }
9200 else {
c277df42
IZ
9201 NEXT_OFF(scan) = val - scan;
9202 }
3dab1dad
YO
9203
9204 return exact;
a687059c 9205}
07be1b83 9206#endif
a687059c
LW
9207
9208/*
fd181c75 9209 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 9210 */
f7819f85 9211#ifdef DEBUGGING
c33269f7 9212static void
7918f24d
NC
9213S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9214{
f7819f85
A
9215 int bit;
9216 int set=0;
7918f24d 9217
f7819f85
A
9218 for (bit=0; bit<32; bit++) {
9219 if (flags & (1<<bit)) {
9220 if (!set++ && lead)
9221 PerlIO_printf(Perl_debug_log, "%s",lead);
9222 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9223 }
9224 }
9225 if (lead) {
9226 if (set)
9227 PerlIO_printf(Perl_debug_log, "\n");
9228 else
9229 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9230 }
9231}
9232#endif
9233
a687059c 9234void
097eb12c 9235Perl_regdump(pTHX_ const regexp *r)
a687059c 9236{
35ff7856 9237#ifdef DEBUGGING
97aff369 9238 dVAR;
c445ea15 9239 SV * const sv = sv_newmortal();
ab3bbdeb 9240 SV *dsv= sv_newmortal();
f8fc2ecf 9241 RXi_GET_DECL(r,ri);
f7819f85 9242 GET_RE_DEBUG_FLAGS_DECL;
a687059c 9243
7918f24d
NC
9244 PERL_ARGS_ASSERT_REGDUMP;
9245
f8fc2ecf 9246 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
9247
9248 /* Header fields of interest. */
ab3bbdeb
YO
9249 if (r->anchored_substr) {
9250 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9251 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 9252 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9253 "anchored %s%s at %"IVdf" ",
9254 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 9255 (IV)r->anchored_offset);
ab3bbdeb
YO
9256 } else if (r->anchored_utf8) {
9257 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9258 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 9259 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9260 "anchored utf8 %s%s at %"IVdf" ",
9261 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 9262 (IV)r->anchored_offset);
ab3bbdeb
YO
9263 }
9264 if (r->float_substr) {
9265 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9266 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 9267 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9268 "floating %s%s at %"IVdf"..%"UVuf" ",
9269 s, RE_SV_TAIL(r->float_substr),
7b0972df 9270 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
9271 } else if (r->float_utf8) {
9272 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9273 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 9274 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9275 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9276 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 9277 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 9278 }
33b8afdf 9279 if (r->check_substr || r->check_utf8)
b81d288d 9280 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
9281 (const char *)
9282 (r->check_substr == r->float_substr
9283 && r->check_utf8 == r->float_utf8
9284 ? "(checking floating" : "(checking anchored"));
bbe252da 9285 if (r->extflags & RXf_NOSCAN)
c277df42 9286 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 9287 if (r->extflags & RXf_CHECK_ALL)
c277df42 9288 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 9289 if (r->check_substr || r->check_utf8)
c277df42
IZ
9290 PerlIO_printf(Perl_debug_log, ") ");
9291
f8fc2ecf
YO
9292 if (ri->regstclass) {
9293 regprop(r, sv, ri->regstclass);
1de06328 9294 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 9295 }
bbe252da 9296 if (r->extflags & RXf_ANCH) {
774d564b 9297 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 9298 if (r->extflags & RXf_ANCH_BOL)
774d564b 9299 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 9300 if (r->extflags & RXf_ANCH_MBOL)
c277df42 9301 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 9302 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 9303 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 9304 if (r->extflags & RXf_ANCH_GPOS)
774d564b 9305 PerlIO_printf(Perl_debug_log, "(GPOS)");
9306 PerlIO_putc(Perl_debug_log, ' ');
9307 }
bbe252da 9308 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 9309 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 9310 if (r->intflags & PREGf_SKIP)
760ac839 9311 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 9312 if (r->intflags & PREGf_IMPLICIT)
760ac839 9313 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 9314 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 9315 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 9316 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 9317 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 9318 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 9319#else
7918f24d 9320 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 9321 PERL_UNUSED_CONTEXT;
65e66c80 9322 PERL_UNUSED_ARG(r);
17c3b450 9323#endif /* DEBUGGING */
a687059c
LW
9324}
9325
9326/*
a0d0e21e
LW
9327- regprop - printable representation of opcode
9328*/
3339dfd8
YO
9329#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9330STMT_START { \
9331 if (do_sep) { \
9332 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9333 if (flags & ANYOF_INVERT) \
9334 /*make sure the invert info is in each */ \
9335 sv_catpvs(sv, "^"); \
9336 do_sep = 0; \
9337 } \
9338} STMT_END
9339
46fc3d4c 9340void
32fc9b6a 9341Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 9342{
35ff7856 9343#ifdef DEBUGGING
97aff369 9344 dVAR;
9b155405 9345 register int k;
f8fc2ecf 9346 RXi_GET_DECL(prog,progi);
1de06328 9347 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 9348
7918f24d 9349 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 9350
76f68e9b 9351 sv_setpvs(sv, "");
8aa23a47 9352
03363afd 9353 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
9354 /* It would be nice to FAIL() here, but this may be called from
9355 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 9356 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 9357 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 9358
3dab1dad 9359 k = PL_regkind[OP(o)];
9b155405 9360
2a782b5b 9361 if (k == EXACT) {
f92a2122 9362 sv_catpvs(sv, " ");
ab3bbdeb
YO
9363 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9364 * is a crude hack but it may be the best for now since
9365 * we have no flag "this EXACTish node was UTF-8"
9366 * --jhi */
f92a2122
NC
9367 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9368 PERL_PV_ESCAPE_UNI_DETECT |
9369 PERL_PV_PRETTY_ELLIPSES |
9370 PERL_PV_PRETTY_LTGT |
9371 PERL_PV_PRETTY_NOCLEAR
9372 );
bb263b4e 9373 } else if (k == TRIE) {
3dab1dad 9374 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 9375 * progi->data isn't available here */
1de06328 9376 const char op = OP(o);
647f639f 9377 const U32 n = ARG(o);
1de06328 9378 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 9379 (reg_ac_data *)progi->data->data[n] :
1de06328 9380 NULL;
3251b653
NC
9381 const reg_trie_data * const trie
9382 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 9383
13d6edb4 9384 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
9385 DEBUG_TRIE_COMPILE_r(
9386 Perl_sv_catpvf(aTHX_ sv,
9387 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9388 (UV)trie->startstate,
1e2e3d02 9389 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
9390 (UV)trie->wordcount,
9391 (UV)trie->minlen,
9392 (UV)trie->maxlen,
9393 (UV)TRIE_CHARCOUNT(trie),
9394 (UV)trie->uniquecharcount
9395 )
9396 );
9397 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9398 int i;
9399 int rangestart = -1;
f46cb337 9400 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 9401 sv_catpvs(sv, "[");
1de06328
YO
9402 for (i = 0; i <= 256; i++) {
9403 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9404 if (rangestart == -1)
9405 rangestart = i;
9406 } else if (rangestart != -1) {
9407 if (i <= rangestart + 3)
9408 for (; rangestart < i; rangestart++)
9409 put_byte(sv, rangestart);
9410 else {
9411 put_byte(sv, rangestart);
9412 sv_catpvs(sv, "-");
9413 put_byte(sv, i - 1);
9414 }
9415 rangestart = -1;
9416 }
9417 }
f3a2811a 9418 sv_catpvs(sv, "]");
1de06328
YO
9419 }
9420
a3621e74 9421 } else if (k == CURLY) {
cb434fcc 9422 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
9423 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9424 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 9425 }
2c2d71f5
JH
9426 else if (k == WHILEM && o->flags) /* Ordinal/of */
9427 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 9428 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 9429 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 9430 if ( RXp_PAREN_NAMES(prog) ) {
ee9b8eae 9431 if ( k != REF || OP(o) < NREF) {
502c6561 9432 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
9433 SV **name= av_fetch(list, ARG(o), 0 );
9434 if (name)
9435 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9436 }
9437 else {
502c6561 9438 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 9439 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
9440 I32 *nums=(I32*)SvPVX(sv_dat);
9441 SV **name= av_fetch(list, nums[0], 0 );
9442 I32 n;
9443 if (name) {
9444 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9445 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9446 (n ? "," : ""), (IV)nums[n]);
9447 }
9448 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 9449 }
1f1031fe 9450 }
ee9b8eae 9451 }
1f1031fe 9452 } else if (k == GOSUB)
6bda09f9 9453 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
9454 else if (k == VERB) {
9455 if (!o->flags)
9456 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 9457 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 9458 } else if (k == LOGICAL)
04ebc1ab 9459 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
f9a79580 9460 else if (k == FOLDCHAR)
df44d732 9461 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
653099ff
GS
9462 else if (k == ANYOF) {
9463 int i, rangestart = -1;
2d03de9c 9464 const U8 flags = ANYOF_FLAGS(o);
24d786f4 9465 int do_sep = 0;
0bd48802
AL
9466
9467 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9468 static const char * const anyofs[] = {
653099ff
GS
9469 "\\w",
9470 "\\W",
9471 "\\s",
9472 "\\S",
9473 "\\d",
9474 "\\D",
9475 "[:alnum:]",
9476 "[:^alnum:]",
9477 "[:alpha:]",
9478 "[:^alpha:]",
9479 "[:ascii:]",
9480 "[:^ascii:]",
24d786f4
YO
9481 "[:cntrl:]",
9482 "[:^cntrl:]",
653099ff
GS
9483 "[:graph:]",
9484 "[:^graph:]",
9485 "[:lower:]",
9486 "[:^lower:]",
9487 "[:print:]",
9488 "[:^print:]",
9489 "[:punct:]",
9490 "[:^punct:]",
9491 "[:upper:]",
aaa51d5e 9492 "[:^upper:]",
653099ff 9493 "[:xdigit:]",
aaa51d5e
JF
9494 "[:^xdigit:]",
9495 "[:space:]",
9496 "[:^space:]",
9497 "[:blank:]",
9498 "[:^blank:]"
653099ff
GS
9499 };
9500
19860706 9501 if (flags & ANYOF_LOCALE)
396482e1 9502 sv_catpvs(sv, "{loc}");
19860706 9503 if (flags & ANYOF_FOLD)
396482e1 9504 sv_catpvs(sv, "{i}");
653099ff 9505 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 9506 if (flags & ANYOF_INVERT)
396482e1 9507 sv_catpvs(sv, "^");
3339dfd8
YO
9508
9509 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
9510 for (i = 0; i <= 256; i++) {
9511 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9512 if (rangestart == -1)
9513 rangestart = i;
9514 } else if (rangestart != -1) {
9515 if (i <= rangestart + 3)
9516 for (; rangestart < i; rangestart++)
653099ff 9517 put_byte(sv, rangestart);
ffc61ed2
JH
9518 else {
9519 put_byte(sv, rangestart);
396482e1 9520 sv_catpvs(sv, "-");
ffc61ed2 9521 put_byte(sv, i - 1);
653099ff 9522 }
24d786f4 9523 do_sep = 1;
ffc61ed2 9524 rangestart = -1;
653099ff 9525 }
847a199f 9526 }
3339dfd8
YO
9527
9528 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9529 /* output any special charclass tests (used mostly under use locale) */
ffc61ed2 9530 if (o->flags & ANYOF_CLASS)
bb7a0f54 9531 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 9532 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 9533 sv_catpv(sv, anyofs[i]);
24d786f4
YO
9534 do_sep = 1;
9535 }
9536
3339dfd8
YO
9537 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9538
9539 /* output information about the unicode matching */
ffc61ed2 9540 if (flags & ANYOF_UNICODE)
396482e1 9541 sv_catpvs(sv, "{unicode}");
1aa99e6b 9542 else if (flags & ANYOF_UNICODE_ALL)
396482e1 9543 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
9544
9545 {
9546 SV *lv;
32fc9b6a 9547 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 9548
ffc61ed2
JH
9549 if (lv) {
9550 if (sw) {
89ebb4a3 9551 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 9552
ffc61ed2 9553 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 9554 uvchr_to_utf8(s, i);
ffc61ed2 9555
3568d838 9556 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
9557 if (rangestart == -1)
9558 rangestart = i;
9559 } else if (rangestart != -1) {
ffc61ed2
JH
9560 if (i <= rangestart + 3)
9561 for (; rangestart < i; rangestart++) {
2d03de9c
AL
9562 const U8 * const e = uvchr_to_utf8(s,rangestart);
9563 U8 *p;
9564 for(p = s; p < e; p++)
ffc61ed2
JH
9565 put_byte(sv, *p);
9566 }
9567 else {
2d03de9c
AL
9568 const U8 *e = uvchr_to_utf8(s,rangestart);
9569 U8 *p;
9570 for (p = s; p < e; p++)
ffc61ed2 9571 put_byte(sv, *p);
396482e1 9572 sv_catpvs(sv, "-");
2d03de9c
AL
9573 e = uvchr_to_utf8(s, i-1);
9574 for (p = s; p < e; p++)
1df70142 9575 put_byte(sv, *p);
ffc61ed2
JH
9576 }
9577 rangestart = -1;
9578 }
19860706 9579 }
ffc61ed2 9580
396482e1 9581 sv_catpvs(sv, "..."); /* et cetera */
19860706 9582 }
fde631ed 9583
ffc61ed2 9584 {
2e0de35c 9585 char *s = savesvpv(lv);
c445ea15 9586 char * const origs = s;
b81d288d 9587
3dab1dad
YO
9588 while (*s && *s != '\n')
9589 s++;
b81d288d 9590
ffc61ed2 9591 if (*s == '\n') {
2d03de9c 9592 const char * const t = ++s;
ffc61ed2
JH
9593
9594 while (*s) {
9595 if (*s == '\n')
9596 *s = ' ';
9597 s++;
9598 }
9599 if (s[-1] == ' ')
9600 s[-1] = 0;
9601
9602 sv_catpv(sv, t);
fde631ed 9603 }
b81d288d 9604
ffc61ed2 9605 Safefree(origs);
fde631ed
JH
9606 }
9607 }
653099ff 9608 }
ffc61ed2 9609
653099ff
GS
9610 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9611 }
9b155405 9612 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 9613 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 9614#else
96a5add6 9615 PERL_UNUSED_CONTEXT;
65e66c80
SP
9616 PERL_UNUSED_ARG(sv);
9617 PERL_UNUSED_ARG(o);
f9049ba1 9618 PERL_UNUSED_ARG(prog);
17c3b450 9619#endif /* DEBUGGING */
35ff7856 9620}
a687059c 9621
cad2e5aa 9622SV *
288b8c02 9623Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 9624{ /* Assume that RE_INTUIT is set */
97aff369 9625 dVAR;
288b8c02 9626 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 9627 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9628
9629 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
9630 PERL_UNUSED_CONTEXT;
9631
a3621e74 9632 DEBUG_COMPILE_r(
cfd0369c 9633 {
2d03de9c 9634 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 9635 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
9636
9637 if (!PL_colorset) reginitcolors();
9638 PerlIO_printf(Perl_debug_log,
a0288114 9639 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
9640 PL_colors[4],
9641 prog->check_substr ? "" : "utf8 ",
9642 PL_colors[5],PL_colors[0],
cad2e5aa
JH
9643 s,
9644 PL_colors[1],
9645 (strlen(s) > 60 ? "..." : ""));
9646 } );
9647
33b8afdf 9648 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
9649}
9650
84da74a7 9651/*
f8149455 9652 pregfree()
84da74a7 9653
f8149455
YO
9654 handles refcounting and freeing the perl core regexp structure. When
9655 it is necessary to actually free the structure the first thing it
9656 does is call the 'free' method of the regexp_engine associated to to
9657 the regexp, allowing the handling of the void *pprivate; member
9658 first. (This routine is not overridable by extensions, which is why
9659 the extensions free is called first.)
9660
9661 See regdupe and regdupe_internal if you change anything here.
84da74a7 9662*/
f8149455 9663#ifndef PERL_IN_XSUB_RE
2b69d0c2 9664void
84679df5 9665Perl_pregfree(pTHX_ REGEXP *r)
a687059c 9666{
288b8c02
NC
9667 SvREFCNT_dec(r);
9668}
9669
9670void
9671Perl_pregfree2(pTHX_ REGEXP *rx)
9672{
27da23d5 9673 dVAR;
288b8c02 9674 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 9675 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 9676
7918f24d
NC
9677 PERL_ARGS_ASSERT_PREGFREE2;
9678
28d8d7f4
YO
9679 if (r->mother_re) {
9680 ReREFCNT_dec(r->mother_re);
9681 } else {
288b8c02 9682 CALLREGFREE_PVT(rx); /* free the private data */
ef8d46e8 9683 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
9684 }
9685 if (r->substrs) {
ef8d46e8
VP
9686 SvREFCNT_dec(r->anchored_substr);
9687 SvREFCNT_dec(r->anchored_utf8);
9688 SvREFCNT_dec(r->float_substr);
9689 SvREFCNT_dec(r->float_utf8);
28d8d7f4
YO
9690 Safefree(r->substrs);
9691 }
288b8c02 9692 RX_MATCH_COPY_FREE(rx);
f8c7b90f 9693#ifdef PERL_OLD_COPY_ON_WRITE
ef8d46e8 9694 SvREFCNT_dec(r->saved_copy);
ed252734 9695#endif
f0ab9afb 9696 Safefree(r->offs);
f8149455 9697}
28d8d7f4
YO
9698
9699/* reg_temp_copy()
9700
9701 This is a hacky workaround to the structural issue of match results
9702 being stored in the regexp structure which is in turn stored in
9703 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9704 could be PL_curpm in multiple contexts, and could require multiple
9705 result sets being associated with the pattern simultaneously, such
9706 as when doing a recursive match with (??{$qr})
9707
9708 The solution is to make a lightweight copy of the regexp structure
9709 when a qr// is returned from the code executed by (??{$qr}) this
9710 lightweight copy doesnt actually own any of its data except for
9711 the starp/end and the actual regexp structure itself.
9712
9713*/
9714
9715
84679df5 9716REGEXP *
f0826785 9717Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
7918f24d 9718{
f0826785 9719 struct regexp *ret;
288b8c02 9720 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 9721 register const I32 npar = r->nparens+1;
7918f24d
NC
9722
9723 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9724
f0826785
BM
9725 if (!ret_x)
9726 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9727 ret = (struct regexp *)SvANY(ret_x);
9728
288b8c02 9729 (void)ReREFCNT_inc(rx);
f7c278bf
NC
9730 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9731 by pointing directly at the buffer, but flagging that the allocated
9732 space in the copy is zero. As we've just done a struct copy, it's now
9733 a case of zero-ing that, rather than copying the current length. */
9734 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 9735 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
b6f60916
NC
9736 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9737 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
f7c278bf 9738 SvLEN_set(ret_x, 0);
b9ad13ac 9739 SvSTASH_set(ret_x, NULL);
703c388d 9740 SvMAGIC_set(ret_x, NULL);
f0ab9afb
NC
9741 Newx(ret->offs, npar, regexp_paren_pair);
9742 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 9743 if (r->substrs) {
28d8d7f4 9744 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
9745 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9746
9747 SvREFCNT_inc_void(ret->anchored_substr);
9748 SvREFCNT_inc_void(ret->anchored_utf8);
9749 SvREFCNT_inc_void(ret->float_substr);
9750 SvREFCNT_inc_void(ret->float_utf8);
9751
9752 /* check_substr and check_utf8, if non-NULL, point to either their
9753 anchored or float namesakes, and don't hold a second reference. */
486913e4 9754 }
288b8c02 9755 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 9756#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 9757 ret->saved_copy = NULL;
28d8d7f4 9758#endif
288b8c02 9759 ret->mother_re = rx;
28d8d7f4 9760
288b8c02 9761 return ret_x;
28d8d7f4 9762}
f8149455
YO
9763#endif
9764
9765/* regfree_internal()
9766
9767 Free the private data in a regexp. This is overloadable by
9768 extensions. Perl takes care of the regexp structure in pregfree(),
9769 this covers the *pprivate pointer which technically perldoesnt
9770 know about, however of course we have to handle the
9771 regexp_internal structure when no extension is in use.
9772
9773 Note this is called before freeing anything in the regexp
9774 structure.
9775 */
9776
9777void
288b8c02 9778Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
9779{
9780 dVAR;
288b8c02 9781 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
9782 RXi_GET_DECL(r,ri);
9783 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9784
9785 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9786
f8149455
YO
9787 DEBUG_COMPILE_r({
9788 if (!PL_colorset)
9789 reginitcolors();
9790 {
9791 SV *dsv= sv_newmortal();
3c8556c3 9792 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 9793 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
9794 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9795 PL_colors[4],PL_colors[5],s);
9796 }
9797 });
7122b237
YO
9798#ifdef RE_TRACK_PATTERN_OFFSETS
9799 if (ri->u.offsets)
9800 Safefree(ri->u.offsets); /* 20010421 MJD */
9801#endif
f8fc2ecf
YO
9802 if (ri->data) {
9803 int n = ri->data->count;
f3548bdc
DM
9804 PAD* new_comppad = NULL;
9805 PAD* old_comppad;
4026c95a 9806 PADOFFSET refcnt;
dfad63ad 9807
c277df42 9808 while (--n >= 0) {
261faec3 9809 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 9810 switch (ri->data->what[n]) {
af534a04 9811 case 'a':
c277df42 9812 case 's':
81714fb9 9813 case 'S':
55eed653 9814 case 'u':
ad64d0ec 9815 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 9816 break;
653099ff 9817 case 'f':
f8fc2ecf 9818 Safefree(ri->data->data[n]);
653099ff 9819 break;
dfad63ad 9820 case 'p':
502c6561 9821 new_comppad = MUTABLE_AV(ri->data->data[n]);
dfad63ad 9822 break;
c277df42 9823 case 'o':
dfad63ad 9824 if (new_comppad == NULL)
cea2e8a9 9825 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
9826 PAD_SAVE_LOCAL(old_comppad,
9827 /* Watch out for global destruction's random ordering. */
c445ea15 9828 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 9829 );
b34c0dd4 9830 OP_REFCNT_LOCK;
f8fc2ecf 9831 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
9832 OP_REFCNT_UNLOCK;
9833 if (!refcnt)
f8fc2ecf 9834 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 9835
f3548bdc 9836 PAD_RESTORE_LOCAL(old_comppad);
ad64d0ec 9837 SvREFCNT_dec(MUTABLE_SV(new_comppad));
dfad63ad 9838 new_comppad = NULL;
c277df42
IZ
9839 break;
9840 case 'n':
9e55ce06 9841 break;
07be1b83 9842 case 'T':
be8e71aa
YO
9843 { /* Aho Corasick add-on structure for a trie node.
9844 Used in stclass optimization only */
07be1b83 9845 U32 refcount;
f8fc2ecf 9846 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
9847 OP_REFCNT_LOCK;
9848 refcount = --aho->refcount;
9849 OP_REFCNT_UNLOCK;
9850 if ( !refcount ) {
446bd890
NC
9851 PerlMemShared_free(aho->states);
9852 PerlMemShared_free(aho->fail);
446bd890
NC
9853 /* do this last!!!! */
9854 PerlMemShared_free(ri->data->data[n]);
9855 PerlMemShared_free(ri->regstclass);
07be1b83
YO
9856 }
9857 }
9858 break;
a3621e74 9859 case 't':
07be1b83 9860 {
be8e71aa 9861 /* trie structure. */
07be1b83 9862 U32 refcount;
f8fc2ecf 9863 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
9864 OP_REFCNT_LOCK;
9865 refcount = --trie->refcount;
9866 OP_REFCNT_UNLOCK;
9867 if ( !refcount ) {
446bd890 9868 PerlMemShared_free(trie->charmap);
446bd890
NC
9869 PerlMemShared_free(trie->states);
9870 PerlMemShared_free(trie->trans);
07be1b83 9871 if (trie->bitmap)
446bd890 9872 PerlMemShared_free(trie->bitmap);
786e8c11 9873 if (trie->jump)
446bd890 9874 PerlMemShared_free(trie->jump);
2e64971a 9875 PerlMemShared_free(trie->wordinfo);
446bd890
NC
9876 /* do this last!!!! */
9877 PerlMemShared_free(ri->data->data[n]);
a3621e74 9878 }
07be1b83
YO
9879 }
9880 break;
c277df42 9881 default:
f8fc2ecf 9882 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
9883 }
9884 }
f8fc2ecf
YO
9885 Safefree(ri->data->what);
9886 Safefree(ri->data);
a0d0e21e 9887 }
28d8d7f4 9888
f8fc2ecf 9889 Safefree(ri);
a687059c 9890}
c277df42 9891
a09252eb
NC
9892#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
9893#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
84da74a7
YO
9894#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9895
9896/*
32cd70f6 9897 re_dup - duplicate a regexp.
84da74a7 9898
8233f606
DM
9899 This routine is expected to clone a given regexp structure. It is only
9900 compiled under USE_ITHREADS.
32cd70f6 9901
f8149455
YO
9902 After all of the core data stored in struct regexp is duplicated
9903 the regexp_engine.dupe method is used to copy any private data
9904 stored in the *pprivate pointer. This allows extensions to handle
9905 any duplication it needs to do.
9906
9907 See pregfree() and regfree_internal() if you change anything here.
84da74a7 9908*/
a3c0e9ca 9909#if defined(USE_ITHREADS)
f8149455 9910#ifndef PERL_IN_XSUB_RE
288b8c02
NC
9911void
9912Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 9913{
84da74a7 9914 dVAR;
a86a1ca7 9915 I32 npar;
288b8c02
NC
9916 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9917 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 9918
7918f24d
NC
9919 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9920
84da74a7 9921 npar = r->nparens+1;
f0ab9afb
NC
9922 Newx(ret->offs, npar, regexp_paren_pair);
9923 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 9924 if(ret->swap) {
28d8d7f4 9925 /* no need to copy these */
f0ab9afb 9926 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 9927 }
84da74a7 9928
6057429f 9929 if (ret->substrs) {
32cd70f6
NC
9930 /* Do it this way to avoid reading from *r after the StructCopy().
9931 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9932 cache, it doesn't matter. */
66b1de87
NC
9933 const bool anchored = r->check_substr
9934 ? r->check_substr == r->anchored_substr
9935 : r->check_utf8 == r->anchored_utf8;
785a26d5 9936 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
9937 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9938
32cd70f6
NC
9939 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9940 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9941 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9942 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 9943
32cd70f6
NC
9944 /* check_substr and check_utf8, if non-NULL, point to either their
9945 anchored or float namesakes, and don't hold a second reference. */
9946
9947 if (ret->check_substr) {
9948 if (anchored) {
9949 assert(r->check_utf8 == r->anchored_utf8);
9950 ret->check_substr = ret->anchored_substr;
9951 ret->check_utf8 = ret->anchored_utf8;
9952 } else {
9953 assert(r->check_substr == r->float_substr);
9954 assert(r->check_utf8 == r->float_utf8);
9955 ret->check_substr = ret->float_substr;
9956 ret->check_utf8 = ret->float_utf8;
9957 }
66b1de87
NC
9958 } else if (ret->check_utf8) {
9959 if (anchored) {
9960 ret->check_utf8 = ret->anchored_utf8;
9961 } else {
9962 ret->check_utf8 = ret->float_utf8;
9963 }
32cd70f6 9964 }
6057429f 9965 }
f8149455 9966
5daac39c 9967 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 9968
6057429f 9969 if (ret->pprivate)
288b8c02 9970 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 9971
288b8c02 9972 if (RX_MATCH_COPIED(dstr))
6057429f 9973 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
9974 else
9975 ret->subbeg = NULL;
9976#ifdef PERL_OLD_COPY_ON_WRITE
9977 ret->saved_copy = NULL;
9978#endif
6057429f 9979
c2123ae3
NC
9980 if (ret->mother_re) {
9981 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9982 /* Our storage points directly to our mother regexp, but that's
9983 1: a buffer in a different thread
9984 2: something we no longer hold a reference on
9985 so we need to copy it locally. */
9986 /* Note we need to sue SvCUR() on our mother_re, because it, in
9987 turn, may well be pointing to its own mother_re. */
9988 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9989 SvCUR(ret->mother_re)+1));
9990 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9991 }
9992 ret->mother_re = NULL;
9993 }
6057429f 9994 ret->gofs = 0;
f8149455
YO
9995}
9996#endif /* PERL_IN_XSUB_RE */
9997
9998/*
9999 regdupe_internal()
10000
10001 This is the internal complement to regdupe() which is used to copy
10002 the structure pointed to by the *pprivate pointer in the regexp.
10003 This is the core version of the extension overridable cloning hook.
10004 The regexp structure being duplicated will be copied by perl prior
10005 to this and will be provided as the regexp *r argument, however
10006 with the /old/ structures pprivate pointer value. Thus this routine
10007 may override any copying normally done by perl.
10008
10009 It returns a pointer to the new regexp_internal structure.
10010*/
10011
10012void *
288b8c02 10013Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
10014{
10015 dVAR;
288b8c02 10016 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
10017 regexp_internal *reti;
10018 int len, npar;
10019 RXi_GET_DECL(r,ri);
7918f24d
NC
10020
10021 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455
YO
10022
10023 npar = r->nparens+1;
7122b237 10024 len = ProgLen(ri);
f8149455 10025
45cf4570 10026 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455
YO
10027 Copy(ri->program, reti->program, len+1, regnode);
10028
f8149455 10029
f8fc2ecf 10030 reti->regstclass = NULL;
bcdf7404 10031
f8fc2ecf 10032 if (ri->data) {
84da74a7 10033 struct reg_data *d;
f8fc2ecf 10034 const int count = ri->data->count;
84da74a7
YO
10035 int i;
10036
10037 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10038 char, struct reg_data);
10039 Newx(d->what, count, U8);
10040
10041 d->count = count;
10042 for (i = 0; i < count; i++) {
f8fc2ecf 10043 d->what[i] = ri->data->what[i];
84da74a7 10044 switch (d->what[i]) {
af534a04 10045 /* legal options are one of: sSfpontTua
84da74a7 10046 see also regcomp.h and pregfree() */
af534a04 10047 case 'a': /* actually an AV, but the dup function is identical. */
84da74a7 10048 case 's':
81714fb9 10049 case 'S':
0536c0a7 10050 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 10051 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 10052 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 10053 break;
84da74a7
YO
10054 case 'f':
10055 /* This is cheating. */
10056 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 10057 StructCopy(ri->data->data[i], d->data[i],
84da74a7 10058 struct regnode_charclass_class);
f8fc2ecf 10059 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
10060 break;
10061 case 'o':
bbe252da
YO
10062 /* Compiled op trees are readonly and in shared memory,
10063 and can thus be shared without duplication. */
84da74a7 10064 OP_REFCNT_LOCK;
f8fc2ecf 10065 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
10066 OP_REFCNT_UNLOCK;
10067 break;
23eab42c
NC
10068 case 'T':
10069 /* Trie stclasses are readonly and can thus be shared
10070 * without duplication. We free the stclass in pregfree
10071 * when the corresponding reg_ac_data struct is freed.
10072 */
10073 reti->regstclass= ri->regstclass;
10074 /* Fall through */
84da74a7 10075 case 't':
84da74a7 10076 OP_REFCNT_LOCK;
0536c0a7 10077 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 10078 OP_REFCNT_UNLOCK;
0536c0a7
NC
10079 /* Fall through */
10080 case 'n':
10081 d->data[i] = ri->data->data[i];
84da74a7 10082 break;
84da74a7 10083 default:
f8fc2ecf 10084 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
10085 }
10086 }
10087
f8fc2ecf 10088 reti->data = d;
84da74a7
YO
10089 }
10090 else
f8fc2ecf 10091 reti->data = NULL;
84da74a7 10092
cde0cee5
YO
10093 reti->name_list_idx = ri->name_list_idx;
10094
7122b237
YO
10095#ifdef RE_TRACK_PATTERN_OFFSETS
10096 if (ri->u.offsets) {
10097 Newx(reti->u.offsets, 2*len+1, U32);
10098 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10099 }
10100#else
10101 SetProgLen(reti,len);
10102#endif
10103
f8149455 10104 return (void*)reti;
84da74a7 10105}
f8149455
YO
10106
10107#endif /* USE_ITHREADS */
84da74a7 10108
f8149455 10109#ifndef PERL_IN_XSUB_RE
bcdf7404 10110
c277df42
IZ
10111/*
10112 - regnext - dig the "next" pointer out of a node
c277df42
IZ
10113 */
10114regnode *
864dbfa3 10115Perl_regnext(pTHX_ register regnode *p)
c277df42 10116{
97aff369 10117 dVAR;
c277df42
IZ
10118 register I32 offset;
10119
f8fc2ecf 10120 if (!p)
c277df42
IZ
10121 return(NULL);
10122
35db910f
KW
10123 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
10124 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10125 }
10126
c277df42
IZ
10127 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10128 if (offset == 0)
10129 return(NULL);
10130
c277df42 10131 return(p+offset);
c277df42 10132}
76234dfb 10133#endif
c277df42 10134
01f988be 10135STATIC void
cea2e8a9 10136S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
10137{
10138 va_list args;
10139 STRLEN l1 = strlen(pat1);
10140 STRLEN l2 = strlen(pat2);
10141 char buf[512];
06bf62c7 10142 SV *msv;
73d840c0 10143 const char *message;
c277df42 10144
7918f24d
NC
10145 PERL_ARGS_ASSERT_RE_CROAK2;
10146
c277df42
IZ
10147 if (l1 > 510)
10148 l1 = 510;
10149 if (l1 + l2 > 510)
10150 l2 = 510 - l1;
10151 Copy(pat1, buf, l1 , char);
10152 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
10153 buf[l1 + l2] = '\n';
10154 buf[l1 + l2 + 1] = '\0';
8736538c
AS
10155#ifdef I_STDARG
10156 /* ANSI variant takes additional second argument */
c277df42 10157 va_start(args, pat2);
8736538c
AS
10158#else
10159 va_start(args);
10160#endif
5a844595 10161 msv = vmess(buf, &args);
c277df42 10162 va_end(args);
cfd0369c 10163 message = SvPV_const(msv,l1);
c277df42
IZ
10164 if (l1 > 512)
10165 l1 = 512;
10166 Copy(message, buf, l1 , char);
197cf9b9 10167 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 10168 Perl_croak(aTHX_ "%s", buf);
c277df42 10169}
a0ed51b3
LW
10170
10171/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
10172
76234dfb 10173#ifndef PERL_IN_XSUB_RE
a0ed51b3 10174void
864dbfa3 10175Perl_save_re_context(pTHX)
b81d288d 10176{
97aff369 10177 dVAR;
1ade1aa1
NC
10178
10179 struct re_save_state *state;
10180
10181 SAVEVPTR(PL_curcop);
10182 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10183
10184 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10185 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
c6bf6a65 10186 SSPUSHUV(SAVEt_RE_STATE);
1ade1aa1 10187
46ab3289 10188 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 10189
a0ed51b3 10190 PL_reg_start_tmp = 0;
a0ed51b3 10191 PL_reg_start_tmpl = 0;
c445ea15 10192 PL_reg_oldsaved = NULL;
a5db57d6 10193 PL_reg_oldsavedlen = 0;
a5db57d6 10194 PL_reg_maxiter = 0;
a5db57d6 10195 PL_reg_leftiter = 0;
c445ea15 10196 PL_reg_poscache = NULL;
a5db57d6 10197 PL_reg_poscache_size = 0;
1ade1aa1
NC
10198#ifdef PERL_OLD_COPY_ON_WRITE
10199 PL_nrs = NULL;
10200#endif
ada6e8a9 10201
c445ea15
AL
10202 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10203 if (PL_curpm) {
10204 const REGEXP * const rx = PM_GETRE(PL_curpm);
10205 if (rx) {
1df70142 10206 U32 i;
07bc277f 10207 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 10208 char digits[TYPE_CHARS(long)];
d9fad198 10209 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
10210 GV *const *const gvp
10211 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10212
b37c2d43
AL
10213 if (gvp) {
10214 GV * const gv = *gvp;
10215 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10216 save_scalar(gv);
49f27e4b 10217 }
ada6e8a9
AMS
10218 }
10219 }
10220 }
a0ed51b3 10221}
76234dfb 10222#endif
51371543 10223
51371543 10224static void
acfe0abc 10225clear_re(pTHX_ void *r)
51371543 10226{
97aff369 10227 dVAR;
84679df5 10228 ReREFCNT_dec((REGEXP *)r);
51371543 10229}
ffbc6a93 10230
a28509cc
AL
10231#ifdef DEBUGGING
10232
10233STATIC void
10234S_put_byte(pTHX_ SV *sv, int c)
10235{
7918f24d
NC
10236 PERL_ARGS_ASSERT_PUT_BYTE;
10237
7fddd944
NC
10238 /* Our definition of isPRINT() ignores locales, so only bytes that are
10239 not part of UTF-8 are considered printable. I assume that the same
10240 holds for UTF-EBCDIC.
10241 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10242 which Wikipedia says:
10243
10244 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10245 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10246 identical, to the ASCII delete (DEL) or rubout control character.
10247 ) So the old condition can be simplified to !isPRINT(c) */
10248 if (!isPRINT(c))
a28509cc 10249 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5e7aa789 10250 else {
88c9ea1e 10251 const char string = c;
5e7aa789
NC
10252 if (c == '-' || c == ']' || c == '\\' || c == '^')
10253 sv_catpvs(sv, "\\");
10254 sv_catpvn(sv, &string, 1);
10255 }
a28509cc
AL
10256}
10257
786e8c11 10258
3dab1dad
YO
10259#define CLEAR_OPTSTART \
10260 if (optstart) STMT_START { \
70685ca0 10261 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
10262 optstart=NULL; \
10263 } STMT_END
10264
786e8c11 10265#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 10266
b5a2f8d8
NC
10267STATIC const regnode *
10268S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
10269 const regnode *last, const regnode *plast,
10270 SV* sv, I32 indent, U32 depth)
a28509cc 10271{
97aff369 10272 dVAR;
786e8c11 10273 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 10274 register const regnode *next;
3dab1dad 10275 const regnode *optstart= NULL;
1f1031fe 10276
f8fc2ecf 10277 RXi_GET_DECL(r,ri);
3dab1dad 10278 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10279
10280 PERL_ARGS_ASSERT_DUMPUNTIL;
10281
786e8c11
YO
10282#ifdef DEBUG_DUMPUNTIL
10283 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10284 last ? last-start : 0,plast ? plast-start : 0);
10285#endif
10286
10287 if (plast && plast < last)
10288 last= plast;
10289
10290 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 10291 /* While that wasn't END last time... */
a28509cc
AL
10292 NODE_ALIGN(node);
10293 op = OP(node);
de734bd5 10294 if (op == CLOSE || op == WHILEM)
786e8c11 10295 indent--;
b5a2f8d8 10296 next = regnext((regnode *)node);
1f1031fe 10297
a28509cc 10298 /* Where, what. */
8e11feef 10299 if (OP(node) == OPTIMIZED) {
e68ec53f 10300 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 10301 optstart = node;
3dab1dad 10302 else
8e11feef 10303 goto after_print;
3dab1dad
YO
10304 } else
10305 CLEAR_OPTSTART;
1f1031fe 10306
32fc9b6a 10307 regprop(r, sv, node);
a28509cc 10308 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 10309 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
10310
10311 if (OP(node) != OPTIMIZED) {
10312 if (next == NULL) /* Next ptr. */
10313 PerlIO_printf(Perl_debug_log, " (0)");
10314 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10315 PerlIO_printf(Perl_debug_log, " (FAIL)");
10316 else
10317 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10318 (void)PerlIO_putc(Perl_debug_log, '\n');
10319 }
10320
a28509cc
AL
10321 after_print:
10322 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
10323 assert(next);
10324 {
10325 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
10326 ? regnext((regnode *)next)
10327 : next);
be8e71aa
YO
10328 if (last && nnode > last)
10329 nnode = last;
786e8c11 10330 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 10331 }
a28509cc
AL
10332 }
10333 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 10334 assert(next);
786e8c11 10335 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
10336 }
10337 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 10338 const regnode *this_trie = node;
1de06328 10339 const char op = OP(node);
647f639f 10340 const U32 n = ARG(node);
1de06328 10341 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 10342 (reg_ac_data *)ri->data->data[n] :
1de06328 10343 NULL;
3251b653
NC
10344 const reg_trie_data * const trie =
10345 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 10346#ifdef DEBUGGING
502c6561 10347 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 10348#endif
786e8c11 10349 const regnode *nextbranch= NULL;
a28509cc 10350 I32 word_idx;
76f68e9b 10351 sv_setpvs(sv, "");
786e8c11 10352 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 10353 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
10354
10355 PerlIO_printf(Perl_debug_log, "%*s%s ",
10356 (int)(2*(indent+3)), "",
10357 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
10358 PL_colors[0], PL_colors[1],
10359 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 10360 PERL_PV_PRETTY_ELLIPSES |
7f69552c 10361 PERL_PV_PRETTY_LTGT
786e8c11
YO
10362 )
10363 : "???"
10364 );
10365 if (trie->jump) {
40d049e4 10366 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
10367 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10368 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
10369 if (dist) {
10370 if (!nextbranch)
24b23f37 10371 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
10372 DUMPUNTIL(this_trie + dist, nextbranch);
10373 }
786e8c11
YO
10374 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10375 nextbranch= regnext((regnode *)nextbranch);
10376 } else {
10377 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 10378 }
786e8c11
YO
10379 }
10380 if (last && next > last)
10381 node= last;
10382 else
10383 node= next;
a28509cc 10384 }
786e8c11
YO
10385 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10386 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10387 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
10388 }
10389 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 10390 assert(next);
786e8c11 10391 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
10392 }
10393 else if ( op == PLUS || op == STAR) {
786e8c11 10394 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
10395 }
10396 else if (op == ANYOF) {
10397 /* arglen 1 + class block */
10398 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10399 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10400 node = NEXTOPER(node);
10401 }
10402 else if (PL_regkind[(U8)op] == EXACT) {
10403 /* Literal string, where present. */
10404 node += NODE_SZ_STR(node) - 1;
10405 node = NEXTOPER(node);
10406 }
10407 else {
10408 node = NEXTOPER(node);
10409 node += regarglen[(U8)op];
10410 }
10411 if (op == CURLYX || op == OPEN)
786e8c11 10412 indent++;
a28509cc 10413 }
3dab1dad 10414 CLEAR_OPTSTART;
786e8c11 10415#ifdef DEBUG_DUMPUNTIL
70685ca0 10416 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 10417#endif
1de06328 10418 return node;
a28509cc
AL
10419}
10420
10421#endif /* DEBUGGING */
10422
241d1a3b
NC
10423/*
10424 * Local variables:
10425 * c-indentation-style: bsd
10426 * c-basic-offset: 4
10427 * indent-tabs-mode: t
10428 * End:
10429 *
37442d52
RGS
10430 * ex: set ts=8 sts=4 sw=4 noet:
10431 */