This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Explicitly close filehandle in ExtUtils::CBuilder::have_compiler().
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
81714fb9 137 HV *paren_names; /* Paren names */
1f1031fe 138
40d049e4
YO
139 regnode **recurse; /* Recurse regops */
140 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
141#if ADD_TO_REGEXEC
142 char *starttry; /* -Dr: where regtry was called. */
143#define RExC_starttry (pRExC_state->starttry)
144#endif
3dab1dad 145#ifdef DEBUGGING
be8e71aa 146 const char *lastparse;
3dab1dad 147 I32 lastnum;
1f1031fe 148 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
149#define RExC_lastparse (pRExC_state->lastparse)
150#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 151#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 152#endif
830247a4
IZ
153} RExC_state_t;
154
e2509266 155#define RExC_flags (pRExC_state->flags)
830247a4 156#define RExC_precomp (pRExC_state->precomp)
288b8c02 157#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 158#define RExC_rx (pRExC_state->rx)
f8fc2ecf 159#define RExC_rxi (pRExC_state->rxi)
fac92740 160#define RExC_start (pRExC_state->start)
830247a4
IZ
161#define RExC_end (pRExC_state->end)
162#define RExC_parse (pRExC_state->parse)
163#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
164#ifdef RE_TRACK_PATTERN_OFFSETS
165#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
166#endif
830247a4 167#define RExC_emit (pRExC_state->emit)
fac92740 168#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 169#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
170#define RExC_naughty (pRExC_state->naughty)
171#define RExC_sawback (pRExC_state->sawback)
172#define RExC_seen (pRExC_state->seen)
173#define RExC_size (pRExC_state->size)
174#define RExC_npar (pRExC_state->npar)
e2e6a0f1 175#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
176#define RExC_extralen (pRExC_state->extralen)
177#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 179#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 180#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
181#define RExC_open_parens (pRExC_state->open_parens)
182#define RExC_close_parens (pRExC_state->close_parens)
183#define RExC_opend (pRExC_state->opend)
81714fb9 184#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
185#define RExC_recurse (pRExC_state->recurse)
186#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 187
cde0cee5 188
a687059c
LW
189#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
a687059c 192
35c8bce7
LW
193#ifdef SPSTART
194#undef SPSTART /* dratted cpp namespace... */
195#endif
a687059c
LW
196/*
197 * Flags to be passed up and down.
198 */
a687059c 199#define WORST 0 /* Worst case. */
a3b492c3 200#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
201
202/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
203 * character, and if utf8, must be invariant. */
204#define SIMPLE 0x02
a3b492c3
YO
205#define SPSTART 0x04 /* Starts with * or +. */
206#define TRYAGAIN 0x08 /* Weeded out a declaration. */
207#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 208
3dab1dad
YO
209#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
210
07be1b83
YO
211/* whether trie related optimizations are enabled */
212#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
213#define TRIE_STUDY_OPT
786e8c11 214#define FULL_TRIE_STUDY
07be1b83
YO
215#define TRIE_STCLASS
216#endif
1de06328
YO
217
218
40d049e4
YO
219
220#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
221#define PBITVAL(paren) (1 << ((paren) & 7))
222#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
223#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
224#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
225
bbd61b5f
KW
226/* If not already in utf8, do a longjmp back to the beginning */
227#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
228#define REQUIRE_UTF8 STMT_START { \
229 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
230 } STMT_END
40d049e4 231
1de06328
YO
232/* About scan_data_t.
233
234 During optimisation we recurse through the regexp program performing
235 various inplace (keyhole style) optimisations. In addition study_chunk
236 and scan_commit populate this data structure with information about
237 what strings MUST appear in the pattern. We look for the longest
3b753521 238 string that must appear at a fixed location, and we look for the
1de06328
YO
239 longest string that may appear at a floating location. So for instance
240 in the pattern:
241
242 /FOO[xX]A.*B[xX]BAR/
243
244 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
245 strings (because they follow a .* construct). study_chunk will identify
246 both FOO and BAR as being the longest fixed and floating strings respectively.
247
248 The strings can be composites, for instance
249
250 /(f)(o)(o)/
251
252 will result in a composite fixed substring 'foo'.
253
254 For each string some basic information is maintained:
255
256 - offset or min_offset
257 This is the position the string must appear at, or not before.
258 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
259 characters must match before the string we are searching for.
260 Likewise when combined with minlenp and the length of the string it
1de06328
YO
261 tells us how many characters must appear after the string we have
262 found.
263
264 - max_offset
265 Only used for floating strings. This is the rightmost point that
3b753521 266 the string can appear at. If set to I32 max it indicates that the
1de06328
YO
267 string can occur infinitely far to the right.
268
269 - minlenp
270 A pointer to the minimum length of the pattern that the string
271 was found inside. This is important as in the case of positive
272 lookahead or positive lookbehind we can have multiple patterns
273 involved. Consider
274
275 /(?=FOO).*F/
276
277 The minimum length of the pattern overall is 3, the minimum length
278 of the lookahead part is 3, but the minimum length of the part that
279 will actually match is 1. So 'FOO's minimum length is 3, but the
280 minimum length for the F is 1. This is important as the minimum length
281 is used to determine offsets in front of and behind the string being
282 looked for. Since strings can be composites this is the length of the
283 pattern at the time it was commited with a scan_commit. Note that
284 the length is calculated by study_chunk, so that the minimum lengths
285 are not known until the full pattern has been compiled, thus the
286 pointer to the value.
287
288 - lookbehind
289
290 In the case of lookbehind the string being searched for can be
291 offset past the start point of the final matching string.
292 If this value was just blithely removed from the min_offset it would
293 invalidate some of the calculations for how many chars must match
294 before or after (as they are derived from min_offset and minlen and
295 the length of the string being searched for).
296 When the final pattern is compiled and the data is moved from the
297 scan_data_t structure into the regexp structure the information
298 about lookbehind is factored in, with the information that would
299 have been lost precalculated in the end_shift field for the
300 associated string.
301
302 The fields pos_min and pos_delta are used to store the minimum offset
303 and the delta to the maximum offset at the current point in the pattern.
304
305*/
2c2d71f5
JH
306
307typedef struct scan_data_t {
1de06328
YO
308 /*I32 len_min; unused */
309 /*I32 len_delta; unused */
2c2d71f5
JH
310 I32 pos_min;
311 I32 pos_delta;
312 SV *last_found;
1de06328 313 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
314 I32 last_start_min;
315 I32 last_start_max;
1de06328
YO
316 SV **longest; /* Either &l_fixed, or &l_float. */
317 SV *longest_fixed; /* longest fixed string found in pattern */
318 I32 offset_fixed; /* offset where it starts */
319 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
320 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
321 SV *longest_float; /* longest floating string found in pattern */
322 I32 offset_float_min; /* earliest point in string it can appear */
323 I32 offset_float_max; /* latest point in string it can appear */
324 I32 *minlen_float; /* pointer to the minlen relevent to the string */
325 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
326 I32 flags;
327 I32 whilem_c;
cb434fcc 328 I32 *last_closep;
653099ff 329 struct regnode_charclass_class *start_class;
2c2d71f5
JH
330} scan_data_t;
331
a687059c 332/*
e50aee73 333 * Forward declarations for pregcomp()'s friends.
a687059c 334 */
a0d0e21e 335
27da23d5 336static const scan_data_t zero_scan_data =
1de06328 337 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
338
339#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
340#define SF_BEFORE_SEOL 0x0001
341#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
342#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
343#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
344
09b7f37c
CB
345#ifdef NO_UNARY_PLUS
346# define SF_FIX_SHIFT_EOL (0+2)
347# define SF_FL_SHIFT_EOL (0+4)
348#else
349# define SF_FIX_SHIFT_EOL (+2)
350# define SF_FL_SHIFT_EOL (+4)
351#endif
c277df42
IZ
352
353#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
354#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
355
356#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
357#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
358#define SF_IS_INF 0x0040
359#define SF_HAS_PAR 0x0080
360#define SF_IN_PAR 0x0100
361#define SF_HAS_EVAL 0x0200
362#define SCF_DO_SUBSTR 0x0400
653099ff
GS
363#define SCF_DO_STCLASS_AND 0x0800
364#define SCF_DO_STCLASS_OR 0x1000
365#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 366#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 367
786e8c11 368#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 369#define SCF_SEEN_ACCEPT 0x8000
07be1b83 370
43fead97
KW
371#define UTF cBOOL(RExC_utf8)
372#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
373#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
374#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 375
ffc61ed2 376#define OOB_UNICODE 12345678
93733859 377#define OOB_NAMEDCLASS -1
b8c5462f 378
a0ed51b3
LW
379#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
380#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
381
8615cb43 382
b45f050a
JF
383/* length of regex to show in messages that don't mark a position within */
384#define RegexLengthToShowInErrorMessages 127
385
386/*
387 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
388 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
389 * op/pragma/warn/regcomp.
390 */
7253e4e3
RK
391#define MARKER1 "<-- HERE" /* marker as it appears in the description */
392#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 393
7253e4e3 394#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
395
396/*
397 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
398 * arg. Show regex, up to a maximum length. If it's too long, chop and add
399 * "...".
400 */
58e23c8d 401#define _FAIL(code) STMT_START { \
bfed75c6 402 const char *ellipses = ""; \
ccb2c380
MP
403 IV len = RExC_end - RExC_precomp; \
404 \
405 if (!SIZE_ONLY) \
288b8c02 406 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
407 if (len > RegexLengthToShowInErrorMessages) { \
408 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
409 len = RegexLengthToShowInErrorMessages - 10; \
410 ellipses = "..."; \
411 } \
58e23c8d 412 code; \
ccb2c380 413} STMT_END
8615cb43 414
58e23c8d
YO
415#define FAIL(msg) _FAIL( \
416 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
417 msg, (int)len, RExC_precomp, ellipses))
418
419#define FAIL2(msg,arg) _FAIL( \
420 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
421 arg, (int)len, RExC_precomp, ellipses))
422
b45f050a 423/*
b45f050a
JF
424 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
425 */
ccb2c380 426#define Simple_vFAIL(m) STMT_START { \
a28509cc 427 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
428 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
b45f050a
JF
431
432/*
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
434 */
ccb2c380
MP
435#define vFAIL(m) STMT_START { \
436 if (!SIZE_ONLY) \
288b8c02 437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
438 Simple_vFAIL(m); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Like Simple_vFAIL(), but accepts two arguments.
443 */
ccb2c380 444#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 445 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
448} STMT_END
b45f050a
JF
449
450/*
451 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
452 */
ccb2c380
MP
453#define vFAIL2(m,a1) STMT_START { \
454 if (!SIZE_ONLY) \
288b8c02 455 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
456 Simple_vFAIL2(m, a1); \
457} STMT_END
b45f050a
JF
458
459
460/*
461 * Like Simple_vFAIL(), but accepts three arguments.
462 */
ccb2c380 463#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 464 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
465 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
466 (int)offset, RExC_precomp, RExC_precomp + offset); \
467} STMT_END
b45f050a
JF
468
469/*
470 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
471 */
ccb2c380
MP
472#define vFAIL3(m,a1,a2) STMT_START { \
473 if (!SIZE_ONLY) \
288b8c02 474 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
475 Simple_vFAIL3(m, a1, a2); \
476} STMT_END
b45f050a
JF
477
478/*
479 * Like Simple_vFAIL(), but accepts four arguments.
480 */
ccb2c380 481#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 482 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
483 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
484 (int)offset, RExC_precomp, RExC_precomp + offset); \
485} STMT_END
b45f050a 486
668c081a 487#define ckWARNreg(loc,m) STMT_START { \
a28509cc 488 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
489 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
491} STMT_END
492
668c081a 493#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 494 const IV offset = loc - RExC_precomp; \
d1d15184 495 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
496 m REPORT_LOCATION, \
497 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
498} STMT_END
499
668c081a 500#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 501 const IV offset = loc - RExC_precomp; \
668c081a 502 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
503 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
504} STMT_END
505
506#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 507 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
508 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
509 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
510} STMT_END
511
668c081a
NC
512#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
513 const IV offset = loc - RExC_precomp; \
514 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
515 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
516} STMT_END
517
ccb2c380 518#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 519 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
520 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
521 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
522} STMT_END
523
668c081a
NC
524#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
527 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
528} STMT_END
529
ccb2c380 530#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 531 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
532 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
533 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
534} STMT_END
9d1d55b5 535
8615cb43 536
cd439c50 537/* Allow for side effects in s */
ccb2c380
MP
538#define REGC(c,s) STMT_START { \
539 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
540} STMT_END
cd439c50 541
fac92740
MJD
542/* Macros for recording node offsets. 20001227 mjd@plover.com
543 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
544 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
545 * Element 0 holds the number n.
07be1b83 546 * Position is 1 indexed.
fac92740 547 */
7122b237
YO
548#ifndef RE_TRACK_PATTERN_OFFSETS
549#define Set_Node_Offset_To_R(node,byte)
550#define Set_Node_Offset(node,byte)
551#define Set_Cur_Node_Offset
552#define Set_Node_Length_To_R(node,len)
553#define Set_Node_Length(node,len)
554#define Set_Node_Cur_Length(node)
555#define Node_Offset(n)
556#define Node_Length(n)
557#define Set_Node_Offset_Length(node,offset,len)
558#define ProgLen(ri) ri->u.proglen
559#define SetProgLen(ri,x) ri->u.proglen = x
560#else
561#define ProgLen(ri) ri->u.offsets[0]
562#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
563#define Set_Node_Offset_To_R(node,byte) STMT_START { \
564 if (! SIZE_ONLY) { \
565 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 566 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 567 if((node) < 0) { \
551405c4 568 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
569 } else { \
570 RExC_offsets[2*(node)-1] = (byte); \
571 } \
572 } \
573} STMT_END
574
575#define Set_Node_Offset(node,byte) \
576 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
577#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
578
579#define Set_Node_Length_To_R(node,len) STMT_START { \
580 if (! SIZE_ONLY) { \
581 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 582 __LINE__, (int)(node), (int)(len))); \
ccb2c380 583 if((node) < 0) { \
551405c4 584 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
585 } else { \
586 RExC_offsets[2*(node)] = (len); \
587 } \
588 } \
589} STMT_END
590
591#define Set_Node_Length(node,len) \
592 Set_Node_Length_To_R((node)-RExC_emit_start, len)
593#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
594#define Set_Node_Cur_Length(node) \
595 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
596
597/* Get offsets and lengths */
598#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
599#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
600
07be1b83
YO
601#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
602 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
603 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
604} STMT_END
7122b237 605#endif
07be1b83
YO
606
607#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
608#define EXPERIMENTAL_INPLACESCAN
f427392e 609#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 610
304ee84b
YO
611#define DEBUG_STUDYDATA(str,data,depth) \
612DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 613 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
614 "%*s" str "Pos:%"IVdf"/%"IVdf \
615 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
616 (int)(depth)*2, "", \
617 (IV)((data)->pos_min), \
618 (IV)((data)->pos_delta), \
304ee84b 619 (UV)((data)->flags), \
1de06328 620 (IV)((data)->whilem_c), \
304ee84b
YO
621 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
622 is_inf ? "INF " : "" \
1de06328
YO
623 ); \
624 if ((data)->last_found) \
625 PerlIO_printf(Perl_debug_log, \
626 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
627 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
628 SvPVX_const((data)->last_found), \
629 (IV)((data)->last_end), \
630 (IV)((data)->last_start_min), \
631 (IV)((data)->last_start_max), \
632 ((data)->longest && \
633 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
634 SvPVX_const((data)->longest_fixed), \
635 (IV)((data)->offset_fixed), \
636 ((data)->longest && \
637 (data)->longest==&((data)->longest_float)) ? "*" : "", \
638 SvPVX_const((data)->longest_float), \
639 (IV)((data)->offset_float_min), \
640 (IV)((data)->offset_float_max) \
641 ); \
642 PerlIO_printf(Perl_debug_log,"\n"); \
643});
644
acfe0abc 645static void clear_re(pTHX_ void *r);
4327152a 646
653099ff 647/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 648 Update the longest found anchored substring and the longest found
653099ff
GS
649 floating substrings if needed. */
650
4327152a 651STATIC void
304ee84b 652S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 653{
e1ec3a88
AL
654 const STRLEN l = CHR_SVLEN(data->last_found);
655 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 656 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 657
7918f24d
NC
658 PERL_ARGS_ASSERT_SCAN_COMMIT;
659
c277df42 660 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 661 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
662 if (*data->longest == data->longest_fixed) {
663 data->offset_fixed = l ? data->last_start_min : data->pos_min;
664 if (data->flags & SF_BEFORE_EOL)
b81d288d 665 data->flags
c277df42
IZ
666 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
667 else
668 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
669 data->minlen_fixed=minlenp;
670 data->lookbehind_fixed=0;
a0ed51b3 671 }
304ee84b 672 else { /* *data->longest == data->longest_float */
c277df42 673 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
674 data->offset_float_max = (l
675 ? data->last_start_max
c277df42 676 : data->pos_min + data->pos_delta);
304ee84b 677 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 678 data->offset_float_max = I32_MAX;
c277df42 679 if (data->flags & SF_BEFORE_EOL)
b81d288d 680 data->flags
c277df42
IZ
681 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
682 else
683 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
684 data->minlen_float=minlenp;
685 data->lookbehind_float=0;
c277df42
IZ
686 }
687 }
688 SvCUR_set(data->last_found, 0);
0eda9292 689 {
a28509cc 690 SV * const sv = data->last_found;
097eb12c
AL
691 if (SvUTF8(sv) && SvMAGICAL(sv)) {
692 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
693 if (mg)
694 mg->mg_len = 0;
695 }
0eda9292 696 }
c277df42
IZ
697 data->last_end = -1;
698 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 699 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
700}
701
653099ff
GS
702/* Can match anything (initialization) */
703STATIC void
097eb12c 704S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 705{
7918f24d
NC
706 PERL_ARGS_ASSERT_CL_ANYTHING;
707
653099ff 708 ANYOF_CLASS_ZERO(cl);
f8bef550 709 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 710 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
711 if (LOC)
712 cl->flags |= ANYOF_LOCALE;
8951c461 713 cl->flags |= ANYOF_FOLD;
653099ff
GS
714}
715
716/* Can match anything (initialization) */
717STATIC int
5f66b61c 718S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
719{
720 int value;
721
7918f24d
NC
722 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
723
aaa51d5e 724 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
725 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
726 return 1;
1aa99e6b
IH
727 if (!(cl->flags & ANYOF_UNICODE_ALL))
728 return 0;
10edeb5d 729 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 730 return 0;
653099ff
GS
731 return 1;
732}
733
734/* Can match anything (initialization) */
735STATIC void
097eb12c 736S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 737{
7918f24d
NC
738 PERL_ARGS_ASSERT_CL_INIT;
739
8ecf7187 740 Zero(cl, 1, struct regnode_charclass_class);
653099ff 741 cl->type = ANYOF;
830247a4 742 cl_anything(pRExC_state, cl);
653099ff
GS
743}
744
745STATIC void
097eb12c 746S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 747{
7918f24d
NC
748 PERL_ARGS_ASSERT_CL_INIT_ZERO;
749
8ecf7187 750 Zero(cl, 1, struct regnode_charclass_class);
653099ff 751 cl->type = ANYOF;
830247a4 752 cl_anything(pRExC_state, cl);
653099ff
GS
753 if (LOC)
754 cl->flags |= ANYOF_LOCALE;
755}
756
757/* 'And' a given class with another one. Can create false positives */
758/* We assume that cl is not inverted */
759STATIC void
5f66b61c 760S_cl_and(struct regnode_charclass_class *cl,
a28509cc 761 const struct regnode_charclass_class *and_with)
653099ff 762{
7918f24d 763 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
764
765 assert(and_with->type == ANYOF);
653099ff
GS
766 if (!(and_with->flags & ANYOF_CLASS)
767 && !(cl->flags & ANYOF_CLASS)
768 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
769 && !(and_with->flags & ANYOF_FOLD)
770 && !(cl->flags & ANYOF_FOLD)) {
771 int i;
772
773 if (and_with->flags & ANYOF_INVERT)
774 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
775 cl->bitmap[i] &= ~and_with->bitmap[i];
776 else
777 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
778 cl->bitmap[i] &= and_with->bitmap[i];
779 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
780 if (!(and_with->flags & ANYOF_EOS))
781 cl->flags &= ~ANYOF_EOS;
1aa99e6b 782
8951c461
KW
783 if (!(and_with->flags & ANYOF_FOLD))
784 cl->flags &= ~ANYOF_FOLD;
785
3ff7ceb3 786 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
14ebb1a2 787 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b 788 cl->flags &= ~ANYOF_UNICODE_ALL;
ef87b810
KW
789 cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
790 only the one(s)
791 actually set */
1aa99e6b
IH
792 ARG_SET(cl, ARG(and_with));
793 }
14ebb1a2
JH
794 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
795 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 796 cl->flags &= ~ANYOF_UNICODE_ALL;
3ff7ceb3 797 if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
14ebb1a2 798 !(and_with->flags & ANYOF_INVERT))
3ff7ceb3 799 cl->flags &= ~ANYOF_NONBITMAP;
653099ff
GS
800}
801
802/* 'OR' a given class with another one. Can create false positives */
803/* We assume that cl is not inverted */
804STATIC void
097eb12c 805S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 806{
7918f24d
NC
807 PERL_ARGS_ASSERT_CL_OR;
808
653099ff
GS
809 if (or_with->flags & ANYOF_INVERT) {
810 /* We do not use
811 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
812 * <= (B1 | !B2) | (CL1 | !CL2)
813 * which is wasteful if CL2 is small, but we ignore CL2:
814 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
815 * XXXX Can we handle case-fold? Unclear:
816 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
817 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
818 */
819 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
820 && !(or_with->flags & ANYOF_FOLD)
821 && !(cl->flags & ANYOF_FOLD) ) {
822 int i;
823
824 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
825 cl->bitmap[i] |= ~or_with->bitmap[i];
826 } /* XXXX: logic is complicated otherwise */
827 else {
830247a4 828 cl_anything(pRExC_state, cl);
653099ff
GS
829 }
830 } else {
831 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
832 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 833 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
834 || (cl->flags & ANYOF_FOLD)) ) {
835 int i;
836
837 /* OR char bitmap and class bitmap separately */
838 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
839 cl->bitmap[i] |= or_with->bitmap[i];
840 if (or_with->flags & ANYOF_CLASS) {
841 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
842 cl->classflags[i] |= or_with->classflags[i];
843 cl->flags |= ANYOF_CLASS;
844 }
845 }
846 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 847 cl_anything(pRExC_state, cl);
653099ff
GS
848 }
849 }
850 if (or_with->flags & ANYOF_EOS)
851 cl->flags |= ANYOF_EOS;
1aa99e6b 852
8951c461
KW
853 if (or_with->flags & ANYOF_FOLD)
854 cl->flags |= ANYOF_FOLD;
855
9826f543
KW
856 /* If both nodes match something outside the bitmap, but what they match
857 * outside is not the same pointer, and hence not easily compared, give up
858 * and allow the start class to match everything outside the bitmap */
3ff7ceb3 859 if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
1aa99e6b
IH
860 ARG(cl) != ARG(or_with)) {
861 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 862 }
9826f543 863
1aa99e6b
IH
864 if (or_with->flags & ANYOF_UNICODE_ALL) {
865 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 866 }
653099ff
GS
867}
868
a3621e74
YO
869#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
870#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
871#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
872#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
873
3dab1dad
YO
874
875#ifdef DEBUGGING
07be1b83 876/*
2b8b4781
NC
877 dump_trie(trie,widecharmap,revcharmap)
878 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
879 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
880
881 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
882 The _interim_ variants are used for debugging the interim
883 tables that are used to generate the final compressed
884 representation which is what dump_trie expects.
885
3dab1dad
YO
886 Part of the reason for their existance is to provide a form
887 of documentation as to how the different representations function.
07be1b83
YO
888
889*/
3dab1dad
YO
890
891/*
3dab1dad
YO
892 Dumps the final compressed table form of the trie to Perl_debug_log.
893 Used for debugging make_trie().
894*/
b9a59e08 895
3dab1dad 896STATIC void
2b8b4781
NC
897S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
898 AV *revcharmap, U32 depth)
3dab1dad
YO
899{
900 U32 state;
ab3bbdeb 901 SV *sv=sv_newmortal();
55eed653 902 int colwidth= widecharmap ? 6 : 4;
2e64971a 903 U16 word;
3dab1dad
YO
904 GET_RE_DEBUG_FLAGS_DECL;
905
7918f24d 906 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 907
3dab1dad
YO
908 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
909 (int)depth * 2 + 2,"",
910 "Match","Base","Ofs" );
911
912 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 913 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 914 if ( tmp ) {
ab3bbdeb
YO
915 PerlIO_printf( Perl_debug_log, "%*s",
916 colwidth,
ddc5bc0f 917 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
918 PL_colors[0], PL_colors[1],
919 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
920 PERL_PV_ESCAPE_FIRSTCHAR
921 )
922 );
3dab1dad
YO
923 }
924 }
925 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
926 (int)depth * 2 + 2,"");
927
928 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 929 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
930 PerlIO_printf( Perl_debug_log, "\n");
931
1e2e3d02 932 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 933 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
934
935 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
936
937 if ( trie->states[ state ].wordnum ) {
938 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
939 } else {
940 PerlIO_printf( Perl_debug_log, "%6s", "" );
941 }
942
943 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
944
945 if ( base ) {
946 U32 ofs = 0;
947
948 while( ( base + ofs < trie->uniquecharcount ) ||
949 ( base + ofs - trie->uniquecharcount < trie->lasttrans
950 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
951 ofs++;
952
953 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
954
955 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
956 if ( ( base + ofs >= trie->uniquecharcount ) &&
957 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
958 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
959 {
ab3bbdeb
YO
960 PerlIO_printf( Perl_debug_log, "%*"UVXf,
961 colwidth,
3dab1dad
YO
962 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
963 } else {
ab3bbdeb 964 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
965 }
966 }
967
968 PerlIO_printf( Perl_debug_log, "]");
969
970 }
971 PerlIO_printf( Perl_debug_log, "\n" );
972 }
2e64971a
DM
973 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
974 for (word=1; word <= trie->wordcount; word++) {
975 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
976 (int)word, (int)(trie->wordinfo[word].prev),
977 (int)(trie->wordinfo[word].len));
978 }
979 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
980}
981/*
3dab1dad
YO
982 Dumps a fully constructed but uncompressed trie in list form.
983 List tries normally only are used for construction when the number of
984 possible chars (trie->uniquecharcount) is very high.
985 Used for debugging make_trie().
986*/
987STATIC void
55eed653 988S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
989 HV *widecharmap, AV *revcharmap, U32 next_alloc,
990 U32 depth)
3dab1dad
YO
991{
992 U32 state;
ab3bbdeb 993 SV *sv=sv_newmortal();
55eed653 994 int colwidth= widecharmap ? 6 : 4;
3dab1dad 995 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
996
997 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
998
3dab1dad 999 /* print out the table precompression. */
ab3bbdeb
YO
1000 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1001 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1002 "------:-----+-----------------\n" );
3dab1dad
YO
1003
1004 for( state=1 ; state < next_alloc ; state ++ ) {
1005 U16 charid;
1006
ab3bbdeb 1007 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1008 (int)depth * 2 + 2,"", (UV)state );
1009 if ( ! trie->states[ state ].wordnum ) {
1010 PerlIO_printf( Perl_debug_log, "%5s| ","");
1011 } else {
1012 PerlIO_printf( Perl_debug_log, "W%4x| ",
1013 trie->states[ state ].wordnum
1014 );
1015 }
1016 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1017 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1018 if ( tmp ) {
1019 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1020 colwidth,
ddc5bc0f 1021 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1022 PL_colors[0], PL_colors[1],
1023 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1024 PERL_PV_ESCAPE_FIRSTCHAR
1025 ) ,
1e2e3d02
YO
1026 TRIE_LIST_ITEM(state,charid).forid,
1027 (UV)TRIE_LIST_ITEM(state,charid).newstate
1028 );
1029 if (!(charid % 10))
664e119d
RGS
1030 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1031 (int)((depth * 2) + 14), "");
1e2e3d02 1032 }
ab3bbdeb
YO
1033 }
1034 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1035 }
1036}
1037
1038/*
3dab1dad
YO
1039 Dumps a fully constructed but uncompressed trie in table form.
1040 This is the normal DFA style state transition table, with a few
1041 twists to facilitate compression later.
1042 Used for debugging make_trie().
1043*/
1044STATIC void
55eed653 1045S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1046 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1047 U32 depth)
3dab1dad
YO
1048{
1049 U32 state;
1050 U16 charid;
ab3bbdeb 1051 SV *sv=sv_newmortal();
55eed653 1052 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1053 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1054
1055 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1056
1057 /*
1058 print out the table precompression so that we can do a visual check
1059 that they are identical.
1060 */
1061
1062 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1063
1064 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1065 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1066 if ( tmp ) {
ab3bbdeb
YO
1067 PerlIO_printf( Perl_debug_log, "%*s",
1068 colwidth,
ddc5bc0f 1069 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1070 PL_colors[0], PL_colors[1],
1071 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1072 PERL_PV_ESCAPE_FIRSTCHAR
1073 )
1074 );
3dab1dad
YO
1075 }
1076 }
1077
1078 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1079
1080 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1081 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1082 }
1083
1084 PerlIO_printf( Perl_debug_log, "\n" );
1085
1086 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1087
1088 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1089 (int)depth * 2 + 2,"",
1090 (UV)TRIE_NODENUM( state ) );
1091
1092 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1093 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1094 if (v)
1095 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1096 else
1097 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1098 }
1099 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1100 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1101 } else {
1102 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1103 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1104 }
1105 }
07be1b83 1106}
3dab1dad
YO
1107
1108#endif
1109
2e64971a 1110
786e8c11
YO
1111/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1112 startbranch: the first branch in the whole branch sequence
1113 first : start branch of sequence of branch-exact nodes.
1114 May be the same as startbranch
1115 last : Thing following the last branch.
1116 May be the same as tail.
1117 tail : item following the branch sequence
1118 count : words in the sequence
1119 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1120 depth : indent depth
3dab1dad 1121
786e8c11 1122Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1123
786e8c11
YO
1124A trie is an N'ary tree where the branches are determined by digital
1125decomposition of the key. IE, at the root node you look up the 1st character and
1126follow that branch repeat until you find the end of the branches. Nodes can be
1127marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1128
786e8c11 1129 /he|she|his|hers/
72f13be8 1130
786e8c11
YO
1131would convert into the following structure. Numbers represent states, letters
1132following numbers represent valid transitions on the letter from that state, if
1133the number is in square brackets it represents an accepting state, otherwise it
1134will be in parenthesis.
07be1b83 1135
786e8c11
YO
1136 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1137 | |
1138 | (2)
1139 | |
1140 (1) +-i->(6)-+-s->[7]
1141 |
1142 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1143
786e8c11
YO
1144 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1145
1146This shows that when matching against the string 'hers' we will begin at state 1
1147read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1148then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1149is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1150single traverse. We store a mapping from accepting to state to which word was
1151matched, and then when we have multiple possibilities we try to complete the
1152rest of the regex in the order in which they occured in the alternation.
1153
1154The only prior NFA like behaviour that would be changed by the TRIE support is
1155the silent ignoring of duplicate alternations which are of the form:
1156
1157 / (DUPE|DUPE) X? (?{ ... }) Y /x
1158
4b714af6 1159Thus EVAL blocks following a trie may be called a different number of times with
786e8c11
YO
1160and without the optimisation. With the optimisations dupes will be silently
1161ignored. This inconsistant behaviour of EVAL type nodes is well established as
1162the following demonstrates:
1163
1164 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1165
1166which prints out 'word' three times, but
1167
1168 'words'=~/(word|word|word)(?{ print $1 })S/
1169
1170which doesnt print it out at all. This is due to other optimisations kicking in.
1171
1172Example of what happens on a structural level:
1173
1174The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1175
1176 1: CURLYM[1] {1,32767}(18)
1177 5: BRANCH(8)
1178 6: EXACT <ac>(16)
1179 8: BRANCH(11)
1180 9: EXACT <ad>(16)
1181 11: BRANCH(14)
1182 12: EXACT <ab>(16)
1183 16: SUCCEED(0)
1184 17: NOTHING(18)
1185 18: END(0)
1186
1187This would be optimizable with startbranch=5, first=5, last=16, tail=16
1188and should turn into:
1189
1190 1: CURLYM[1] {1,32767}(18)
1191 5: TRIE(16)
1192 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1193 <ac>
1194 <ad>
1195 <ab>
1196 16: SUCCEED(0)
1197 17: NOTHING(18)
1198 18: END(0)
1199
1200Cases where tail != last would be like /(?foo|bar)baz/:
1201
1202 1: BRANCH(4)
1203 2: EXACT <foo>(8)
1204 4: BRANCH(7)
1205 5: EXACT <bar>(8)
1206 7: TAIL(8)
1207 8: EXACT <baz>(10)
1208 10: END(0)
1209
1210which would be optimizable with startbranch=1, first=1, last=7, tail=8
1211and would end up looking like:
1212
1213 1: TRIE(8)
1214 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1215 <foo>
1216 <bar>
1217 7: TAIL(8)
1218 8: EXACT <baz>(10)
1219 10: END(0)
1220
1221 d = uvuni_to_utf8_flags(d, uv, 0);
1222
1223is the recommended Unicode-aware way of saying
1224
1225 *(d++) = uv;
1226*/
1227
1e2e3d02 1228#define TRIE_STORE_REVCHAR \
786e8c11 1229 STMT_START { \
73031816
NC
1230 if (UTF) { \
1231 SV *zlopp = newSV(2); \
88c9ea1e
CB
1232 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1233 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1234 SvCUR_set(zlopp, kapow - flrbbbbb); \
1235 SvPOK_on(zlopp); \
1236 SvUTF8_on(zlopp); \
1237 av_push(revcharmap, zlopp); \
1238 } else { \
6bdeddd2 1239 char ooooff = (char)uvc; \
73031816
NC
1240 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1241 } \
1242 } STMT_END
786e8c11
YO
1243
1244#define TRIE_READ_CHAR STMT_START { \
1245 wordlen++; \
1246 if ( UTF ) { \
1247 if ( folder ) { \
1248 if ( foldlen > 0 ) { \
1249 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1250 foldlen -= len; \
1251 scan += len; \
1252 len = 0; \
1253 } else { \
1254 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1255 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1256 foldlen -= UNISKIP( uvc ); \
1257 scan = foldbuf + UNISKIP( uvc ); \
1258 } \
1259 } else { \
1260 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1261 } \
1262 } else { \
1263 uvc = (U32)*uc; \
1264 len = 1; \
1265 } \
1266} STMT_END
1267
1268
1269
1270#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1271 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1272 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1273 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1274 } \
1275 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1276 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1277 TRIE_LIST_CUR( state )++; \
1278} STMT_END
07be1b83 1279
786e8c11
YO
1280#define TRIE_LIST_NEW(state) STMT_START { \
1281 Newxz( trie->states[ state ].trans.list, \
1282 4, reg_trie_trans_le ); \
1283 TRIE_LIST_CUR( state ) = 1; \
1284 TRIE_LIST_LEN( state ) = 4; \
1285} STMT_END
07be1b83 1286
786e8c11
YO
1287#define TRIE_HANDLE_WORD(state) STMT_START { \
1288 U16 dupe= trie->states[ state ].wordnum; \
1289 regnode * const noper_next = regnext( noper ); \
1290 \
786e8c11
YO
1291 DEBUG_r({ \
1292 /* store the word for dumping */ \
1293 SV* tmp; \
1294 if (OP(noper) != NOTHING) \
740cce10 1295 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1296 else \
740cce10 1297 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1298 av_push( trie_words, tmp ); \
786e8c11
YO
1299 }); \
1300 \
1301 curword++; \
2e64971a
DM
1302 trie->wordinfo[curword].prev = 0; \
1303 trie->wordinfo[curword].len = wordlen; \
1304 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1305 \
1306 if ( noper_next < tail ) { \
1307 if (!trie->jump) \
c944940b 1308 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1309 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1310 if (!jumper) \
1311 jumper = noper_next; \
1312 if (!nextbranch) \
1313 nextbranch= regnext(cur); \
1314 } \
1315 \
1316 if ( dupe ) { \
2e64971a
DM
1317 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1318 /* chain, so that when the bits of chain are later */\
1319 /* linked together, the dups appear in the chain */\
1320 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1321 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1322 } else { \
1323 /* we haven't inserted this word yet. */ \
1324 trie->states[ state ].wordnum = curword; \
1325 } \
1326} STMT_END
07be1b83 1327
3dab1dad 1328
786e8c11
YO
1329#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1330 ( ( base + charid >= ucharcount \
1331 && base + charid < ubound \
1332 && state == trie->trans[ base - ucharcount + charid ].check \
1333 && trie->trans[ base - ucharcount + charid ].next ) \
1334 ? trie->trans[ base - ucharcount + charid ].next \
1335 : ( state==1 ? special : 0 ) \
1336 )
3dab1dad 1337
786e8c11
YO
1338#define MADE_TRIE 1
1339#define MADE_JUMP_TRIE 2
1340#define MADE_EXACT_TRIE 4
3dab1dad 1341
a3621e74 1342STATIC I32
786e8c11 1343S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1344{
27da23d5 1345 dVAR;
a3621e74
YO
1346 /* first pass, loop through and scan words */
1347 reg_trie_data *trie;
55eed653 1348 HV *widecharmap = NULL;
2b8b4781 1349 AV *revcharmap = newAV();
a3621e74 1350 regnode *cur;
9f7f3913 1351 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1352 STRLEN len = 0;
1353 UV uvc = 0;
1354 U16 curword = 0;
1355 U32 next_alloc = 0;
786e8c11
YO
1356 regnode *jumper = NULL;
1357 regnode *nextbranch = NULL;
7f69552c 1358 regnode *convert = NULL;
2e64971a 1359 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1360 /* we just use folder as a flag in utf8 */
1e696034 1361 const U8 * folder = NULL;
a3621e74 1362
2b8b4781
NC
1363#ifdef DEBUGGING
1364 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1365 AV *trie_words = NULL;
1366 /* along with revcharmap, this only used during construction but both are
1367 * useful during debugging so we store them in the struct when debugging.
8e11feef 1368 */
2b8b4781
NC
1369#else
1370 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1371 STRLEN trie_charcount=0;
3dab1dad 1372#endif
2b8b4781 1373 SV *re_trie_maxbuff;
a3621e74 1374 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1375
1376 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1377#ifndef DEBUGGING
1378 PERL_UNUSED_ARG(depth);
1379#endif
a3621e74 1380
1e696034
KW
1381 switch (flags) {
1382 case EXACTFU: folder = PL_fold_latin1; break;
1383 case EXACTF: folder = PL_fold; break;
1384 case EXACTFL: folder = PL_fold_locale; break;
1385 }
1386
c944940b 1387 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1388 trie->refcount = 1;
3dab1dad 1389 trie->startstate = 1;
786e8c11 1390 trie->wordcount = word_count;
f8fc2ecf 1391 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1392 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1393 if (!(UTF && folder))
c944940b 1394 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1395 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1396 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1397
a3621e74 1398 DEBUG_r({
2b8b4781 1399 trie_words = newAV();
a3621e74 1400 });
a3621e74 1401
0111c4fd 1402 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1403 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1404 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1405 }
3dab1dad
YO
1406 DEBUG_OPTIMISE_r({
1407 PerlIO_printf( Perl_debug_log,
786e8c11 1408 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1409 (int)depth * 2 + 2, "",
1410 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1411 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1412 (int)depth);
3dab1dad 1413 });
7f69552c
YO
1414
1415 /* Find the node we are going to overwrite */
1416 if ( first == startbranch && OP( last ) != BRANCH ) {
1417 /* whole branch chain */
1418 convert = first;
1419 } else {
1420 /* branch sub-chain */
1421 convert = NEXTOPER( first );
1422 }
1423
a3621e74
YO
1424 /* -- First loop and Setup --
1425
1426 We first traverse the branches and scan each word to determine if it
1427 contains widechars, and how many unique chars there are, this is
1428 important as we have to build a table with at least as many columns as we
1429 have unique chars.
1430
1431 We use an array of integers to represent the character codes 0..255
38a44b82 1432 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1433 native representation of the character value as the key and IV's for the
1434 coded index.
1435
1436 *TODO* If we keep track of how many times each character is used we can
1437 remap the columns so that the table compression later on is more
3b753521 1438 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1439 middle and the least common are on the outside. IMO this would be better
1440 than a most to least common mapping as theres a decent chance the most
1441 common letter will share a node with the least common, meaning the node
1442 will not be compressable. With a middle is most common approach the worst
1443 case is when we have the least common nodes twice.
1444
1445 */
1446
a3621e74 1447 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1448 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1449 const U8 *uc = (U8*)STRING( noper );
a28509cc 1450 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1451 STRLEN foldlen = 0;
1452 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1453 const U8 *scan = (U8*)NULL;
07be1b83 1454 U32 wordlen = 0; /* required init */
02daf0ab
YO
1455 STRLEN chars = 0;
1456 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1457
3dab1dad
YO
1458 if (OP(noper) == NOTHING) {
1459 trie->minlen= 0;
1460 continue;
1461 }
02daf0ab
YO
1462 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1463 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1464 regardless of encoding */
1465
a3621e74 1466 for ( ; uc < e ; uc += len ) {
3dab1dad 1467 TRIE_CHARCOUNT(trie)++;
a3621e74 1468 TRIE_READ_CHAR;
3dab1dad 1469 chars++;
a3621e74
YO
1470 if ( uvc < 256 ) {
1471 if ( !trie->charmap[ uvc ] ) {
1472 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1473 if ( folder )
1474 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1475 TRIE_STORE_REVCHAR;
a3621e74 1476 }
02daf0ab 1477 if ( set_bit ) {
62012aee
KW
1478 /* store the codepoint in the bitmap, and its folded
1479 * equivalent. */
02daf0ab 1480 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1481
1482 /* store the folded codepoint */
1483 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1484
1485 if ( !UTF ) {
1486 /* store first byte of utf8 representation of
acdf4139
KW
1487 variant codepoints */
1488 if (! UNI_IS_INVARIANT(uvc)) {
1489 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
1490 }
1491 }
02daf0ab
YO
1492 set_bit = 0; /* We've done our bit :-) */
1493 }
a3621e74
YO
1494 } else {
1495 SV** svpp;
55eed653
NC
1496 if ( !widecharmap )
1497 widecharmap = newHV();
a3621e74 1498
55eed653 1499 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1500
1501 if ( !svpp )
e4584336 1502 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1503
1504 if ( !SvTRUE( *svpp ) ) {
1505 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1506 TRIE_STORE_REVCHAR;
a3621e74
YO
1507 }
1508 }
1509 }
3dab1dad
YO
1510 if( cur == first ) {
1511 trie->minlen=chars;
1512 trie->maxlen=chars;
1513 } else if (chars < trie->minlen) {
1514 trie->minlen=chars;
1515 } else if (chars > trie->maxlen) {
1516 trie->maxlen=chars;
1517 }
1518
a3621e74
YO
1519 } /* end first pass */
1520 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1521 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1522 (int)depth * 2 + 2,"",
55eed653 1523 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1524 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1525 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1526 );
a3621e74
YO
1527
1528 /*
1529 We now know what we are dealing with in terms of unique chars and
1530 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1531 representation using a flat table will take. If it's over a reasonable
1532 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1533 conservative but potentially much slower representation using an array
1534 of lists.
1535
1536 At the end we convert both representations into the same compressed
1537 form that will be used in regexec.c for matching with. The latter
1538 is a form that cannot be used to construct with but has memory
1539 properties similar to the list form and access properties similar
1540 to the table form making it both suitable for fast searches and
1541 small enough that its feasable to store for the duration of a program.
1542
1543 See the comment in the code where the compressed table is produced
1544 inplace from the flat tabe representation for an explanation of how
1545 the compression works.
1546
1547 */
1548
1549
2e64971a
DM
1550 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1551 prev_states[1] = 0;
1552
3dab1dad 1553 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1554 /*
1555 Second Pass -- Array Of Lists Representation
1556
1557 Each state will be represented by a list of charid:state records
1558 (reg_trie_trans_le) the first such element holds the CUR and LEN
1559 points of the allocated array. (See defines above).
1560
1561 We build the initial structure using the lists, and then convert
1562 it into the compressed table form which allows faster lookups
1563 (but cant be modified once converted).
a3621e74
YO
1564 */
1565
a3621e74
YO
1566 STRLEN transcount = 1;
1567
1e2e3d02
YO
1568 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1569 "%*sCompiling trie using list compiler\n",
1570 (int)depth * 2 + 2, ""));
446bd890 1571
c944940b
JH
1572 trie->states = (reg_trie_state *)
1573 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1574 sizeof(reg_trie_state) );
a3621e74
YO
1575 TRIE_LIST_NEW(1);
1576 next_alloc = 2;
1577
1578 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1579
c445ea15
AL
1580 regnode * const noper = NEXTOPER( cur );
1581 U8 *uc = (U8*)STRING( noper );
1582 const U8 * const e = uc + STR_LEN( noper );
1583 U32 state = 1; /* required init */
1584 U16 charid = 0; /* sanity init */
1585 U8 *scan = (U8*)NULL; /* sanity init */
1586 STRLEN foldlen = 0; /* required init */
07be1b83 1587 U32 wordlen = 0; /* required init */
c445ea15
AL
1588 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1589
3dab1dad 1590 if (OP(noper) != NOTHING) {
786e8c11 1591 for ( ; uc < e ; uc += len ) {
c445ea15 1592
786e8c11 1593 TRIE_READ_CHAR;
c445ea15 1594
786e8c11
YO
1595 if ( uvc < 256 ) {
1596 charid = trie->charmap[ uvc ];
c445ea15 1597 } else {
55eed653 1598 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1599 if ( !svpp ) {
1600 charid = 0;
1601 } else {
1602 charid=(U16)SvIV( *svpp );
1603 }
c445ea15 1604 }
786e8c11
YO
1605 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1606 if ( charid ) {
a3621e74 1607
786e8c11
YO
1608 U16 check;
1609 U32 newstate = 0;
a3621e74 1610
786e8c11
YO
1611 charid--;
1612 if ( !trie->states[ state ].trans.list ) {
1613 TRIE_LIST_NEW( state );
c445ea15 1614 }
786e8c11
YO
1615 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1616 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1617 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1618 break;
1619 }
1620 }
1621 if ( ! newstate ) {
1622 newstate = next_alloc++;
2e64971a 1623 prev_states[newstate] = state;
786e8c11
YO
1624 TRIE_LIST_PUSH( state, charid, newstate );
1625 transcount++;
1626 }
1627 state = newstate;
1628 } else {
1629 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1630 }
a28509cc 1631 }
c445ea15 1632 }
3dab1dad 1633 TRIE_HANDLE_WORD(state);
a3621e74
YO
1634
1635 } /* end second pass */
1636
1e2e3d02
YO
1637 /* next alloc is the NEXT state to be allocated */
1638 trie->statecount = next_alloc;
c944940b
JH
1639 trie->states = (reg_trie_state *)
1640 PerlMemShared_realloc( trie->states,
1641 next_alloc
1642 * sizeof(reg_trie_state) );
a3621e74 1643
3dab1dad 1644 /* and now dump it out before we compress it */
2b8b4781
NC
1645 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1646 revcharmap, next_alloc,
1647 depth+1)
1e2e3d02 1648 );
a3621e74 1649
c944940b
JH
1650 trie->trans = (reg_trie_trans *)
1651 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1652 {
1653 U32 state;
a3621e74
YO
1654 U32 tp = 0;
1655 U32 zp = 0;
1656
1657
1658 for( state=1 ; state < next_alloc ; state ++ ) {
1659 U32 base=0;
1660
1661 /*
1662 DEBUG_TRIE_COMPILE_MORE_r(
1663 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1664 );
1665 */
1666
1667 if (trie->states[state].trans.list) {
1668 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1669 U16 maxid=minid;
a28509cc 1670 U16 idx;
a3621e74
YO
1671
1672 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1673 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1674 if ( forid < minid ) {
1675 minid=forid;
1676 } else if ( forid > maxid ) {
1677 maxid=forid;
1678 }
a3621e74
YO
1679 }
1680 if ( transcount < tp + maxid - minid + 1) {
1681 transcount *= 2;
c944940b
JH
1682 trie->trans = (reg_trie_trans *)
1683 PerlMemShared_realloc( trie->trans,
446bd890
NC
1684 transcount
1685 * sizeof(reg_trie_trans) );
a3621e74
YO
1686 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1687 }
1688 base = trie->uniquecharcount + tp - minid;
1689 if ( maxid == minid ) {
1690 U32 set = 0;
1691 for ( ; zp < tp ; zp++ ) {
1692 if ( ! trie->trans[ zp ].next ) {
1693 base = trie->uniquecharcount + zp - minid;
1694 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1695 trie->trans[ zp ].check = state;
1696 set = 1;
1697 break;
1698 }
1699 }
1700 if ( !set ) {
1701 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1702 trie->trans[ tp ].check = state;
1703 tp++;
1704 zp = tp;
1705 }
1706 } else {
1707 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1708 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1709 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1710 trie->trans[ tid ].check = state;
1711 }
1712 tp += ( maxid - minid + 1 );
1713 }
1714 Safefree(trie->states[ state ].trans.list);
1715 }
1716 /*
1717 DEBUG_TRIE_COMPILE_MORE_r(
1718 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1719 );
1720 */
1721 trie->states[ state ].trans.base=base;
1722 }
cc601c31 1723 trie->lasttrans = tp + 1;
a3621e74
YO
1724 }
1725 } else {
1726 /*
1727 Second Pass -- Flat Table Representation.
1728
1729 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1730 We know that we will need Charcount+1 trans at most to store the data
1731 (one row per char at worst case) So we preallocate both structures
1732 assuming worst case.
1733
1734 We then construct the trie using only the .next slots of the entry
1735 structs.
1736
3b753521 1737 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
1738 make compression both faster and easier by keeping track of how many non
1739 zero fields are in the node.
1740
1741 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1742 transition.
1743
1744 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1745 number representing the first entry of the node, and state as a
1746 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1747 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1748 are 2 entrys per node. eg:
1749
1750 A B A B
1751 1. 2 4 1. 3 7
1752 2. 0 3 3. 0 5
1753 3. 0 0 5. 0 0
1754 4. 0 0 7. 0 0
1755
1756 The table is internally in the right hand, idx form. However as we also
1757 have to deal with the states array which is indexed by nodenum we have to
1758 use TRIE_NODENUM() to convert.
1759
1760 */
1e2e3d02
YO
1761 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1762 "%*sCompiling trie using table compiler\n",
1763 (int)depth * 2 + 2, ""));
3dab1dad 1764
c944940b
JH
1765 trie->trans = (reg_trie_trans *)
1766 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1767 * trie->uniquecharcount + 1,
1768 sizeof(reg_trie_trans) );
1769 trie->states = (reg_trie_state *)
1770 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1771 sizeof(reg_trie_state) );
a3621e74
YO
1772 next_alloc = trie->uniquecharcount + 1;
1773
3dab1dad 1774
a3621e74
YO
1775 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1776
c445ea15 1777 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1778 const U8 *uc = (U8*)STRING( noper );
1779 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1780
1781 U32 state = 1; /* required init */
1782
1783 U16 charid = 0; /* sanity init */
1784 U32 accept_state = 0; /* sanity init */
1785 U8 *scan = (U8*)NULL; /* sanity init */
1786
1787 STRLEN foldlen = 0; /* required init */
07be1b83 1788 U32 wordlen = 0; /* required init */
a3621e74
YO
1789 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1790
3dab1dad 1791 if ( OP(noper) != NOTHING ) {
786e8c11 1792 for ( ; uc < e ; uc += len ) {
a3621e74 1793
786e8c11 1794 TRIE_READ_CHAR;
a3621e74 1795
786e8c11
YO
1796 if ( uvc < 256 ) {
1797 charid = trie->charmap[ uvc ];
1798 } else {
55eed653 1799 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1800 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1801 }
786e8c11
YO
1802 if ( charid ) {
1803 charid--;
1804 if ( !trie->trans[ state + charid ].next ) {
1805 trie->trans[ state + charid ].next = next_alloc;
1806 trie->trans[ state ].check++;
2e64971a
DM
1807 prev_states[TRIE_NODENUM(next_alloc)]
1808 = TRIE_NODENUM(state);
786e8c11
YO
1809 next_alloc += trie->uniquecharcount;
1810 }
1811 state = trie->trans[ state + charid ].next;
1812 } else {
1813 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1814 }
1815 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1816 }
a3621e74 1817 }
3dab1dad
YO
1818 accept_state = TRIE_NODENUM( state );
1819 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1820
1821 } /* end second pass */
1822
3dab1dad 1823 /* and now dump it out before we compress it */
2b8b4781
NC
1824 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1825 revcharmap,
1826 next_alloc, depth+1));
a3621e74 1827
a3621e74
YO
1828 {
1829 /*
1830 * Inplace compress the table.*
1831
1832 For sparse data sets the table constructed by the trie algorithm will
1833 be mostly 0/FAIL transitions or to put it another way mostly empty.
1834 (Note that leaf nodes will not contain any transitions.)
1835
1836 This algorithm compresses the tables by eliminating most such
1837 transitions, at the cost of a modest bit of extra work during lookup:
1838
1839 - Each states[] entry contains a .base field which indicates the
1840 index in the state[] array wheres its transition data is stored.
1841
3b753521 1842 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
1843
1844 - If .base is nonzero then charid is added to it to find an entry in
1845 the trans array.
1846
1847 -If trans[states[state].base+charid].check!=state then the
1848 transition is taken to be a 0/Fail transition. Thus if there are fail
1849 transitions at the front of the node then the .base offset will point
1850 somewhere inside the previous nodes data (or maybe even into a node
1851 even earlier), but the .check field determines if the transition is
1852 valid.
1853
786e8c11 1854 XXX - wrong maybe?
a3621e74 1855 The following process inplace converts the table to the compressed
3b753521 1856 table: We first do not compress the root node 1,and mark all its
a3621e74 1857 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
1858 allows us to do a DFA construction from the compressed table later,
1859 and ensures that any .base pointers we calculate later are greater
1860 than 0.
a3621e74
YO
1861
1862 - We set 'pos' to indicate the first entry of the second node.
1863
1864 - We then iterate over the columns of the node, finding the first and
1865 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1866 and set the .check pointers accordingly, and advance pos
1867 appropriately and repreat for the next node. Note that when we copy
1868 the next pointers we have to convert them from the original
1869 NODEIDX form to NODENUM form as the former is not valid post
1870 compression.
1871
1872 - If a node has no transitions used we mark its base as 0 and do not
1873 advance the pos pointer.
1874
1875 - If a node only has one transition we use a second pointer into the
1876 structure to fill in allocated fail transitions from other states.
1877 This pointer is independent of the main pointer and scans forward
1878 looking for null transitions that are allocated to a state. When it
1879 finds one it writes the single transition into the "hole". If the
786e8c11 1880 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1881
1882 - Once compressed we can Renew/realloc the structures to release the
1883 excess space.
1884
1885 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1886 specifically Fig 3.47 and the associated pseudocode.
1887
1888 demq
1889 */
a3b680e6 1890 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1891 U32 state, charid;
a3621e74 1892 U32 pos = 0, zp=0;
1e2e3d02 1893 trie->statecount = laststate;
a3621e74
YO
1894
1895 for ( state = 1 ; state < laststate ; state++ ) {
1896 U8 flag = 0;
a28509cc
AL
1897 const U32 stateidx = TRIE_NODEIDX( state );
1898 const U32 o_used = trie->trans[ stateidx ].check;
1899 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1900 trie->trans[ stateidx ].check = 0;
1901
1902 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1903 if ( flag || trie->trans[ stateidx + charid ].next ) {
1904 if ( trie->trans[ stateidx + charid ].next ) {
1905 if (o_used == 1) {
1906 for ( ; zp < pos ; zp++ ) {
1907 if ( ! trie->trans[ zp ].next ) {
1908 break;
1909 }
1910 }
1911 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1912 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1913 trie->trans[ zp ].check = state;
1914 if ( ++zp > pos ) pos = zp;
1915 break;
1916 }
1917 used--;
1918 }
1919 if ( !flag ) {
1920 flag = 1;
1921 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1922 }
1923 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1924 trie->trans[ pos ].check = state;
1925 pos++;
1926 }
1927 }
1928 }
cc601c31 1929 trie->lasttrans = pos + 1;
c944940b
JH
1930 trie->states = (reg_trie_state *)
1931 PerlMemShared_realloc( trie->states, laststate
1932 * sizeof(reg_trie_state) );
a3621e74 1933 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1934 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1935 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1936 (int)depth * 2 + 2,"",
1937 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1938 (IV)next_alloc,
1939 (IV)pos,
a3621e74
YO
1940 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1941 );
1942
1943 } /* end table compress */
1944 }
1e2e3d02
YO
1945 DEBUG_TRIE_COMPILE_MORE_r(
1946 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1947 (int)depth * 2 + 2, "",
1948 (UV)trie->statecount,
1949 (UV)trie->lasttrans)
1950 );
cc601c31 1951 /* resize the trans array to remove unused space */
c944940b
JH
1952 trie->trans = (reg_trie_trans *)
1953 PerlMemShared_realloc( trie->trans, trie->lasttrans
1954 * sizeof(reg_trie_trans) );
a3621e74 1955
3b753521 1956 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
1957 U8 nodetype =(U8)(flags & 0xFF);
1958 char *str=NULL;
786e8c11 1959
07be1b83 1960#ifdef DEBUGGING
e62cc96a 1961 regnode *optimize = NULL;
7122b237
YO
1962#ifdef RE_TRACK_PATTERN_OFFSETS
1963
b57a0404
JH
1964 U32 mjd_offset = 0;
1965 U32 mjd_nodelen = 0;
7122b237
YO
1966#endif /* RE_TRACK_PATTERN_OFFSETS */
1967#endif /* DEBUGGING */
a3621e74 1968 /*
3dab1dad
YO
1969 This means we convert either the first branch or the first Exact,
1970 depending on whether the thing following (in 'last') is a branch
1971 or not and whther first is the startbranch (ie is it a sub part of
1972 the alternation or is it the whole thing.)
3b753521 1973 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 1974 the whole branch sequence, including the first.
a3621e74 1975 */
3dab1dad 1976 /* Find the node we are going to overwrite */
7f69552c 1977 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1978 /* branch sub-chain */
3dab1dad 1979 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1980#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1981 DEBUG_r({
1982 mjd_offset= Node_Offset((convert));
1983 mjd_nodelen= Node_Length((convert));
1984 });
7122b237 1985#endif
7f69552c 1986 /* whole branch chain */
7122b237
YO
1987 }
1988#ifdef RE_TRACK_PATTERN_OFFSETS
1989 else {
7f69552c
YO
1990 DEBUG_r({
1991 const regnode *nop = NEXTOPER( convert );
1992 mjd_offset= Node_Offset((nop));
1993 mjd_nodelen= Node_Length((nop));
1994 });
07be1b83
YO
1995 }
1996 DEBUG_OPTIMISE_r(
1997 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1998 (int)depth * 2 + 2, "",
786e8c11 1999 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2000 );
7122b237 2001#endif
3dab1dad
YO
2002 /* But first we check to see if there is a common prefix we can
2003 split out as an EXACT and put in front of the TRIE node. */
2004 trie->startstate= 1;
55eed653 2005 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2006 U32 state;
1e2e3d02 2007 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2008 U32 ofs = 0;
8e11feef
RGS
2009 I32 idx = -1;
2010 U32 count = 0;
2011 const U32 base = trie->states[ state ].trans.base;
a3621e74 2012
3dab1dad 2013 if ( trie->states[state].wordnum )
8e11feef 2014 count = 1;
a3621e74 2015
8e11feef 2016 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2017 if ( ( base + ofs >= trie->uniquecharcount ) &&
2018 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2019 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2020 {
3dab1dad 2021 if ( ++count > 1 ) {
2b8b4781 2022 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2023 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2024 if ( state == 1 ) break;
3dab1dad
YO
2025 if ( count == 2 ) {
2026 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2027 DEBUG_OPTIMISE_r(
8e11feef
RGS
2028 PerlIO_printf(Perl_debug_log,
2029 "%*sNew Start State=%"UVuf" Class: [",
2030 (int)depth * 2 + 2, "",
786e8c11 2031 (UV)state));
be8e71aa 2032 if (idx >= 0) {
2b8b4781 2033 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2034 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2035
3dab1dad 2036 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2037 if ( folder )
2038 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2039 DEBUG_OPTIMISE_r(
f1f66076 2040 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2041 );
8e11feef
RGS
2042 }
2043 }
2044 TRIE_BITMAP_SET(trie,*ch);
2045 if ( folder )
2046 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2047 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2048 }
2049 idx = ofs;
2050 }
3dab1dad
YO
2051 }
2052 if ( count == 1 ) {
2b8b4781 2053 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2054 STRLEN len;
2055 char *ch = SvPV( *tmp, len );
de734bd5
A
2056 DEBUG_OPTIMISE_r({
2057 SV *sv=sv_newmortal();
8e11feef
RGS
2058 PerlIO_printf( Perl_debug_log,
2059 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2060 (int)depth * 2 + 2, "",
de734bd5
A
2061 (UV)state, (UV)idx,
2062 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2063 PL_colors[0], PL_colors[1],
2064 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2065 PERL_PV_ESCAPE_FIRSTCHAR
2066 )
2067 );
2068 });
3dab1dad
YO
2069 if ( state==1 ) {
2070 OP( convert ) = nodetype;
2071 str=STRING(convert);
2072 STR_LEN(convert)=0;
2073 }
c490c714
YO
2074 STR_LEN(convert) += len;
2075 while (len--)
de734bd5 2076 *str++ = *ch++;
8e11feef 2077 } else {
f9049ba1 2078#ifdef DEBUGGING
8e11feef
RGS
2079 if (state>1)
2080 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2081#endif
8e11feef
RGS
2082 break;
2083 }
2084 }
2e64971a 2085 trie->prefixlen = (state-1);
3dab1dad 2086 if (str) {
8e11feef 2087 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2088 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2089 trie->startstate = state;
07be1b83
YO
2090 trie->minlen -= (state - 1);
2091 trie->maxlen -= (state - 1);
33809eae
JH
2092#ifdef DEBUGGING
2093 /* At least the UNICOS C compiler choked on this
2094 * being argument to DEBUG_r(), so let's just have
2095 * it right here. */
2096 if (
2097#ifdef PERL_EXT_RE_BUILD
2098 1
2099#else
2100 DEBUG_r_TEST
2101#endif
2102 ) {
2103 regnode *fix = convert;
2104 U32 word = trie->wordcount;
2105 mjd_nodelen++;
2106 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2107 while( ++fix < n ) {
2108 Set_Node_Offset_Length(fix, 0, 0);
2109 }
2110 while (word--) {
2111 SV ** const tmp = av_fetch( trie_words, word, 0 );
2112 if (tmp) {
2113 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2114 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2115 else
2116 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2117 }
2118 }
2119 }
2120#endif
8e11feef
RGS
2121 if (trie->maxlen) {
2122 convert = n;
2123 } else {
3dab1dad 2124 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2125 DEBUG_r(optimize= n);
3dab1dad
YO
2126 }
2127 }
2128 }
a5ca303d
YO
2129 if (!jumper)
2130 jumper = last;
3dab1dad 2131 if ( trie->maxlen ) {
8e11feef
RGS
2132 NEXT_OFF( convert ) = (U16)(tail - convert);
2133 ARG_SET( convert, data_slot );
786e8c11
YO
2134 /* Store the offset to the first unabsorbed branch in
2135 jump[0], which is otherwise unused by the jump logic.
2136 We use this when dumping a trie and during optimisation. */
2137 if (trie->jump)
7f69552c 2138 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2139
6c48061a
YO
2140 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2141 * and there is a bitmap
2142 * and the first "jump target" node we found leaves enough room
2143 * then convert the TRIE node into a TRIEC node, with the bitmap
2144 * embedded inline in the opcode - this is hypothetically faster.
2145 */
2146 if ( !trie->states[trie->startstate].wordnum
2147 && trie->bitmap
2148 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2149 {
2150 OP( convert ) = TRIEC;
2151 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2152 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2153 trie->bitmap= NULL;
2154 } else
2155 OP( convert ) = TRIE;
a3621e74 2156
3dab1dad
YO
2157 /* store the type in the flags */
2158 convert->flags = nodetype;
a5ca303d
YO
2159 DEBUG_r({
2160 optimize = convert
2161 + NODE_STEP_REGNODE
2162 + regarglen[ OP( convert ) ];
2163 });
2164 /* XXX We really should free up the resource in trie now,
2165 as we won't use them - (which resources?) dmq */
3dab1dad 2166 }
a3621e74 2167 /* needed for dumping*/
e62cc96a 2168 DEBUG_r(if (optimize) {
07be1b83 2169 regnode *opt = convert;
bcdf7404 2170
e62cc96a 2171 while ( ++opt < optimize) {
07be1b83
YO
2172 Set_Node_Offset_Length(opt,0,0);
2173 }
786e8c11
YO
2174 /*
2175 Try to clean up some of the debris left after the
2176 optimisation.
a3621e74 2177 */
786e8c11 2178 while( optimize < jumper ) {
07be1b83 2179 mjd_nodelen += Node_Length((optimize));
a3621e74 2180 OP( optimize ) = OPTIMIZED;
07be1b83 2181 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2182 optimize++;
2183 }
07be1b83 2184 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2185 });
2186 } /* end node insert */
2e64971a
DM
2187
2188 /* Finish populating the prev field of the wordinfo array. Walk back
2189 * from each accept state until we find another accept state, and if
2190 * so, point the first word's .prev field at the second word. If the
2191 * second already has a .prev field set, stop now. This will be the
2192 * case either if we've already processed that word's accept state,
3b753521
FN
2193 * or that state had multiple words, and the overspill words were
2194 * already linked up earlier.
2e64971a
DM
2195 */
2196 {
2197 U16 word;
2198 U32 state;
2199 U16 prev;
2200
2201 for (word=1; word <= trie->wordcount; word++) {
2202 prev = 0;
2203 if (trie->wordinfo[word].prev)
2204 continue;
2205 state = trie->wordinfo[word].accept;
2206 while (state) {
2207 state = prev_states[state];
2208 if (!state)
2209 break;
2210 prev = trie->states[state].wordnum;
2211 if (prev)
2212 break;
2213 }
2214 trie->wordinfo[word].prev = prev;
2215 }
2216 Safefree(prev_states);
2217 }
2218
2219
2220 /* and now dump out the compressed format */
2221 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2222
55eed653 2223 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2224#ifdef DEBUGGING
2225 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2226 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2227#else
2228 SvREFCNT_dec(revcharmap);
07be1b83 2229#endif
786e8c11
YO
2230 return trie->jump
2231 ? MADE_JUMP_TRIE
2232 : trie->startstate>1
2233 ? MADE_EXACT_TRIE
2234 : MADE_TRIE;
2235}
2236
2237STATIC void
2238S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2239{
3b753521 2240/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2241
2242 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2243 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2244 ISBN 0-201-10088-6
2245
2246 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2247 suffix of the current state's 'word' that is also a proper prefix of another word in our
2248 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2249 the DFA not to have to restart after its tried and failed a word at a given point, it
2250 simply continues as though it had been matching the other word in the first place.
2251 Consider
2252 'abcdgu'=~/abcdefg|cdgu/
2253 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2254 fail, which would bring us to the state representing 'd' in the second word where we would
2255 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2256 */
2257 /* add a fail transition */
3251b653
NC
2258 const U32 trie_offset = ARG(source);
2259 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2260 U32 *q;
2261 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2262 const U32 numstates = trie->statecount;
786e8c11
YO
2263 const U32 ubound = trie->lasttrans + ucharcount;
2264 U32 q_read = 0;
2265 U32 q_write = 0;
2266 U32 charid;
2267 U32 base = trie->states[ 1 ].trans.base;
2268 U32 *fail;
2269 reg_ac_data *aho;
2270 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2271 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2272
2273 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2274#ifndef DEBUGGING
2275 PERL_UNUSED_ARG(depth);
2276#endif
2277
2278
2279 ARG_SET( stclass, data_slot );
c944940b 2280 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2281 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2282 aho->trie=trie_offset;
446bd890
NC
2283 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2284 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2285 Newxz( q, numstates, U32);
c944940b 2286 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2287 aho->refcount = 1;
2288 fail = aho->fail;
2289 /* initialize fail[0..1] to be 1 so that we always have
2290 a valid final fail state */
2291 fail[ 0 ] = fail[ 1 ] = 1;
2292
2293 for ( charid = 0; charid < ucharcount ; charid++ ) {
2294 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2295 if ( newstate ) {
2296 q[ q_write ] = newstate;
2297 /* set to point at the root */
2298 fail[ q[ q_write++ ] ]=1;
2299 }
2300 }
2301 while ( q_read < q_write) {
2302 const U32 cur = q[ q_read++ % numstates ];
2303 base = trie->states[ cur ].trans.base;
2304
2305 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2306 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2307 if (ch_state) {
2308 U32 fail_state = cur;
2309 U32 fail_base;
2310 do {
2311 fail_state = fail[ fail_state ];
2312 fail_base = aho->states[ fail_state ].trans.base;
2313 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2314
2315 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2316 fail[ ch_state ] = fail_state;
2317 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2318 {
2319 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2320 }
2321 q[ q_write++ % numstates] = ch_state;
2322 }
2323 }
2324 }
2325 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2326 when we fail in state 1, this allows us to use the
2327 charclass scan to find a valid start char. This is based on the principle
2328 that theres a good chance the string being searched contains lots of stuff
2329 that cant be a start char.
2330 */
2331 fail[ 0 ] = fail[ 1 ] = 0;
2332 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2333 PerlIO_printf(Perl_debug_log,
2334 "%*sStclass Failtable (%"UVuf" states): 0",
2335 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2336 );
786e8c11
YO
2337 for( q_read=1; q_read<numstates; q_read++ ) {
2338 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2339 }
2340 PerlIO_printf(Perl_debug_log, "\n");
2341 });
2342 Safefree(q);
2343 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2344}
2345
786e8c11 2346
a3621e74 2347/*
5d1c421c
JH
2348 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2349 * These need to be revisited when a newer toolchain becomes available.
2350 */
2351#if defined(__sparc64__) && defined(__GNUC__)
2352# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2353# undef SPARC64_GCC_WORKAROUND
2354# define SPARC64_GCC_WORKAROUND 1
2355# endif
2356#endif
2357
07be1b83 2358#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2359 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2360 SV * const mysv=sv_newmortal(); \
2361 regnode *Next = regnext(scan); \
2362 regprop(RExC_rx, mysv, scan); \
7f69552c 2363 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2364 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2365 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2366 }});
07be1b83 2367
1de06328
YO
2368
2369
2370
2371
07be1b83
YO
2372#define JOIN_EXACT(scan,min,flags) \
2373 if (PL_regkind[OP(scan)] == EXACT) \
2374 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2375
be8e71aa 2376STATIC U32
07be1b83
YO
2377S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2378 /* Merge several consecutive EXACTish nodes into one. */
2379 regnode *n = regnext(scan);
2380 U32 stringok = 1;
2381 regnode *next = scan + NODE_SZ_STR(scan);
2382 U32 merged = 0;
2383 U32 stopnow = 0;
2384#ifdef DEBUGGING
2385 regnode *stop = scan;
72f13be8 2386 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2387#else
d47053eb
RGS
2388 PERL_UNUSED_ARG(depth);
2389#endif
7918f24d
NC
2390
2391 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2392#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2393 PERL_UNUSED_ARG(flags);
2394 PERL_UNUSED_ARG(val);
07be1b83 2395#endif
07be1b83
YO
2396 DEBUG_PEEP("join",scan,depth);
2397
2398 /* Skip NOTHING, merge EXACT*. */
2399 while (n &&
2400 ( PL_regkind[OP(n)] == NOTHING ||
2401 (stringok && (OP(n) == OP(scan))))
2402 && NEXT_OFF(n)
2403 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2404
2405 if (OP(n) == TAIL || n > next)
2406 stringok = 0;
2407 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2408 DEBUG_PEEP("skip:",n,depth);
2409 NEXT_OFF(scan) += NEXT_OFF(n);
2410 next = n + NODE_STEP_REGNODE;
2411#ifdef DEBUGGING
2412 if (stringok)
2413 stop = n;
2414#endif
2415 n = regnext(n);
2416 }
2417 else if (stringok) {
786e8c11 2418 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2419 regnode * const nnext = regnext(n);
2420
2421 DEBUG_PEEP("merg",n,depth);
2422
2423 merged++;
2424 if (oldl + STR_LEN(n) > U8_MAX)
2425 break;
2426 NEXT_OFF(scan) += NEXT_OFF(n);
2427 STR_LEN(scan) += STR_LEN(n);
2428 next = n + NODE_SZ_STR(n);
2429 /* Now we can overwrite *n : */
2430 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2431#ifdef DEBUGGING
2432 stop = next - 1;
2433#endif
2434 n = nnext;
2435 if (stopnow) break;
2436 }
2437
d47053eb
RGS
2438#ifdef EXPERIMENTAL_INPLACESCAN
2439 if (flags && !NEXT_OFF(n)) {
2440 DEBUG_PEEP("atch", val, depth);
2441 if (reg_off_by_arg[OP(n)]) {
2442 ARG_SET(n, val - n);
2443 }
2444 else {
2445 NEXT_OFF(n) = val - n;
2446 }
2447 stopnow = 1;
2448 }
07be1b83
YO
2449#endif
2450 }
ced7f090
KW
2451#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2452#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2453#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2454#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2c2b7f86
KW
2455
2456 if (UTF
2457 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
2458 && ( STR_LEN(scan) >= 6 ) )
2459 {
07be1b83
YO
2460 /*
2461 Two problematic code points in Unicode casefolding of EXACT nodes:
2462
2463 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2464 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2465
2466 which casefold to
2467
2468 Unicode UTF-8
2469
2470 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2471 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2472
2473 This means that in case-insensitive matching (or "loose matching",
2474 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2475 length of the above casefolded versions) can match a target string
2476 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2477 This would rather mess up the minimum length computation.
2478
2479 What we'll do is to look for the tail four bytes, and then peek
2480 at the preceding two bytes to see whether we need to decrease
2481 the minimum length by four (six minus two).
2482
2483 Thanks to the design of UTF-8, there cannot be false matches:
2484 A sequence of valid UTF-8 bytes cannot be a subsequence of
2485 another valid sequence of UTF-8 bytes.
2486
2487 */
2488 char * const s0 = STRING(scan), *s, *t;
2489 char * const s1 = s0 + STR_LEN(scan) - 1;
2490 char * const s2 = s1 - 4;
e294cc5d
JH
2491#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2492 const char t0[] = "\xaf\x49\xaf\x42";
2493#else
07be1b83 2494 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2495#endif
07be1b83
YO
2496 const char * const t1 = t0 + 3;
2497
2498 for (s = s0 + 2;
2499 s < s2 && (t = ninstr(s, s1, t0, t1));
2500 s = t + 4) {
e294cc5d
JH
2501#ifdef EBCDIC
2502 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2503 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2504#else
07be1b83
YO
2505 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2506 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2507#endif
07be1b83
YO
2508 *min -= 4;
2509 }
2510 }
2511
2512#ifdef DEBUGGING
2513 /* Allow dumping */
2514 n = scan + NODE_SZ_STR(scan);
2515 while (n <= stop) {
2516 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2517 OP(n) = OPTIMIZED;
2518 NEXT_OFF(n) = 0;
2519 }
2520 n++;
2521 }
2522#endif
2523 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2524 return stopnow;
2525}
2526
653099ff
GS
2527/* REx optimizer. Converts nodes into quickier variants "in place".
2528 Finds fixed substrings. */
2529
a0288114 2530/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2531 to the position after last scanned or to NULL. */
2532
40d049e4
YO
2533#define INIT_AND_WITHP \
2534 assert(!and_withp); \
2535 Newx(and_withp,1,struct regnode_charclass_class); \
2536 SAVEFREEPV(and_withp)
07be1b83 2537
b515a41d
YO
2538/* this is a chain of data about sub patterns we are processing that
2539 need to be handled seperately/specially in study_chunk. Its so
2540 we can simulate recursion without losing state. */
2541struct scan_frame;
2542typedef struct scan_frame {
2543 regnode *last; /* last node to process in this frame */
2544 regnode *next; /* next node to process when last is reached */
2545 struct scan_frame *prev; /*previous frame*/
2546 I32 stop; /* what stopparen do we use */
2547} scan_frame;
2548
304ee84b
YO
2549
2550#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2551
e1d1eefb
YO
2552#define CASE_SYNST_FNC(nAmE) \
2553case nAmE: \
2554 if (flags & SCF_DO_STCLASS_AND) { \
2555 for (value = 0; value < 256; value++) \
2556 if (!is_ ## nAmE ## _cp(value)) \
2557 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2558 } \
2559 else { \
2560 for (value = 0; value < 256; value++) \
2561 if (is_ ## nAmE ## _cp(value)) \
2562 ANYOF_BITMAP_SET(data->start_class, value); \
2563 } \
2564 break; \
2565case N ## nAmE: \
2566 if (flags & SCF_DO_STCLASS_AND) { \
2567 for (value = 0; value < 256; value++) \
2568 if (is_ ## nAmE ## _cp(value)) \
2569 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2570 } \
2571 else { \
2572 for (value = 0; value < 256; value++) \
2573 if (!is_ ## nAmE ## _cp(value)) \
2574 ANYOF_BITMAP_SET(data->start_class, value); \
2575 } \
2576 break
2577
2578
2579
76e3520e 2580STATIC I32
40d049e4 2581S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2582 I32 *minlenp, I32 *deltap,
40d049e4
YO
2583 regnode *last,
2584 scan_data_t *data,
2585 I32 stopparen,
2586 U8* recursed,
2587 struct regnode_charclass_class *and_withp,
2588 U32 flags, U32 depth)
c277df42
IZ
2589 /* scanp: Start here (read-write). */
2590 /* deltap: Write maxlen-minlen here. */
2591 /* last: Stop before this one. */
40d049e4
YO
2592 /* data: string data about the pattern */
2593 /* stopparen: treat close N as END */
2594 /* recursed: which subroutines have we recursed into */
2595 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2596{
97aff369 2597 dVAR;
c277df42
IZ
2598 I32 min = 0, pars = 0, code;
2599 regnode *scan = *scanp, *next;
2600 I32 delta = 0;
2601 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2602 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2603 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2604 scan_data_t data_fake;
a3621e74 2605 SV *re_trie_maxbuff = NULL;
786e8c11 2606 regnode *first_non_open = scan;
e2e6a0f1 2607 I32 stopmin = I32_MAX;
8aa23a47 2608 scan_frame *frame = NULL;
a3621e74 2609 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2610
7918f24d
NC
2611 PERL_ARGS_ASSERT_STUDY_CHUNK;
2612
13a24bad 2613#ifdef DEBUGGING
40d049e4 2614 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2615#endif
40d049e4 2616
786e8c11 2617 if ( depth == 0 ) {
40d049e4 2618 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2619 first_non_open=regnext(first_non_open);
2620 }
2621
b81d288d 2622
8aa23a47
YO
2623 fake_study_recurse:
2624 while ( scan && OP(scan) != END && scan < last ){
2625 /* Peephole optimizer: */
304ee84b 2626 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2627 DEBUG_PEEP("Peep",scan,depth);
2628 JOIN_EXACT(scan,&min,0);
2629
2630 /* Follow the next-chain of the current node and optimize
2631 away all the NOTHINGs from it. */
2632 if (OP(scan) != CURLYX) {
2633 const int max = (reg_off_by_arg[OP(scan)]
2634 ? I32_MAX
2635 /* I32 may be smaller than U16 on CRAYs! */
2636 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2637 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2638 int noff;
2639 regnode *n = scan;
2640
2641 /* Skip NOTHING and LONGJMP. */
2642 while ((n = regnext(n))
2643 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2644 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2645 && off + noff < max)
2646 off += noff;
2647 if (reg_off_by_arg[OP(scan)])
2648 ARG(scan) = off;
2649 else
2650 NEXT_OFF(scan) = off;
2651 }
a3621e74 2652
c277df42 2653
8aa23a47
YO
2654
2655 /* The principal pseudo-switch. Cannot be a switch, since we
2656 look into several different things. */
2657 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2658 || OP(scan) == IFTHEN) {
2659 next = regnext(scan);
2660 code = OP(scan);
2661 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2662
2663 if (OP(next) == code || code == IFTHEN) {
2664 /* NOTE - There is similar code to this block below for handling
2665 TRIE nodes on a re-study. If you change stuff here check there
2666 too. */
2667 I32 max1 = 0, min1 = I32_MAX, num = 0;
2668 struct regnode_charclass_class accum;
2669 regnode * const startbranch=scan;
2670
2671 if (flags & SCF_DO_SUBSTR)
304ee84b 2672 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2673 if (flags & SCF_DO_STCLASS)
2674 cl_init_zero(pRExC_state, &accum);
2675
2676 while (OP(scan) == code) {
2677 I32 deltanext, minnext, f = 0, fake;
2678 struct regnode_charclass_class this_class;
2679
2680 num++;
2681 data_fake.flags = 0;
2682 if (data) {
2683 data_fake.whilem_c = data->whilem_c;
2684 data_fake.last_closep = data->last_closep;
2685 }
2686 else
2687 data_fake.last_closep = &fake;
58e23c8d
YO
2688
2689 data_fake.pos_delta = delta;
8aa23a47
YO
2690 next = regnext(scan);
2691 scan = NEXTOPER(scan);
2692 if (code != BRANCH)
c277df42 2693 scan = NEXTOPER(scan);
8aa23a47
YO
2694 if (flags & SCF_DO_STCLASS) {
2695 cl_init(pRExC_state, &this_class);
2696 data_fake.start_class = &this_class;
2697 f = SCF_DO_STCLASS_AND;
58e23c8d 2698 }
8aa23a47
YO
2699 if (flags & SCF_WHILEM_VISITED_POS)
2700 f |= SCF_WHILEM_VISITED_POS;
2701
2702 /* we suppose the run is continuous, last=next...*/
2703 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2704 next, &data_fake,
2705 stopparen, recursed, NULL, f,depth+1);
2706 if (min1 > minnext)
2707 min1 = minnext;
2708 if (max1 < minnext + deltanext)
2709 max1 = minnext + deltanext;
2710 if (deltanext == I32_MAX)
2711 is_inf = is_inf_internal = 1;
2712 scan = next;
2713 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2714 pars++;
2715 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2716 if ( stopmin > minnext)
2717 stopmin = min + min1;
2718 flags &= ~SCF_DO_SUBSTR;
2719 if (data)
2720 data->flags |= SCF_SEEN_ACCEPT;
2721 }
2722 if (data) {
2723 if (data_fake.flags & SF_HAS_EVAL)
2724 data->flags |= SF_HAS_EVAL;
2725 data->whilem_c = data_fake.whilem_c;
3dab1dad 2726 }
8aa23a47
YO
2727 if (flags & SCF_DO_STCLASS)
2728 cl_or(pRExC_state, &accum, &this_class);
2729 }
2730 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2731 min1 = 0;
2732 if (flags & SCF_DO_SUBSTR) {
2733 data->pos_min += min1;
2734 data->pos_delta += max1 - min1;
2735 if (max1 != min1 || is_inf)
2736 data->longest = &(data->longest_float);
2737 }
2738 min += min1;
2739 delta += max1 - min1;
2740 if (flags & SCF_DO_STCLASS_OR) {
2741 cl_or(pRExC_state, data->start_class, &accum);
2742 if (min1) {
2743 cl_and(data->start_class, and_withp);
2744 flags &= ~SCF_DO_STCLASS;
653099ff 2745 }
8aa23a47
YO
2746 }
2747 else if (flags & SCF_DO_STCLASS_AND) {
2748 if (min1) {
2749 cl_and(data->start_class, &accum);
2750 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2751 }
8aa23a47
YO
2752 else {
2753 /* Switch to OR mode: cache the old value of
2754 * data->start_class */
2755 INIT_AND_WITHP;
2756 StructCopy(data->start_class, and_withp,
2757 struct regnode_charclass_class);
2758 flags &= ~SCF_DO_STCLASS_AND;
2759 StructCopy(&accum, data->start_class,
2760 struct regnode_charclass_class);
2761 flags |= SCF_DO_STCLASS_OR;
2762 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2763 }
8aa23a47 2764 }
a3621e74 2765
8aa23a47
YO
2766 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2767 /* demq.
a3621e74 2768
8aa23a47
YO
2769 Assuming this was/is a branch we are dealing with: 'scan' now
2770 points at the item that follows the branch sequence, whatever
2771 it is. We now start at the beginning of the sequence and look
2772 for subsequences of
a3621e74 2773
8aa23a47
YO
2774 BRANCH->EXACT=>x1
2775 BRANCH->EXACT=>x2
2776 tail
a3621e74 2777
8aa23a47 2778 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2779
8aa23a47
YO
2780 If we can find such a subseqence we need to turn the first
2781 element into a trie and then add the subsequent branch exact
2782 strings to the trie.
a3621e74 2783
8aa23a47 2784 We have two cases
a3621e74 2785
3b753521 2786 1. patterns where the whole set of branches can be converted.
a3621e74 2787
8aa23a47 2788 2. patterns where only a subset can be converted.
a3621e74 2789
8aa23a47
YO
2790 In case 1 we can replace the whole set with a single regop
2791 for the trie. In case 2 we need to keep the start and end
3b753521 2792 branches so
a3621e74 2793
8aa23a47
YO
2794 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2795 becomes BRANCH TRIE; BRANCH X;
786e8c11 2796
8aa23a47
YO
2797 There is an additional case, that being where there is a
2798 common prefix, which gets split out into an EXACT like node
2799 preceding the TRIE node.
a3621e74 2800
8aa23a47
YO
2801 If x(1..n)==tail then we can do a simple trie, if not we make
2802 a "jump" trie, such that when we match the appropriate word
2803 we "jump" to the appopriate tail node. Essentailly we turn
2804 a nested if into a case structure of sorts.
b515a41d 2805
8aa23a47
YO
2806 */
2807
2808 int made=0;
2809 if (!re_trie_maxbuff) {
2810 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2811 if (!SvIOK(re_trie_maxbuff))
2812 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2813 }
2814 if ( SvIV(re_trie_maxbuff)>=0 ) {
2815 regnode *cur;
2816 regnode *first = (regnode *)NULL;
2817 regnode *last = (regnode *)NULL;
2818 regnode *tail = scan;
2819 U8 optype = 0;
2820 U32 count=0;
a3621e74
YO
2821
2822#ifdef DEBUGGING
8aa23a47 2823 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2824#endif
8aa23a47
YO
2825 /* var tail is used because there may be a TAIL
2826 regop in the way. Ie, the exacts will point to the
2827 thing following the TAIL, but the last branch will
2828 point at the TAIL. So we advance tail. If we
2829 have nested (?:) we may have to move through several
2830 tails.
2831 */
2832
2833 while ( OP( tail ) == TAIL ) {
2834 /* this is the TAIL generated by (?:) */
2835 tail = regnext( tail );
2836 }
a3621e74 2837
8aa23a47
YO
2838
2839 DEBUG_OPTIMISE_r({
2840 regprop(RExC_rx, mysv, tail );
2841 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2842 (int)depth * 2 + 2, "",
2843 "Looking for TRIE'able sequences. Tail node is: ",
2844 SvPV_nolen_const( mysv )
2845 );
2846 });
2847
2848 /*
2849
2850 step through the branches, cur represents each
2851 branch, noper is the first thing to be matched
2852 as part of that branch and noper_next is the
2853 regnext() of that node. if noper is an EXACT
2854 and noper_next is the same as scan (our current
2855 position in the regex) then the EXACT branch is
2856 a possible optimization target. Once we have
2857 two or more consequetive such branches we can
2858 create a trie of the EXACT's contents and stich
2859 it in place. If the sequence represents all of
2860 the branches we eliminate the whole thing and
2861 replace it with a single TRIE. If it is a
2862 subsequence then we need to stitch it in. This
2863 means the first branch has to remain, and needs
2864 to be repointed at the item on the branch chain
2865 following the last branch optimized. This could
2866 be either a BRANCH, in which case the
2867 subsequence is internal, or it could be the
2868 item following the branch sequence in which
2869 case the subsequence is at the end.
2870
2871 */
2872
2873 /* dont use tail as the end marker for this traverse */
2874 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2875 regnode * const noper = NEXTOPER( cur );
b515a41d 2876#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2877 regnode * const noper_next = regnext( noper );
b515a41d
YO
2878#endif
2879
8aa23a47
YO
2880 DEBUG_OPTIMISE_r({
2881 regprop(RExC_rx, mysv, cur);
2882 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2883 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2884
2885 regprop(RExC_rx, mysv, noper);
2886 PerlIO_printf( Perl_debug_log, " -> %s",
2887 SvPV_nolen_const(mysv));
2888
2889 if ( noper_next ) {
2890 regprop(RExC_rx, mysv, noper_next );
2891 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2892 SvPV_nolen_const(mysv));
2893 }
2894 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2895 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2896 });
2897 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2898 : PL_regkind[ OP( noper ) ] == EXACT )
2899 || OP(noper) == NOTHING )
786e8c11 2900#ifdef NOJUMPTRIE
8aa23a47 2901 && noper_next == tail
786e8c11 2902#endif
8aa23a47
YO
2903 && count < U16_MAX)
2904 {
2905 count++;
2906 if ( !first || optype == NOTHING ) {
2907 if (!first) first = cur;
2908 optype = OP( noper );
2909 } else {
2910 last = cur;
2911 }
2912 } else {
a0a388a1 2913/*
0abd0d78
YO
2914 Currently we do not believe that the trie logic can
2915 handle case insensitive matching properly when the
2916 pattern is not unicode (thus forcing unicode semantics).
2917
2918 If/when this is fixed the following define can be swapped
2919 in below to fully enable trie logic.
2920
a0a388a1 2921#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2922
2923*/
2924#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2925
a0a388a1 2926 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2927 make_trie( pRExC_state,
2928 startbranch, first, cur, tail, count,
2929 optype, depth+1 );
2930 }
2931 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2932#ifdef NOJUMPTRIE
8aa23a47 2933 && noper_next == tail
786e8c11 2934#endif
8aa23a47
YO
2935 ){
2936 count = 1;
2937 first = cur;
2938 optype = OP( noper );
2939 } else {
2940 count = 0;
2941 first = NULL;
2942 optype = 0;
2943 }
2944 last = NULL;
2945 }
2946 }
2947 DEBUG_OPTIMISE_r({
2948 regprop(RExC_rx, mysv, cur);
2949 PerlIO_printf( Perl_debug_log,
2950 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2951 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2952
2953 });
a0a388a1
YO
2954
2955 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2956 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2957#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2958 if ( ((made == MADE_EXACT_TRIE &&
2959 startbranch == first)
2960 || ( first_non_open == first )) &&
2961 depth==0 ) {
2962 flags |= SCF_TRIE_RESTUDY;
2963 if ( startbranch == first
2964 && scan == tail )
2965 {
2966 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2967 }
2968 }
3dab1dad 2969#endif
8aa23a47
YO
2970 }
2971 }
2972
2973 } /* do trie */
2974
653099ff 2975 }
8aa23a47
YO
2976 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2977 scan = NEXTOPER(NEXTOPER(scan));
2978 } else /* single branch is optimized. */
2979 scan = NEXTOPER(scan);
2980 continue;
2981 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2982 scan_frame *newframe = NULL;
2983 I32 paren;
2984 regnode *start;
2985 regnode *end;
2986
2987 if (OP(scan) != SUSPEND) {
2988 /* set the pointer */
2989 if (OP(scan) == GOSUB) {
2990 paren = ARG(scan);
2991 RExC_recurse[ARG2L(scan)] = scan;
2992 start = RExC_open_parens[paren-1];
2993 end = RExC_close_parens[paren-1];
2994 } else {
2995 paren = 0;
f8fc2ecf 2996 start = RExC_rxi->program + 1;
8aa23a47
YO
2997 end = RExC_opend;
2998 }
2999 if (!recursed) {
3000 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3001 SAVEFREEPV(recursed);
3002 }
3003 if (!PAREN_TEST(recursed,paren+1)) {
3004 PAREN_SET(recursed,paren+1);
3005 Newx(newframe,1,scan_frame);
3006 } else {
3007 if (flags & SCF_DO_SUBSTR) {
304ee84b 3008 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3009 data->longest = &(data->longest_float);
3010 }
3011 is_inf = is_inf_internal = 1;
3012 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3013 cl_anything(pRExC_state, data->start_class);
3014 flags &= ~SCF_DO_STCLASS;
3015 }
3016 } else {
3017 Newx(newframe,1,scan_frame);
3018 paren = stopparen;
3019 start = scan+2;
3020 end = regnext(scan);
3021 }
3022 if (newframe) {
3023 assert(start);
3024 assert(end);
3025 SAVEFREEPV(newframe);
3026 newframe->next = regnext(scan);
3027 newframe->last = last;
3028 newframe->stop = stopparen;
3029 newframe->prev = frame;
3030
3031 frame = newframe;
3032 scan = start;
3033 stopparen = paren;
3034 last = end;
3035
3036 continue;
3037 }
3038 }
3039 else if (OP(scan) == EXACT) {
3040 I32 l = STR_LEN(scan);
3041 UV uc;
3042 if (UTF) {
3043 const U8 * const s = (U8*)STRING(scan);
3044 l = utf8_length(s, s + l);
3045 uc = utf8_to_uvchr(s, NULL);
3046 } else {
3047 uc = *((U8*)STRING(scan));
3048 }
3049 min += l;
3050 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3051 /* The code below prefers earlier match for fixed
3052 offset, later match for variable offset. */
3053 if (data->last_end == -1) { /* Update the start info. */
3054 data->last_start_min = data->pos_min;
3055 data->last_start_max = is_inf
3056 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3057 }
8aa23a47
YO
3058 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3059 if (UTF)
3060 SvUTF8_on(data->last_found);
3061 {
3062 SV * const sv = data->last_found;
3063 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3064 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3065 if (mg && mg->mg_len >= 0)
3066 mg->mg_len += utf8_length((U8*)STRING(scan),
3067 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3068 }
8aa23a47
YO
3069 data->last_end = data->pos_min + l;
3070 data->pos_min += l; /* As in the first entry. */
3071 data->flags &= ~SF_BEFORE_EOL;
3072 }
3073 if (flags & SCF_DO_STCLASS_AND) {
3074 /* Check whether it is compatible with what we know already! */
3075 int compat = 1;
3076
54251c2e
KW
3077
3078 /* If compatibile, we or it in below. It is compatible if is
3079 * in the bitmp and either 1) its bit or its fold is set, or 2)
3080 * it's for a locale. Even if there isn't unicode semantics
3081 * here, at runtime there may be because of matching against a
3082 * utf8 string, so accept a possible false positive for
3083 * latin1-range folds */
8aa23a47
YO
3084 if (uc >= 0x100 ||
3085 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3086 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3087 && (!(data->start_class->flags & ANYOF_FOLD)
54251c2e 3088 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
8aa23a47
YO
3089 )
3090 compat = 0;
3091 ANYOF_CLASS_ZERO(data->start_class);
3092 ANYOF_BITMAP_ZERO(data->start_class);
3093 if (compat)
3094 ANYOF_BITMAP_SET(data->start_class, uc);
3095 data->start_class->flags &= ~ANYOF_EOS;
3096 if (uc < 0x100)
3097 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3098 }
3099 else if (flags & SCF_DO_STCLASS_OR) {
3100 /* false positive possible if the class is case-folded */
3101 if (uc < 0x100)
3102 ANYOF_BITMAP_SET(data->start_class, uc);
3103 else
3104 data->start_class->flags |= ANYOF_UNICODE_ALL;
3105 data->start_class->flags &= ~ANYOF_EOS;
3106 cl_and(data->start_class, and_withp);
3107 }
3108 flags &= ~SCF_DO_STCLASS;
3109 }
3110 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3111 I32 l = STR_LEN(scan);
3112 UV uc = *((U8*)STRING(scan));
3113
3114 /* Search for fixed substrings supports EXACT only. */
3115 if (flags & SCF_DO_SUBSTR) {
3116 assert(data);
304ee84b 3117 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3118 }
3119 if (UTF) {
3120 const U8 * const s = (U8 *)STRING(scan);
3121 l = utf8_length(s, s + l);
3122 uc = utf8_to_uvchr(s, NULL);
3123 }
3124 min += l;
3125 if (flags & SCF_DO_SUBSTR)
3126 data->pos_min += l;
3127 if (flags & SCF_DO_STCLASS_AND) {
3128 /* Check whether it is compatible with what we know already! */
3129 int compat = 1;
8aa23a47 3130 if (uc >= 0x100 ||
54251c2e
KW
3131 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3132 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3133 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3134 {
8aa23a47 3135 compat = 0;
54251c2e 3136 }
8aa23a47
YO
3137 ANYOF_CLASS_ZERO(data->start_class);
3138 ANYOF_BITMAP_ZERO(data->start_class);
3139 if (compat) {
3140 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3141 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47 3142 data->start_class->flags |= ANYOF_FOLD;
970c8436 3143 if (OP(scan) == EXACTFL) {
8aa23a47 3144 data->start_class->flags |= ANYOF_LOCALE;
970c8436
KW
3145 }
3146 else {
3147
54251c2e
KW
3148 /* Also set the other member of the fold pair. In case
3149 * that unicode semantics is called for at runtime, use
3150 * the full latin1 fold. (Can't do this for locale,
3151 * because not known until runtime */
3152 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
970c8436 3153 }
653099ff 3154 }
8aa23a47
YO
3155 }
3156 else if (flags & SCF_DO_STCLASS_OR) {
3157 if (data->start_class->flags & ANYOF_FOLD) {
3158 /* false positive possible if the class is case-folded.
3159 Assume that the locale settings are the same... */
970c8436 3160 if (uc < 0x100) {
1aa99e6b 3161 ANYOF_BITMAP_SET(data->start_class, uc);
970c8436
KW
3162 if (OP(scan) != EXACTFL) {
3163
3164 /* And set the other member of the fold pair, but
3165 * can't do that in locale because not known until
3166 * run-time */
3167 ANYOF_BITMAP_SET(data->start_class,
54251c2e 3168 PL_fold_latin1[uc]);
970c8436
KW
3169 }
3170 }
653099ff
GS
3171 data->start_class->flags &= ~ANYOF_EOS;
3172 }
8aa23a47 3173 cl_and(data->start_class, and_withp);
653099ff 3174 }
8aa23a47
YO
3175 flags &= ~SCF_DO_STCLASS;
3176 }
e52fc539 3177 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3178 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3179 I32 f = flags, pos_before = 0;
3180 regnode * const oscan = scan;
3181 struct regnode_charclass_class this_class;
3182 struct regnode_charclass_class *oclass = NULL;
3183 I32 next_is_eval = 0;
3184
3185 switch (PL_regkind[OP(scan)]) {
3186 case WHILEM: /* End of (?:...)* . */
3187 scan = NEXTOPER(scan);
3188 goto finish;
3189 case PLUS:
3190 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3191 next = NEXTOPER(scan);
3192 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3193 mincount = 1;
3194 maxcount = REG_INFTY;
3195 next = regnext(scan);
3196 scan = NEXTOPER(scan);
3197 goto do_curly;
3198 }
3199 }
3200 if (flags & SCF_DO_SUBSTR)
3201 data->pos_min++;
3202 min++;
3203 /* Fall through. */
3204 case STAR:
3205 if (flags & SCF_DO_STCLASS) {
3206 mincount = 0;
3207 maxcount = REG_INFTY;
3208 next = regnext(scan);
3209 scan = NEXTOPER(scan);
3210 goto do_curly;
3211 }
3212 is_inf = is_inf_internal = 1;
3213 scan = regnext(scan);
c277df42 3214 if (flags & SCF_DO_SUBSTR) {
304ee84b 3215 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3216 data->longest = &(data->longest_float);
c277df42 3217 }
8aa23a47
YO
3218 goto optimize_curly_tail;
3219 case CURLY:
3220 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3221 && (scan->flags == stopparen))
3222 {
3223 mincount = 1;
3224 maxcount = 1;
3225 } else {
3226 mincount = ARG1(scan);
3227 maxcount = ARG2(scan);
653099ff 3228 }
8aa23a47
YO
3229 next = regnext(scan);
3230 if (OP(scan) == CURLYX) {
3231 I32 lp = (data ? *(data->last_closep) : 0);
3232 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3233 }
8aa23a47
YO
3234 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3235 next_is_eval = (OP(scan) == EVAL);
3236 do_curly:
3237 if (flags & SCF_DO_SUBSTR) {
304ee84b 3238 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3239 pos_before = data->pos_min;
b45f050a 3240 }
8aa23a47
YO
3241 if (data) {
3242 fl = data->flags;
3243 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3244 if (is_inf)
3245 data->flags |= SF_IS_INF;
3246 }
3247 if (flags & SCF_DO_STCLASS) {
3248 cl_init(pRExC_state, &this_class);
3249 oclass = data->start_class;
3250 data->start_class = &this_class;
3251 f |= SCF_DO_STCLASS_AND;
3252 f &= ~SCF_DO_STCLASS_OR;
3253 }
779bcb7d
NC
3254 /* Exclude from super-linear cache processing any {n,m}
3255 regops for which the combination of input pos and regex
3256 pos is not enough information to determine if a match
3257 will be possible.
3258
3259 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3260 regex pos at the \s*, the prospects for a match depend not
3261 only on the input position but also on how many (bar\s*)
3262 repeats into the {4,8} we are. */
3263 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
8aa23a47 3264 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3265
8aa23a47
YO
3266 /* This will finish on WHILEM, setting scan, or on NULL: */
3267 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3268 last, data, stopparen, recursed, NULL,
3269 (mincount == 0
3270 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3271
8aa23a47
YO
3272 if (flags & SCF_DO_STCLASS)
3273 data->start_class = oclass;
3274 if (mincount == 0 || minnext == 0) {
3275 if (flags & SCF_DO_STCLASS_OR) {
3276 cl_or(pRExC_state, data->start_class, &this_class);
3277 }
3278 else if (flags & SCF_DO_STCLASS_AND) {
3279 /* Switch to OR mode: cache the old value of
3280 * data->start_class */
3281 INIT_AND_WITHP;
3282 StructCopy(data->start_class, and_withp,
3283 struct regnode_charclass_class);
3284 flags &= ~SCF_DO_STCLASS_AND;
3285 StructCopy(&this_class, data->start_class,
3286 struct regnode_charclass_class);
3287 flags |= SCF_DO_STCLASS_OR;
3288 data->start_class->flags |= ANYOF_EOS;
3289 }
3290 } else { /* Non-zero len */
3291 if (flags & SCF_DO_STCLASS_OR) {
3292 cl_or(pRExC_state, data->start_class, &this_class);
3293 cl_and(data->start_class, and_withp);
3294 }
3295 else if (flags & SCF_DO_STCLASS_AND)
3296 cl_and(data->start_class, &this_class);
3297 flags &= ~SCF_DO_STCLASS;
3298 }
3299 if (!scan) /* It was not CURLYX, but CURLY. */
3300 scan = next;
3301 if ( /* ? quantifier ok, except for (?{ ... }) */
3302 (next_is_eval || !(mincount == 0 && maxcount == 1))
3303 && (minnext == 0) && (deltanext == 0)
3304 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3305 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3306 {
668c081a
NC
3307 ckWARNreg(RExC_parse,
3308 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3309 }
3310
3311 min += minnext * mincount;
3312 is_inf_internal |= ((maxcount == REG_INFTY
3313 && (minnext + deltanext) > 0)
3314 || deltanext == I32_MAX);
3315 is_inf |= is_inf_internal;
3316 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3317
3318 /* Try powerful optimization CURLYX => CURLYN. */
3319 if ( OP(oscan) == CURLYX && data
3320 && data->flags & SF_IN_PAR
3321 && !(data->flags & SF_HAS_EVAL)
3322 && !deltanext && minnext == 1 ) {
3323 /* Try to optimize to CURLYN. */
3324 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3325 regnode * const nxt1 = nxt;
497b47a8 3326#ifdef DEBUGGING
8aa23a47 3327 regnode *nxt2;
497b47a8 3328#endif
c277df42 3329
8aa23a47
YO
3330 /* Skip open. */
3331 nxt = regnext(nxt);
e52fc539 3332 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3333 && !(PL_regkind[OP(nxt)] == EXACT
3334 && STR_LEN(nxt) == 1))
3335 goto nogo;
497b47a8 3336#ifdef DEBUGGING
8aa23a47 3337 nxt2 = nxt;
497b47a8 3338#endif
8aa23a47
YO
3339 nxt = regnext(nxt);
3340 if (OP(nxt) != CLOSE)
3341 goto nogo;
3342 if (RExC_open_parens) {
3343 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3344 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3345 }
3346 /* Now we know that nxt2 is the only contents: */
3347 oscan->flags = (U8)ARG(nxt);
3348 OP(oscan) = CURLYN;
3349 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3350
c277df42 3351#ifdef DEBUGGING
8aa23a47 3352 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3353 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3354 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3355 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3356 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3357 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3358#endif
8aa23a47
YO
3359 }
3360 nogo:
3361
3362 /* Try optimization CURLYX => CURLYM. */
3363 if ( OP(oscan) == CURLYX && data
3364 && !(data->flags & SF_HAS_PAR)
3365 && !(data->flags & SF_HAS_EVAL)
3366 && !deltanext /* atom is fixed width */
3367 && minnext != 0 /* CURLYM can't handle zero width */
3368 ) {
3369 /* XXXX How to optimize if data == 0? */
3370 /* Optimize to a simpler form. */
3371 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3372 regnode *nxt2;
3373
3374 OP(oscan) = CURLYM;
3375 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3376 && (OP(nxt2) != WHILEM))
3377 nxt = nxt2;
3378 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3379 /* Need to optimize away parenths. */
b3c0965f 3380 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3381 /* Set the parenth number. */
3382 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3383
8aa23a47
YO
3384 oscan->flags = (U8)ARG(nxt);
3385 if (RExC_open_parens) {
3386 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3387 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3388 }
8aa23a47
YO
3389 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3390 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3391
c277df42 3392#ifdef DEBUGGING
8aa23a47
YO
3393 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3394 OP(nxt + 1) = OPTIMIZED; /* was count. */
3395 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3396 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3397#endif
c277df42 3398#if 0
8aa23a47
YO
3399 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3400 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3401 if (nnxt == nxt) {
3402 if (reg_off_by_arg[OP(nxt1)])
3403 ARG_SET(nxt1, nxt2 - nxt1);
3404 else if (nxt2 - nxt1 < U16_MAX)
3405 NEXT_OFF(nxt1) = nxt2 - nxt1;
3406 else
3407 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3408 }
8aa23a47 3409 nxt1 = nnxt;
c277df42 3410 }
5d1c421c 3411#endif
8aa23a47
YO
3412 /* Optimize again: */
3413 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3414 NULL, stopparen, recursed, NULL, 0,depth+1);
3415 }
3416 else
3417 oscan->flags = 0;
3418 }
3419 else if ((OP(oscan) == CURLYX)
3420 && (flags & SCF_WHILEM_VISITED_POS)
3421 /* See the comment on a similar expression above.
3b753521 3422 However, this time it's not a subexpression
8aa23a47
YO
3423 we care about, but the expression itself. */
3424 && (maxcount == REG_INFTY)
3425 && data && ++data->whilem_c < 16) {
3426 /* This stays as CURLYX, we can put the count/of pair. */
3427 /* Find WHILEM (as in regexec.c) */
3428 regnode *nxt = oscan + NEXT_OFF(oscan);
3429
3430 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3431 nxt += ARG(nxt);
3432 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3433 | (RExC_whilem_seen << 4)); /* On WHILEM */
3434 }
3435 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3436 pars++;
3437 if (flags & SCF_DO_SUBSTR) {
3438 SV *last_str = NULL;
3439 int counted = mincount != 0;
a0ed51b3 3440
8aa23a47
YO
3441 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3442#if defined(SPARC64_GCC_WORKAROUND)
3443 I32 b = 0;
3444 STRLEN l = 0;
3445 const char *s = NULL;
3446 I32 old = 0;
b515a41d 3447
8aa23a47
YO
3448 if (pos_before >= data->last_start_min)
3449 b = pos_before;
3450 else
3451 b = data->last_start_min;
b515a41d 3452
8aa23a47
YO
3453 l = 0;
3454 s = SvPV_const(data->last_found, l);
3455 old = b - data->last_start_min;
3456
3457#else
3458 I32 b = pos_before >= data->last_start_min
3459 ? pos_before : data->last_start_min;
3460 STRLEN l;
3461 const char * const s = SvPV_const(data->last_found, l);
3462 I32 old = b - data->last_start_min;
3463#endif
3464
3465 if (UTF)
3466 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3467 l -= old;
3468 /* Get the added string: */
740cce10 3469 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3470 if (deltanext == 0 && pos_before == b) {
3471 /* What was added is a constant string */
3472 if (mincount > 1) {
3473 SvGROW(last_str, (mincount * l) + 1);
3474 repeatcpy(SvPVX(last_str) + l,
3475 SvPVX_const(last_str), l, mincount - 1);
3476 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3477 /* Add additional parts. */
3478 SvCUR_set(data->last_found,
3479 SvCUR(data->last_found) - l);
3480 sv_catsv(data->last_found, last_str);
3481 {
3482 SV * sv = data->last_found;
3483 MAGIC *mg =
3484 SvUTF8(sv) && SvMAGICAL(sv) ?
3485 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3486 if (mg && mg->mg_len >= 0)
bd94e887 3487 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3488 }
8aa23a47 3489 data->last_end += l * (mincount - 1);
b515a41d 3490 }
8aa23a47
YO
3491 } else {
3492 /* start offset must point into the last copy */
3493 data->last_start_min += minnext * (mincount - 1);
3494 data->last_start_max += is_inf ? I32_MAX
3495 : (maxcount - 1) * (minnext + data->pos_delta);
3496 }
c277df42 3497 }
8aa23a47
YO
3498 /* It is counted once already... */
3499 data->pos_min += minnext * (mincount - counted);
3500 data->pos_delta += - counted * deltanext +
3501 (minnext + deltanext) * maxcount - minnext * mincount;
3502 if (mincount != maxcount) {
3503 /* Cannot extend fixed substrings found inside
3504 the group. */
304ee84b 3505 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3506 if (mincount && last_str) {
3507 SV * const sv = data->last_found;
3508 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3509 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3510
3511 if (mg)
3512 mg->mg_len = -1;
3513 sv_setsv(sv, last_str);
3514 data->last_end = data->pos_min;
3515 data->last_start_min =
3516 data->pos_min - CHR_SVLEN(last_str);
3517 data->last_start_max = is_inf
3518 ? I32_MAX
3519 : data->pos_min + data->pos_delta
3520 - CHR_SVLEN(last_str);
3521 }
3522 data->longest = &(data->longest_float);
3523 }
3524 SvREFCNT_dec(last_str);
c277df42 3525 }
8aa23a47
YO
3526 if (data && (fl & SF_HAS_EVAL))
3527 data->flags |= SF_HAS_EVAL;
3528 optimize_curly_tail:
3529 if (OP(oscan) != CURLYX) {
3530 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3531 && NEXT_OFF(next))
3532 NEXT_OFF(oscan) += NEXT_OFF(next);
3533 }
3534 continue;
3535 default: /* REF and CLUMP only? */
3536 if (flags & SCF_DO_SUBSTR) {
304ee84b 3537 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3538 data->longest = &(data->longest_float);
3539 }
3540 is_inf = is_inf_internal = 1;
3541 if (flags & SCF_DO_STCLASS_OR)
3542 cl_anything(pRExC_state, data->start_class);
3543 flags &= ~SCF_DO_STCLASS;
3544 break;
c277df42 3545 }
8aa23a47 3546 }
e1d1eefb
YO
3547 else if (OP(scan) == LNBREAK) {
3548 if (flags & SCF_DO_STCLASS) {
3549 int value = 0;
3550 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3551 if (flags & SCF_DO_STCLASS_AND) {
3552 for (value = 0; value < 256; value++)
e64b1bd1 3553 if (!is_VERTWS_cp(value))
b9a59e08
KW
3554 ANYOF_BITMAP_CLEAR(data->start_class, value);
3555 }
3556 else {
e1d1eefb 3557 for (value = 0; value < 256; value++)
e64b1bd1 3558 if (is_VERTWS_cp(value))
b9a59e08
KW
3559 ANYOF_BITMAP_SET(data->start_class, value);
3560 }
e1d1eefb
YO
3561 if (flags & SCF_DO_STCLASS_OR)
3562 cl_and(data->start_class, and_withp);
3563 flags &= ~SCF_DO_STCLASS;
3564 }
3565 min += 1;
f9a79580 3566 delta += 1;
e1d1eefb
YO
3567 if (flags & SCF_DO_SUBSTR) {
3568 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3569 data->pos_min += 1;
f9a79580 3570 data->pos_delta += 1;
e1d1eefb
YO
3571 data->longest = &(data->longest_float);
3572 }
e1d1eefb 3573 }
f9a79580 3574 else if (OP(scan) == FOLDCHAR) {
ced7f090 3575 int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
f9a79580
RGS
3576 flags &= ~SCF_DO_STCLASS;
3577 min += 1;
3578 delta += d;
3579 if (flags & SCF_DO_SUBSTR) {
3580 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3581 data->pos_min += 1;
3582 data->pos_delta += d;
3583 data->longest = &(data->longest_float);
3584 }
3585 }
e52fc539 3586 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 3587 int value = 0;
653099ff 3588
8aa23a47 3589 if (flags & SCF_DO_SUBSTR) {
304ee84b 3590 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3591 data->pos_min++;
3592 }
3593 min++;
3594 if (flags & SCF_DO_STCLASS) {
3595 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3596
8aa23a47
YO
3597 /* Some of the logic below assumes that switching
3598 locale on will only add false positives. */
3599 switch (PL_regkind[OP(scan)]) {
3600 case SANY:
3601 default:
3602 do_default:
3603 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3604 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3605 cl_anything(pRExC_state, data->start_class);
3606 break;
3607 case REG_ANY:
3608 if (OP(scan) == SANY)
3609 goto do_default;
3610 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3611 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3a15e693 3612 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
8aa23a47 3613 cl_anything(pRExC_state, data->start_class);
653099ff 3614 }
8aa23a47
YO
3615 if (flags & SCF_DO_STCLASS_AND || !value)
3616 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3617 break;
3618 case ANYOF:
3619 if (flags & SCF_DO_STCLASS_AND)
3620 cl_and(data->start_class,
3621 (struct regnode_charclass_class*)scan);
653099ff 3622 else
8aa23a47
YO
3623 cl_or(pRExC_state, data->start_class,
3624 (struct regnode_charclass_class*)scan);
3625 break;
3626 case ALNUM:
3627 if (flags & SCF_DO_STCLASS_AND) {
3628 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3629 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
a12cf05f
KW
3630 if (FLAGS(scan) & USE_UNI) {
3631 for (value = 0; value < 256; value++) {
3632 if (!isWORDCHAR_L1(value)) {
3633 ANYOF_BITMAP_CLEAR(data->start_class, value);
3634 }
3635 }
3636 } else {
3637 for (value = 0; value < 256; value++) {
3638 if (!isALNUM(value)) {
3639 ANYOF_BITMAP_CLEAR(data->start_class, value);
3640 }
3641 }
3642 }
8aa23a47 3643 }
653099ff 3644 }
8aa23a47
YO
3645 else {
3646 if (data->start_class->flags & ANYOF_LOCALE)
3647 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
a12cf05f
KW
3648 else if (FLAGS(scan) & USE_UNI) {
3649 for (value = 0; value < 256; value++) {
3650 if (isWORDCHAR_L1(value)) {
3651 ANYOF_BITMAP_SET(data->start_class, value);
3652 }
3653 }
3654 } else {
3655 for (value = 0; value < 256; value++) {
3656 if (isALNUM(value)) {
3657 ANYOF_BITMAP_SET(data->start_class, value);
3658 }
3659 }
3660 }
8aa23a47
YO
3661 }
3662 break;
3663 case ALNUML:
3664 if (flags & SCF_DO_STCLASS_AND) {
3665 if (data->start_class->flags & ANYOF_LOCALE)
3666 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3667 }
3668 else {
3669 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3670 data->start_class->flags |= ANYOF_LOCALE;
3671 }
3672 break;
3673 case NALNUM:
3674 if (flags & SCF_DO_STCLASS_AND) {
3675 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3676 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
a12cf05f
KW
3677 if (FLAGS(scan) & USE_UNI) {
3678 for (value = 0; value < 256; value++) {
3679 if (isWORDCHAR_L1(value)) {
3680 ANYOF_BITMAP_CLEAR(data->start_class, value);
3681 }
3682 }
3683 } else {
3684 for (value = 0; value < 256; value++) {
3685 if (isALNUM(value)) {
3686 ANYOF_BITMAP_CLEAR(data->start_class, value);
3687 }
3688 }
3689 }
653099ff
GS
3690 }
3691 }
8aa23a47
YO
3692 else {
3693 if (data->start_class->flags & ANYOF_LOCALE)
3694 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3695 else {
3696 for (value = 0; value < 256; value++)
3697 if (!isALNUM(value))
b9a59e08 3698 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47 3699 }
653099ff 3700 }
8aa23a47
YO
3701 break;
3702 case NALNUML:
3703 if (flags & SCF_DO_STCLASS_AND) {
3704 if (data->start_class->flags & ANYOF_LOCALE)
3705 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3706 }
8aa23a47
YO
3707 else {
3708 data->start_class->flags |= ANYOF_LOCALE;
3709 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3710 }
3711 break;
3712 case SPACE:
3713 if (flags & SCF_DO_STCLASS_AND) {
3714 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3715 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
a12cf05f
KW
3716 if (FLAGS(scan) & USE_UNI) {
3717 for (value = 0; value < 256; value++) {
3718 if (!isSPACE_L1(value)) {
3719 ANYOF_BITMAP_CLEAR(data->start_class, value);
3720 }
3721 }
3722 } else {
3723 for (value = 0; value < 256; value++) {
3724 if (!isSPACE(value)) {
3725 ANYOF_BITMAP_CLEAR(data->start_class, value);
3726 }
3727 }
3728 }
653099ff
GS
3729 }
3730 }
8aa23a47 3731 else {
a12cf05f 3732 if (data->start_class->flags & ANYOF_LOCALE) {
8aa23a47 3733 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
a12cf05f
KW
3734 }
3735 else if (FLAGS(scan) & USE_UNI) {
3736 for (value = 0; value < 256; value++) {
3737 if (isSPACE_L1(value)) {
3738 ANYOF_BITMAP_SET(data->start_class, value);
3739 }
3740 }
3741 } else {
3742 for (value = 0; value < 256; value++) {
3743 if (isSPACE(value)) {
3744 ANYOF_BITMAP_SET(data->start_class, value);
3745 }
3746 }
8aa23a47 3747 }
653099ff 3748 }
8aa23a47
YO
3749 break;
3750 case SPACEL:
3751 if (flags & SCF_DO_STCLASS_AND) {
3752 if (data->start_class->flags & ANYOF_LOCALE)
3753 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3754 }
3755 else {
3756 data->start_class->flags |= ANYOF_LOCALE;
3757 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3758 }
3759 break;
3760 case NSPACE:
3761 if (flags & SCF_DO_STCLASS_AND) {
3762 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3763 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
a12cf05f
KW
3764 if (FLAGS(scan) & USE_UNI) {
3765 for (value = 0; value < 256; value++) {
3766 if (isSPACE_L1(value)) {
3767 ANYOF_BITMAP_CLEAR(data->start_class, value);
3768 }
3769 }
3770 } else {
3771 for (value = 0; value < 256; value++) {
3772 if (isSPACE(value)) {
3773 ANYOF_BITMAP_CLEAR(data->start_class, value);
3774 }
3775 }
3776 }
653099ff 3777 }
8aa23a47
YO
3778 }
3779 else {
3780 if (data->start_class->flags & ANYOF_LOCALE)
3781 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
a12cf05f
KW
3782 else if (FLAGS(scan) & USE_UNI) {
3783 for (value = 0; value < 256; value++) {
3784 if (!isSPACE_L1(value)) {
3785 ANYOF_BITMAP_SET(data->start_class, value);
3786 }
3787 }
3788 }
3789 else {
3790 for (value = 0; value < 256; value++) {
3791 if (!isSPACE(value)) {
3792 ANYOF_BITMAP_SET(data->start_class, value);
3793 }
3794 }
3795 }
653099ff 3796 }
8aa23a47
YO
3797 break;
3798 case NSPACEL:
3799 if (flags & SCF_DO_STCLASS_AND) {
3800 if (data->start_class->flags & ANYOF_LOCALE) {
3801 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3802 for (value = 0; value < 256; value++)
3803 if (!isSPACE(value))
3804 ANYOF_BITMAP_CLEAR(data->start_class, value);
3805 }
653099ff 3806 }
8aa23a47
YO
3807 else {
3808 data->start_class->flags |= ANYOF_LOCALE;
3809 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3810 }
3811 break;
3812 case DIGIT:
3813 if (flags & SCF_DO_STCLASS_AND) {
3814 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3815 for (value = 0; value < 256; value++)
3816 if (!isDIGIT(value))
3817 ANYOF_BITMAP_CLEAR(data->start_class, value);
3818 }
3819 else {
3820 if (data->start_class->flags & ANYOF_LOCALE)
3821 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3822 else {
3823 for (value = 0; value < 256; value++)
3824 if (isDIGIT(value))
b9a59e08 3825 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47
YO
3826 }
3827 }
3828 break;
3829 case NDIGIT:
3830 if (flags & SCF_DO_STCLASS_AND) {
3831 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3832 for (value = 0; value < 256; value++)
3833 if (isDIGIT(value))
3834 ANYOF_BITMAP_CLEAR(data->start_class, value);
3835 }
3836 else {
3837 if (data->start_class->flags & ANYOF_LOCALE)
3838 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3839 else {
3840 for (value = 0; value < 256; value++)
3841 if (!isDIGIT(value))
b9a59e08 3842 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3843 }
3844 }
8aa23a47 3845 break;
e1d1eefb
YO
3846 CASE_SYNST_FNC(VERTWS);
3847 CASE_SYNST_FNC(HORIZWS);
3848
8aa23a47
YO
3849 }
3850 if (flags & SCF_DO_STCLASS_OR)
3851 cl_and(data->start_class, and_withp);
3852 flags &= ~SCF_DO_STCLASS;
3853 }
3854 }
3855 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3856 data->flags |= (OP(scan) == MEOL
3857 ? SF_BEFORE_MEOL
3858 : SF_BEFORE_SEOL);
3859 }
3860 else if ( PL_regkind[OP(scan)] == BRANCHJ
3861 /* Lookbehind, or need to calculate parens/evals/stclass: */
3862 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3863 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3864 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3865 || OP(scan) == UNLESSM )
3866 {
3867 /* Negative Lookahead/lookbehind
3868 In this case we can't do fixed string optimisation.
3869 */
1de06328 3870
8aa23a47
YO
3871 I32 deltanext, minnext, fake = 0;
3872 regnode *nscan;
3873 struct regnode_charclass_class intrnl;
3874 int f = 0;
1de06328 3875
8aa23a47
YO
3876 data_fake.flags = 0;
3877 if (data) {
3878 data_fake.whilem_c = data->whilem_c;
3879 data_fake.last_closep = data->last_closep;
c277df42 3880 }
8aa23a47
YO
3881 else
3882 data_fake.last_closep = &fake;
58e23c8d 3883 data_fake.pos_delta = delta;
8aa23a47
YO
3884 if ( flags & SCF_DO_STCLASS && !scan->flags
3885 && OP(scan) == IFMATCH ) { /* Lookahead */
3886 cl_init(pRExC_state, &intrnl);
3887 data_fake.start_class = &intrnl;
3888 f |= SCF_DO_STCLASS_AND;
3889 }
3890 if (flags & SCF_WHILEM_VISITED_POS)
3891 f |= SCF_WHILEM_VISITED_POS;
3892 next = regnext(scan);
3893 nscan = NEXTOPER(NEXTOPER(scan));
3894 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3895 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3896 if (scan->flags) {
3897 if (deltanext) {
58e23c8d 3898 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3899 }
3900 else if (minnext > (I32)U8_MAX) {
58e23c8d 3901 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3902 }
3903 scan->flags = (U8)minnext;
3904 }
3905 if (data) {
3906 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3907 pars++;
3908 if (data_fake.flags & SF_HAS_EVAL)
3909 data->flags |= SF_HAS_EVAL;
3910 data->whilem_c = data_fake.whilem_c;
3911 }
3912 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
3913 if (flags & SCF_DO_STCLASS_OR) {
3914 /* OR before, AND after: ideally we would recurse with
3915 * data_fake to get the AND applied by study of the
3916 * remainder of the pattern, and then derecurse;
3917 * *** HACK *** for now just treat as "no information".
3918 * See [perl #56690].
3919 */
3920 cl_init(pRExC_state, data->start_class);
3921 } else {
3922 /* AND before and after: combine and continue */
3923 const int was = (data->start_class->flags & ANYOF_EOS);
3924
3925 cl_and(data->start_class, &intrnl);
3926 if (was)
3927 data->start_class->flags |= ANYOF_EOS;
3928 }
8aa23a47 3929 }
cb434fcc 3930 }
8aa23a47
YO
3931#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3932 else {
3933 /* Positive Lookahead/lookbehind
3934 In this case we can do fixed string optimisation,
3935 but we must be careful about it. Note in the case of
3936 lookbehind the positions will be offset by the minimum
3937 length of the pattern, something we won't know about
3938 until after the recurse.
3939 */
3940 I32 deltanext, fake = 0;
3941 regnode *nscan;
3942 struct regnode_charclass_class intrnl;
3943 int f = 0;
3944 /* We use SAVEFREEPV so that when the full compile
3945 is finished perl will clean up the allocated
3b753521 3946 minlens when it's all done. This way we don't
8aa23a47
YO
3947 have to worry about freeing them when we know
3948 they wont be used, which would be a pain.
3949 */
3950 I32 *minnextp;
3951 Newx( minnextp, 1, I32 );
3952 SAVEFREEPV(minnextp);
3953
3954 if (data) {
3955 StructCopy(data, &data_fake, scan_data_t);
3956 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3957 f |= SCF_DO_SUBSTR;
3958 if (scan->flags)
304ee84b 3959 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3960 data_fake.last_found=newSVsv(data->last_found);
3961 }
3962 }
3963 else
3964 data_fake.last_closep = &fake;
3965 data_fake.flags = 0;
58e23c8d 3966 data_fake.pos_delta = delta;
8aa23a47
YO
3967 if (is_inf)
3968 data_fake.flags |= SF_IS_INF;
3969 if ( flags & SCF_DO_STCLASS && !scan->flags
3970 && OP(scan) == IFMATCH ) { /* Lookahead */
3971 cl_init(pRExC_state, &intrnl);
3972 data_fake.start_class = &intrnl;
3973 f |= SCF_DO_STCLASS_AND;
3974 }
3975 if (flags & SCF_WHILEM_VISITED_POS)
3976 f |= SCF_WHILEM_VISITED_POS;
3977 next = regnext(scan);
3978 nscan = NEXTOPER(NEXTOPER(scan));
3979
3980 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3981 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3982 if (scan->flags) {
3983 if (deltanext) {
58e23c8d 3984 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3985 }
3986 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3987 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3988 }
3989 scan->flags = (U8)*minnextp;
3990 }
3991
3992 *minnextp += min;
3993
3994 if (f & SCF_DO_STCLASS_AND) {
3995 const int was = (data->start_class->flags & ANYOF_EOS);
3996
3997 cl_and(data->start_class, &intrnl);
3998 if (was)
3999 data->start_class->flags |= ANYOF_EOS;
4000 }
4001 if (data) {
4002 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4003 pars++;
4004 if (data_fake.flags & SF_HAS_EVAL)
4005 data->flags |= SF_HAS_EVAL;
4006 data->whilem_c = data_fake.whilem_c;
4007 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4008 if (RExC_rx->minlen<*minnextp)
4009 RExC_rx->minlen=*minnextp;
304ee84b 4010 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
4011 SvREFCNT_dec(data_fake.last_found);
4012
4013 if ( data_fake.minlen_fixed != minlenp )
4014 {
4015 data->offset_fixed= data_fake.offset_fixed;
4016 data->minlen_fixed= data_fake.minlen_fixed;
4017 data->lookbehind_fixed+= scan->flags;
4018 }
4019 if ( data_fake.minlen_float != minlenp )
4020 {
4021 data->minlen_float= data_fake.minlen_float;
4022 data->offset_float_min=data_fake.offset_float_min;
4023 data->offset_float_max=data_fake.offset_float_max;
4024 data->lookbehind_float+= scan->flags;
4025 }
4026 }
4027 }
4028
4029
40d049e4 4030 }
8aa23a47
YO
4031#endif
4032 }
4033 else if (OP(scan) == OPEN) {
4034 if (stopparen != (I32)ARG(scan))
4035 pars++;
4036 }
4037 else if (OP(scan) == CLOSE) {
4038 if (stopparen == (I32)ARG(scan)) {
4039 break;
4040 }
4041 if ((I32)ARG(scan) == is_par) {
4042 next = regnext(scan);
b515a41d 4043
8aa23a47
YO
4044 if ( next && (OP(next) != WHILEM) && next < last)
4045 is_par = 0; /* Disable optimization */
40d049e4 4046 }
8aa23a47
YO
4047 if (data)
4048 *(data->last_closep) = ARG(scan);
4049 }
4050 else if (OP(scan) == EVAL) {
c277df42
IZ
4051 if (data)
4052 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
4053 }
4054 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4055 if (flags & SCF_DO_SUBSTR) {
304ee84b 4056 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 4057 flags &= ~SCF_DO_SUBSTR;
40d049e4 4058 }
8aa23a47
YO
4059 if (data && OP(scan)==ACCEPT) {
4060 data->flags |= SCF_SEEN_ACCEPT;
4061 if (stopmin > min)
4062 stopmin = min;
e2e6a0f1 4063 }
8aa23a47
YO
4064 }
4065 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4066 {
0f5d15d6 4067 if (flags & SCF_DO_SUBSTR) {
304ee84b 4068 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
4069 data->longest = &(data->longest_float);
4070 }
4071 is_inf = is_inf_internal = 1;
653099ff 4072 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 4073 cl_anything(pRExC_state, data->start_class);
96776eda 4074 flags &= ~SCF_DO_STCLASS;
8aa23a47 4075 }
58e23c8d 4076 else if (OP(scan) == GPOS) {
bbe252da 4077 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
4078 !(delta || is_inf || (data && data->pos_delta)))
4079 {
bbe252da
YO
4080 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4081 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
4082 if (RExC_rx->gofs < (U32)min)
4083 RExC_rx->gofs = min;
4084 } else {
bbe252da 4085 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
4086 RExC_rx->gofs = 0;
4087 }
4088 }
786e8c11 4089#ifdef TRIE_STUDY_OPT
40d049e4 4090#ifdef FULL_TRIE_STUDY
8aa23a47
YO
4091 else if (PL_regkind[OP(scan)] == TRIE) {
4092 /* NOTE - There is similar code to this block above for handling
4093 BRANCH nodes on the initial study. If you change stuff here
4094 check there too. */
4095 regnode *trie_node= scan;
4096 regnode *tail= regnext(scan);
f8fc2ecf 4097 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4098 I32 max1 = 0, min1 = I32_MAX;
4099 struct regnode_charclass_class accum;
4100
4101 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 4102 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
4103 if (flags & SCF_DO_STCLASS)
4104 cl_init_zero(pRExC_state, &accum);
4105
4106 if (!trie->jump) {
4107 min1= trie->minlen;
4108 max1= trie->maxlen;
4109 } else {
4110 const regnode *nextbranch= NULL;
4111 U32 word;
4112
4113 for ( word=1 ; word <= trie->wordcount ; word++)
4114 {
4115 I32 deltanext=0, minnext=0, f = 0, fake;
4116 struct regnode_charclass_class this_class;
4117
4118 data_fake.flags = 0;
4119 if (data) {
4120 data_fake.whilem_c = data->whilem_c;
4121 data_fake.last_closep = data->last_closep;
4122 }
4123 else
4124 data_fake.last_closep = &fake;
58e23c8d 4125 data_fake.pos_delta = delta;
8aa23a47
YO
4126 if (flags & SCF_DO_STCLASS) {
4127 cl_init(pRExC_state, &this_class);
4128 data_fake.start_class = &this_class;
4129 f = SCF_DO_STCLASS_AND;
4130 }
4131 if (flags & SCF_WHILEM_VISITED_POS)
4132 f |= SCF_WHILEM_VISITED_POS;
4133
4134 if (trie->jump[word]) {
4135 if (!nextbranch)
4136 nextbranch = trie_node + trie->jump[0];
4137 scan= trie_node + trie->jump[word];
4138 /* We go from the jump point to the branch that follows
4139 it. Note this means we need the vestigal unused branches
4140 even though they arent otherwise used.
4141 */
4142 minnext = study_chunk(pRExC_state, &scan, minlenp,
4143 &deltanext, (regnode *)nextbranch, &data_fake,
4144 stopparen, recursed, NULL, f,depth+1);
4145 }
4146 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4147 nextbranch= regnext((regnode*)nextbranch);
4148
4149 if (min1 > (I32)(minnext + trie->minlen))
4150 min1 = minnext + trie->minlen;
4151 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4152 max1 = minnext + deltanext + trie->maxlen;
4153 if (deltanext == I32_MAX)
4154 is_inf = is_inf_internal = 1;
4155
4156 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4157 pars++;
4158 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4159 if ( stopmin > min + min1)
4160 stopmin = min + min1;
4161 flags &= ~SCF_DO_SUBSTR;
4162 if (data)
4163 data->flags |= SCF_SEEN_ACCEPT;
4164 }
4165 if (data) {
4166 if (data_fake.flags & SF_HAS_EVAL)
4167 data->flags |= SF_HAS_EVAL;
4168 data->whilem_c = data_fake.whilem_c;
4169 }
4170 if (flags & SCF_DO_STCLASS)
4171 cl_or(pRExC_state, &accum, &this_class);
4172 }
4173 }
4174 if (flags & SCF_DO_SUBSTR) {
4175 data->pos_min += min1;
4176 data->pos_delta += max1 - min1;
4177 if (max1 != min1 || is_inf)
4178 data->longest = &(data->longest_float);
4179 }
4180 min += min1;
4181 delta += max1 - min1;
4182 if (flags & SCF_DO_STCLASS_OR) {
4183 cl_or(pRExC_state, data->start_class, &accum);
4184 if (min1) {
4185 cl_and(data->start_class, and_withp);
4186 flags &= ~SCF_DO_STCLASS;
4187 }
4188 }
4189 else if (flags & SCF_DO_STCLASS_AND) {
4190 if (min1) {
4191 cl_and(data->start_class, &accum);
4192 flags &= ~SCF_DO_STCLASS;
4193 }
4194 else {
4195 /* Switch to OR mode: cache the old value of
4196 * data->start_class */
4197 INIT_AND_WITHP;
4198 StructCopy(data->start_class, and_withp,
4199 struct regnode_charclass_class);
4200 flags &= ~SCF_DO_STCLASS_AND;
4201 StructCopy(&accum, data->start_class,
4202 struct regnode_charclass_class);
4203 flags |= SCF_DO_STCLASS_OR;
4204 data->start_class->flags |= ANYOF_EOS;
4205 }
4206 }
4207 scan= tail;
4208 continue;
4209 }
786e8c11 4210#else
8aa23a47 4211 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4212 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4213 U8*bang=NULL;
4214
4215 min += trie->minlen;
4216 delta += (trie->maxlen - trie->minlen);
4217 flags &= ~SCF_DO_STCLASS; /* xxx */
4218 if (flags & SCF_DO_SUBSTR) {
304ee84b 4219 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4220 data->pos_min += trie->minlen;
4221 data->pos_delta += (trie->maxlen - trie->minlen);
4222 if (trie->maxlen != trie->minlen)
4223 data->longest = &(data->longest_float);
4224 }
4225 if (trie->jump) /* no more substrings -- for now /grr*/
4226 flags &= ~SCF_DO_SUBSTR;
b515a41d 4227 }
8aa23a47
YO
4228#endif /* old or new */
4229#endif /* TRIE_STUDY_OPT */
e1d1eefb 4230
8aa23a47
YO
4231 /* Else: zero-length, ignore. */
4232 scan = regnext(scan);
4233 }
4234 if (frame) {
4235 last = frame->last;
4236 scan = frame->next;
4237 stopparen = frame->stop;
4238 frame = frame->prev;
4239 goto fake_study_recurse;
c277df42
IZ
4240 }
4241
4242 finish:
8aa23a47 4243 assert(!frame);
304ee84b 4244 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4245
c277df42 4246 *scanp = scan;
aca2d497 4247 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4248 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4249 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4250 if (is_par > (I32)U8_MAX)
c277df42
IZ
4251 is_par = 0;
4252 if (is_par && pars==1 && data) {
4253 data->flags |= SF_IN_PAR;
4254 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4255 }
4256 else if (pars && data) {
c277df42
IZ
4257 data->flags |= SF_HAS_PAR;
4258 data->flags &= ~SF_IN_PAR;
4259 }
653099ff 4260 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4261 cl_and(data->start_class, and_withp);
786e8c11
YO
4262 if (flags & SCF_TRIE_RESTUDY)
4263 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4264
304ee84b 4265 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4266
e2e6a0f1 4267 return min < stopmin ? min : stopmin;
c277df42
IZ
4268}
4269
2eccd3b2
NC
4270STATIC U32
4271S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4272{
4a4e7719
NC
4273 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4274
7918f24d
NC
4275 PERL_ARGS_ASSERT_ADD_DATA;
4276
4a4e7719
NC
4277 Renewc(RExC_rxi->data,
4278 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4279 char, struct reg_data);
4280 if(count)
f8fc2ecf 4281 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4282 else
f8fc2ecf 4283 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4284 RExC_rxi->data->count = count + n;
4285 Copy(s, RExC_rxi->data->what + count, n, U8);
4286 return count;
c277df42
IZ
4287}
4288
f8149455 4289/*XXX: todo make this not included in a non debugging perl */
76234dfb 4290#ifndef PERL_IN_XSUB_RE
d88dccdf 4291void
864dbfa3 4292Perl_reginitcolors(pTHX)
d88dccdf 4293{
97aff369 4294 dVAR;
1df70142 4295 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4296 if (s) {
1df70142
AL
4297 char *t = savepv(s);
4298 int i = 0;
4299 PL_colors[0] = t;
d88dccdf 4300 while (++i < 6) {
1df70142
AL
4301 t = strchr(t, '\t');
4302 if (t) {
4303 *t = '\0';
4304 PL_colors[i] = ++t;
d88dccdf
IZ
4305 }
4306 else
1df70142 4307 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4308 }
4309 } else {
1df70142 4310 int i = 0;
b81d288d 4311 while (i < 6)
06b5626a 4312 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4313 }
4314 PL_colorset = 1;
4315}
76234dfb 4316#endif
8615cb43 4317
07be1b83 4318
786e8c11
YO
4319#ifdef TRIE_STUDY_OPT
4320#define CHECK_RESTUDY_GOTO \
4321 if ( \
4322 (data.flags & SCF_TRIE_RESTUDY) \
4323 && ! restudied++ \
4324 ) goto reStudy
4325#else
4326#define CHECK_RESTUDY_GOTO
4327#endif
f9f4320a 4328
a687059c 4329/*
e50aee73 4330 - pregcomp - compile a regular expression into internal code
a687059c
LW
4331 *
4332 * We can't allocate space until we know how big the compiled form will be,
4333 * but we can't compile it (and thus know how big it is) until we've got a
4334 * place to put the code. So we cheat: we compile it twice, once with code
4335 * generation turned off and size counting turned on, and once "for real".
4336 * This also means that we don't allocate space until we are sure that the
4337 * thing really will compile successfully, and we never have to move the
4338 * code and thus invalidate pointers into it. (Note that it has to be in
4339 * one piece because free() must be able to free it all.) [NB: not true in perl]
4340 *
4341 * Beware that the optimization-preparation code in here knows about some
4342 * of the structure of the compiled regexp. [I'll say.]
4343 */
b9b4dddf
YO
4344
4345
4346
f9f4320a 4347#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4348#define RE_ENGINE_PTR &PL_core_reg_engine
4349#else
f9f4320a
YO
4350extern const struct regexp_engine my_reg_engine;
4351#define RE_ENGINE_PTR &my_reg_engine
4352#endif
6d5c990f
RGS
4353
4354#ifndef PERL_IN_XSUB_RE
3ab4a224 4355REGEXP *
1593ad57 4356Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4357{
97aff369 4358 dVAR;
6d5c990f 4359 HV * const table = GvHV(PL_hintgv);
7918f24d
NC
4360
4361 PERL_ARGS_ASSERT_PREGCOMP;
4362
f9f4320a
YO
4363 /* Dispatch a request to compile a regexp to correct
4364 regexp engine. */
f9f4320a
YO
4365 if (table) {
4366 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4367 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4368 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4369 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4370 DEBUG_COMPILE_r({
8d8756e7 4371 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4372 SvIV(*ptr));
4373 });
3ab4a224 4374 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4375 }
b9b4dddf 4376 }
3ab4a224 4377 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4378}
6d5c990f 4379#endif
2a5d9b1d 4380
3ab4a224 4381REGEXP *
29b09c41 4382Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
2a5d9b1d
RGS
4383{
4384 dVAR;
288b8c02
NC
4385 REGEXP *rx;
4386 struct regexp *r;
f8fc2ecf 4387 register regexp_internal *ri;
3ab4a224 4388 STRLEN plen;
5d51ce98
KW
4389 char *exp;
4390 char* xend;
c277df42 4391 regnode *scan;
a0d0e21e 4392 I32 flags;
a0d0e21e 4393 I32 minlen = 0;
29b09c41 4394 U32 pm_flags;
e7f38d0f
YO
4395
4396 /* these are all flags - maybe they should be turned
4397 * into a single int with different bit masks */
4398 I32 sawlookahead = 0;
a0d0e21e
LW
4399 I32 sawplus = 0;
4400 I32 sawopen = 0;
29b09c41 4401 bool used_setjump = FALSE;
e7f38d0f 4402
bbd61b5f
KW
4403 U8 jump_ret = 0;
4404 dJMPENV;
2c2d71f5 4405 scan_data_t data;
830247a4 4406 RExC_state_t RExC_state;
be8e71aa 4407 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83 4408#ifdef TRIE_STUDY_OPT
5d51ce98 4409 int restudied;
07be1b83
YO
4410 RExC_state_t copyRExC_state;
4411#endif
2a5d9b1d 4412 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4413
4414 PERL_ARGS_ASSERT_RE_COMPILE;
4415
6d5c990f 4416 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4417
29b09c41 4418 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
7b597bb8 4419
d6bd454d 4420 /****************** LONG JUMP TARGET HERE***********************/
bbd61b5f
KW
4421 /* Longjmp back to here if have to switch in midstream to utf8 */
4422 if (! RExC_orig_utf8) {
4423 JMPENV_PUSH(jump_ret);
29b09c41 4424 used_setjump = TRUE;
bbd61b5f
KW
4425 }
4426
5d51ce98 4427 if (jump_ret == 0) { /* First time through */
29b09c41
KW
4428 exp = SvPV(pattern, plen);
4429 xend = exp + plen;
4430 /* ignore the utf8ness if the pattern is 0 length */
4431 if (plen == 0) {
4432 RExC_utf8 = RExC_orig_utf8 = 0;
4433 }
4434
5d51ce98
KW
4435 DEBUG_COMPILE_r({
4436 SV *dsv= sv_newmortal();
4437 RE_PV_QUOTED_DECL(s, RExC_utf8,
4438 dsv, exp, plen, 60);
4439 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4440 PL_colors[4],PL_colors[5],s);
4441 });
4442 }
4443 else { /* longjumped back */
bbd61b5f
KW
4444 STRLEN len = plen;
4445
5d51ce98
KW
4446 /* If the cause for the longjmp was other than changing to utf8, pop
4447 * our own setjmp, and longjmp to the correct handler */
bbd61b5f
KW
4448 if (jump_ret != UTF8_LONGJMP) {
4449 JMPENV_POP;
4450 JMPENV_JUMP(jump_ret);
4451 }
4452
595598ee
KW
4453 GET_RE_DEBUG_FLAGS;
4454
bbd61b5f
KW
4455 /* It's possible to write a regexp in ascii that represents Unicode
4456 codepoints outside of the byte range, such as via \x{100}. If we
4457 detect such a sequence we have to convert the entire pattern to utf8
4458 and then recompile, as our sizing calculation will have been based
4459 on 1 byte == 1 character, but we will need to use utf8 to encode
4460 at least some part of the pattern, and therefore must convert the whole
4461 thing.
4462 -- dmq */
4463 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4464 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
595598ee 4465 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
bbd61b5f
KW
4466 xend = exp + len;
4467 RExC_orig_utf8 = RExC_utf8 = 1;
4468 SAVEFREEPV(exp);
4469 }
4470
5d51ce98
KW
4471#ifdef TRIE_STUDY_OPT
4472 restudied = 0;
4473#endif
4474
29b09c41
KW
4475 /* Set to use unicode semantics if the pattern is in utf8 and has the
4476 * 'dual' charset specified, as it means unicode when utf8 */
4477 pm_flags = orig_pm_flags;
4478 if (RExC_utf8 && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
4479 pm_flags |= RXf_PMf_UNICODE;
4480 }
4481
02daf0ab 4482 RExC_precomp = exp;
c737faaf 4483 RExC_flags = pm_flags;
830247a4 4484 RExC_sawback = 0;
bbce6d69 4485
830247a4
IZ
4486 RExC_seen = 0;
4487 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4488 RExC_seen_evals = 0;
4489 RExC_extralen = 0;
c277df42 4490
bbce6d69 4491 /* First pass: determine size, legality. */
830247a4 4492 RExC_parse = exp;
fac92740 4493 RExC_start = exp;
830247a4
IZ
4494 RExC_end = xend;
4495 RExC_naughty = 0;
4496 RExC_npar = 1;
e2e6a0f1 4497 RExC_nestroot = 0;
830247a4
IZ
4498 RExC_size = 0L;
4499 RExC_emit = &PL_regdummy;
4500 RExC_whilem_seen = 0;
40d049e4
YO
4501 RExC_open_parens = NULL;
4502 RExC_close_parens = NULL;
4503 RExC_opend = NULL;
81714fb9 4504 RExC_paren_names = NULL;
1f1031fe
YO
4505#ifdef DEBUGGING
4506 RExC_paren_name_list = NULL;
4507#endif
40d049e4
YO
4508 RExC_recurse = NULL;
4509 RExC_recurse_count = 0;
81714fb9 4510
85ddcde9
JH
4511#if 0 /* REGC() is (currently) a NOP at the first pass.
4512 * Clever compilers notice this and complain. --jhi */
830247a4 4513 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4514#endif
3dab1dad
YO
4515 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4516 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4517 RExC_precomp = NULL;
a0d0e21e
LW
4518 return(NULL);
4519 }
bbd61b5f 4520
29b09c41
KW
4521 /* Here, finished first pass. Get rid of any added setjmp */
4522 if (used_setjump) {
bbd61b5f 4523 JMPENV_POP;
02daf0ab 4524 }
07be1b83 4525 DEBUG_PARSE_r({
81714fb9
YO
4526 PerlIO_printf(Perl_debug_log,
4527 "Required size %"IVdf" nodes\n"
4528 "Starting second pass (creation)\n",
4529 (IV)RExC_size);
07be1b83
YO
4530 RExC_lastnum=0;
4531 RExC_lastparse=NULL;
4532 });
c277df42
IZ
4533 /* Small enough for pointer-storage convention?
4534 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4535 if (RExC_size >= 0x10000L && RExC_extralen)
4536 RExC_size += RExC_extralen;
c277df42 4537 else
830247a4
IZ
4538 RExC_extralen = 0;
4539 if (RExC_whilem_seen > 15)
4540 RExC_whilem_seen = 15;
a0d0e21e 4541
f9f4320a
YO
4542 /* Allocate space and zero-initialize. Note, the two step process
4543 of zeroing when in debug mode, thus anything assigned has to
4544 happen after that */
d2f13c59 4545 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 4546 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
4547 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4548 char, regexp_internal);
4549 if ( r == NULL || ri == NULL )
b45f050a 4550 FAIL("Regexp out of space");
0f79a09d
GS
4551#ifdef DEBUGGING
4552 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4553 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4554#else
f8fc2ecf
YO
4555 /* bulk initialize base fields with 0. */
4556 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4557#endif
58e23c8d
YO
4558
4559 /* non-zero initialization begins here */
f8fc2ecf 4560 RXi_SET( r, ri );
f9f4320a 4561 r->engine= RE_ENGINE_PTR;
c737faaf 4562 r->extflags = pm_flags;
bcdf7404 4563 {
f7819f85 4564 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
77ba44fb 4565 bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
c5ea2ffa
KW
4566
4567 /* The caret is output if there are any defaults: if not all the STD
4568 * flags are set, or if no character set specifier is needed */
4569 bool has_default =
4570 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4571 || ! has_charset);
bcdf7404 4572 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
4573 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4574 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
4575 const char *fptr = STD_PAT_MODS; /*"msix"*/
4576 char *p;
fb85c044 4577 /* Allocate for the worst case, which is all the std flags are turned
c5ea2ffa
KW
4578 * on. If more precision is desired, we could do a population count of
4579 * the flags set. This could be done with a small lookup table, or by
4580 * shifting, masking and adding, or even, when available, assembly
4581 * language for a machine-language population count.
4582 * We never output a minus, as all those are defaults, so are
4583 * covered by the caret */
fb85c044 4584 const STRLEN wraplen = plen + has_p + has_runon
c5ea2ffa
KW
4585 + has_default /* If needs a caret */
4586 + has_charset /* If needs a character set specifier */
bcdf7404
YO
4587 + (sizeof(STD_PAT_MODS) - 1)
4588 + (sizeof("(?:)") - 1);
4589
c5ea2ffa 4590 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
f7c278bf 4591 SvPOK_on(rx);
8f6ae13c 4592 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 4593 *p++='('; *p++='?';
9de15fec
KW
4594
4595 /* If a default, cover it using the caret */
c5ea2ffa 4596 if (has_default) {
85508812 4597 *p++= DEFAULT_PAT_MOD;
fb85c044 4598 }
c5ea2ffa
KW
4599 if (has_charset) {
4600 if (r->extflags & RXf_PMf_LOCALE) {
4601 *p++ = LOCALE_PAT_MOD;
4602 } else {
4603 *p++ = UNICODE_PAT_MOD;
4604 }
9de15fec 4605 }
f7819f85
A
4606 if (has_p)
4607 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404 4608 {
bcdf7404 4609 char ch;
bcdf7404
YO
4610 while((ch = *fptr++)) {
4611 if(reganch & 1)
4612 *p++ = ch;
bcdf7404
YO
4613 reganch >>= 1;
4614 }
bcdf7404
YO
4615 }
4616
28d8d7f4 4617 *p++ = ':';
bb661a58 4618 Copy(RExC_precomp, p, plen, char);
efd26800
NC
4619 assert ((RX_WRAPPED(rx) - p) < 16);
4620 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 4621 p += plen;
bcdf7404 4622 if (has_runon)
28d8d7f4
YO
4623 *p++ = '\n';
4624 *p++ = ')';
4625 *p = 0;
fb85c044 4626 SvCUR_set(rx, p - SvPVX_const(rx));
bcdf7404
YO
4627 }
4628
bbe252da 4629 r->intflags = 0;
830247a4 4630 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4631
6bda09f9 4632 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4633 Newxz(RExC_open_parens, RExC_npar,regnode *);
4634 SAVEFREEPV(RExC_open_parens);
4635 Newxz(RExC_close_parens,RExC_npar,regnode *);
4636 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4637 }
4638
4639 /* Useful during FAIL. */
7122b237
YO
4640#ifdef RE_TRACK_PATTERN_OFFSETS
4641 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4642 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4643 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4644 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4645 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4646#endif
4647 SetProgLen(ri,RExC_size);
288b8c02 4648 RExC_rx_sv = rx;
830247a4 4649 RExC_rx = r;
f8fc2ecf 4650 RExC_rxi = ri;
bbce6d69 4651
4652 /* Second pass: emit code. */
c737faaf 4653 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
4654 RExC_parse = exp;
4655 RExC_end = xend;
4656 RExC_naughty = 0;
4657 RExC_npar = 1;
f8fc2ecf
YO
4658 RExC_emit_start = ri->program;
4659 RExC_emit = ri->program;
3b57cd43
YO
4660 RExC_emit_bound = ri->program + RExC_size + 1;
4661
2cd61cdb 4662 /* Store the count of eval-groups for security checks: */
f8149455 4663 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4664 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 4665 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 4666 ReREFCNT_dec(rx);
a0d0e21e 4667 return(NULL);
80757612 4668 }
07be1b83
YO
4669 /* XXXX To minimize changes to RE engine we always allocate
4670 3-units-long substrs field. */
4671 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4672 if (RExC_recurse_count) {
4673 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4674 SAVEFREEPV(RExC_recurse);
4675 }
a0d0e21e 4676
07be1b83 4677reStudy:
e7f38d0f 4678 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
07be1b83 4679 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4680
07be1b83 4681#ifdef TRIE_STUDY_OPT
0934c9d9
SH
4682 if (!restudied) {
4683 StructCopy(&zero_scan_data, &data, scan_data_t);
4684 copyRExC_state = RExC_state;
4685 } else {
5d458dd8 4686 U32 seen=RExC_seen;
07be1b83 4687 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4688
4689 RExC_state = copyRExC_state;
4690 if (seen & REG_TOP_LEVEL_BRANCHES)
4691 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4692 else
4693 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4694 if (data.last_found) {
07be1b83 4695 SvREFCNT_dec(data.longest_fixed);
07be1b83 4696 SvREFCNT_dec(data.longest_float);
07be1b83 4697 SvREFCNT_dec(data.last_found);
1de06328 4698 }
40d049e4 4699 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4700 }
40d049e4
YO
4701#else
4702 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4703#endif
fc8cd66c 4704
a0d0e21e 4705 /* Dig out information for optimizations. */
f7819f85 4706 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
4707 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4708
a0ed51b3 4709 if (UTF)
8f6ae13c 4710 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 4711 ri->regstclass = NULL;
830247a4 4712 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4713 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4714 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4715
1de06328
YO
4716 /* testing for BRANCH here tells us whether there is "must appear"
4717 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4718 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4719 I32 fake;
c5254dd6 4720 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4721 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4722 int stclass_flag;
07be1b83 4723 I32 last_close = 0; /* pointed to by data */
5339e136
YO
4724 regnode *first= scan;
4725 regnode *first_next= regnext(first);
639081d6
YO
4726 /*
4727 * Skip introductions and multiplicators >= 1
4728 * so that we can extract the 'meat' of the pattern that must
4729 * match in the large if() sequence following.
4730 * NOTE that EXACT is NOT covered here, as it is normally
4731 * picked up by the optimiser separately.
4732 *
4733 * This is unfortunate as the optimiser isnt handling lookahead
4734 * properly currently.
4735 *
4736 */
a0d0e21e 4737 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4738 /* An OR of *one* alternative - should not happen now. */
5339e136 4739 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83 4740 /* for now we can't handle lookbehind IFMATCH*/
e7f38d0f 4741 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
a0d0e21e
LW
4742 (OP(first) == PLUS) ||
4743 (OP(first) == MINMOD) ||
653099ff 4744 /* An {n,m} with n>0 */
5339e136
YO
4745 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4746 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 4747 {
639081d6
YO
4748 /*
4749 * the only op that could be a regnode is PLUS, all the rest
4750 * will be regnode_1 or regnode_2.
4751 *
4752 */
a0d0e21e
LW
4753 if (OP(first) == PLUS)
4754 sawplus = 1;
4755 else
3dab1dad 4756 first += regarglen[OP(first)];
639081d6
YO
4757
4758 first = NEXTOPER(first);
5339e136 4759 first_next= regnext(first);
a687059c
LW
4760 }
4761
a0d0e21e
LW
4762 /* Starting-point info. */
4763 again:
786e8c11 4764 DEBUG_PEEP("first:",first,0);
07be1b83 4765 /* Ignore EXACT as we deal with it later. */
3dab1dad 4766 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4767 if (OP(first) == EXACT)
6f207bd3 4768 NOOP; /* Empty, get anchored substr later. */
e5fbd0ff 4769 else
f8fc2ecf 4770 ri->regstclass = first;
b3c9acc1 4771 }
07be1b83 4772#ifdef TRIE_STCLASS
786e8c11 4773 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4774 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4775 {
786e8c11 4776 regnode *trie_op;
07be1b83 4777 /* this can happen only on restudy */
786e8c11 4778 if ( OP(first) == TRIE ) {
c944940b 4779 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4780 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4781 StructCopy(first,trieop,struct regnode_1);
4782 trie_op=(regnode *)trieop;
4783 } else {
c944940b 4784 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4785 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4786 StructCopy(first,trieop,struct regnode_charclass);
4787 trie_op=(regnode *)trieop;
4788 }
1de06328 4789 OP(trie_op)+=2;
786e8c11 4790 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4791 ri->regstclass = trie_op;
07be1b83
YO
4792 }
4793#endif
e52fc539 4794 else if (REGNODE_SIMPLE(OP(first)))
f8fc2ecf 4795 ri->regstclass = first;
3dab1dad
YO
4796 else if (PL_regkind[OP(first)] == BOUND ||
4797 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4798 ri->regstclass = first;
3dab1dad 4799 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4800 r->extflags |= (OP(first) == MBOL
4801 ? RXf_ANCH_MBOL
cad2e5aa 4802 : (OP(first) == SBOL
bbe252da
YO
4803 ? RXf_ANCH_SBOL
4804 : RXf_ANCH_BOL));
a0d0e21e 4805 first = NEXTOPER(first);
774d564b 4806 goto again;
4807 }
4808 else if (OP(first) == GPOS) {
bbe252da 4809 r->extflags |= RXf_ANCH_GPOS;
774d564b 4810 first = NEXTOPER(first);
4811 goto again;
a0d0e21e 4812 }
cf2a2b69
YO
4813 else if ((!sawopen || !RExC_sawback) &&
4814 (OP(first) == STAR &&
3dab1dad 4815 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4816 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4817 {
4818 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4819 const int type =
4820 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4821 ? RXf_ANCH_MBOL
4822 : RXf_ANCH_SBOL;
4823 r->extflags |= type;
4824 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4825 first = NEXTOPER(first);
774d564b 4826 goto again;
a0d0e21e 4827 }
e7f38d0f 4828 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
830247a4 4829 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4830 /* x+ must match at the 1st pos of run of x's */
bbe252da 4831 r->intflags |= PREGf_SKIP;
a0d0e21e 4832
c277df42 4833 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4834#ifdef TRIE_STUDY_OPT
81714fb9 4835 DEBUG_PARSE_r(
be8e71aa
YO
4836 if (!restudied)
4837 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4838 (IV)(first - scan + 1))
4839 );
4840#else
81714fb9 4841 DEBUG_PARSE_r(
be8e71aa
YO
4842 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4843 (IV)(first - scan + 1))
4844 );
4845#endif
4846
4847
a0d0e21e
LW
4848 /*
4849 * If there's something expensive in the r.e., find the
4850 * longest literal string that must appear and make it the
4851 * regmust. Resolve ties in favor of later strings, since
4852 * the regstart check works with the beginning of the r.e.
4853 * and avoiding duplication strengthens checking. Not a
4854 * strong reason, but sufficient in the absence of others.
4855 * [Now we resolve ties in favor of the earlier string if
c277df42 4856 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4857 * earlier string may buy us something the later one won't.]
4858 */
de8c5301 4859
396482e1
GA
4860 data.longest_fixed = newSVpvs("");
4861 data.longest_float = newSVpvs("");
4862 data.last_found = newSVpvs("");
c277df42
IZ
4863 data.longest = &(data.longest_fixed);
4864 first = scan;
f8fc2ecf 4865 if (!ri->regstclass) {
830247a4 4866 cl_init(pRExC_state, &ch_class);
653099ff
GS
4867 data.start_class = &ch_class;
4868 stclass_flag = SCF_DO_STCLASS_AND;
4869 } else /* XXXX Check for BOUND? */
4870 stclass_flag = 0;
cb434fcc 4871 data.last_closep = &last_close;
de8c5301 4872
1de06328 4873 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4874 &data, -1, NULL, NULL,
4875 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4876
07be1b83 4877
786e8c11
YO
4878 CHECK_RESTUDY_GOTO;
4879
4880
830247a4 4881 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4882 && data.last_start_min == 0 && data.last_end > 0
830247a4 4883 && !RExC_seen_zerolen
2bf803e2 4884 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4885 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4886 r->extflags |= RXf_CHECK_ALL;
304ee84b 4887 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4888 SvREFCNT_dec(data.last_found);
4889
1de06328
YO
4890 /* Note that code very similar to this but for anchored string
4891 follows immediately below, changes may need to be made to both.
4892 Be careful.
4893 */
a0ed51b3 4894 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4895 if (longest_float_length
c277df42
IZ
4896 || (data.flags & SF_FL_BEFORE_EOL
4897 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4898 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4899 {
1182767e 4900 I32 t,ml;
cf93c79d 4901
1de06328 4902 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4903 && data.offset_fixed == data.offset_float_min
4904 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4905 goto remove_float; /* As in (a)+. */
4906
1de06328
YO
4907 /* copy the information about the longest float from the reg_scan_data
4908 over to the program. */
33b8afdf
JH
4909 if (SvUTF8(data.longest_float)) {
4910 r->float_utf8 = data.longest_float;
c445ea15 4911 r->float_substr = NULL;
33b8afdf
JH
4912 } else {
4913 r->float_substr = data.longest_float;
c445ea15 4914 r->float_utf8 = NULL;
33b8afdf 4915 }
1de06328
YO
4916 /* float_end_shift is how many chars that must be matched that
4917 follow this item. We calculate it ahead of time as once the
4918 lookbehind offset is added in we lose the ability to correctly
4919 calculate it.*/
4920 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4921 : (I32)longest_float_length;
1de06328
YO
4922 r->float_end_shift = ml - data.offset_float_min
4923 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4924 + data.lookbehind_float;
4925 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4926 r->float_max_offset = data.offset_float_max;
1182767e 4927 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4928 r->float_max_offset -= data.lookbehind_float;
4929
cf93c79d
IZ
4930 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4931 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4932 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4933 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4934 }
4935 else {
aca2d497 4936 remove_float:
c445ea15 4937 r->float_substr = r->float_utf8 = NULL;
c277df42 4938 SvREFCNT_dec(data.longest_float);
c5254dd6 4939 longest_float_length = 0;
a0d0e21e 4940 }
c277df42 4941
1de06328
YO
4942 /* Note that code very similar to this but for floating string
4943 is immediately above, changes may need to be made to both.
4944 Be careful.
4945 */
a0ed51b3 4946 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4947 if (longest_fixed_length
c277df42
IZ
4948 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4949 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4950 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4951 {
1182767e 4952 I32 t,ml;
cf93c79d 4953
1de06328
YO
4954 /* copy the information about the longest fixed
4955 from the reg_scan_data over to the program. */
33b8afdf
JH
4956 if (SvUTF8(data.longest_fixed)) {
4957 r->anchored_utf8 = data.longest_fixed;
c445ea15 4958 r->anchored_substr = NULL;
33b8afdf
JH
4959 } else {
4960 r->anchored_substr = data.longest_fixed;
c445ea15 4961 r->anchored_utf8 = NULL;
33b8afdf 4962 }
1de06328
YO
4963 /* fixed_end_shift is how many chars that must be matched that
4964 follow this item. We calculate it ahead of time as once the
4965 lookbehind offset is added in we lose the ability to correctly
4966 calculate it.*/
4967 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4968 : (I32)longest_fixed_length;
1de06328
YO
4969 r->anchored_end_shift = ml - data.offset_fixed
4970 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4971 + data.lookbehind_fixed;
4972 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4973
cf93c79d
IZ
4974 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4975 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4976 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4977 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4978 }
4979 else {
c445ea15 4980 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4981 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4982 longest_fixed_length = 0;
a0d0e21e 4983 }
f8fc2ecf
YO
4984 if (ri->regstclass
4985 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4986 ri->regstclass = NULL;
33b8afdf
JH
4987 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4988 && stclass_flag
653099ff 4989 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4990 && !cl_is_anything(data.start_class))
4991 {
2eccd3b2 4992 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4993
f8fc2ecf 4994 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4995 struct regnode_charclass_class);
4996 StructCopy(data.start_class,
f8fc2ecf 4997 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4998 struct regnode_charclass_class);
f8fc2ecf 4999 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5000 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5001 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 5002 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5003 PerlIO_printf(Perl_debug_log,
a0288114 5004 "synthetic stclass \"%s\".\n",
3f7c398e 5005 SvPVX_const(sv));});
653099ff 5006 }
c277df42
IZ
5007
5008 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 5009 if (longest_fixed_length > longest_float_length) {
1de06328 5010 r->check_end_shift = r->anchored_end_shift;
c277df42 5011 r->check_substr = r->anchored_substr;
33b8afdf 5012 r->check_utf8 = r->anchored_utf8;
c277df42 5013 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
5014 if (r->extflags & RXf_ANCH_SINGLE)
5015 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
5016 }
5017 else {
1de06328 5018 r->check_end_shift = r->float_end_shift;
c277df42 5019 r->check_substr = r->float_substr;
33b8afdf 5020 r->check_utf8 = r->float_utf8;
1de06328
YO
5021 r->check_offset_min = r->float_min_offset;
5022 r->check_offset_max = r->float_max_offset;
a0d0e21e 5023 }
30382c73
IZ
5024 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5025 This should be changed ASAP! */
bbe252da
YO
5026 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5027 r->extflags |= RXf_USE_INTUIT;
33b8afdf 5028 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 5029 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 5030 }
1de06328
YO
5031 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5032 if ( (STRLEN)minlen < longest_float_length )
5033 minlen= longest_float_length;
5034 if ( (STRLEN)minlen < longest_fixed_length )
5035 minlen= longest_fixed_length;
5036 */
a0ed51b3
LW
5037 }
5038 else {
c277df42
IZ
5039 /* Several toplevels. Best we can is to set minlen. */
5040 I32 fake;
653099ff 5041 struct regnode_charclass_class ch_class;
cb434fcc 5042 I32 last_close = 0;
c277df42 5043
5d458dd8 5044 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 5045
f8fc2ecf 5046 scan = ri->program + 1;
830247a4 5047 cl_init(pRExC_state, &ch_class);
653099ff 5048 data.start_class = &ch_class;
cb434fcc 5049 data.last_closep = &last_close;
07be1b83 5050
de8c5301 5051
1de06328 5052 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 5053 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 5054
786e8c11 5055 CHECK_RESTUDY_GOTO;
07be1b83 5056
33b8afdf 5057 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 5058 = r->float_substr = r->float_utf8 = NULL;
653099ff 5059 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
5060 && !cl_is_anything(data.start_class))
5061 {
2eccd3b2 5062 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 5063
f8fc2ecf 5064 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
5065 struct regnode_charclass_class);
5066 StructCopy(data.start_class,
f8fc2ecf 5067 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 5068 struct regnode_charclass_class);
f8fc2ecf 5069 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5070 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5071 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 5072 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5073 PerlIO_printf(Perl_debug_log,
a0288114 5074 "synthetic stclass \"%s\".\n",
3f7c398e 5075 SvPVX_const(sv));});
653099ff 5076 }
a0d0e21e
LW
5077 }
5078
1de06328
YO
5079 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5080 the "real" pattern. */
cf9788e3
RGS
5081 DEBUG_OPTIMISE_r({
5082 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 5083 (IV)minlen, (IV)r->minlen);
cf9788e3 5084 });
de8c5301 5085 r->minlenret = minlen;
1de06328
YO
5086 if (r->minlen < minlen)
5087 r->minlen = minlen;
5088
b81d288d 5089 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 5090 r->extflags |= RXf_GPOS_SEEN;
830247a4 5091 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 5092 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 5093 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 5094 r->extflags |= RXf_EVAL_SEEN;
f33976b4 5095 if (RExC_seen & REG_SEEN_CANY)
bbe252da 5096 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 5097 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 5098 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 5099 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 5100 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 5101 if (RExC_paren_names)
85fbaab2 5102 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 5103 else
5daac39c 5104 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 5105
7bd1e614 5106#ifdef STUPID_PATTERN_CHECKS
5509d87a 5107 if (RX_PRELEN(rx) == 0)
640f820d 5108 r->extflags |= RXf_NULL;
5509d87a 5109 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
5110 /* XXX: this should happen BEFORE we compile */
5111 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 5112 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 5113 r->extflags |= RXf_WHITE;
5509d87a 5114 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 5115 r->extflags |= RXf_START_ONLY;
f1b875a0 5116#else
5509d87a 5117 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
5118 /* XXX: this should happen BEFORE we compile */
5119 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5120 else {
5121 regnode *first = ri->program + 1;
39aa8307
JH
5122 U8 fop = OP(first);
5123 U8 nop = OP(NEXTOPER(first));
7bd1e614 5124
640f820d
AB
5125 if (PL_regkind[fop] == NOTHING && nop == END)
5126 r->extflags |= RXf_NULL;
5127 else if (PL_regkind[fop] == BOL && nop == END)
7bd1e614
YO
5128 r->extflags |= RXf_START_ONLY;
5129 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5130 r->extflags |= RXf_WHITE;
5131 }
f1b875a0 5132#endif
1f1031fe
YO
5133#ifdef DEBUGGING
5134 if (RExC_paren_names) {
af534a04 5135 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
1f1031fe
YO
5136 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5137 } else
1f1031fe 5138#endif
cde0cee5 5139 ri->name_list_idx = 0;
1f1031fe 5140
40d049e4
YO
5141 if (RExC_recurse_count) {
5142 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5143 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5144 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5145 }
5146 }
f0ab9afb 5147 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
5148 /* assume we don't need to swap parens around before we match */
5149
be8e71aa
YO
5150 DEBUG_DUMP_r({
5151 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
5152 regdump(r);
5153 });
7122b237
YO
5154#ifdef RE_TRACK_PATTERN_OFFSETS
5155 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5156 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
5157 U32 i;
5158 GET_RE_DEBUG_FLAGS_DECL;
7122b237 5159 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 5160 for (i = 1; i <= len; i++) {
7122b237 5161 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 5162 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 5163 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
5164 }
5165 PerlIO_printf(Perl_debug_log, "\n");
5166 });
7122b237 5167#endif
288b8c02 5168 return rx;
a687059c
LW
5169}
5170
f9f4320a 5171#undef RE_ENGINE_PTR
3dab1dad 5172
93b32b6d 5173
81714fb9 5174SV*
192b9cd1
AB
5175Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5176 const U32 flags)
5177{
7918f24d
NC
5178 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5179
192b9cd1
AB
5180 PERL_UNUSED_ARG(value);
5181
f1b875a0 5182 if (flags & RXapif_FETCH) {
192b9cd1 5183 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 5184 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6ad8f254 5185 Perl_croak_no_modify(aTHX);
192b9cd1 5186 return NULL;
f1b875a0 5187 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
5188 return reg_named_buff_exists(rx, key, flags)
5189 ? &PL_sv_yes
5190 : &PL_sv_no;
f1b875a0 5191 } else if (flags & RXapif_REGNAMES) {
192b9cd1 5192 return reg_named_buff_all(rx, flags);
f1b875a0 5193 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
5194 return reg_named_buff_scalar(rx, flags);
5195 } else {
5196 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5197 return NULL;
5198 }
5199}
5200
5201SV*
5202Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5203 const U32 flags)
5204{
7918f24d 5205 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
5206 PERL_UNUSED_ARG(lastkey);
5207
f1b875a0 5208 if (flags & RXapif_FIRSTKEY)
192b9cd1 5209 return reg_named_buff_firstkey(rx, flags);
f1b875a0 5210 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
5211 return reg_named_buff_nextkey(rx, flags);
5212 else {
5213 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5214 return NULL;
5215 }
5216}
5217
5218SV*
288b8c02
NC
5219Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5220 const U32 flags)
81714fb9 5221{
44a2ac75
YO
5222 AV *retarray = NULL;
5223 SV *ret;
288b8c02 5224 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5225
5226 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5227
f1b875a0 5228 if (flags & RXapif_ALL)
44a2ac75 5229 retarray=newAV();
93b32b6d 5230
5daac39c
NC
5231 if (rx && RXp_PAREN_NAMES(rx)) {
5232 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
5233 if (he_str) {
5234 IV i;
5235 SV* sv_dat=HeVAL(he_str);
5236 I32 *nums=(I32*)SvPVX(sv_dat);
5237 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
5238 if ((I32)(rx->nparens) >= nums[i]
5239 && rx->offs[nums[i]].start != -1
5240 && rx->offs[nums[i]].end != -1)
93b32b6d 5241 {
49d7dfbc 5242 ret = newSVpvs("");
288b8c02 5243 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
5244 if (!retarray)
5245 return ret;
5246 } else {
5247 ret = newSVsv(&PL_sv_undef);
5248 }
ec83ea38 5249 if (retarray)
93b32b6d 5250 av_push(retarray, ret);
81714fb9 5251 }
93b32b6d 5252 if (retarray)
ad64d0ec 5253 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
5254 }
5255 }
5256 return NULL;
5257}
5258
5259bool
288b8c02 5260Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
5261 const U32 flags)
5262{
288b8c02 5263 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5264
5265 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5266
5daac39c 5267 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5268 if (flags & RXapif_ALL) {
5daac39c 5269 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 5270 } else {
288b8c02 5271 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
5272 if (sv) {
5273 SvREFCNT_dec(sv);
192b9cd1
AB
5274 return TRUE;
5275 } else {
5276 return FALSE;
5277 }
5278 }
5279 } else {
5280 return FALSE;
5281 }
5282}
5283
5284SV*
288b8c02 5285Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5286{
288b8c02 5287 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5288
5289 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5290
5daac39c
NC
5291 if ( rx && RXp_PAREN_NAMES(rx) ) {
5292 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 5293
288b8c02 5294 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
5295 } else {
5296 return FALSE;
5297 }
192b9cd1
AB
5298}
5299
5300SV*
288b8c02 5301Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5302{
288b8c02 5303 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 5304 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5305
5306 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5307
5daac39c
NC
5308 if (rx && RXp_PAREN_NAMES(rx)) {
5309 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
5310 HE *temphe;
5311 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5312 IV i;
5313 IV parno = 0;
5314 SV* sv_dat = HeVAL(temphe);
5315 I32 *nums = (I32*)SvPVX(sv_dat);
5316 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5317 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5318 rx->offs[nums[i]].start != -1 &&
5319 rx->offs[nums[i]].end != -1)
5320 {
5321 parno = nums[i];
5322 break;
5323 }
5324 }
f1b875a0 5325 if (parno || flags & RXapif_ALL) {
a663657d 5326 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5327 }
81714fb9
YO
5328 }
5329 }
44a2ac75
YO
5330 return NULL;
5331}
5332
192b9cd1 5333SV*
288b8c02 5334Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5335{
5336 SV *ret;
5337 AV *av;
5338 I32 length;
288b8c02 5339 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5340
7918f24d
NC
5341 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5342
5daac39c 5343 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5344 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5345 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5346 } else if (flags & RXapif_ONE) {
288b8c02 5347 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 5348 av = MUTABLE_AV(SvRV(ret));
192b9cd1 5349 length = av_len(av);
ec83ea38 5350 SvREFCNT_dec(ret);
192b9cd1
AB
5351 return newSViv(length + 1);
5352 } else {
5353 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5354 return NULL;
5355 }
5356 }
5357 return &PL_sv_undef;
5358}
5359
5360SV*
288b8c02 5361Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5362{
288b8c02 5363 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5364 AV *av = newAV();
5365
7918f24d
NC
5366 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5367
5daac39c
NC
5368 if (rx && RXp_PAREN_NAMES(rx)) {
5369 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5370 HE *temphe;
5371 (void)hv_iterinit(hv);
5372 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5373 IV i;
5374 IV parno = 0;
5375 SV* sv_dat = HeVAL(temphe);
5376 I32 *nums = (I32*)SvPVX(sv_dat);
5377 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5378 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5379 rx->offs[nums[i]].start != -1 &&
5380 rx->offs[nums[i]].end != -1)
5381 {
5382 parno = nums[i];
5383 break;
5384 }
5385 }
f1b875a0 5386 if (parno || flags & RXapif_ALL) {
a663657d 5387 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
5388 }
5389 }
5390 }
5391
ad64d0ec 5392 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
5393}
5394
49d7dfbc 5395void
288b8c02
NC
5396Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5397 SV * const sv)
44a2ac75 5398{
288b8c02 5399 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 5400 char *s = NULL;
a9d504c3 5401 I32 i = 0;
44a2ac75 5402 I32 s1, t1;
7918f24d
NC
5403
5404 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 5405
cde0cee5
YO
5406 if (!rx->subbeg) {
5407 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5408 return;
cde0cee5
YO
5409 }
5410 else
f1b875a0 5411 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 5412 /* $` */
f0ab9afb 5413 i = rx->offs[0].start;
cde0cee5 5414 s = rx->subbeg;
44a2ac75
YO
5415 }
5416 else
f1b875a0 5417 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 5418 /* $' */
f0ab9afb
NC
5419 s = rx->subbeg + rx->offs[0].end;
5420 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
5421 }
5422 else
5423 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
5424 (s1 = rx->offs[paren].start) != -1 &&
5425 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
5426 {
5427 /* $& $1 ... */
5428 i = t1 - s1;
5429 s = rx->subbeg + s1;
cde0cee5
YO
5430 } else {
5431 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5432 return;
cde0cee5
YO
5433 }
5434 assert(rx->sublen >= (s - rx->subbeg) + i );
5435 if (i >= 0) {
5436 const int oldtainted = PL_tainted;
5437 TAINT_NOT;
5438 sv_setpvn(sv, s, i);
5439 PL_tainted = oldtainted;
5440 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 5441 ? (RXp_MATCH_UTF8(rx)
cde0cee5 5442 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 5443 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
5444 {
5445 SvUTF8_on(sv);
5446 }
5447 else
5448 SvUTF8_off(sv);
5449 if (PL_tainting) {
07bc277f 5450 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
5451 if (SvTYPE(sv) >= SVt_PVMG) {
5452 MAGIC* const mg = SvMAGIC(sv);
5453 MAGIC* mgt;
5454 PL_tainted = 1;
5455 SvMAGIC_set(sv, mg->mg_moremagic);
5456 SvTAINT(sv);
5457 if ((mgt = SvMAGIC(sv))) {
5458 mg->mg_moremagic = mgt;
5459 SvMAGIC_set(sv, mg);
44a2ac75 5460 }
cde0cee5
YO
5461 } else {
5462 PL_tainted = 1;
5463 SvTAINT(sv);
5464 }
5465 } else
5466 SvTAINTED_off(sv);
44a2ac75 5467 }
81714fb9 5468 } else {
44a2ac75 5469 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5470 return;
81714fb9
YO
5471 }
5472}
93b32b6d 5473
2fdbfb4d
AB
5474void
5475Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5476 SV const * const value)
5477{
7918f24d
NC
5478 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5479
2fdbfb4d
AB
5480 PERL_UNUSED_ARG(rx);
5481 PERL_UNUSED_ARG(paren);
5482 PERL_UNUSED_ARG(value);
5483
5484 if (!PL_localizing)
6ad8f254 5485 Perl_croak_no_modify(aTHX);
2fdbfb4d
AB
5486}
5487
5488I32
288b8c02 5489Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
5490 const I32 paren)
5491{
288b8c02 5492 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
5493 I32 i;
5494 I32 s1, t1;
5495
7918f24d
NC
5496 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5497
2fdbfb4d
AB
5498 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5499 switch (paren) {
192b9cd1 5500 /* $` / ${^PREMATCH} */
f1b875a0 5501 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
5502 if (rx->offs[0].start != -1) {
5503 i = rx->offs[0].start;
5504 if (i > 0) {
5505 s1 = 0;
5506 t1 = i;
5507 goto getlen;
5508 }
5509 }
5510 return 0;
192b9cd1 5511 /* $' / ${^POSTMATCH} */
f1b875a0 5512 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
5513 if (rx->offs[0].end != -1) {
5514 i = rx->sublen - rx->offs[0].end;
5515 if (i > 0) {
5516 s1 = rx->offs[0].end;
5517 t1 = rx->sublen;
5518 goto getlen;
5519 }
5520 }
5521 return 0;
192b9cd1
AB
5522 /* $& / ${^MATCH}, $1, $2, ... */
5523 default:
2fdbfb4d
AB
5524 if (paren <= (I32)rx->nparens &&
5525 (s1 = rx->offs[paren].start) != -1 &&
5526 (t1 = rx->offs[paren].end) != -1)
5527 {
5528 i = t1 - s1;
5529 goto getlen;
5530 } else {
5531 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 5532 report_uninit((const SV *)sv);
2fdbfb4d
AB
5533 return 0;
5534 }
5535 }
5536 getlen:
07bc277f 5537 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
5538 const char * const s = rx->subbeg + s1;
5539 const U8 *ep;
5540 STRLEN el;
5541
5542 i = t1 - s1;
5543 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5544 i = el;
5545 }
5546 return i;
5547}
5548
fe578d7f 5549SV*
49d7dfbc 5550Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 5551{
7918f24d 5552 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 5553 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
5554 if (0)
5555 return NULL;
5556 else
5557 return newSVpvs("Regexp");
fe578d7f 5558}
0a4db386 5559
894be9b7 5560/* Scans the name of a named buffer from the pattern.
0a4db386
YO
5561 * If flags is REG_RSN_RETURN_NULL returns null.
5562 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5563 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5564 * to the parsed name as looked up in the RExC_paren_names hash.
5565 * If there is an error throws a vFAIL().. type exception.
894be9b7 5566 */
0a4db386
YO
5567
5568#define REG_RSN_RETURN_NULL 0
5569#define REG_RSN_RETURN_NAME 1
5570#define REG_RSN_RETURN_DATA 2
5571
894be9b7 5572STATIC SV*
7918f24d
NC
5573S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5574{
894be9b7 5575 char *name_start = RExC_parse;
1f1031fe 5576
7918f24d
NC
5577 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5578
1f1031fe
YO
5579 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5580 /* skip IDFIRST by using do...while */
5581 if (UTF)
5582 do {
5583 RExC_parse += UTF8SKIP(RExC_parse);
5584 } while (isALNUM_utf8((U8*)RExC_parse));
5585 else
5586 do {
5587 RExC_parse++;
5588 } while (isALNUM(*RExC_parse));
894be9b7 5589 }
1f1031fe 5590
0a4db386 5591 if ( flags ) {
59cd0e26
NC
5592 SV* sv_name
5593 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5594 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
5595 if ( flags == REG_RSN_RETURN_NAME)
5596 return sv_name;
5597 else if (flags==REG_RSN_RETURN_DATA) {
5598 HE *he_str = NULL;
5599 SV *sv_dat = NULL;
5600 if ( ! sv_name ) /* should not happen*/
5601 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5602 if (RExC_paren_names)
5603 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5604 if ( he_str )
5605 sv_dat = HeVAL(he_str);
5606 if ( ! sv_dat )
5607 vFAIL("Reference to nonexistent named group");
5608 return sv_dat;
5609 }
5610 else {
5611 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5612 }
5613 /* NOT REACHED */
894be9b7 5614 }
0a4db386 5615 return NULL;
894be9b7
YO
5616}
5617
3dab1dad
YO
5618#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5619 int rem=(int)(RExC_end - RExC_parse); \
5620 int cut; \
5621 int num; \
5622 int iscut=0; \
5623 if (rem>10) { \
5624 rem=10; \
5625 iscut=1; \
5626 } \
5627 cut=10-rem; \
5628 if (RExC_lastparse!=RExC_parse) \
5629 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5630 rem, RExC_parse, \
5631 cut + 4, \
5632 iscut ? "..." : "<" \
5633 ); \
5634 else \
5635 PerlIO_printf(Perl_debug_log,"%16s",""); \
5636 \
5637 if (SIZE_ONLY) \
3b57cd43 5638 num = RExC_size + 1; \
3dab1dad
YO
5639 else \
5640 num=REG_NODE_NUM(RExC_emit); \
5641 if (RExC_lastnum!=num) \
0a4db386 5642 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 5643 else \
0a4db386 5644 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
5645 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5646 (int)((depth*2)), "", \
3dab1dad
YO
5647 (funcname) \
5648 ); \
5649 RExC_lastnum=num; \
5650 RExC_lastparse=RExC_parse; \
5651})
5652
07be1b83
YO
5653
5654
3dab1dad
YO
5655#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5656 DEBUG_PARSE_MSG((funcname)); \
5657 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5658})
6bda09f9
YO
5659#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5660 DEBUG_PARSE_MSG((funcname)); \
5661 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5662})
a687059c
LW
5663/*
5664 - reg - regular expression, i.e. main body or parenthesized thing
5665 *
5666 * Caller must absorb opening parenthesis.
5667 *
5668 * Combining parenthesis handling with the base level of regular expression
5669 * is a trifle forced, but the need to tie the tails of the branches to what
5670 * follows makes it hard to avoid.
5671 */
07be1b83
YO
5672#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5673#ifdef DEBUGGING
5674#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5675#else
5676#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5677#endif
3dab1dad 5678
76e3520e 5679STATIC regnode *
3dab1dad 5680S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 5681 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 5682{
27da23d5 5683 dVAR;
c277df42
IZ
5684 register regnode *ret; /* Will be the head of the group. */
5685 register regnode *br;
5686 register regnode *lastbr;
cbbf8932 5687 register regnode *ender = NULL;
a0d0e21e 5688 register I32 parno = 0;
cbbf8932 5689 I32 flags;
f7819f85 5690 U32 oregflags = RExC_flags;
6136c704
AL
5691 bool have_branch = 0;
5692 bool is_open = 0;
594d7033
YO
5693 I32 freeze_paren = 0;
5694 I32 after_freeze = 0;
9d1d55b5
JP
5695
5696 /* for (?g), (?gc), and (?o) warnings; warning
5697 about (?c) will warn about (?g) -- japhy */
5698
6136c704
AL
5699#define WASTED_O 0x01
5700#define WASTED_G 0x02
5701#define WASTED_C 0x04
5702#define WASTED_GC (0x02|0x04)
cbbf8932 5703 I32 wastedflags = 0x00;
9d1d55b5 5704
fac92740 5705 char * parse_start = RExC_parse; /* MJD */
a28509cc 5706 char * const oregcomp_parse = RExC_parse;
a0d0e21e 5707
3dab1dad 5708 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5709
5710 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
5711 DEBUG_PARSE("reg ");
5712
821b33a5 5713 *flagp = 0; /* Tentatively. */
a0d0e21e 5714
9d1d55b5 5715
a0d0e21e
LW
5716 /* Make an OPEN node, if parenthesized. */
5717 if (paren) {
e2e6a0f1
YO
5718 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5719 char *start_verb = RExC_parse;
5720 STRLEN verb_len = 0;
5721 char *start_arg = NULL;
5722 unsigned char op = 0;
5723 int argok = 1;
5724 int internal_argval = 0; /* internal_argval is only useful if !argok */
5725 while ( *RExC_parse && *RExC_parse != ')' ) {
5726 if ( *RExC_parse == ':' ) {
5727 start_arg = RExC_parse + 1;
5728 break;
5729 }
5730 RExC_parse++;
5731 }
5732 ++start_verb;
5733 verb_len = RExC_parse - start_verb;
5734 if ( start_arg ) {
5735 RExC_parse++;
5736 while ( *RExC_parse && *RExC_parse != ')' )
5737 RExC_parse++;
5738 if ( *RExC_parse != ')' )
5739 vFAIL("Unterminated verb pattern argument");
5740 if ( RExC_parse == start_arg )
5741 start_arg = NULL;
5742 } else {
5743 if ( *RExC_parse != ')' )
5744 vFAIL("Unterminated verb pattern");
5745 }
5d458dd8 5746
e2e6a0f1
YO
5747 switch ( *start_verb ) {
5748 case 'A': /* (*ACCEPT) */
568a785a 5749 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
5750 op = ACCEPT;
5751 internal_argval = RExC_nestroot;
5752 }
5753 break;
5754 case 'C': /* (*COMMIT) */
568a785a 5755 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 5756 op = COMMIT;
e2e6a0f1
YO
5757 break;
5758 case 'F': /* (*FAIL) */
568a785a 5759 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
5760 op = OPFAIL;
5761 argok = 0;
5762 }
5763 break;
5d458dd8
YO
5764 case ':': /* (*:NAME) */
5765 case 'M': /* (*MARK:NAME) */
568a785a 5766 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 5767 op = MARKPOINT;
5d458dd8
YO
5768 argok = -1;
5769 }
5770 break;
5771 case 'P': /* (*PRUNE) */
568a785a 5772 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 5773 op = PRUNE;
e2e6a0f1 5774 break;
5d458dd8 5775 case 'S': /* (*SKIP) */
568a785a 5776 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
5777 op = SKIP;
5778 break;
5779 case 'T': /* (*THEN) */
5780 /* [19:06] <TimToady> :: is then */
568a785a 5781 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
5782 op = CUTGROUP;
5783 RExC_seen |= REG_SEEN_CUTGROUP;
5784 }
e2e6a0f1
YO
5785 break;
5786 }
5787 if ( ! op ) {
5788 RExC_parse++;
5789 vFAIL3("Unknown verb pattern '%.*s'",
5790 verb_len, start_verb);
5791 }
5792 if ( argok ) {
5793 if ( start_arg && internal_argval ) {
5794 vFAIL3("Verb pattern '%.*s' may not have an argument",
5795 verb_len, start_verb);
5796 } else if ( argok < 0 && !start_arg ) {
5797 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5798 verb_len, start_verb);
5799 } else {
5800 ret = reganode(pRExC_state, op, internal_argval);
5801 if ( ! internal_argval && ! SIZE_ONLY ) {
5802 if (start_arg) {
5803 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5804 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5805 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
5806 ret->flags = 0;
5807 } else {
5808 ret->flags = 1;
5809 }
5810 }
5811 }
5812 if (!internal_argval)
5813 RExC_seen |= REG_SEEN_VERBARG;
5814 } else if ( start_arg ) {
5815 vFAIL3("Verb pattern '%.*s' may not have an argument",
5816 verb_len, start_verb);
5817 } else {
5818 ret = reg_node(pRExC_state, op);
5819 }
5820 nextchar(pRExC_state);
5821 return ret;
5822 } else
fac92740 5823 if (*RExC_parse == '?') { /* (?...) */
6136c704 5824 bool is_logical = 0;
a28509cc 5825 const char * const seqstart = RExC_parse;
fb85c044 5826 bool has_use_defaults = FALSE;
ca9dfc88 5827
830247a4
IZ
5828 RExC_parse++;
5829 paren = *RExC_parse++;
c277df42 5830 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5831 switch (paren) {
894be9b7 5832
1f1031fe
YO
5833 case 'P': /* (?P...) variants for those used to PCRE/Python */
5834 paren = *RExC_parse++;
5835 if ( paren == '<') /* (?P<...>) named capture */
5836 goto named_capture;
5837 else if (paren == '>') { /* (?P>name) named recursion */
5838 goto named_recursion;
5839 }
5840 else if (paren == '=') { /* (?P=...) named backref */
5841 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5842 you change this make sure you change that */
5843 char* name_start = RExC_parse;
5844 U32 num = 0;
5845 SV *sv_dat = reg_scan_name(pRExC_state,
5846 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5847 if (RExC_parse == name_start || *RExC_parse != ')')
5848 vFAIL2("Sequence %.3s... not terminated",parse_start);
5849
5850 if (!SIZE_ONLY) {
5851 num = add_data( pRExC_state, 1, "S" );
5852 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5853 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
5854 }
5855 RExC_sawback = 1;
4444fd9f
KW
5856 ret = reganode(pRExC_state,
5857 ((! FOLD)
5858 ? NREF
5859 : (UNI_SEMANTICS)
5860 ? NREFFU
5861 : (LOC)
5862 ? NREFFL
5863 : NREFF),
5864 num);
1f1031fe
YO
5865 *flagp |= HASWIDTH;
5866
5867 Set_Node_Offset(ret, parse_start+1);
5868 Set_Node_Cur_Length(ret); /* MJD */
5869
5870 nextchar(pRExC_state);
5871 return ret;
5872 }
57b84237
YO
5873 RExC_parse++;
5874 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5875 /*NOTREACHED*/
5876 case '<': /* (?<...) */
b81d288d 5877 if (*RExC_parse == '!')
c277df42 5878 paren = ',';
0a4db386 5879 else if (*RExC_parse != '=')
1f1031fe 5880 named_capture:
0a4db386 5881 { /* (?<...>) */
81714fb9 5882 char *name_start;
894be9b7 5883 SV *svname;
81714fb9
YO
5884 paren= '>';
5885 case '\'': /* (?'...') */
5886 name_start= RExC_parse;
0a4db386
YO
5887 svname = reg_scan_name(pRExC_state,
5888 SIZE_ONLY ? /* reverse test from the others */
5889 REG_RSN_RETURN_NAME :
5890 REG_RSN_RETURN_NULL);
57b84237
YO
5891 if (RExC_parse == name_start) {
5892 RExC_parse++;
5893 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5894 /*NOTREACHED*/
5895 }
81714fb9
YO
5896 if (*RExC_parse != paren)
5897 vFAIL2("Sequence (?%c... not terminated",
5898 paren=='>' ? '<' : paren);
5899 if (SIZE_ONLY) {
e62cc96a
YO
5900 HE *he_str;
5901 SV *sv_dat = NULL;
894be9b7
YO
5902 if (!svname) /* shouldnt happen */
5903 Perl_croak(aTHX_
5904 "panic: reg_scan_name returned NULL");
81714fb9
YO
5905 if (!RExC_paren_names) {
5906 RExC_paren_names= newHV();
ad64d0ec 5907 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
5908#ifdef DEBUGGING
5909 RExC_paren_name_list= newAV();
ad64d0ec 5910 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 5911#endif
81714fb9
YO
5912 }
5913 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5914 if ( he_str )
81714fb9 5915 sv_dat = HeVAL(he_str);
e62cc96a 5916 if ( ! sv_dat ) {
81714fb9 5917 /* croak baby croak */
e62cc96a
YO
5918 Perl_croak(aTHX_
5919 "panic: paren_name hash element allocation failed");
5920 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
5921 /* (?|...) can mean we have dupes so scan to check
5922 its already been stored. Maybe a flag indicating
5923 we are inside such a construct would be useful,
5924 but the arrays are likely to be quite small, so
5925 for now we punt -- dmq */
5926 IV count = SvIV(sv_dat);
5927 I32 *pv = (I32*)SvPVX(sv_dat);
5928 IV i;
5929 for ( i = 0 ; i < count ; i++ ) {
5930 if ( pv[i] == RExC_npar ) {
5931 count = 0;
5932 break;
5933 }
5934 }
5935 if ( count ) {
5936 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5937 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5938 pv[count] = RExC_npar;
3a92e6ae 5939 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 5940 }
81714fb9
YO
5941 } else {
5942 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5943 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5944 SvIOK_on(sv_dat);
3ec35e0f 5945 SvIV_set(sv_dat, 1);
e62cc96a 5946 }
1f1031fe
YO
5947#ifdef DEBUGGING
5948 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5949 SvREFCNT_dec(svname);
5950#endif
e62cc96a 5951
81714fb9
YO
5952 /*sv_dump(sv_dat);*/
5953 }
5954 nextchar(pRExC_state);
5955 paren = 1;
5956 goto capturing_parens;
5957 }
5958 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5959 RExC_parse++;
fac92740 5960 case '=': /* (?=...) */
89c6a13e 5961 RExC_seen_zerolen++;
5c3fa2e7 5962 break;
fac92740 5963 case '!': /* (?!...) */
830247a4 5964 RExC_seen_zerolen++;
e2e6a0f1
YO
5965 if (*RExC_parse == ')') {
5966 ret=reg_node(pRExC_state, OPFAIL);
5967 nextchar(pRExC_state);
5968 return ret;
5969 }
594d7033
YO
5970 break;
5971 case '|': /* (?|...) */
5972 /* branch reset, behave like a (?:...) except that
5973 buffers in alternations share the same numbers */
5974 paren = ':';
5975 after_freeze = freeze_paren = RExC_npar;
5976 break;
fac92740
MJD
5977 case ':': /* (?:...) */
5978 case '>': /* (?>...) */
a0d0e21e 5979 break;
fac92740
MJD
5980 case '$': /* (?$...) */
5981 case '@': /* (?@...) */
8615cb43 5982 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5983 break;
fac92740 5984 case '#': /* (?#...) */
830247a4
IZ
5985 while (*RExC_parse && *RExC_parse != ')')
5986 RExC_parse++;
5987 if (*RExC_parse != ')')
c277df42 5988 FAIL("Sequence (?#... not terminated");
830247a4 5989 nextchar(pRExC_state);
a0d0e21e
LW
5990 *flagp = TRYAGAIN;
5991 return NULL;
894be9b7
YO
5992 case '0' : /* (?0) */
5993 case 'R' : /* (?R) */
5994 if (*RExC_parse != ')')
6bda09f9 5995 FAIL("Sequence (?R) not terminated");
1a147d38 5996 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 5997 *flagp |= POSTPONED;
7f69552c
YO
5998 nextchar(pRExC_state);
5999 return ret;
6000 /*notreached*/
894be9b7
YO
6001 { /* named and numeric backreferences */
6002 I32 num;
894be9b7
YO
6003 case '&': /* (?&NAME) */
6004 parse_start = RExC_parse - 1;
1f1031fe 6005 named_recursion:
894be9b7 6006 {
0a4db386
YO
6007 SV *sv_dat = reg_scan_name(pRExC_state,
6008 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6009 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
6010 }
6011 goto gen_recurse_regop;
6012 /* NOT REACHED */
542fa716
YO
6013 case '+':
6014 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6015 RExC_parse++;
6016 vFAIL("Illegal pattern");
6017 }
6018 goto parse_recursion;
6019 /* NOT REACHED*/
6020 case '-': /* (?-1) */
6021 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6022 RExC_parse--; /* rewind to let it be handled later */
6023 goto parse_flags;
6024 }
6025 /*FALLTHROUGH */
6bda09f9
YO
6026 case '1': case '2': case '3': case '4': /* (?1) */
6027 case '5': case '6': case '7': case '8': case '9':
6028 RExC_parse--;
542fa716 6029 parse_recursion:
894be9b7
YO
6030 num = atoi(RExC_parse);
6031 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
6032 if (*RExC_parse == '-')
6033 RExC_parse++;
6bda09f9
YO
6034 while (isDIGIT(*RExC_parse))
6035 RExC_parse++;
6036 if (*RExC_parse!=')')
6037 vFAIL("Expecting close bracket");
894be9b7
YO
6038
6039 gen_recurse_regop:
542fa716
YO
6040 if ( paren == '-' ) {
6041 /*
6042 Diagram of capture buffer numbering.
6043 Top line is the normal capture buffer numbers
3b753521 6044 Bottom line is the negative indexing as from
542fa716
YO
6045 the X (the (?-2))
6046
6047 + 1 2 3 4 5 X 6 7
6048 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6049 - 5 4 3 2 1 X x x
6050
6051 */
6052 num = RExC_npar + num;
6053 if (num < 1) {
6054 RExC_parse++;
6055 vFAIL("Reference to nonexistent group");
6056 }
6057 } else if ( paren == '+' ) {
6058 num = RExC_npar + num - 1;
6059 }
6060
1a147d38 6061 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
6062 if (!SIZE_ONLY) {
6063 if (num > (I32)RExC_rx->nparens) {
6064 RExC_parse++;
6065 vFAIL("Reference to nonexistent group");
6066 }
40d049e4 6067 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 6068 RExC_emit++;
226de585 6069 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 6070 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 6071 } else {
6bda09f9 6072 RExC_size++;
6bda09f9 6073 }
0a4db386 6074 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 6075 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
6076 Set_Node_Offset(ret, parse_start); /* MJD */
6077
a3b492c3 6078 *flagp |= POSTPONED;
6bda09f9
YO
6079 nextchar(pRExC_state);
6080 return ret;
894be9b7
YO
6081 } /* named and numeric backreferences */
6082 /* NOT REACHED */
6083
fac92740 6084 case '?': /* (??...) */
6136c704 6085 is_logical = 1;
57b84237
YO
6086 if (*RExC_parse != '{') {
6087 RExC_parse++;
6088 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6089 /*NOTREACHED*/
6090 }
a3b492c3 6091 *flagp |= POSTPONED;
830247a4 6092 paren = *RExC_parse++;
0f5d15d6 6093 /* FALL THROUGH */
fac92740 6094 case '{': /* (?{...}) */
c277df42 6095 {
2eccd3b2
NC
6096 I32 count = 1;
6097 U32 n = 0;
c277df42 6098 char c;
830247a4 6099 char *s = RExC_parse;
c277df42 6100
830247a4
IZ
6101 RExC_seen_zerolen++;
6102 RExC_seen |= REG_SEEN_EVAL;
6103 while (count && (c = *RExC_parse)) {
6136c704
AL
6104 if (c == '\\') {
6105 if (RExC_parse[1])
6106 RExC_parse++;
6107 }
b81d288d 6108 else if (c == '{')
c277df42 6109 count++;
b81d288d 6110 else if (c == '}')
c277df42 6111 count--;
830247a4 6112 RExC_parse++;
c277df42 6113 }
6136c704 6114 if (*RExC_parse != ')') {
b81d288d 6115 RExC_parse = s;
b45f050a
JF
6116 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6117 }
c277df42 6118 if (!SIZE_ONLY) {
f3548bdc 6119 PAD *pad;
6136c704
AL
6120 OP_4tree *sop, *rop;
6121 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 6122
569233ed
SB
6123 ENTER;
6124 Perl_save_re_context(aTHX);
d59a8b3e 6125 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
9b978d73
DM
6126 sop->op_private |= OPpREFCOUNTED;
6127 /* re_dup will OpREFCNT_inc */
6128 OpREFCNT_set(sop, 1);
569233ed 6129 LEAVE;
c277df42 6130
830247a4 6131 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
6132 RExC_rxi->data->data[n] = (void*)rop;
6133 RExC_rxi->data->data[n+1] = (void*)sop;
6134 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 6135 SvREFCNT_dec(sv);
a0ed51b3 6136 }
e24b16f9 6137 else { /* First pass */
830247a4 6138 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 6139 && IN_PERL_RUNTIME)
2cd61cdb
IZ
6140 /* No compiled RE interpolated, has runtime
6141 components ===> unsafe. */
6142 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 6143 if (PL_tainting && PL_tainted)
cc6b7395 6144 FAIL("Eval-group in insecure regular expression");
54df2634 6145#if PERL_VERSION > 8
923e4eb5 6146 if (IN_PERL_COMPILETIME)
b5c19bd7 6147 PL_cv_has_eval = 1;
54df2634 6148#endif
c277df42 6149 }
b5c19bd7 6150
830247a4 6151 nextchar(pRExC_state);
6136c704 6152 if (is_logical) {
830247a4 6153 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6154 if (!SIZE_ONLY)
6155 ret->flags = 2;
3dab1dad 6156 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 6157 /* deal with the length of this later - MJD */
0f5d15d6
IZ
6158 return ret;
6159 }
ccb2c380
MP
6160 ret = reganode(pRExC_state, EVAL, n);
6161 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6162 Set_Node_Offset(ret, parse_start);
6163 return ret;
c277df42 6164 }
fac92740 6165 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 6166 {
0a4db386 6167 int is_define= 0;
fac92740 6168 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
6169 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6170 || RExC_parse[1] == '<'
830247a4 6171 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
6172 I32 flag;
6173
830247a4 6174 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6175 if (!SIZE_ONLY)
6176 ret->flags = 1;
3dab1dad 6177 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 6178 goto insert_if;
b81d288d 6179 }
a0ed51b3 6180 }
0a4db386
YO
6181 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6182 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6183 {
6184 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6185 char *name_start= RExC_parse++;
2eccd3b2 6186 U32 num = 0;
0a4db386
YO
6187 SV *sv_dat=reg_scan_name(pRExC_state,
6188 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6189 if (RExC_parse == name_start || *RExC_parse != ch)
6190 vFAIL2("Sequence (?(%c... not terminated",
6191 (ch == '>' ? '<' : ch));
6192 RExC_parse++;
6193 if (!SIZE_ONLY) {
6194 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 6195 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 6196 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
6197 }
6198 ret = reganode(pRExC_state,NGROUPP,num);
6199 goto insert_if_check_paren;
6200 }
6201 else if (RExC_parse[0] == 'D' &&
6202 RExC_parse[1] == 'E' &&
6203 RExC_parse[2] == 'F' &&
6204 RExC_parse[3] == 'I' &&
6205 RExC_parse[4] == 'N' &&
6206 RExC_parse[5] == 'E')
6207 {
6208 ret = reganode(pRExC_state,DEFINEP,0);
6209 RExC_parse +=6 ;
6210 is_define = 1;
6211 goto insert_if_check_paren;
6212 }
6213 else if (RExC_parse[0] == 'R') {
6214 RExC_parse++;
6215 parno = 0;
6216 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6217 parno = atoi(RExC_parse++);
6218 while (isDIGIT(*RExC_parse))
6219 RExC_parse++;
6220 } else if (RExC_parse[0] == '&') {
6221 SV *sv_dat;
6222 RExC_parse++;
6223 sv_dat = reg_scan_name(pRExC_state,
6224 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6225 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6226 }
1a147d38 6227 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
6228 goto insert_if_check_paren;
6229 }
830247a4 6230 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 6231 /* (?(1)...) */
6136c704 6232 char c;
830247a4 6233 parno = atoi(RExC_parse++);
c277df42 6234
830247a4
IZ
6235 while (isDIGIT(*RExC_parse))
6236 RExC_parse++;
fac92740 6237 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 6238
0a4db386 6239 insert_if_check_paren:
830247a4 6240 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 6241 vFAIL("Switch condition not recognized");
c277df42 6242 insert_if:
3dab1dad
YO
6243 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6244 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 6245 if (br == NULL)
830247a4 6246 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 6247 else
3dab1dad 6248 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 6249 c = *nextchar(pRExC_state);
d1b80229
IZ
6250 if (flags&HASWIDTH)
6251 *flagp |= HASWIDTH;
c277df42 6252 if (c == '|') {
0a4db386
YO
6253 if (is_define)
6254 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 6255 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
6256 regbranch(pRExC_state, &flags, 1,depth+1);
6257 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
6258 if (flags&HASWIDTH)
6259 *flagp |= HASWIDTH;
830247a4 6260 c = *nextchar(pRExC_state);
a0ed51b3
LW
6261 }
6262 else
c277df42
IZ
6263 lastbr = NULL;
6264 if (c != ')')
8615cb43 6265 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 6266 ender = reg_node(pRExC_state, TAIL);
3dab1dad 6267 REGTAIL(pRExC_state, br, ender);
c277df42 6268 if (lastbr) {
3dab1dad
YO
6269 REGTAIL(pRExC_state, lastbr, ender);
6270 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
6271 }
6272 else
3dab1dad 6273 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
6274 RExC_size++; /* XXX WHY do we need this?!!
6275 For large programs it seems to be required
6276 but I can't figure out why. -- dmq*/
c277df42 6277 return ret;
a0ed51b3
LW
6278 }
6279 else {
830247a4 6280 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
6281 }
6282 }
1b1626e4 6283 case 0:
830247a4 6284 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 6285 vFAIL("Sequence (? incomplete");
1b1626e4 6286 break;
85508812
KW
6287 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6288 that follow */
fb85c044
KW
6289 has_use_defaults = TRUE;
6290 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
7b98bc43
KW
6291 if (RExC_utf8) { /* But the default for a utf8 pattern is
6292 unicode semantics */
6293 RExC_flags |= RXf_PMf_UNICODE;
6294 }
fb85c044 6295 goto parse_flags;
a0d0e21e 6296 default:
cde0cee5
YO
6297 --RExC_parse;
6298 parse_flags: /* (?i) */
6299 {
6300 U32 posflags = 0, negflags = 0;
6301 U32 *flagsp = &posflags;
9de15fec 6302 bool has_charset_modifier = 0;
cde0cee5
YO
6303
6304 while (*RExC_parse) {
6305 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
6306 /* (?g), (?gc) and (?o) are useless here
6307 and must be globally applied -- japhy */
cde0cee5
YO
6308 switch (*RExC_parse) {
6309 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9de15fec
KW
6310 case LOCALE_PAT_MOD:
6311 if (has_charset_modifier || flagsp == &negflags) {
6312 goto fail_modifiers;
6313 }
8cc86590
KW
6314 posflags |= RXf_PMf_LOCALE;
6315 negflags |= RXf_PMf_UNICODE;
9de15fec
KW
6316 has_charset_modifier = 1;
6317 break;
6318 case UNICODE_PAT_MOD:
6319 if (has_charset_modifier || flagsp == &negflags) {
6320 goto fail_modifiers;
6321 }
8cc86590
KW
6322 posflags |= RXf_PMf_UNICODE;
6323 negflags |= RXf_PMf_LOCALE;
9de15fec
KW
6324 has_charset_modifier = 1;
6325 break;
6326 case DUAL_PAT_MOD:
6327 if (has_use_defaults
6328 || has_charset_modifier
6329 || flagsp == &negflags)
6330 {
6331 goto fail_modifiers;
6332 }
7b98bc43
KW
6333
6334 /* The dual charset means unicode semantics if the
6335 * pattern (or target, not known until runtime) are
6336 * utf8 */
6337 if (RExC_utf8) {
6338 posflags |= RXf_PMf_UNICODE;
6339 negflags |= RXf_PMf_LOCALE;
6340 }
6341 else {
6342 negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6343 }
9de15fec
KW
6344 has_charset_modifier = 1;
6345 break;
f7819f85
A
6346 case ONCE_PAT_MOD: /* 'o' */
6347 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 6348 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 6349 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
6350 if (! (wastedflags & wflagbit) ) {
6351 wastedflags |= wflagbit;
6352 vWARN5(
6353 RExC_parse + 1,
6354 "Useless (%s%c) - %suse /%c modifier",
6355 flagsp == &negflags ? "?-" : "?",
6356 *RExC_parse,
6357 flagsp == &negflags ? "don't " : "",
6358 *RExC_parse
6359 );
6360 }
6361 }
cde0cee5
YO
6362 break;
6363
f7819f85 6364 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 6365 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
6366 if (! (wastedflags & WASTED_C) ) {
6367 wastedflags |= WASTED_GC;
9d1d55b5
JP
6368 vWARN3(
6369 RExC_parse + 1,
6370 "Useless (%sc) - %suse /gc modifier",
6371 flagsp == &negflags ? "?-" : "?",
6372 flagsp == &negflags ? "don't " : ""
6373 );
6374 }
6375 }
cde0cee5 6376 break;
f7819f85 6377 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5 6378 if (flagsp == &negflags) {
668c081a
NC
6379 if (SIZE_ONLY)
6380 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
6381 } else {
6382 *flagsp |= RXf_PMf_KEEPCOPY;
6383 }
6384 break;
6385 case '-':
3b753521 6386 /* A flag is a default iff it is following a minus, so
fb85c044
KW
6387 * if there is a minus, it means will be trying to
6388 * re-specify a default which is an error */
6389 if (has_use_defaults || flagsp == &negflags) {
9de15fec 6390 fail_modifiers:
57b84237
YO
6391 RExC_parse++;
6392 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6393 /*NOTREACHED*/
6394 }
cde0cee5
YO
6395 flagsp = &negflags;
6396 wastedflags = 0; /* reset so (?g-c) warns twice */
6397 break;
6398 case ':':
6399 paren = ':';
6400 /*FALLTHROUGH*/
6401 case ')':
6402 RExC_flags |= posflags;
6403 RExC_flags &= ~negflags;
f7819f85
A
6404 if (paren != ':') {
6405 oregflags |= posflags;
6406 oregflags &= ~negflags;
6407 }
cde0cee5
YO
6408 nextchar(pRExC_state);
6409 if (paren != ':') {
6410 *flagp = TRYAGAIN;
6411 return NULL;
6412 } else {
6413 ret = NULL;
6414 goto parse_rest;
6415 }
6416 /*NOTREACHED*/
6417 default:
cde0cee5
YO
6418 RExC_parse++;
6419 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6420 /*NOTREACHED*/
6421 }
830247a4 6422 ++RExC_parse;
48c036b1 6423 }
cde0cee5 6424 }} /* one for the default block, one for the switch */
a0d0e21e 6425 }
fac92740 6426 else { /* (...) */
81714fb9 6427 capturing_parens:
830247a4
IZ
6428 parno = RExC_npar;
6429 RExC_npar++;
e2e6a0f1 6430
830247a4 6431 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
6432 if (!SIZE_ONLY ){
6433 if (!RExC_nestroot)
6434 RExC_nestroot = parno;
c009da3d
YO
6435 if (RExC_seen & REG_SEEN_RECURSE
6436 && !RExC_open_parens[parno-1])
6437 {
e2e6a0f1 6438 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
6439 "Setting open paren #%"IVdf" to %d\n",
6440 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
6441 RExC_open_parens[parno-1]= ret;
6442 }
6bda09f9 6443 }
fac92740
MJD
6444 Set_Node_Length(ret, 1); /* MJD */
6445 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 6446 is_open = 1;
a0d0e21e 6447 }
a0ed51b3 6448 }
fac92740 6449 else /* ! paren */
a0d0e21e 6450 ret = NULL;
cde0cee5
YO
6451
6452 parse_rest:
a0d0e21e 6453 /* Pick up the branches, linking them together. */
fac92740 6454 parse_start = RExC_parse; /* MJD */
3dab1dad 6455 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e
VP
6456
6457 if (freeze_paren) {
6458 if (RExC_npar > after_freeze)
6459 after_freeze = RExC_npar;
6460 RExC_npar = freeze_paren;
6461 }
6462
fac92740 6463 /* branch_len = (paren != 0); */
2af232bd 6464
a0d0e21e
LW
6465 if (br == NULL)
6466 return(NULL);
830247a4
IZ
6467 if (*RExC_parse == '|') {
6468 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 6469 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 6470 }
fac92740 6471 else { /* MJD */
6bda09f9 6472 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
6473 Set_Node_Length(br, paren != 0);
6474 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6475 }
c277df42
IZ
6476 have_branch = 1;
6477 if (SIZE_ONLY)
830247a4 6478 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
6479 }
6480 else if (paren == ':') {
c277df42
IZ
6481 *flagp |= flags&SIMPLE;
6482 }
6136c704 6483 if (is_open) { /* Starts with OPEN. */
3dab1dad 6484 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
6485 }
6486 else if (paren != '?') /* Not Conditional */
a0d0e21e 6487 ret = br;
8ae10a67 6488 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 6489 lastbr = br;
830247a4
IZ
6490 while (*RExC_parse == '|') {
6491 if (!SIZE_ONLY && RExC_extralen) {
6492 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 6493 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
6494 }
6495 if (SIZE_ONLY)
830247a4
IZ
6496 RExC_extralen += 2; /* Account for LONGJMP. */
6497 nextchar(pRExC_state);
594d7033
YO
6498 if (freeze_paren) {
6499 if (RExC_npar > after_freeze)
6500 after_freeze = RExC_npar;
6501 RExC_npar = freeze_paren;
6502 }
3dab1dad 6503 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 6504
a687059c 6505 if (br == NULL)
a0d0e21e 6506 return(NULL);
3dab1dad 6507 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 6508 lastbr = br;
8ae10a67 6509 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
6510 }
6511
c277df42
IZ
6512 if (have_branch || paren != ':') {
6513 /* Make a closing node, and hook it on the end. */
6514 switch (paren) {
6515 case ':':
830247a4 6516 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
6517 break;
6518 case 1:
830247a4 6519 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
6520 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6521 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6522 "Setting close paren #%"IVdf" to %d\n",
6523 (IV)parno, REG_NODE_NUM(ender)));
6524 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
6525 if (RExC_nestroot == parno)
6526 RExC_nestroot = 0;
40d049e4 6527 }
fac92740
MJD
6528 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6529 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
6530 break;
6531 case '<':
c277df42
IZ
6532 case ',':
6533 case '=':
6534 case '!':
c277df42 6535 *flagp &= ~HASWIDTH;
821b33a5
IZ
6536 /* FALL THROUGH */
6537 case '>':
830247a4 6538 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
6539 break;
6540 case 0:
830247a4 6541 ender = reg_node(pRExC_state, END);
40d049e4
YO
6542 if (!SIZE_ONLY) {
6543 assert(!RExC_opend); /* there can only be one! */
6544 RExC_opend = ender;
6545 }
c277df42
IZ
6546 break;
6547 }
eaf3ca90 6548 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 6549
9674d46a 6550 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
6551 if (depth==1)
6552 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6553
c277df42 6554 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
6555 for (br = ret; br; br = regnext(br)) {
6556 const U8 op = PL_regkind[OP(br)];
6557 if (op == BRANCH) {
07be1b83 6558 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
6559 }
6560 else if (op == BRANCHJ) {
07be1b83 6561 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 6562 }
c277df42
IZ
6563 }
6564 }
a0d0e21e 6565 }
c277df42
IZ
6566
6567 {
e1ec3a88
AL
6568 const char *p;
6569 static const char parens[] = "=!<,>";
c277df42
IZ
6570
6571 if (paren && (p = strchr(parens, paren))) {
eb160463 6572 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
6573 int flag = (p - parens) > 1;
6574
6575 if (paren == '>')
6576 node = SUSPEND, flag = 0;
6bda09f9 6577 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
6578 Set_Node_Cur_Length(ret);
6579 Set_Node_Offset(ret, parse_start + 1);
c277df42 6580 ret->flags = flag;
07be1b83 6581 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 6582 }
a0d0e21e
LW
6583 }
6584
6585 /* Check for proper termination. */
ce3e6498 6586 if (paren) {
e2509266 6587 RExC_flags = oregflags;
830247a4
IZ
6588 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6589 RExC_parse = oregcomp_parse;
380a0633 6590 vFAIL("Unmatched (");
ce3e6498 6591 }
a0ed51b3 6592 }
830247a4
IZ
6593 else if (!paren && RExC_parse < RExC_end) {
6594 if (*RExC_parse == ')') {
6595 RExC_parse++;
380a0633 6596 vFAIL("Unmatched )");
a0ed51b3
LW
6597 }
6598 else
b45f050a 6599 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
6600 /* NOTREACHED */
6601 }
594d7033
YO
6602 if (after_freeze)
6603 RExC_npar = after_freeze;
a0d0e21e 6604 return(ret);
a687059c
LW
6605}
6606
6607/*
6608 - regbranch - one alternative of an | operator
6609 *
6610 * Implements the concatenation operator.
6611 */
76e3520e 6612STATIC regnode *
3dab1dad 6613S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 6614{
97aff369 6615 dVAR;
c277df42
IZ
6616 register regnode *ret;
6617 register regnode *chain = NULL;
6618 register regnode *latest;
6619 I32 flags = 0, c = 0;
3dab1dad 6620 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6621
6622 PERL_ARGS_ASSERT_REGBRANCH;
6623
3dab1dad 6624 DEBUG_PARSE("brnc");
02daf0ab 6625
b81d288d 6626 if (first)
c277df42
IZ
6627 ret = NULL;
6628 else {
b81d288d 6629 if (!SIZE_ONLY && RExC_extralen)
830247a4 6630 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 6631 else {
830247a4 6632 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
6633 Set_Node_Length(ret, 1);
6634 }
c277df42
IZ
6635 }
6636
b81d288d 6637 if (!first && SIZE_ONLY)
830247a4 6638 RExC_extralen += 1; /* BRANCHJ */
b81d288d 6639
c277df42 6640 *flagp = WORST; /* Tentatively. */
a0d0e21e 6641
830247a4
IZ
6642 RExC_parse--;
6643 nextchar(pRExC_state);
6644 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 6645 flags &= ~TRYAGAIN;
3dab1dad 6646 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6647 if (latest == NULL) {
6648 if (flags & TRYAGAIN)
6649 continue;
6650 return(NULL);
a0ed51b3
LW
6651 }
6652 else if (ret == NULL)
c277df42 6653 ret = latest;
8ae10a67 6654 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 6655 if (chain == NULL) /* First piece. */
a0d0e21e
LW
6656 *flagp |= flags&SPSTART;
6657 else {
830247a4 6658 RExC_naughty++;
3dab1dad 6659 REGTAIL(pRExC_state, chain, latest);
a687059c 6660 }
a0d0e21e 6661 chain = latest;
c277df42
IZ
6662 c++;
6663 }
6664 if (chain == NULL) { /* Loop ran zero times. */
830247a4 6665 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
6666 if (ret == NULL)
6667 ret = chain;
6668 }
6669 if (c == 1) {
6670 *flagp |= flags&SIMPLE;
a0d0e21e 6671 }
a687059c 6672
d4c19fe8 6673 return ret;
a687059c
LW
6674}
6675
6676/*
6677 - regpiece - something followed by possible [*+?]
6678 *
6679 * Note that the branching code sequences used for ? and the general cases
6680 * of * and + are somewhat optimized: they use the same NOTHING node as
6681 * both the endmarker for their branch list and the body of the last branch.
6682 * It might seem that this node could be dispensed with entirely, but the
6683 * endmarker role is not redundant.
6684 */
76e3520e 6685STATIC regnode *
3dab1dad 6686S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6687{
97aff369 6688 dVAR;
c277df42 6689 register regnode *ret;
a0d0e21e
LW
6690 register char op;
6691 register char *next;
6692 I32 flags;
1df70142 6693 const char * const origparse = RExC_parse;
a0d0e21e 6694 I32 min;
c277df42 6695 I32 max = REG_INFTY;
fac92740 6696 char *parse_start;
10edeb5d 6697 const char *maxpos = NULL;
3dab1dad 6698 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6699
6700 PERL_ARGS_ASSERT_REGPIECE;
6701
3dab1dad 6702 DEBUG_PARSE("piec");
a0d0e21e 6703
3dab1dad 6704 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6705 if (ret == NULL) {
6706 if (flags & TRYAGAIN)
6707 *flagp |= TRYAGAIN;
6708 return(NULL);
6709 }
6710
830247a4 6711 op = *RExC_parse;
a0d0e21e 6712
830247a4 6713 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 6714 maxpos = NULL;
fac92740 6715 parse_start = RExC_parse; /* MJD */
830247a4 6716 next = RExC_parse + 1;
a0d0e21e
LW
6717 while (isDIGIT(*next) || *next == ',') {
6718 if (*next == ',') {
6719 if (maxpos)
6720 break;
6721 else
6722 maxpos = next;
a687059c 6723 }
a0d0e21e
LW
6724 next++;
6725 }
6726 if (*next == '}') { /* got one */
6727 if (!maxpos)
6728 maxpos = next;
830247a4
IZ
6729 RExC_parse++;
6730 min = atoi(RExC_parse);
a0d0e21e
LW
6731 if (*maxpos == ',')
6732 maxpos++;
6733 else
830247a4 6734 maxpos = RExC_parse;
a0d0e21e
LW
6735 max = atoi(maxpos);
6736 if (!max && *maxpos != '0')
c277df42
IZ
6737 max = REG_INFTY; /* meaning "infinity" */
6738 else if (max >= REG_INFTY)
8615cb43 6739 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
6740 RExC_parse = next;
6741 nextchar(pRExC_state);
a0d0e21e
LW
6742
6743 do_curly:
6744 if ((flags&SIMPLE)) {
830247a4 6745 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 6746 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
6747 Set_Node_Offset(ret, parse_start+1); /* MJD */
6748 Set_Node_Cur_Length(ret);
a0d0e21e
LW
6749 }
6750 else {
3dab1dad 6751 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
6752
6753 w->flags = 0;
3dab1dad 6754 REGTAIL(pRExC_state, ret, w);
830247a4 6755 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
6756 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6757 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
6758 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6759 }
6bda09f9 6760 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
6761 /* MJD hk */
6762 Set_Node_Offset(ret, parse_start+1);
2af232bd 6763 Set_Node_Length(ret,
fac92740 6764 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 6765
830247a4 6766 if (!SIZE_ONLY && RExC_extralen)
c277df42 6767 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 6768 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 6769 if (SIZE_ONLY)
830247a4
IZ
6770 RExC_whilem_seen++, RExC_extralen += 3;
6771 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 6772 }
c277df42 6773 ret->flags = 0;
a0d0e21e
LW
6774
6775 if (min > 0)
821b33a5
IZ
6776 *flagp = WORST;
6777 if (max > 0)
6778 *flagp |= HASWIDTH;
8fa23287 6779 if (max < min)
8615cb43 6780 vFAIL("Can't do {n,m} with n > m");
c277df42 6781 if (!SIZE_ONLY) {
eb160463
GS
6782 ARG1_SET(ret, (U16)min);
6783 ARG2_SET(ret, (U16)max);
a687059c 6784 }
a687059c 6785
a0d0e21e 6786 goto nest_check;
a687059c 6787 }
a0d0e21e 6788 }
a687059c 6789
a0d0e21e
LW
6790 if (!ISMULT1(op)) {
6791 *flagp = flags;
a687059c 6792 return(ret);
a0d0e21e 6793 }
bb20fd44 6794
c277df42 6795#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
6796
6797 /* if this is reinstated, don't forget to put this back into perldiag:
6798
6799 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6800
6801 (F) The part of the regexp subject to either the * or + quantifier
6802 could match an empty string. The {#} shows in the regular
6803 expression about where the problem was discovered.
6804
6805 */
6806
bb20fd44 6807 if (!(flags&HASWIDTH) && op != '?')
b45f050a 6808 vFAIL("Regexp *+ operand could be empty");
b81d288d 6809#endif
bb20fd44 6810
fac92740 6811 parse_start = RExC_parse;
830247a4 6812 nextchar(pRExC_state);
a0d0e21e 6813
821b33a5 6814 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
6815
6816 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 6817 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 6818 ret->flags = 0;
830247a4 6819 RExC_naughty += 4;
a0d0e21e
LW
6820 }
6821 else if (op == '*') {
6822 min = 0;
6823 goto do_curly;
a0ed51b3
LW
6824 }
6825 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 6826 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 6827 ret->flags = 0;
830247a4 6828 RExC_naughty += 3;
a0d0e21e
LW
6829 }
6830 else if (op == '+') {
6831 min = 1;
6832 goto do_curly;
a0ed51b3
LW
6833 }
6834 else if (op == '?') {
a0d0e21e
LW
6835 min = 0; max = 1;
6836 goto do_curly;
6837 }
6838 nest_check:
668c081a
NC
6839 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6840 ckWARN3reg(RExC_parse,
6841 "%.*s matches null string many times",
6842 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6843 origparse);
a0d0e21e
LW
6844 }
6845
b9b4dddf 6846 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 6847 nextchar(pRExC_state);
6bda09f9 6848 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 6849 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 6850 }
b9b4dddf
YO
6851#ifndef REG_ALLOW_MINMOD_SUSPEND
6852 else
6853#endif
6854 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6855 regnode *ender;
6856 nextchar(pRExC_state);
6857 ender = reg_node(pRExC_state, SUCCEED);
6858 REGTAIL(pRExC_state, ret, ender);
6859 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6860 ret->flags = 0;
6861 ender = reg_node(pRExC_state, TAIL);
6862 REGTAIL(pRExC_state, ret, ender);
6863 /*ret= ender;*/
6864 }
6865
6866 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 6867 RExC_parse++;
b45f050a
JF
6868 vFAIL("Nested quantifiers");
6869 }
a0d0e21e
LW
6870
6871 return(ret);
a687059c
LW
6872}
6873
fc8cd66c
YO
6874
6875/* reg_namedseq(pRExC_state,UVp)
6876
6877 This is expected to be called by a parser routine that has
afefe6bf 6878 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
6879 expected to point at the first char following the N at the time
6880 of the call.
ff3f963a
KW
6881
6882 The \N may be inside (indicated by valuep not being NULL) or outside a
6883 character class.
6884
6885 \N may begin either a named sequence, or if outside a character class, mean
6886 to match a non-newline. For non single-quoted regexes, the tokenizer has
6887 attempted to decide which, and in the case of a named sequence converted it
6888 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6889 where c1... are the characters in the sequence. For single-quoted regexes,
6890 the tokenizer passes the \N sequence through unchanged; this code will not
6891 attempt to determine this nor expand those. The net effect is that if the
6892 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6893 signals that this \N occurrence means to match a non-newline.
6894
6895 Only the \N{U+...} form should occur in a character class, for the same
6896 reason that '.' inside a character class means to just match a period: it
6897 just doesn't make sense.
fc8cd66c
YO
6898
6899 If valuep is non-null then it is assumed that we are parsing inside
6900 of a charclass definition and the first codepoint in the resolved
6901 string is returned via *valuep and the routine will return NULL.
6902 In this mode if a multichar string is returned from the charnames
ff3f963a 6903 handler, a warning will be issued, and only the first char in the
fc8cd66c
YO
6904 sequence will be examined. If the string returned is zero length
6905 then the value of *valuep is undefined and NON-NULL will
6906 be returned to indicate failure. (This will NOT be a valid pointer
6907 to a regnode.)
6908
ff3f963a
KW
6909 If valuep is null then it is assumed that we are parsing normal text and a
6910 new EXACT node is inserted into the program containing the resolved string,
6911 and a pointer to the new node is returned. But if the string is zero length
6912 a NOTHING node is emitted instead.
afefe6bf 6913
fc8cd66c 6914 On success RExC_parse is set to the char following the endbrace.
ff3f963a 6915 Parsing failures will generate a fatal error via vFAIL(...)
fc8cd66c
YO
6916 */
6917STATIC regnode *
afefe6bf 6918S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
fc8cd66c 6919{
c3c41406 6920 char * endbrace; /* '}' following the name */
fc8cd66c 6921 regnode *ret = NULL;
ff3f963a
KW
6922#ifdef DEBUGGING
6923 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6924#endif
c3c41406 6925 char* p;
ff3f963a
KW
6926
6927 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6928
6929 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
ff3f963a
KW
6930
6931 GET_RE_DEBUG_FLAGS;
c3c41406
KW
6932
6933 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6934 * modifier. The other meaning does not */
6935 p = (RExC_flags & RXf_PMf_EXTENDED)
6936 ? regwhite( pRExC_state, RExC_parse )
6937 : RExC_parse;
7918f24d 6938
ff3f963a 6939 /* Disambiguate between \N meaning a named character versus \N meaning
c3c41406
KW
6940 * [^\n]. The former is assumed when it can't be the latter. */
6941 if (*p != '{' || regcurly(p)) {
6942 RExC_parse = p;
ff3f963a 6943 if (valuep) {
afefe6bf 6944 /* no bare \N in a charclass */
ff3f963a
KW
6945 vFAIL("\\N in a character class must be a named character: \\N{...}");
6946 }
afefe6bf
RGS
6947 nextchar(pRExC_state);
6948 ret = reg_node(pRExC_state, REG_ANY);
6949 *flagp |= HASWIDTH|SIMPLE;
6950 RExC_naughty++;
6951 RExC_parse--;
6952 Set_Node_Length(ret, 1); /* MJD */
6953 return ret;
fc8cd66c 6954 }
a4893424 6955
c3c41406
KW
6956 /* Here, we have decided it should be a named sequence */
6957
6958 /* The test above made sure that the next real character is a '{', but
6959 * under the /x modifier, it could be separated by space (or a comment and
6960 * \n) and this is not allowed (for consistency with \x{...} and the
6961 * tokenizer handling of \N{NAME}). */
6962 if (*RExC_parse != '{') {
6963 vFAIL("Missing braces on \\N{}");
6964 }
6965
ff3f963a 6966 RExC_parse++; /* Skip past the '{' */
c3c41406
KW
6967
6968 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6969 || ! (endbrace == RExC_parse /* nothing between the {} */
6970 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6971 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6972 {
6973 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6974 vFAIL("\\N{NAME} must be resolved by the lexer");
6975 }
6976
ff3f963a
KW
6977 if (endbrace == RExC_parse) { /* empty: \N{} */
6978 if (! valuep) {
6979 RExC_parse = endbrace + 1;
6980 return reg_node(pRExC_state,NOTHING);
a4893424 6981 }
fc8cd66c 6982
ff3f963a
KW
6983 if (SIZE_ONLY) {
6984 ckWARNreg(RExC_parse,
6985 "Ignoring zero length \\N{} in character class"
6986 );
6987 RExC_parse = endbrace + 1;
6988 }
6989 *valuep = 0;
6990 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 6991 }
ff3f963a 6992
62fed28b 6993 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
ff3f963a
KW
6994 RExC_parse += 2; /* Skip past the 'U+' */
6995
6996 if (valuep) { /* In a bracketed char class */
6997 /* We only pay attention to the first char of
6998 multichar strings being returned. I kinda wonder
6999 if this makes sense as it does change the behaviour
7000 from earlier versions, OTOH that behaviour was broken
7001 as well. XXX Solution is to recharacterize as
7002 [rest-of-class]|multi1|multi2... */
7003
7004 STRLEN length_of_hex;
7005 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7006 | PERL_SCAN_DISALLOW_PREFIX
7007 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7008
37820adc
KW
7009 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7010 if (endchar < endbrace) {
ff3f963a
KW
7011 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7012 }
ff3f963a
KW
7013
7014 length_of_hex = (STRLEN)(endchar - RExC_parse);
7015 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7016
7017 /* The tokenizer should have guaranteed validity, but it's possible to
7018 * bypass it by using single quoting, so check */
c3c41406
KW
7019 if (length_of_hex == 0
7020 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7021 {
7022 RExC_parse += length_of_hex; /* Includes all the valid */
7023 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7024 ? UTF8SKIP(RExC_parse)
7025 : 1;
7026 /* Guard against malformed utf8 */
7027 if (RExC_parse >= endchar) RExC_parse = endchar;
7028 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
7029 }
7030
7031 RExC_parse = endbrace + 1;
7032 if (endchar == endbrace) return NULL;
7033
7034 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 7035 }
ff3f963a
KW
7036 else { /* Not a char class */
7037 char *s; /* String to put in generated EXACT node */
fda99bee 7038 STRLEN len = 0; /* Its current byte length */
ff3f963a
KW
7039 char *endchar; /* Points to '.' or '}' ending cur char in the input
7040 stream */
7041
2c2b7f86
KW
7042 ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
7043 : (LOC)
7044 ? EXACTFL
7045 : UNI_SEMANTICS
7046 ? EXACTFU
7047 : EXACTF));
ff3f963a
KW
7048 s= STRING(ret);
7049
7050 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
7051 * the input which is of the form now 'c1.c2.c3...}' until find the
fda99bee 7052 * ending brace or exceed length 255. The characters that exceed this
ff3f963a
KW
7053 * limit are dropped. The limit could be relaxed should it become
7054 * desirable by reparsing this as (?:\N{NAME}), so could generate
7055 * multiple EXACT nodes, as is done for just regular input. But this
7056 * is primarily a named character, and not intended to be a huge long
7057 * string, so 255 bytes should be good enough */
7058 while (1) {
c3c41406 7059 STRLEN length_of_hex;
ff3f963a
KW
7060 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7061 | PERL_SCAN_DISALLOW_PREFIX
7062 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7063 UV cp; /* Ord of current character */
7064
7065 /* Code points are separated by dots. If none, there is only one
7066 * code point, and is terminated by the brace */
37820adc 7067 endchar = RExC_parse + strcspn(RExC_parse, ".}");
ff3f963a
KW
7068
7069 /* The values are Unicode even on EBCDIC machines */
c3c41406
KW
7070 length_of_hex = (STRLEN)(endchar - RExC_parse);
7071 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7072 if ( length_of_hex == 0
7073 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
ff3f963a 7074 {
c3c41406
KW
7075 RExC_parse += length_of_hex; /* Includes all the valid */
7076 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7077 ? UTF8SKIP(RExC_parse)
7078 : 1;
7079 /* Guard against malformed utf8 */
7080 if (RExC_parse >= endchar) RExC_parse = endchar;
7081 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
7082 }
7083
7084 if (! FOLD) { /* Not folding, just append to the string */
7085 STRLEN unilen;
7086
7087 /* Quit before adding this character if would exceed limit */
7088 if (len + UNISKIP(cp) > U8_MAX) break;
fc8cd66c 7089
ff3f963a
KW
7090 unilen = reguni(pRExC_state, cp, s);
7091 if (unilen > 0) {
7092 s += unilen;
7093 len += unilen;
7094 }
7095 } else { /* Folding, output the folded equivalent */
7096 STRLEN foldlen,numlen;
7097 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7098 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7099
7100 /* Quit before exceeding size limit */
7101 if (len + foldlen > U8_MAX) break;
7102
7103 for (foldbuf = tmpbuf;
7104 foldlen;
7105 foldlen -= numlen)
7106 {
7107 cp = utf8_to_uvchr(foldbuf, &numlen);
7108 if (numlen > 0) {
7109 const STRLEN unilen = reguni(pRExC_state, cp, s);
7110 s += unilen;
7111 len += unilen;
7112 /* In EBCDIC the numlen and unilen can differ. */
7113 foldbuf += numlen;
7114 if (numlen >= foldlen)
7115 break;
7116 }
7117 else
7118 break; /* "Can't happen." */
7119 }
7120 }
7121
7122 /* Point to the beginning of the next character in the sequence. */
7123 RExC_parse = endchar + 1;
7124
7125 /* Quit if no more characters */
7126 if (RExC_parse >= endbrace) break;
7127 }
7128
7129
7130 if (SIZE_ONLY) {
7131 if (RExC_parse < endbrace) {
7132 ckWARNreg(RExC_parse - 1,
7133 "Using just the first characters returned by \\N{}");
7134 }
7135
7136 RExC_size += STR_SZ(len);
7137 } else {
7138 STR_LEN(ret) = len;
7139 RExC_emit += STR_SZ(len);
7140 }
7141
7142 RExC_parse = endbrace + 1;
7143
7144 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7145 with malformed in t/re/pat_advanced.t */
7146 RExC_parse --;
7147 Set_Node_Cur_Length(ret); /* MJD */
7148 nextchar(pRExC_state);
7149 }
7150
7151 return ret;
fc8cd66c
YO
7152}
7153
7154
9e08bc66
TS
7155/*
7156 * reg_recode
7157 *
7158 * It returns the code point in utf8 for the value in *encp.
7159 * value: a code value in the source encoding
7160 * encp: a pointer to an Encode object
7161 *
7162 * If the result from Encode is not a single character,
7163 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7164 */
7165STATIC UV
7166S_reg_recode(pTHX_ const char value, SV **encp)
7167{
7168 STRLEN numlen = 1;
59cd0e26 7169 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 7170 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
7171 const STRLEN newlen = SvCUR(sv);
7172 UV uv = UNICODE_REPLACEMENT;
7173
7918f24d
NC
7174 PERL_ARGS_ASSERT_REG_RECODE;
7175
9e08bc66
TS
7176 if (newlen)
7177 uv = SvUTF8(sv)
7178 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7179 : *(U8*)s;
7180
7181 if (!newlen || numlen != newlen) {
7182 uv = UNICODE_REPLACEMENT;
c86f7df5 7183 *encp = NULL;
9e08bc66
TS
7184 }
7185 return uv;
7186}
7187
fc8cd66c 7188
a687059c
LW
7189/*
7190 - regatom - the lowest level
ee9b8eae
YO
7191
7192 Try to identify anything special at the start of the pattern. If there
7193 is, then handle it as required. This may involve generating a single regop,
7194 such as for an assertion; or it may involve recursing, such as to
7195 handle a () structure.
7196
7197 If the string doesn't start with something special then we gobble up
7198 as much literal text as we can.
7199
7200 Once we have been able to handle whatever type of thing started the
7201 sequence, we return.
7202
7203 Note: we have to be careful with escapes, as they can be both literal
7204 and special, and in the case of \10 and friends can either, depending
7205 on context. Specifically there are two seperate switches for handling
7206 escape sequences, with the one for handling literal escapes requiring
7207 a dummy entry for all of the special escapes that are actually handled
7208 by the other.
7209*/
7210
76e3520e 7211STATIC regnode *
3dab1dad 7212S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 7213{
97aff369 7214 dVAR;
cbbf8932 7215 register regnode *ret = NULL;
a0d0e21e 7216 I32 flags;
45948336 7217 char *parse_start = RExC_parse;
3dab1dad
YO
7218 GET_RE_DEBUG_FLAGS_DECL;
7219 DEBUG_PARSE("atom");
a0d0e21e
LW
7220 *flagp = WORST; /* Tentatively. */
7221
7918f24d 7222 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 7223
a0d0e21e 7224tryagain:
f9a79580 7225 switch ((U8)*RExC_parse) {
a0d0e21e 7226 case '^':
830247a4
IZ
7227 RExC_seen_zerolen++;
7228 nextchar(pRExC_state);
bbe252da 7229 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7230 ret = reg_node(pRExC_state, MBOL);
bbe252da 7231 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7232 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7233 else
830247a4 7234 ret = reg_node(pRExC_state, BOL);
fac92740 7235 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7236 break;
7237 case '$':
830247a4 7238 nextchar(pRExC_state);
b81d288d 7239 if (*RExC_parse)
830247a4 7240 RExC_seen_zerolen++;
bbe252da 7241 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7242 ret = reg_node(pRExC_state, MEOL);
bbe252da 7243 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7244 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7245 else
830247a4 7246 ret = reg_node(pRExC_state, EOL);
fac92740 7247 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7248 break;
7249 case '.':
830247a4 7250 nextchar(pRExC_state);
bbe252da 7251 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
7252 ret = reg_node(pRExC_state, SANY);
7253 else
7254 ret = reg_node(pRExC_state, REG_ANY);
7255 *flagp |= HASWIDTH|SIMPLE;
830247a4 7256 RExC_naughty++;
fac92740 7257 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7258 break;
7259 case '[':
b45f050a 7260 {
3dab1dad
YO
7261 char * const oregcomp_parse = ++RExC_parse;
7262 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
7263 if (*RExC_parse != ']') {
7264 RExC_parse = oregcomp_parse;
b45f050a
JF
7265 vFAIL("Unmatched [");
7266 }
830247a4 7267 nextchar(pRExC_state);
a0d0e21e 7268 *flagp |= HASWIDTH|SIMPLE;
fac92740 7269 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 7270 break;
b45f050a 7271 }
a0d0e21e 7272 case '(':
830247a4 7273 nextchar(pRExC_state);
3dab1dad 7274 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 7275 if (ret == NULL) {
bf93d4cc 7276 if (flags & TRYAGAIN) {
830247a4 7277 if (RExC_parse == RExC_end) {
bf93d4cc
GS
7278 /* Make parent create an empty node if needed. */
7279 *flagp |= TRYAGAIN;
7280 return(NULL);
7281 }
a0d0e21e 7282 goto tryagain;
bf93d4cc 7283 }
a0d0e21e
LW
7284 return(NULL);
7285 }
a3b492c3 7286 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
7287 break;
7288 case '|':
7289 case ')':
7290 if (flags & TRYAGAIN) {
7291 *flagp |= TRYAGAIN;
7292 return NULL;
7293 }
b45f050a 7294 vFAIL("Internal urp");
a0d0e21e
LW
7295 /* Supposed to be caught earlier. */
7296 break;
85afd4ae 7297 case '{':
830247a4
IZ
7298 if (!regcurly(RExC_parse)) {
7299 RExC_parse++;
85afd4ae
CS
7300 goto defchar;
7301 }
7302 /* FALL THROUGH */
a0d0e21e
LW
7303 case '?':
7304 case '+':
7305 case '*':
830247a4 7306 RExC_parse++;
b45f050a 7307 vFAIL("Quantifier follows nothing");
a0d0e21e 7308 break;
ced7f090
KW
7309 case LATIN_SMALL_LETTER_SHARP_S:
7310 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7311 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7312#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
7313#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below.
7314 case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
7315#endif
a0a388a1 7316 do_foldchar:
56d400ed 7317 if (!LOC && FOLD) {
e64b1bd1 7318 U32 len,cp;
7cf3a6a3 7319 len=0; /* silence a spurious compiler warning */
56d400ed 7320 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
e64b1bd1
YO
7321 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7322 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7323 ret = reganode(pRExC_state, FOLDCHAR, cp);
7324 Set_Node_Length(ret, 1); /* MJD */
7325 nextchar(pRExC_state); /* kill whitespace under /x */
7326 return ret;
7327 }
7328 }
7329 goto outer_default;
a0d0e21e 7330 case '\\':
ee9b8eae
YO
7331 /* Special Escapes
7332
7333 This switch handles escape sequences that resolve to some kind
7334 of special regop and not to literal text. Escape sequnces that
7335 resolve to literal text are handled below in the switch marked
7336 "Literal Escapes".
7337
7338 Every entry in this switch *must* have a corresponding entry
7339 in the literal escape switch. However, the opposite is not
7340 required, as the default for this switch is to jump to the
7341 literal text handling code.
7342 */
a0a388a1 7343 switch ((U8)*++RExC_parse) {
ced7f090
KW
7344 case LATIN_SMALL_LETTER_SHARP_S:
7345 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7346 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
a0a388a1 7347 goto do_foldchar;
ee9b8eae 7348 /* Special Escapes */
a0d0e21e 7349 case 'A':
830247a4
IZ
7350 RExC_seen_zerolen++;
7351 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7352 *flagp |= SIMPLE;
ee9b8eae 7353 goto finish_meta_pat;
a0d0e21e 7354 case 'G':
830247a4
IZ
7355 ret = reg_node(pRExC_state, GPOS);
7356 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 7357 *flagp |= SIMPLE;
ee9b8eae
YO
7358 goto finish_meta_pat;
7359 case 'K':
7360 RExC_seen_zerolen++;
7361 ret = reg_node(pRExC_state, KEEPS);
7362 *flagp |= SIMPLE;
37923168
RGS
7363 /* XXX:dmq : disabling in-place substitution seems to
7364 * be necessary here to avoid cases of memory corruption, as
7365 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7366 */
7367 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 7368 goto finish_meta_pat;
a0d0e21e 7369 case 'Z':
830247a4 7370 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7371 *flagp |= SIMPLE;
a1917ab9 7372 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7373 goto finish_meta_pat;
b85d18e9 7374 case 'z':
830247a4 7375 ret = reg_node(pRExC_state, EOS);
b85d18e9 7376 *flagp |= SIMPLE;
830247a4 7377 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7378 goto finish_meta_pat;
4a2d328f 7379 case 'C':
f33976b4
DB
7380 ret = reg_node(pRExC_state, CANY);
7381 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 7382 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7383 goto finish_meta_pat;
a0ed51b3 7384 case 'X':
830247a4 7385 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 7386 *flagp |= HASWIDTH;
ee9b8eae 7387 goto finish_meta_pat;
a0d0e21e 7388 case 'w':
a12cf05f
KW
7389 if (LOC) {
7390 ret = reg_node(pRExC_state, (U8)(ALNUML));
7391 } else {
7392 ret = reg_node(pRExC_state, (U8)(ALNUM));
7393 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7394 }
a0d0e21e 7395 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7396 goto finish_meta_pat;
a0d0e21e 7397 case 'W':
a12cf05f
KW
7398 if (LOC) {
7399 ret = reg_node(pRExC_state, (U8)(NALNUML));
7400 } else {
7401 ret = reg_node(pRExC_state, (U8)(NALNUM));
7402 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7403 }
a0d0e21e 7404 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7405 goto finish_meta_pat;
a0d0e21e 7406 case 'b':
830247a4
IZ
7407 RExC_seen_zerolen++;
7408 RExC_seen |= REG_SEEN_LOOKBEHIND;
a12cf05f
KW
7409 if (LOC) {
7410 ret = reg_node(pRExC_state, (U8)(BOUNDL));
7411 } else {
7412 ret = reg_node(pRExC_state, (U8)(BOUND));
7413 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7414 }
a0d0e21e 7415 *flagp |= SIMPLE;
ee9b8eae 7416 goto finish_meta_pat;
a0d0e21e 7417 case 'B':
830247a4
IZ
7418 RExC_seen_zerolen++;
7419 RExC_seen |= REG_SEEN_LOOKBEHIND;
a12cf05f
KW
7420 if (LOC) {
7421 ret = reg_node(pRExC_state, (U8)(NBOUNDL));
7422 } else {
7423 ret = reg_node(pRExC_state, (U8)(NBOUND));
7424 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7425 }
a0d0e21e 7426 *flagp |= SIMPLE;
ee9b8eae 7427 goto finish_meta_pat;
a0d0e21e 7428 case 's':
a12cf05f
KW
7429 if (LOC) {
7430 ret = reg_node(pRExC_state, (U8)(SPACEL));
7431 } else {
7432 ret = reg_node(pRExC_state, (U8)(SPACE));
7433 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7434 }
a0d0e21e 7435 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7436 goto finish_meta_pat;
a0d0e21e 7437 case 'S':
a12cf05f
KW
7438 if (LOC) {
7439 ret = reg_node(pRExC_state, (U8)(NSPACEL));
7440 } else {
7441 ret = reg_node(pRExC_state, (U8)(NSPACE));
7442 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7443 }
a0d0e21e 7444 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7445 goto finish_meta_pat;
a0d0e21e 7446 case 'd':
6ab9ea91
KW
7447 if (LOC) {
7448 ret = reg_node(pRExC_state, (U8)(DIGITL));
7449 } else {
7450 ret = reg_node(pRExC_state, (U8)(DIGIT));
7451 }
a0d0e21e 7452 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7453 goto finish_meta_pat;
a0d0e21e 7454 case 'D':
6ab9ea91
KW
7455 if (LOC) {
7456 ret = reg_node(pRExC_state, (U8)(NDIGITL));
7457 } else {
7458 ret = reg_node(pRExC_state, (U8)(NDIGIT));
7459 }
a0d0e21e 7460 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7461 goto finish_meta_pat;
e1d1eefb
YO
7462 case 'R':
7463 ret = reg_node(pRExC_state, LNBREAK);
7464 *flagp |= HASWIDTH|SIMPLE;
7465 goto finish_meta_pat;
7466 case 'h':
7467 ret = reg_node(pRExC_state, HORIZWS);
7468 *flagp |= HASWIDTH|SIMPLE;
7469 goto finish_meta_pat;
7470 case 'H':
7471 ret = reg_node(pRExC_state, NHORIZWS);
7472 *flagp |= HASWIDTH|SIMPLE;
7473 goto finish_meta_pat;
ee9b8eae 7474 case 'v':
e1d1eefb
YO
7475 ret = reg_node(pRExC_state, VERTWS);
7476 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
7477 goto finish_meta_pat;
7478 case 'V':
e1d1eefb
YO
7479 ret = reg_node(pRExC_state, NVERTWS);
7480 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7481 finish_meta_pat:
830247a4 7482 nextchar(pRExC_state);
fac92740 7483 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 7484 break;
a14b48bc
LW
7485 case 'p':
7486 case 'P':
3568d838 7487 {
3dab1dad 7488 char* const oldregxend = RExC_end;
d008bc60 7489#ifdef DEBUGGING
ccb2c380 7490 char* parse_start = RExC_parse - 2;
d008bc60 7491#endif
a14b48bc 7492
830247a4 7493 if (RExC_parse[1] == '{') {
3568d838 7494 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
7495 RExC_end = strchr(RExC_parse, '}');
7496 if (!RExC_end) {
3dab1dad 7497 const U8 c = (U8)*RExC_parse;
830247a4
IZ
7498 RExC_parse += 2;
7499 RExC_end = oldregxend;
0da60cf5 7500 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 7501 }
830247a4 7502 RExC_end++;
a14b48bc 7503 }
af6f566e 7504 else {
830247a4 7505 RExC_end = RExC_parse + 2;
af6f566e
HS
7506 if (RExC_end > oldregxend)
7507 RExC_end = oldregxend;
7508 }
830247a4 7509 RExC_parse--;
a14b48bc 7510
3dab1dad 7511 ret = regclass(pRExC_state,depth+1);
a14b48bc 7512
830247a4
IZ
7513 RExC_end = oldregxend;
7514 RExC_parse--;
ccb2c380
MP
7515
7516 Set_Node_Offset(ret, parse_start + 2);
7517 Set_Node_Cur_Length(ret);
830247a4 7518 nextchar(pRExC_state);
a14b48bc
LW
7519 *flagp |= HASWIDTH|SIMPLE;
7520 }
7521 break;
fc8cd66c 7522 case 'N':
afefe6bf 7523 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
7524 multicharacter. join_exact() will join them up later on.
7525 Also this makes sure that things like /\N{BLAH}+/ and
7526 \N{BLAH} being multi char Just Happen. dmq*/
7527 ++RExC_parse;
afefe6bf 7528 ret= reg_namedseq(pRExC_state, NULL, flagp);
fc8cd66c 7529 break;
0a4db386 7530 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 7531 parse_named_seq:
81714fb9
YO
7532 {
7533 char ch= RExC_parse[1];
1f1031fe
YO
7534 if (ch != '<' && ch != '\'' && ch != '{') {
7535 RExC_parse++;
7536 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 7537 } else {
1f1031fe
YO
7538 /* this pretty much dupes the code for (?P=...) in reg(), if
7539 you change this make sure you change that */
81714fb9 7540 char* name_start = (RExC_parse += 2);
2eccd3b2 7541 U32 num = 0;
0a4db386
YO
7542 SV *sv_dat = reg_scan_name(pRExC_state,
7543 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 7544 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 7545 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
7546 vFAIL2("Sequence %.3s... not terminated",parse_start);
7547
7548 if (!SIZE_ONLY) {
7549 num = add_data( pRExC_state, 1, "S" );
7550 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 7551 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
7552 }
7553
81714fb9
YO
7554 RExC_sawback = 1;
7555 ret = reganode(pRExC_state,
4444fd9f
KW
7556 ((! FOLD)
7557 ? NREF
7558 : (UNI_SEMANTICS)
7559 ? NREFFU
7560 : (LOC)
7561 ? NREFFL
7562 : NREFF),
7563 num);
81714fb9 7564 *flagp |= HASWIDTH;
1f1031fe 7565
81714fb9
YO
7566 /* override incorrect value set in reganode MJD */
7567 Set_Node_Offset(ret, parse_start+1);
7568 Set_Node_Cur_Length(ret); /* MJD */
7569 nextchar(pRExC_state);
1f1031fe 7570
81714fb9
YO
7571 }
7572 break;
1f1031fe 7573 }
2bf803e2 7574 case 'g':
a0d0e21e
LW
7575 case '1': case '2': case '3': case '4':
7576 case '5': case '6': case '7': case '8': case '9':
7577 {
c74340f9 7578 I32 num;
2bf803e2
YO
7579 bool isg = *RExC_parse == 'g';
7580 bool isrel = 0;
7581 bool hasbrace = 0;
7582 if (isg) {
c74340f9 7583 RExC_parse++;
2bf803e2
YO
7584 if (*RExC_parse == '{') {
7585 RExC_parse++;
7586 hasbrace = 1;
7587 }
7588 if (*RExC_parse == '-') {
7589 RExC_parse++;
7590 isrel = 1;
7591 }
1f1031fe
YO
7592 if (hasbrace && !isDIGIT(*RExC_parse)) {
7593 if (isrel) RExC_parse--;
7594 RExC_parse -= 2;
7595 goto parse_named_seq;
7596 } }
c74340f9 7597 num = atoi(RExC_parse);
b72d83b2
RGS
7598 if (isg && num == 0)
7599 vFAIL("Reference to invalid group 0");
c74340f9 7600 if (isrel) {
5624f11d 7601 num = RExC_npar - num;
c74340f9
YO
7602 if (num < 1)
7603 vFAIL("Reference to nonexistent or unclosed group");
7604 }
2bf803e2 7605 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
7606 goto defchar;
7607 else {
3dab1dad 7608 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
7609 while (isDIGIT(*RExC_parse))
7610 RExC_parse++;
1f1031fe
YO
7611 if (parse_start == RExC_parse - 1)
7612 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
7613 if (hasbrace) {
7614 if (*RExC_parse != '}')
7615 vFAIL("Unterminated \\g{...} pattern");
7616 RExC_parse++;
7617 }
c74340f9
YO
7618 if (!SIZE_ONLY) {
7619 if (num > (I32)RExC_rx->nparens)
7620 vFAIL("Reference to nonexistent group");
c74340f9 7621 }
830247a4 7622 RExC_sawback = 1;
eb160463 7623 ret = reganode(pRExC_state,
4444fd9f
KW
7624 ((! FOLD)
7625 ? REF
7626 : (UNI_SEMANTICS)
7627 ? REFFU
7628 : (LOC)
7629 ? REFFL
7630 : REFF),
7631 num);
a0d0e21e 7632 *flagp |= HASWIDTH;
2af232bd 7633
fac92740 7634 /* override incorrect value set in reganode MJD */
2af232bd 7635 Set_Node_Offset(ret, parse_start+1);
fac92740 7636 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
7637 RExC_parse--;
7638 nextchar(pRExC_state);
a0d0e21e
LW
7639 }
7640 }
7641 break;
7642 case '\0':
830247a4 7643 if (RExC_parse >= RExC_end)
b45f050a 7644 FAIL("Trailing \\");
a0d0e21e
LW
7645 /* FALL THROUGH */
7646 default:
a0288114 7647 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 7648 back into the quick-grab loop below */
45948336 7649 parse_start--;
a0d0e21e
LW
7650 goto defchar;
7651 }
7652 break;
4633a7c4
LW
7653
7654 case '#':
bbe252da 7655 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 7656 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
7657 goto tryagain;
7658 }
7659 /* FALL THROUGH */
7660
f9a79580
RGS
7661 default:
7662 outer_default:{
ba210ebe 7663 register STRLEN len;
58ae7d3f 7664 register UV ender;
a0d0e21e 7665 register char *p;
3dab1dad 7666 char *s;
80aecb99 7667 STRLEN foldlen;
89ebb4a3 7668 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
7669
7670 parse_start = RExC_parse - 1;
a0d0e21e 7671
830247a4 7672 RExC_parse++;
a0d0e21e
LW
7673
7674 defchar:
58ae7d3f 7675 ender = 0;
eb160463 7676 ret = reg_node(pRExC_state,
2c2b7f86
KW
7677 (U8) ((! FOLD) ? EXACT
7678 : (LOC)
7679 ? EXACTFL
7680 : (UNI_SEMANTICS)
7681 ? EXACTFU
7682 : EXACTF)
7683 );
cd439c50 7684 s = STRING(ret);
830247a4
IZ
7685 for (len = 0, p = RExC_parse - 1;
7686 len < 127 && p < RExC_end;
a0d0e21e
LW
7687 len++)
7688 {
3dab1dad 7689 char * const oldp = p;
5b5a24f7 7690
bbe252da 7691 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 7692 p = regwhite( pRExC_state, p );
f9a79580 7693 switch ((U8)*p) {
ced7f090
KW
7694 case LATIN_SMALL_LETTER_SHARP_S:
7695 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7696 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
56d400ed 7697 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
f9a79580 7698 goto normal_default;
a0d0e21e
LW
7699 case '^':
7700 case '$':
7701 case '.':
7702 case '[':
7703 case '(':
7704 case ')':
7705 case '|':
7706 goto loopdone;
7707 case '\\':
ee9b8eae
YO
7708 /* Literal Escapes Switch
7709
7710 This switch is meant to handle escape sequences that
7711 resolve to a literal character.
7712
7713 Every escape sequence that represents something
7714 else, like an assertion or a char class, is handled
7715 in the switch marked 'Special Escapes' above in this
7716 routine, but also has an entry here as anything that
7717 isn't explicitly mentioned here will be treated as
7718 an unescaped equivalent literal.
7719 */
7720
a0a388a1 7721 switch ((U8)*++p) {
ee9b8eae 7722 /* These are all the special escapes. */
ced7f090
KW
7723 case LATIN_SMALL_LETTER_SHARP_S:
7724 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7725 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
a0a388a1
YO
7726 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7727 goto normal_default;
ee9b8eae
YO
7728 case 'A': /* Start assertion */
7729 case 'b': case 'B': /* Word-boundary assertion*/
7730 case 'C': /* Single char !DANGEROUS! */
7731 case 'd': case 'D': /* digit class */
7732 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 7733 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
7734 case 'k': case 'K': /* named backref, keep marker */
7735 case 'N': /* named char sequence */
38a44b82 7736 case 'p': case 'P': /* Unicode property */
e1d1eefb 7737 case 'R': /* LNBREAK */
ee9b8eae 7738 case 's': case 'S': /* space class */
e1d1eefb 7739 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
7740 case 'w': case 'W': /* word class */
7741 case 'X': /* eXtended Unicode "combining character sequence" */
7742 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
7743 --p;
7744 goto loopdone;
ee9b8eae
YO
7745
7746 /* Anything after here is an escape that resolves to a
7747 literal. (Except digits, which may or may not)
7748 */
a0d0e21e
LW
7749 case 'n':
7750 ender = '\n';
7751 p++;
a687059c 7752 break;
a0d0e21e
LW
7753 case 'r':
7754 ender = '\r';
7755 p++;
a687059c 7756 break;
a0d0e21e
LW
7757 case 't':
7758 ender = '\t';
7759 p++;
a687059c 7760 break;
a0d0e21e
LW
7761 case 'f':
7762 ender = '\f';
7763 p++;
a687059c 7764 break;
a0d0e21e 7765 case 'e':
c7f1f016 7766 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 7767 p++;
a687059c 7768 break;
a0d0e21e 7769 case 'a':
c7f1f016 7770 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 7771 p++;
a687059c 7772 break;
f0a2b745
KW
7773 case 'o':
7774 {
7775 STRLEN brace_len = len;
00c0cb6d 7776 UV result;
454155d9
KW
7777 const char* error_msg;
7778
7779 bool valid = grok_bslash_o(p,
7780 &result,
7781 &brace_len,
7782 &error_msg,
7783 1);
7784 p += brace_len;
7785 if (! valid) {
7786 RExC_parse = p; /* going to die anyway; point
7787 to exact spot of failure */
f0a2b745
KW
7788 vFAIL(error_msg);
7789 }
00c0cb6d
DG
7790 else
7791 {
7792 ender = result;
7793 }
f0a2b745
KW
7794 if (PL_encoding && ender < 0x100) {
7795 goto recode_encoding;
7796 }
7797 if (ender > 0xff) {
62fed28b 7798 REQUIRE_UTF8;
f0a2b745
KW
7799 }
7800 break;
7801 }
a0d0e21e 7802 case 'x':
a0ed51b3 7803 if (*++p == '{') {
1df70142 7804 char* const e = strchr(p, '}');
b81d288d 7805
b45f050a 7806 if (!e) {
830247a4 7807 RExC_parse = p + 1;
b45f050a
JF
7808 vFAIL("Missing right brace on \\x{}");
7809 }
de5f0749 7810 else {
a4c04bdc
NC
7811 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7812 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 7813 STRLEN numlen = e - p - 1;
53305cf1 7814 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028 7815 if (ender > 0xff)
62fed28b 7816 REQUIRE_UTF8;
a0ed51b3
LW
7817 p = e + 1;
7818 }
a0ed51b3
LW
7819 }
7820 else {
a4c04bdc 7821 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 7822 STRLEN numlen = 2;
53305cf1 7823 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
7824 p += numlen;
7825 }
9e08bc66
TS
7826 if (PL_encoding && ender < 0x100)
7827 goto recode_encoding;
a687059c 7828 break;
a0d0e21e
LW
7829 case 'c':
7830 p++;
f9d13529 7831 ender = grok_bslash_c(*p++, SIZE_ONLY);
a687059c 7832 break;
a0d0e21e
LW
7833 case '0': case '1': case '2': case '3':case '4':
7834 case '5': case '6': case '7': case '8':case '9':
7835 if (*p == '0' ||
ca67da41 7836 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
c99e91e9
KW
7837 {
7838 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
1df70142 7839 STRLEN numlen = 3;
53305cf1 7840 ender = grok_oct(p, &numlen, &flags, NULL);
fa1639c5 7841 if (ender > 0xff) {
62fed28b 7842 REQUIRE_UTF8;
609122bd 7843 }
a0d0e21e
LW
7844 p += numlen;
7845 }
7846 else {
7847 --p;
7848 goto loopdone;
a687059c 7849 }
9e08bc66
TS
7850 if (PL_encoding && ender < 0x100)
7851 goto recode_encoding;
7852 break;
7853 recode_encoding:
7854 {
7855 SV* enc = PL_encoding;
7856 ender = reg_recode((const char)(U8)ender, &enc);
668c081a
NC
7857 if (!enc && SIZE_ONLY)
7858 ckWARNreg(p, "Invalid escape in the specified encoding");
62fed28b 7859 REQUIRE_UTF8;
9e08bc66 7860 }
a687059c 7861 break;
a0d0e21e 7862 case '\0':
830247a4 7863 if (p >= RExC_end)
b45f050a 7864 FAIL("Trailing \\");
a687059c 7865 /* FALL THROUGH */
a0d0e21e 7866 default:
668c081a
NC
7867 if (!SIZE_ONLY&& isALPHA(*p))
7868 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 7869 goto normal_default;
a0d0e21e
LW
7870 }
7871 break;
a687059c 7872 default:
a0ed51b3 7873 normal_default:
fd400ab9 7874 if (UTF8_IS_START(*p) && UTF) {
1df70142 7875 STRLEN numlen;
5e12f4fb 7876 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 7877 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
7878 p += numlen;
7879 }
7880 else
7881 ender = *p++;
a0d0e21e 7882 break;
a687059c 7883 }
bcdf7404
YO
7884 if ( RExC_flags & RXf_PMf_EXTENDED)
7885 p = regwhite( pRExC_state, p );
60a8b682
JH
7886 if (UTF && FOLD) {
7887 /* Prime the casefolded buffer. */
ac7e0132 7888 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 7889 }
bcdf7404 7890 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
7891 if (len)
7892 p = oldp;
16ea2a2e 7893 else if (UTF) {
80aecb99 7894 if (FOLD) {
60a8b682 7895 /* Emit all the Unicode characters. */
1df70142 7896 STRLEN numlen;
80aecb99
JH
7897 for (foldbuf = tmpbuf;
7898 foldlen;
7899 foldlen -= numlen) {
7900 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7901 if (numlen > 0) {
71207a34 7902 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7903 s += unilen;
7904 len += unilen;
7905 /* In EBCDIC the numlen
7906 * and unilen can differ. */
9dc45d57 7907 foldbuf += numlen;
47654450
JH
7908 if (numlen >= foldlen)
7909 break;
9dc45d57
JH
7910 }
7911 else
7912 break; /* "Can't happen." */
80aecb99
JH
7913 }
7914 }
7915 else {
71207a34 7916 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7917 if (unilen > 0) {
0ebc6274
JH
7918 s += unilen;
7919 len += unilen;
9dc45d57 7920 }
80aecb99 7921 }
a0ed51b3 7922 }
a0d0e21e
LW
7923 else {
7924 len++;
eb160463 7925 REGC((char)ender, s++);
a0d0e21e
LW
7926 }
7927 break;
a687059c 7928 }
16ea2a2e 7929 if (UTF) {
80aecb99 7930 if (FOLD) {
60a8b682 7931 /* Emit all the Unicode characters. */
1df70142 7932 STRLEN numlen;
80aecb99
JH
7933 for (foldbuf = tmpbuf;
7934 foldlen;
7935 foldlen -= numlen) {
7936 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7937 if (numlen > 0) {
71207a34 7938 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7939 len += unilen;
7940 s += unilen;
7941 /* In EBCDIC the numlen
7942 * and unilen can differ. */
9dc45d57 7943 foldbuf += numlen;
47654450
JH
7944 if (numlen >= foldlen)
7945 break;
9dc45d57
JH
7946 }
7947 else
7948 break;
80aecb99
JH
7949 }
7950 }
7951 else {
71207a34 7952 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7953 if (unilen > 0) {
0ebc6274
JH
7954 s += unilen;
7955 len += unilen;
9dc45d57 7956 }
80aecb99
JH
7957 }
7958 len--;
a0ed51b3
LW
7959 }
7960 else
eb160463 7961 REGC((char)ender, s++);
a0d0e21e
LW
7962 }
7963 loopdone:
830247a4 7964 RExC_parse = p - 1;
fac92740 7965 Set_Node_Cur_Length(ret); /* MJD */
830247a4 7966 nextchar(pRExC_state);
793db0cb
JH
7967 {
7968 /* len is STRLEN which is unsigned, need to copy to signed */
7969 IV iv = len;
7970 if (iv < 0)
7971 vFAIL("Internal disaster");
7972 }
a0d0e21e
LW
7973 if (len > 0)
7974 *flagp |= HASWIDTH;
090f7165 7975 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 7976 *flagp |= SIMPLE;
3dab1dad 7977
cd439c50 7978 if (SIZE_ONLY)
830247a4 7979 RExC_size += STR_SZ(len);
3dab1dad
YO
7980 else {
7981 STR_LEN(ret) = len;
830247a4 7982 RExC_emit += STR_SZ(len);
07be1b83 7983 }
3dab1dad 7984 }
a0d0e21e
LW
7985 break;
7986 }
a687059c 7987
a0d0e21e 7988 return(ret);
a687059c
LW
7989}
7990
873ef191 7991STATIC char *
bcdf7404 7992S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 7993{
bcdf7404 7994 const char *e = RExC_end;
7918f24d
NC
7995
7996 PERL_ARGS_ASSERT_REGWHITE;
7997
5b5a24f7
CS
7998 while (p < e) {
7999 if (isSPACE(*p))
8000 ++p;
8001 else if (*p == '#') {
bcdf7404 8002 bool ended = 0;
5b5a24f7 8003 do {
bcdf7404
YO
8004 if (*p++ == '\n') {
8005 ended = 1;
8006 break;
8007 }
8008 } while (p < e);
8009 if (!ended)
8010 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
8011 }
8012 else
8013 break;
8014 }
8015 return p;
8016}
8017
b8c5462f
JH
8018/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
8019 Character classes ([:foo:]) can also be negated ([:^foo:]).
8020 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
8021 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 8022 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
8023
8024#define POSIXCC_DONE(c) ((c) == ':')
8025#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
8026#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
8027
b8c5462f 8028STATIC I32
830247a4 8029S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 8030{
97aff369 8031 dVAR;
936ed897 8032 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 8033
7918f24d
NC
8034 PERL_ARGS_ASSERT_REGPPOSIXCC;
8035
830247a4 8036 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 8037 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 8038 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 8039 const char c = UCHARAT(RExC_parse);
097eb12c 8040 char* const s = RExC_parse++;
b81d288d 8041
9a86a77b 8042 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
8043 RExC_parse++;
8044 if (RExC_parse == RExC_end)
620e46c5 8045 /* Grandfather lone [:, [=, [. */
830247a4 8046 RExC_parse = s;
620e46c5 8047 else {
3dab1dad 8048 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
8049 assert(*t == c);
8050
9a86a77b 8051 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 8052 const char *posixcc = s + 1;
830247a4 8053 RExC_parse++; /* skip over the ending ] */
3dab1dad 8054
b8c5462f 8055 if (*s == ':') {
1df70142
AL
8056 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
8057 const I32 skip = t - posixcc;
80916619
NC
8058
8059 /* Initially switch on the length of the name. */
8060 switch (skip) {
8061 case 4:
3dab1dad
YO
8062 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
8063 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 8064 break;
80916619
NC
8065 case 5:
8066 /* Names all of length 5. */
8067 /* alnum alpha ascii blank cntrl digit graph lower
8068 print punct space upper */
8069 /* Offset 4 gives the best switch position. */
8070 switch (posixcc[4]) {
8071 case 'a':
3dab1dad
YO
8072 if (memEQ(posixcc, "alph", 4)) /* alpha */
8073 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
8074 break;
8075 case 'e':
3dab1dad
YO
8076 if (memEQ(posixcc, "spac", 4)) /* space */
8077 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
8078 break;
8079 case 'h':
3dab1dad
YO
8080 if (memEQ(posixcc, "grap", 4)) /* graph */
8081 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
8082 break;
8083 case 'i':
3dab1dad
YO
8084 if (memEQ(posixcc, "asci", 4)) /* ascii */
8085 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
8086 break;
8087 case 'k':
3dab1dad
YO
8088 if (memEQ(posixcc, "blan", 4)) /* blank */
8089 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
8090 break;
8091 case 'l':
3dab1dad
YO
8092 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
8093 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
8094 break;
8095 case 'm':
3dab1dad
YO
8096 if (memEQ(posixcc, "alnu", 4)) /* alnum */
8097 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
8098 break;
8099 case 'r':
3dab1dad
YO
8100 if (memEQ(posixcc, "lowe", 4)) /* lower */
8101 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8102 else if (memEQ(posixcc, "uppe", 4)) /* upper */
8103 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
8104 break;
8105 case 't':
3dab1dad
YO
8106 if (memEQ(posixcc, "digi", 4)) /* digit */
8107 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8108 else if (memEQ(posixcc, "prin", 4)) /* print */
8109 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8110 else if (memEQ(posixcc, "punc", 4)) /* punct */
8111 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 8112 break;
b8c5462f
JH
8113 }
8114 break;
80916619 8115 case 6:
3dab1dad
YO
8116 if (memEQ(posixcc, "xdigit", 6))
8117 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
8118 break;
8119 }
80916619
NC
8120
8121 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
8122 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8123 t - s - 1, s + 1);
80916619
NC
8124 assert (posixcc[skip] == ':');
8125 assert (posixcc[skip+1] == ']');
b45f050a 8126 } else if (!SIZE_ONLY) {
b8c5462f 8127 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 8128
830247a4 8129 /* adjust RExC_parse so the warning shows after
b45f050a 8130 the class closes */
9a86a77b 8131 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 8132 RExC_parse++;
b45f050a
JF
8133 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8134 }
b8c5462f
JH
8135 } else {
8136 /* Maternal grandfather:
8137 * "[:" ending in ":" but not in ":]" */
830247a4 8138 RExC_parse = s;
767d463e 8139 }
620e46c5
JH
8140 }
8141 }
8142
b8c5462f
JH
8143 return namedclass;
8144}
8145
8146STATIC void
830247a4 8147S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 8148{
97aff369 8149 dVAR;
7918f24d
NC
8150
8151 PERL_ARGS_ASSERT_CHECKPOSIXCC;
8152
3dab1dad 8153 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
8154 const char *s = RExC_parse;
8155 const char c = *s++;
b8c5462f 8156
3dab1dad 8157 while (isALNUM(*s))
b8c5462f
JH
8158 s++;
8159 if (*s && c == *s && s[1] == ']') {
668c081a
NC
8160 ckWARN3reg(s+2,
8161 "POSIX syntax [%c %c] belongs inside character classes",
8162 c, c);
b45f050a
JF
8163
8164 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 8165 if (POSIXCC_NOTYET(c)) {
830247a4 8166 /* adjust RExC_parse so the error shows after
b45f050a 8167 the class closes */
9a86a77b 8168 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 8169 NOOP;
b45f050a
JF
8170 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8171 }
b8c5462f
JH
8172 }
8173 }
620e46c5
JH
8174}
8175
0399b215 8176/* No locale test */
e1d1eefb
YO
8177#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
8178ANYOF_##NAME: \
8179 for (value = 0; value < 256; value++) \
8180 if (TEST) \
2283d326 8181 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
e1d1eefb
YO
8182 yesno = '+'; \
8183 what = WORD; \
8184 break; \
8185case ANYOF_N##NAME: \
8186 for (value = 0; value < 256; value++) \
8187 if (!TEST) \
2283d326 8188 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
e1d1eefb
YO
8189 yesno = '!'; \
8190 what = WORD; \
8191 break
89836f1f 8192
a12cf05f
KW
8193/* Like the above, but there are differences if we are in uni-8-bit or not, so
8194 * there are two tests passed in, to use depending on that. There aren't any
8195 * cases where the label is different from the name, so no need for that
8196 * parameter */
0399b215 8197#define _C_C_T_(NAME,TEST_8,TEST_7,WORD) \
a12cf05f
KW
8198ANYOF_##NAME: \
8199 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8200 else if (UNI_SEMANTICS) { \
8201 for (value = 0; value < 256; value++) { \
2283d326
KW
8202 if (TEST_8) stored += \
8203 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
a12cf05f
KW
8204 } \
8205 } \
8206 else { \
3c8eea2b 8207 for (value = 0; value < 128; value++) { \
2283d326 8208 if (TEST_7) stored += \
3c8eea2b 8209 S_set_regclass_bit(aTHX_ pRExC_state, ret, UNI_TO_NATIVE(value)); \
a12cf05f
KW
8210 } \
8211 } \
8212 yesno = '+'; \
8213 what = WORD; \
8214 break; \
8215case ANYOF_N##NAME: \
8216 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8217 else if (UNI_SEMANTICS) { \
8218 for (value = 0; value < 256; value++) { \
2283d326
KW
8219 if (! TEST_8) stored += \
8220 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
a12cf05f
KW
8221 } \
8222 } \
8223 else { \
3c8eea2b 8224 for (value = 0; value < 128; value++) { \
2283d326
KW
8225 if (! TEST_7) stored += \
8226 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
a12cf05f 8227 } \
3c8eea2b
KW
8228 for (value = 128; value < 256; value++) { \
8229 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8230 } \
a12cf05f
KW
8231 } \
8232 yesno = '!'; \
8233 what = WORD; \
8234 break
8235
da7fcca4
YO
8236/*
8237 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8238 so that it is possible to override the option here without having to
8239 rebuild the entire core. as we are required to do if we change regcomp.h
8240 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8241*/
8242#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8243#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8244#endif
8245
8246#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8247#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8248#else
8249#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8250#endif
8251
2283d326
KW
8252STATIC U8
8253S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8254{
8255
8256 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
8257 * Locale folding is done at run-time, so this function should not be
8258 * called for nodes that are for locales.
8259 *
8260 * This function simply sets the bit corresponding to the fold of the input
8261 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
8262 * 'F' is 'f'.
8263 *
8264 * It also sets any necessary flags, and returns the number of bits that
8265 * actually changed from 0 to 1 */
8266
8267 U8 stored = 0;
2283d326
KW
8268 U8 fold;
8269
8270 fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
8271 : PL_fold[value];
8272
8273 /* It assumes the bit for 'value' has already been set */
8274 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
8275 ANYOF_BITMAP_SET(node, fold);
8276 stored++;
8277 }
8278
8279 /* The fold of the German sharp s is two ASCII characters, so isn't in the
8280 * bitmap and doesn't have to be in utf8, but we only process it if unicode
8281 * semantics are called for */
8282 if (UNI_SEMANTICS && value == LATIN_SMALL_LETTER_SHARP_S) {
8283 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
8284 }
e2e75538 8285 else if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
2283d326
KW
8286 || (! UNI_SEMANTICS
8287 && ! isASCII(value)
8288 && PL_fold_latin1[value] != value))
8289 { /* A character that has a fold outside of Latin1 matches outside the
8290 bitmap, but only when the target string is utf8. Similarly when we
8291 don't have unicode semantics for the above ASCII Latin-1 characters,
8292 and they have a fold, they should match if the target is utf8, and
8293 not otherwise */
8294 ANYOF_FLAGS(node) |= ANYOF_UTF8;
8295 }
8296
8297 return stored;
8298}
8299
8300
8301PERL_STATIC_INLINE U8
41b0f1c1 8302S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
2283d326
KW
8303{
8304 /* This inline function sets a bit in the bitmap if not already set, and if
8305 * appropriate, its fold, returning the number of bits that actually
8306 * changed from 0 to 1 */
8307
8308 U8 stored;
8309
8310 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
8311 return 0;
8312 }
8313
8314 ANYOF_BITMAP_SET(node, value);
8315 stored = 1;
8316
8317 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
8318 stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
8319 }
8320
8321 return stored;
8322}
8323
7f6f358c
YO
8324/*
8325 parse a class specification and produce either an ANYOF node that
89836f1f
YO
8326 matches the pattern or if the pattern matches a single char only and
8327 that char is < 256 and we are case insensitive then we produce an
8328 EXACT node instead.
7f6f358c 8329*/
89836f1f 8330
76e3520e 8331STATIC regnode *
3dab1dad 8332S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 8333{
97aff369 8334 dVAR;
9a86a77b 8335 register UV nextvalue;
3568d838 8336 register IV prevvalue = OOB_UNICODE;
ffc61ed2 8337 register IV range = 0;
e1d1eefb 8338 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 8339 register regnode *ret;
ba210ebe 8340 STRLEN numlen;
ffc61ed2 8341 IV namedclass;
cbbf8932 8342 char *rangebegin = NULL;
936ed897 8343 bool need_class = 0;
c445ea15 8344 SV *listsv = NULL;
ffc61ed2 8345 UV n;
cbbf8932 8346 AV* unicode_alternate = NULL;
1b2d223b
JH
8347#ifdef EBCDIC
8348 UV literal_endpoint = 0;
8349#endif
ffc130aa 8350 UV stored = 0; /* how many chars stored in the bitmap */
ffc61ed2 8351
3dab1dad 8352 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 8353 case we need to change the emitted regop to an EXACT. */
07be1b83 8354 const char * orig_parse = RExC_parse;
72f13be8 8355 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8356
8357 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
8358#ifndef DEBUGGING
8359 PERL_UNUSED_ARG(depth);
8360#endif
72f13be8 8361
3dab1dad 8362 DEBUG_PARSE("clas");
7f6f358c
YO
8363
8364 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
8365 ret = reganode(pRExC_state, ANYOF, 0);
8366
8367 if (!SIZE_ONLY)
8368 ANYOF_FLAGS(ret) = 0;
8369
9a86a77b 8370 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
8371 RExC_naughty++;
8372 RExC_parse++;
8373 if (!SIZE_ONLY)
8374 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8375 }
a0d0e21e 8376
73060fc4 8377 if (SIZE_ONLY) {
830247a4 8378 RExC_size += ANYOF_SKIP;
3a15e693
KW
8379#ifdef ANYOF_ADD_LOC_SKIP
8380 if (LOC) {
8381 RExC_size += ANYOF_ADD_LOC_SKIP;
8382 }
8383#endif
73060fc4
JH
8384 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8385 }
936ed897 8386 else {
830247a4 8387 RExC_emit += ANYOF_SKIP;
3a15e693 8388 if (LOC) {
936ed897 8389 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3a15e693
KW
8390#ifdef ANYOF_ADD_LOC_SKIP
8391 RExC_emit += ANYOF_ADD_LOC_SKIP;
8392#endif
8393 }
ffc61ed2 8394 ANYOF_BITMAP_ZERO(ret);
396482e1 8395 listsv = newSVpvs("# comment\n");
a0d0e21e 8396 }
b8c5462f 8397
9a86a77b
JH
8398 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8399
b938889d 8400 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 8401 checkposixcc(pRExC_state);
b8c5462f 8402
f064b6ad
HS
8403 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8404 if (UCHARAT(RExC_parse) == ']')
8405 goto charclassloop;
ffc61ed2 8406
fc8cd66c 8407parseit:
9a86a77b 8408 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
8409
8410 charclassloop:
8411
8412 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8413
73b437c8 8414 if (!range)
830247a4 8415 rangebegin = RExC_parse;
ffc61ed2 8416 if (UTF) {
5e12f4fb 8417 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 8418 RExC_end - RExC_parse,
9f7f3913 8419 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
8420 RExC_parse += numlen;
8421 }
8422 else
8423 value = UCHARAT(RExC_parse++);
7f6f358c 8424
9a86a77b
JH
8425 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8426 if (value == '[' && POSIXCC(nextvalue))
830247a4 8427 namedclass = regpposixcc(pRExC_state, value);
620e46c5 8428 else if (value == '\\') {
ffc61ed2 8429 if (UTF) {
5e12f4fb 8430 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 8431 RExC_end - RExC_parse,
9f7f3913 8432 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
8433 RExC_parse += numlen;
8434 }
8435 else
8436 value = UCHARAT(RExC_parse++);
470c3474 8437 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 8438 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
8439 * be a problem later if we want switch on Unicode.
8440 * A similar issue a little bit later when switching on
8441 * namedclass. --jhi */
ffc61ed2 8442 switch ((I32)value) {
b8c5462f
JH
8443 case 'w': namedclass = ANYOF_ALNUM; break;
8444 case 'W': namedclass = ANYOF_NALNUM; break;
8445 case 's': namedclass = ANYOF_SPACE; break;
8446 case 'S': namedclass = ANYOF_NSPACE; break;
8447 case 'd': namedclass = ANYOF_DIGIT; break;
8448 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
8449 case 'v': namedclass = ANYOF_VERTWS; break;
8450 case 'V': namedclass = ANYOF_NVERTWS; break;
8451 case 'h': namedclass = ANYOF_HORIZWS; break;
8452 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
8453 case 'N': /* Handle \N{NAME} in class */
8454 {
8455 /* We only pay attention to the first char of
8456 multichar strings being returned. I kinda wonder
8457 if this makes sense as it does change the behaviour
8458 from earlier versions, OTOH that behaviour was broken
8459 as well. */
8460 UV v; /* value is register so we cant & it /grrr */
afefe6bf 8461 if (reg_namedseq(pRExC_state, &v, NULL)) {
fc8cd66c
YO
8462 goto parseit;
8463 }
8464 value= v;
8465 }
8466 break;
ffc61ed2
JH
8467 case 'p':
8468 case 'P':
3dab1dad
YO
8469 {
8470 char *e;
af6f566e 8471 if (RExC_parse >= RExC_end)
2a4859cd 8472 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 8473 if (*RExC_parse == '{') {
1df70142 8474 const U8 c = (U8)value;
ffc61ed2
JH
8475 e = strchr(RExC_parse++, '}');
8476 if (!e)
0da60cf5 8477 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
8478 while (isSPACE(UCHARAT(RExC_parse)))
8479 RExC_parse++;
8480 if (e == RExC_parse)
0da60cf5 8481 vFAIL2("Empty \\%c{}", c);
ffc61ed2 8482 n = e - RExC_parse;
ab13f0c7
JH
8483 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8484 n--;
ffc61ed2
JH
8485 }
8486 else {
8487 e = RExC_parse;
8488 n = 1;
8489 }
8490 if (!SIZE_ONLY) {
ab13f0c7
JH
8491 if (UCHARAT(RExC_parse) == '^') {
8492 RExC_parse++;
8493 n--;
8494 value = value == 'p' ? 'P' : 'p'; /* toggle */
8495 while (isSPACE(UCHARAT(RExC_parse))) {
8496 RExC_parse++;
8497 n--;
8498 }
8499 }
097eb12c
AL
8500 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8501 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
8502 }
8503 RExC_parse = e + 1;
08fc12dd
KW
8504
8505 /* The \p could match something in the Latin1 range, hence
8506 * something that isn't utf8 */
3ff7ceb3 8507 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
f81125e2 8508 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 8509 }
f81125e2 8510 break;
b8c5462f
JH
8511 case 'n': value = '\n'; break;
8512 case 'r': value = '\r'; break;
8513 case 't': value = '\t'; break;
8514 case 'f': value = '\f'; break;
8515 case 'b': value = '\b'; break;
c7f1f016
NIS
8516 case 'e': value = ASCII_TO_NATIVE('\033');break;
8517 case 'a': value = ASCII_TO_NATIVE('\007');break;
f0a2b745
KW
8518 case 'o':
8519 RExC_parse--; /* function expects to be pointed at the 'o' */
454155d9
KW
8520 {
8521 const char* error_msg;
8522 bool valid = grok_bslash_o(RExC_parse,
f0a2b745
KW
8523 &value,
8524 &numlen,
454155d9
KW
8525 &error_msg,
8526 SIZE_ONLY);
8527 RExC_parse += numlen;
8528 if (! valid) {
8529 vFAIL(error_msg);
8530 }
f0a2b745 8531 }
f0a2b745
KW
8532 if (PL_encoding && value < 0x100) {
8533 goto recode_encoding;
8534 }
8535 break;
b8c5462f 8536 case 'x':
ffc61ed2 8537 if (*RExC_parse == '{') {
a4c04bdc
NC
8538 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8539 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 8540 char * const e = strchr(RExC_parse++, '}');
b81d288d 8541 if (!e)
ffc61ed2 8542 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
8543
8544 numlen = e - RExC_parse;
8545 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8546 RExC_parse = e + 1;
8547 }
8548 else {
a4c04bdc 8549 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
8550 numlen = 2;
8551 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8552 RExC_parse += numlen;
8553 }
9e08bc66
TS
8554 if (PL_encoding && value < 0x100)
8555 goto recode_encoding;
b8c5462f
JH
8556 break;
8557 case 'c':
f9d13529 8558 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
b8c5462f
JH
8559 break;
8560 case '0': case '1': case '2': case '3': case '4':
c99e91e9 8561 case '5': case '6': case '7':
9e08bc66 8562 {
c99e91e9
KW
8563 /* Take 1-3 octal digits */
8564 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9e08bc66
TS
8565 numlen = 3;
8566 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8567 RExC_parse += numlen;
8568 if (PL_encoding && value < 0x100)
8569 goto recode_encoding;
8570 break;
8571 }
8572 recode_encoding:
8573 {
8574 SV* enc = PL_encoding;
8575 value = reg_recode((const char)(U8)value, &enc);
668c081a
NC
8576 if (!enc && SIZE_ONLY)
8577 ckWARNreg(RExC_parse,
8578 "Invalid escape in the specified encoding");
9e08bc66
TS
8579 break;
8580 }
1028017a 8581 default:
c99e91e9
KW
8582 /* Allow \_ to not give an error */
8583 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
668c081a
NC
8584 ckWARN2reg(RExC_parse,
8585 "Unrecognized escape \\%c in character class passed through",
8586 (int)value);
c99e91e9 8587 }
1028017a 8588 break;
b8c5462f 8589 }
ffc61ed2 8590 } /* end of \blah */
1b2d223b
JH
8591#ifdef EBCDIC
8592 else
8593 literal_endpoint++;
8594#endif
ffc61ed2
JH
8595
8596 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8597
2c63ecad
KW
8598 /* What matches in a locale is not known until runtime, so need to
8599 * (one time per class) allocate extra space to pass to regexec.
8600 * The space will contain a bit for each named class that is to be
8601 * matched against. This isn't needed for \p{} and pseudo-classes,
8602 * as they are not affected by locale, and hence are dealt with
8603 * separately */
8604 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8605 need_class = 1;
8606 if (SIZE_ONLY) {
3a15e693 8607#ifdef ANYOF_CLASS_ADD_SKIP
2c63ecad 8608 RExC_size += ANYOF_CLASS_ADD_SKIP;
3a15e693 8609#endif
2c63ecad
KW
8610 }
8611 else {
3a15e693 8612#ifdef ANYOF_CLASS_ADD_SKIP
2c63ecad 8613 RExC_emit += ANYOF_CLASS_ADD_SKIP;
3a15e693 8614#endif
2c63ecad
KW
8615 ANYOF_CLASS_ZERO(ret);
8616 }
9051cfd9 8617 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
2c63ecad 8618 }
ffc61ed2
JH
8619
8620 /* a bad range like a-\d, a-[:digit:] ? */
8621 if (range) {
73b437c8 8622 if (!SIZE_ONLY) {
668c081a
NC
8623 const int w =
8624 RExC_parse >= rangebegin ?
8625 RExC_parse - rangebegin : 0;
8626 ckWARN4reg(RExC_parse,
b45f050a 8627 "False [] range \"%*.*s\"",
097eb12c 8628 w, w, rangebegin);
668c081a 8629
3568d838 8630 if (prevvalue < 256) {
2283d326
KW
8631 stored +=
8632 S_set_regclass_bit(aTHX_ pRExC_state, ret, prevvalue);
8633 stored +=
8634 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
ffc61ed2
JH
8635 }
8636 else {
ef87b810 8637 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
ffc61ed2 8638 Perl_sv_catpvf(aTHX_ listsv,
a1f3213b 8639 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 8640 }
b8c5462f 8641 }
ffc61ed2
JH
8642
8643 range = 0; /* this was not a true range */
73b437c8 8644 }
ffc61ed2 8645
89836f1f
YO
8646
8647
73b437c8 8648 if (!SIZE_ONLY) {
c49a72a9
NC
8649 const char *what = NULL;
8650 char yesno = 0;
8651
e2962f66
JH
8652 /* Possible truncation here but in some 64-bit environments
8653 * the compiler gets heartburn about switch on 64-bit values.
8654 * A similar issue a little earlier when switching on value.
98f323fa 8655 * --jhi */
e2962f66 8656 switch ((I32)namedclass) {
da7fcca4 8657
0399b215
KW
8658 case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
8659 case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
8660 case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
8661 case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
8662 case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
8663 case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
8664 case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
8665 case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
8666 case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
8667 case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
da7fcca4 8668#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
a12cf05f 8669 /* \s, \w match all unicode if utf8. */
0399b215
KW
8670 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8671 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
da7fcca4 8672#else
a12cf05f 8673 /* \s, \w match ascii and locale only */
0399b215
KW
8674 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8675 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
da7fcca4 8676#endif
0399b215 8677 case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
e1d1eefb
YO
8678 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8679 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
73b437c8
JH
8680 case ANYOF_ASCII:
8681 if (LOC)
936ed897 8682 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 8683 else {
1ba5c669 8684 for (value = 0; value < 128; value++)
2283d326 8685 stored +=
3c8eea2b 8686 S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value));
73b437c8 8687 }
c49a72a9 8688 yesno = '+';
ce1c68b2
KW
8689 what = NULL; /* Doesn't match outside ascii, so
8690 don't want to add +utf8:: */
73b437c8
JH
8691 break;
8692 case ANYOF_NASCII:
8693 if (LOC)
936ed897 8694 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 8695 else {
1ba5c669 8696 for (value = 128; value < 256; value++)
2283d326 8697 stored +=
3c8eea2b 8698 S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value));
73b437c8 8699 }
c49a72a9
NC
8700 yesno = '!';
8701 what = "ASCII";
89836f1f 8702 break;
ffc61ed2
JH
8703 case ANYOF_DIGIT:
8704 if (LOC)
8705 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8706 else {
8707 /* consecutive digits assumed */
8708 for (value = '0'; value <= '9'; value++)
2283d326
KW
8709 stored +=
8710 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
ffc61ed2 8711 }
c49a72a9 8712 yesno = '+';
da7fcca4 8713 what = POSIX_CC_UNI_NAME("Digit");
ffc61ed2
JH
8714 break;
8715 case ANYOF_NDIGIT:
8716 if (LOC)
8717 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8718 else {
8719 /* consecutive digits assumed */
8720 for (value = 0; value < '0'; value++)
2283d326
KW
8721 stored +=
8722 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
ffc61ed2 8723 for (value = '9' + 1; value < 256; value++)
2283d326
KW
8724 stored +=
8725 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
ffc61ed2 8726 }
c49a72a9 8727 yesno = '!';
da7fcca4 8728 what = POSIX_CC_UNI_NAME("Digit");
89836f1f 8729 break;
f81125e2
JP
8730 case ANYOF_MAX:
8731 /* this is to handle \p and \P */
8732 break;
73b437c8 8733 default:
b45f050a 8734 vFAIL("Invalid [::] class");
73b437c8 8735 break;
b8c5462f 8736 }
c49a72a9
NC
8737 if (what) {
8738 /* Strings such as "+utf8::isWord\n" */
8739 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
ef87b810
KW
8740 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8741 }
ce1c68b2 8742
73b437c8 8743 continue;
a0d0e21e 8744 }
ffc61ed2
JH
8745 } /* end of namedclass \blah */
8746
a0d0e21e 8747 if (range) {
eb160463 8748 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
8749 const int w = RExC_parse - rangebegin;
8750 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 8751 range = 0; /* not a valid range */
73b437c8 8752 }
a0d0e21e
LW
8753 }
8754 else {
3568d838 8755 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
8756 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8757 RExC_parse[1] != ']') {
8758 RExC_parse++;
ffc61ed2
JH
8759
8760 /* a bad range like \w-, [:word:]- ? */
8761 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 8762 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 8763 const int w =
afd78fd5
JH
8764 RExC_parse >= rangebegin ?
8765 RExC_parse - rangebegin : 0;
830247a4 8766 vWARN4(RExC_parse,
b45f050a 8767 "False [] range \"%*.*s\"",
097eb12c 8768 w, w, rangebegin);
afd78fd5 8769 }
73b437c8 8770 if (!SIZE_ONLY)
2283d326
KW
8771 stored +=
8772 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
73b437c8 8773 } else
ffc61ed2
JH
8774 range = 1; /* yeah, it's a range! */
8775 continue; /* but do it the next time */
a0d0e21e 8776 }
a687059c 8777 }
ffc61ed2 8778
93733859 8779 /* now is the next time */
ae5c130c 8780 if (!SIZE_ONLY) {
3568d838 8781 if (prevvalue < 256) {
1df70142 8782 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 8783 IV i;
3568d838 8784#ifdef EBCDIC
1b2d223b
JH
8785 /* In EBCDIC [\x89-\x91] should include
8786 * the \x8e but [i-j] should not. */
8787 if (literal_endpoint == 2 &&
8788 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8789 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 8790 {
3568d838
JH
8791 if (isLOWER(prevvalue)) {
8792 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 8793 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326
KW
8794 stored +=
8795 S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
2670d666 8796 }
ffc61ed2 8797 } else {
3568d838 8798 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 8799 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326
KW
8800 stored +=
8801 S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
2670d666 8802 }
ffc61ed2 8803 }
8ada0baa 8804 }
ffc61ed2 8805 else
8ada0baa 8806#endif
07be1b83 8807 for (i = prevvalue; i <= ceilvalue; i++) {
79e7df45 8808 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
07be1b83 8809 }
3568d838 8810 }
a5961de5 8811 if (value > 255 || UTF) {
1df70142
AL
8812 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8813 const UV natvalue = NATIVE_TO_UNI(value);
ef87b810
KW
8814
8815 /* If the code point requires utf8 to represent, and we are not
8816 * folding, it can't match unless the target is in utf8. Only
8817 * a few code points above 255 fold to below it, so XXX an
8818 * optimization would be to know which ones and set the flag
8819 * appropriately. */
8820 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
8821 ? ANYOF_NONBITMAP
8822 : ANYOF_UTF8;
b08decb7 8823 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 8824 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
8825 prevnatvalue, natvalue);
8826 }
8827 else if (prevnatvalue == natvalue) {
8828 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 8829 if (FOLD) {
89ebb4a3 8830 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 8831 STRLEN foldlen;
1df70142 8832 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 8833
e294cc5d
JH
8834#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8835 if (RExC_precomp[0] == ':' &&
8836 RExC_precomp[1] == '[' &&
8837 (f == 0xDF || f == 0x92)) {
8838 f = NATIVE_TO_UNI(f);
8839 }
8840#endif
c840d2a2
JH
8841 /* If folding and foldable and a single
8842 * character, insert also the folded version
8843 * to the charclass. */
9e55ce06 8844 if (f != value) {
e294cc5d
JH
8845#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8846 if ((RExC_precomp[0] == ':' &&
8847 RExC_precomp[1] == '[' &&
8848 (f == 0xA2 &&
8849 (value == 0xFB05 || value == 0xFB06))) ?
8850 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8851 foldlen == (STRLEN)UNISKIP(f) )
8852#else
eb160463 8853 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 8854#endif
9e55ce06
JH
8855 Perl_sv_catpvf(aTHX_ listsv,
8856 "%04"UVxf"\n", f);
8857 else {
8858 /* Any multicharacter foldings
8859 * require the following transform:
8860 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8861 * where E folds into "pq" and F folds
8862 * into "rst", all other characters
8863 * fold to single characters. We save
8864 * away these multicharacter foldings,
8865 * to be later saved as part of the
8866 * additional "s" data. */
8867 SV *sv;
8868
8869 if (!unicode_alternate)
8870 unicode_alternate = newAV();
740cce10
NC
8871 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8872 TRUE);
9e55ce06
JH
8873 av_push(unicode_alternate, sv);
8874 }
8875 }
254ba52a 8876
60a8b682
JH
8877 /* If folding and the value is one of the Greek
8878 * sigmas insert a few more sigmas to make the
8879 * folding rules of the sigmas to work right.
8880 * Note that not all the possible combinations
8881 * are handled here: some of them are handled
9e55ce06
JH
8882 * by the standard folding rules, and some of
8883 * them (literal or EXACTF cases) are handled
8884 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
8885 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8886 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8887 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 8888 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8889 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8890 }
8891 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8892 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8893 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8894 }
8895 }
ffc61ed2 8896 }
1b2d223b
JH
8897#ifdef EBCDIC
8898 literal_endpoint = 0;
8899#endif
8ada0baa 8900 }
ffc61ed2
JH
8901
8902 range = 0; /* this range (if it was one) is done now */
a0d0e21e 8903 }
ffc61ed2 8904
ffc61ed2 8905
7f6f358c
YO
8906
8907 if (SIZE_ONLY)
8908 return ret;
8909 /****** !SIZE_ONLY AFTER HERE *********/
8910
b8e7c664
KW
8911 /* Folding in the bitmap is taken care of above, but not for locale, for
8912 * which we have to wait to see what folding is in effect at runtime, and
8913 * for things not in the bitmap */
8914 if (FOLD && (LOC || ANYOF_FLAGS(ret) & ANYOF_NONBITMAP)) {
8915 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8916 }
40c78556
KW
8917
8918 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that this doesn't
8919 * optimize locale. Doing so perhaps could be done as long as there is
8920 * nothing like \w in it; some thought also would have to be given to the
8921 * interaction with above 0x100 chars */
8922 if ((ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8923 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8924 ANYOF_BITMAP(ret)[value] ^= 0xFF;
8925 stored = 256 - stored;
8926
8927 /* The inversion means that everything above 255 is matched */
8928 ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
8929 }
8930
2786be71
KW
8931 /* A single character class can be "optimized" into an EXACTish node.
8932 * Note that since we don't currently count how many characters there are
8933 * outside the bitmap, we are XXX missing optimization possibilities for
8934 * them. This optimization can't happen unless this is a truly single
8935 * character class, which means that it can't be an inversion into a
8936 * many-character class, and there must be no possibility of there being
8937 * things outside the bitmap. 'stored' (only) for locales doesn't include
8938 * \w, etc, so have to make a special test that they aren't present */
8939 if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
8940 && ((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
8941 || (! ANYOF_CLASS_TEST_ANY_SET(ret))))))
8942 {
8943 /* Note that the information needed to decide to do this optimization
8944 * is not currently available until the 2nd pass, and that the actually
8945 * used EXACT node takes less space than the calculated ANYOF node, and
8946 * hence the amount of space calculated in the first pass is larger
8947 * than actually used, so this optimization doesn't gain us any space.
8948 * But an EXACT node is faster than an ANYOF node, and can be combined
8949 * with any adjacent EXACT nodes later by the optimizer for further
8950 * gains. */
8951
07be1b83
YO
8952 const char * cur_parse= RExC_parse;
8953 RExC_emit = (regnode *)orig_emit;
8954 RExC_parse = (char *)orig_parse;
2786be71
KW
8955
8956 /* (A locale node can have 1 point and be folded; all the other folds
8957 * will include the fold, hence will have 2 points, so we won't get
929fffa4
KW
8958 * here with ANYOF_FOLD set unless it is also locale) */
8959 ret = reg_node(pRExC_state, (U8) (! (ANYOF_FLAGS(ret) & ANYOF_FOLD))
2786be71
KW
8960 ? EXACT
8961 : EXACTFL
8962 );
07be1b83 8963 RExC_parse = (char *)cur_parse;
2786be71
KW
8964 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
8965 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
8966 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
8967 STR_LEN(ret)= 2;
8968 RExC_emit += STR_SZ(2);
8969 }
8970 else {
8971 *STRING(ret)= (char)value;
8972 STR_LEN(ret)= 1;
8973 RExC_emit += STR_SZ(1);
8974 }
ef8d46e8 8975 SvREFCNT_dec(listsv);
7f6f358c 8976 return ret;
2786be71
KW
8977
8978 /* (A 2-character class of the very special form like [aA] could be
8979 * optimized into an EXACTFish node, but only for non-locales, and for
8980 * characters which only have the two folds; so things like 'fF' and
8981 * 'Ii' wouldn't work because of the fold of 'LATIN SMALL LIGATURE FI'.
8982 * Since we don't have that information currently conveniently
8983 * available, skip the optimization) */
7f6f358c 8984 }
ffc61ed2 8985
7f6f358c 8986 {
097eb12c 8987 AV * const av = newAV();
ffc61ed2 8988 SV *rv;
9e55ce06 8989 /* The 0th element stores the character class description
6a0407ee 8990 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
8991 * to initialize the appropriate swash (which gets stored in
8992 * the 1st element), and also useful for dumping the regnode.
8993 * The 2nd element stores the multicharacter foldings,
6a0407ee 8994 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
8995 av_store(av, 0, listsv);
8996 av_store(av, 1, NULL);
ad64d0ec
NC
8997 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8998 rv = newRV_noinc(MUTABLE_SV(av));
19860706 8999 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 9000 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 9001 ARG_SET(ret, n);
a0ed51b3 9002 }
a0ed51b3
LW
9003 return ret;
9004}
89836f1f
YO
9005#undef _C_C_T_
9006
a0ed51b3 9007
bcdf7404
YO
9008/* reg_skipcomment()
9009
9010 Absorbs an /x style # comments from the input stream.
9011 Returns true if there is more text remaining in the stream.
9012 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
9013 terminates the pattern without including a newline.
9014
9015 Note its the callers responsibility to ensure that we are
9016 actually in /x mode
9017
9018*/
9019
9020STATIC bool
9021S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
9022{
9023 bool ended = 0;
7918f24d
NC
9024
9025 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
9026
bcdf7404
YO
9027 while (RExC_parse < RExC_end)
9028 if (*RExC_parse++ == '\n') {
9029 ended = 1;
9030 break;
9031 }
9032 if (!ended) {
9033 /* we ran off the end of the pattern without ending
9034 the comment, so we have to add an \n when wrapping */
9035 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9036 return 0;
9037 } else
9038 return 1;
9039}
9040
9041/* nextchar()
9042
3b753521 9043 Advances the parse position, and optionally absorbs
bcdf7404
YO
9044 "whitespace" from the inputstream.
9045
9046 Without /x "whitespace" means (?#...) style comments only,
9047 with /x this means (?#...) and # comments and whitespace proper.
9048
9049 Returns the RExC_parse point from BEFORE the scan occurs.
9050
9051 This is the /x friendly way of saying RExC_parse++.
9052*/
9053
76e3520e 9054STATIC char*
830247a4 9055S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 9056{
097eb12c 9057 char* const retval = RExC_parse++;
a0d0e21e 9058
7918f24d
NC
9059 PERL_ARGS_ASSERT_NEXTCHAR;
9060
4633a7c4 9061 for (;;) {
830247a4
IZ
9062 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
9063 RExC_parse[2] == '#') {
e994fd66
AE
9064 while (*RExC_parse != ')') {
9065 if (RExC_parse == RExC_end)
9066 FAIL("Sequence (?#... not terminated");
830247a4 9067 RExC_parse++;
e994fd66 9068 }
830247a4 9069 RExC_parse++;
4633a7c4
LW
9070 continue;
9071 }
bbe252da 9072 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
9073 if (isSPACE(*RExC_parse)) {
9074 RExC_parse++;
748a9306
LW
9075 continue;
9076 }
830247a4 9077 else if (*RExC_parse == '#') {
bcdf7404
YO
9078 if ( reg_skipcomment( pRExC_state ) )
9079 continue;
748a9306 9080 }
748a9306 9081 }
4633a7c4 9082 return retval;
a0d0e21e 9083 }
a687059c
LW
9084}
9085
9086/*
c277df42 9087- reg_node - emit a node
a0d0e21e 9088*/
76e3520e 9089STATIC regnode * /* Location. */
830247a4 9090S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 9091{
97aff369 9092 dVAR;
c277df42 9093 register regnode *ptr;
504618e9 9094 regnode * const ret = RExC_emit;
07be1b83 9095 GET_RE_DEBUG_FLAGS_DECL;
a687059c 9096
7918f24d
NC
9097 PERL_ARGS_ASSERT_REG_NODE;
9098
c277df42 9099 if (SIZE_ONLY) {
830247a4
IZ
9100 SIZE_ALIGN(RExC_size);
9101 RExC_size += 1;
a0d0e21e
LW
9102 return(ret);
9103 }
3b57cd43
YO
9104 if (RExC_emit >= RExC_emit_bound)
9105 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9106
c277df42 9107 NODE_ALIGN_FILL(ret);
a0d0e21e 9108 ptr = ret;
c277df42 9109 FILL_ADVANCE_NODE(ptr, op);
7122b237 9110#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9111 if (RExC_offsets) { /* MJD */
07be1b83 9112 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 9113 "reg_node", __LINE__,
13d6edb4 9114 PL_reg_name[op],
07be1b83
YO
9115 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
9116 ? "Overwriting end of array!\n" : "OK",
9117 (UV)(RExC_emit - RExC_emit_start),
9118 (UV)(RExC_parse - RExC_start),
9119 (UV)RExC_offsets[0]));
ccb2c380 9120 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 9121 }
7122b237 9122#endif
830247a4 9123 RExC_emit = ptr;
a0d0e21e 9124 return(ret);
a687059c
LW
9125}
9126
9127/*
a0d0e21e
LW
9128- reganode - emit a node with an argument
9129*/
76e3520e 9130STATIC regnode * /* Location. */
830247a4 9131S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 9132{
97aff369 9133 dVAR;
c277df42 9134 register regnode *ptr;
504618e9 9135 regnode * const ret = RExC_emit;
07be1b83 9136 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 9137
7918f24d
NC
9138 PERL_ARGS_ASSERT_REGANODE;
9139
c277df42 9140 if (SIZE_ONLY) {
830247a4
IZ
9141 SIZE_ALIGN(RExC_size);
9142 RExC_size += 2;
6bda09f9
YO
9143 /*
9144 We can't do this:
9145
9146 assert(2==regarglen[op]+1);
9147
9148 Anything larger than this has to allocate the extra amount.
9149 If we changed this to be:
9150
9151 RExC_size += (1 + regarglen[op]);
9152
9153 then it wouldn't matter. Its not clear what side effect
9154 might come from that so its not done so far.
9155 -- dmq
9156 */
a0d0e21e
LW
9157 return(ret);
9158 }
3b57cd43
YO
9159 if (RExC_emit >= RExC_emit_bound)
9160 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9161
c277df42 9162 NODE_ALIGN_FILL(ret);
a0d0e21e 9163 ptr = ret;
c277df42 9164 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 9165#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9166 if (RExC_offsets) { /* MJD */
07be1b83 9167 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 9168 "reganode",
ccb2c380 9169 __LINE__,
13d6edb4 9170 PL_reg_name[op],
07be1b83 9171 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 9172 "Overwriting end of array!\n" : "OK",
07be1b83
YO
9173 (UV)(RExC_emit - RExC_emit_start),
9174 (UV)(RExC_parse - RExC_start),
9175 (UV)RExC_offsets[0]));
ccb2c380 9176 Set_Cur_Node_Offset;
fac92740 9177 }
7122b237 9178#endif
830247a4 9179 RExC_emit = ptr;
a0d0e21e 9180 return(ret);
fe14fcc3
LW
9181}
9182
9183/*
cd439c50 9184- reguni - emit (if appropriate) a Unicode character
a0ed51b3 9185*/
71207a34
AL
9186STATIC STRLEN
9187S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 9188{
97aff369 9189 dVAR;
7918f24d
NC
9190
9191 PERL_ARGS_ASSERT_REGUNI;
9192
71207a34 9193 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
9194}
9195
9196/*
a0d0e21e
LW
9197- reginsert - insert an operator in front of already-emitted operand
9198*
9199* Means relocating the operand.
9200*/
76e3520e 9201STATIC void
6bda09f9 9202S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 9203{
97aff369 9204 dVAR;
c277df42
IZ
9205 register regnode *src;
9206 register regnode *dst;
9207 register regnode *place;
504618e9 9208 const int offset = regarglen[(U8)op];
6bda09f9 9209 const int size = NODE_STEP_REGNODE + offset;
07be1b83 9210 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9211
9212 PERL_ARGS_ASSERT_REGINSERT;
def51078 9213 PERL_UNUSED_ARG(depth);
22c35a8c 9214/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 9215 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 9216 if (SIZE_ONLY) {
6bda09f9 9217 RExC_size += size;
a0d0e21e
LW
9218 return;
9219 }
a687059c 9220
830247a4 9221 src = RExC_emit;
6bda09f9 9222 RExC_emit += size;
830247a4 9223 dst = RExC_emit;
40d049e4 9224 if (RExC_open_parens) {
6bda09f9 9225 int paren;
3b57cd43 9226 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 9227 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 9228 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 9229 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
9230 RExC_open_parens[paren] += size;
9231 } else {
3b57cd43 9232 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
9233 }
9234 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 9235 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
9236 RExC_close_parens[paren] += size;
9237 } else {
3b57cd43 9238 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
9239 }
9240 }
6bda09f9 9241 }
40d049e4 9242
fac92740 9243 while (src > opnd) {
c277df42 9244 StructCopy(--src, --dst, regnode);
7122b237 9245#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9246 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 9247 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 9248 "reg_insert",
ccb2c380 9249 __LINE__,
13d6edb4 9250 PL_reg_name[op],
07be1b83
YO
9251 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9252 ? "Overwriting end of array!\n" : "OK",
9253 (UV)(src - RExC_emit_start),
9254 (UV)(dst - RExC_emit_start),
9255 (UV)RExC_offsets[0]));
ccb2c380
MP
9256 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9257 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 9258 }
7122b237 9259#endif
fac92740
MJD
9260 }
9261
a0d0e21e
LW
9262
9263 place = opnd; /* Op node, where operand used to be. */
7122b237 9264#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9265 if (RExC_offsets) { /* MJD */
07be1b83 9266 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 9267 "reginsert",
ccb2c380 9268 __LINE__,
13d6edb4 9269 PL_reg_name[op],
07be1b83 9270 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 9271 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
9272 (UV)(place - RExC_emit_start),
9273 (UV)(RExC_parse - RExC_start),
786e8c11 9274 (UV)RExC_offsets[0]));
ccb2c380 9275 Set_Node_Offset(place, RExC_parse);
45948336 9276 Set_Node_Length(place, 1);
fac92740 9277 }
7122b237 9278#endif
c277df42
IZ
9279 src = NEXTOPER(place);
9280 FILL_ADVANCE_NODE(place, op);
9281 Zero(src, offset, regnode);
a687059c
LW
9282}
9283
9284/*
c277df42 9285- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 9286- SEE ALSO: regtail_study
a0d0e21e 9287*/
097eb12c 9288/* TODO: All three parms should be const */
76e3520e 9289STATIC void
3dab1dad 9290S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 9291{
97aff369 9292 dVAR;
c277df42 9293 register regnode *scan;
72f13be8 9294 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9295
9296 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
9297#ifndef DEBUGGING
9298 PERL_UNUSED_ARG(depth);
9299#endif
a0d0e21e 9300
c277df42 9301 if (SIZE_ONLY)
a0d0e21e
LW
9302 return;
9303
9304 /* Find last node. */
9305 scan = p;
9306 for (;;) {
504618e9 9307 regnode * const temp = regnext(scan);
3dab1dad
YO
9308 DEBUG_PARSE_r({
9309 SV * const mysv=sv_newmortal();
9310 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9311 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
9312 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9313 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9314 (temp == NULL ? "->" : ""),
13d6edb4 9315 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 9316 );
3dab1dad
YO
9317 });
9318 if (temp == NULL)
9319 break;
9320 scan = temp;
9321 }
9322
9323 if (reg_off_by_arg[OP(scan)]) {
9324 ARG_SET(scan, val - scan);
9325 }
9326 else {
9327 NEXT_OFF(scan) = val - scan;
9328 }
9329}
9330
07be1b83 9331#ifdef DEBUGGING
3dab1dad
YO
9332/*
9333- regtail_study - set the next-pointer at the end of a node chain of p to val.
9334- Look for optimizable sequences at the same time.
9335- currently only looks for EXACT chains.
07be1b83
YO
9336
9337This is expermental code. The idea is to use this routine to perform
9338in place optimizations on branches and groups as they are constructed,
9339with the long term intention of removing optimization from study_chunk so
9340that it is purely analytical.
9341
9342Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9343to control which is which.
9344
3dab1dad
YO
9345*/
9346/* TODO: All four parms should be const */
07be1b83 9347
3dab1dad
YO
9348STATIC U8
9349S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9350{
9351 dVAR;
9352 register regnode *scan;
07be1b83
YO
9353 U8 exact = PSEUDO;
9354#ifdef EXPERIMENTAL_INPLACESCAN
9355 I32 min = 0;
9356#endif
3dab1dad
YO
9357 GET_RE_DEBUG_FLAGS_DECL;
9358
7918f24d
NC
9359 PERL_ARGS_ASSERT_REGTAIL_STUDY;
9360
07be1b83 9361
3dab1dad
YO
9362 if (SIZE_ONLY)
9363 return exact;
9364
9365 /* Find last node. */
9366
9367 scan = p;
9368 for (;;) {
9369 regnode * const temp = regnext(scan);
07be1b83
YO
9370#ifdef EXPERIMENTAL_INPLACESCAN
9371 if (PL_regkind[OP(scan)] == EXACT)
9372 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9373 return EXACT;
9374#endif
3dab1dad
YO
9375 if ( exact ) {
9376 switch (OP(scan)) {
9377 case EXACT:
9378 case EXACTF:
2c2b7f86 9379 case EXACTFU:
3dab1dad
YO
9380 case EXACTFL:
9381 if( exact == PSEUDO )
9382 exact= OP(scan);
07be1b83
YO
9383 else if ( exact != OP(scan) )
9384 exact= 0;
3dab1dad
YO
9385 case NOTHING:
9386 break;
9387 default:
9388 exact= 0;
9389 }
9390 }
9391 DEBUG_PARSE_r({
9392 SV * const mysv=sv_newmortal();
9393 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9394 regprop(RExC_rx, mysv, scan);
eaf3ca90 9395 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 9396 SvPV_nolen_const(mysv),
eaf3ca90 9397 REG_NODE_NUM(scan),
13d6edb4 9398 PL_reg_name[exact]);
3dab1dad 9399 });
a0d0e21e
LW
9400 if (temp == NULL)
9401 break;
9402 scan = temp;
9403 }
07be1b83
YO
9404 DEBUG_PARSE_r({
9405 SV * const mysv_val=sv_newmortal();
9406 DEBUG_PARSE_MSG("");
9407 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
9408 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9409 SvPV_nolen_const(mysv_val),
9410 (IV)REG_NODE_NUM(val),
9411 (IV)(val - scan)
07be1b83
YO
9412 );
9413 });
c277df42
IZ
9414 if (reg_off_by_arg[OP(scan)]) {
9415 ARG_SET(scan, val - scan);
a0ed51b3
LW
9416 }
9417 else {
c277df42
IZ
9418 NEXT_OFF(scan) = val - scan;
9419 }
3dab1dad
YO
9420
9421 return exact;
a687059c 9422}
07be1b83 9423#endif
a687059c
LW
9424
9425/*
fd181c75 9426 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 9427 */
f7819f85 9428#ifdef DEBUGGING
c33269f7 9429static void
7918f24d
NC
9430S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9431{
f7819f85
A
9432 int bit;
9433 int set=0;
7918f24d 9434
f7819f85
A
9435 for (bit=0; bit<32; bit++) {
9436 if (flags & (1<<bit)) {
9437 if (!set++ && lead)
9438 PerlIO_printf(Perl_debug_log, "%s",lead);
9439 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9440 }
9441 }
9442 if (lead) {
9443 if (set)
9444 PerlIO_printf(Perl_debug_log, "\n");
9445 else
9446 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9447 }
9448}
9449#endif
9450
a687059c 9451void
097eb12c 9452Perl_regdump(pTHX_ const regexp *r)
a687059c 9453{
35ff7856 9454#ifdef DEBUGGING
97aff369 9455 dVAR;
c445ea15 9456 SV * const sv = sv_newmortal();
ab3bbdeb 9457 SV *dsv= sv_newmortal();
f8fc2ecf 9458 RXi_GET_DECL(r,ri);
f7819f85 9459 GET_RE_DEBUG_FLAGS_DECL;
a687059c 9460
7918f24d
NC
9461 PERL_ARGS_ASSERT_REGDUMP;
9462
f8fc2ecf 9463 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
9464
9465 /* Header fields of interest. */
ab3bbdeb
YO
9466 if (r->anchored_substr) {
9467 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9468 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 9469 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9470 "anchored %s%s at %"IVdf" ",
9471 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 9472 (IV)r->anchored_offset);
ab3bbdeb
YO
9473 } else if (r->anchored_utf8) {
9474 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9475 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 9476 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9477 "anchored utf8 %s%s at %"IVdf" ",
9478 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 9479 (IV)r->anchored_offset);
ab3bbdeb
YO
9480 }
9481 if (r->float_substr) {
9482 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9483 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 9484 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9485 "floating %s%s at %"IVdf"..%"UVuf" ",
9486 s, RE_SV_TAIL(r->float_substr),
7b0972df 9487 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
9488 } else if (r->float_utf8) {
9489 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9490 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 9491 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9492 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9493 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 9494 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 9495 }
33b8afdf 9496 if (r->check_substr || r->check_utf8)
b81d288d 9497 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
9498 (const char *)
9499 (r->check_substr == r->float_substr
9500 && r->check_utf8 == r->float_utf8
9501 ? "(checking floating" : "(checking anchored"));
bbe252da 9502 if (r->extflags & RXf_NOSCAN)
c277df42 9503 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 9504 if (r->extflags & RXf_CHECK_ALL)
c277df42 9505 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 9506 if (r->check_substr || r->check_utf8)
c277df42
IZ
9507 PerlIO_printf(Perl_debug_log, ") ");
9508
f8fc2ecf
YO
9509 if (ri->regstclass) {
9510 regprop(r, sv, ri->regstclass);
1de06328 9511 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 9512 }
bbe252da 9513 if (r->extflags & RXf_ANCH) {
774d564b 9514 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 9515 if (r->extflags & RXf_ANCH_BOL)
774d564b 9516 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 9517 if (r->extflags & RXf_ANCH_MBOL)
c277df42 9518 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 9519 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 9520 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 9521 if (r->extflags & RXf_ANCH_GPOS)
774d564b 9522 PerlIO_printf(Perl_debug_log, "(GPOS)");
9523 PerlIO_putc(Perl_debug_log, ' ');
9524 }
bbe252da 9525 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 9526 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 9527 if (r->intflags & PREGf_SKIP)
760ac839 9528 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 9529 if (r->intflags & PREGf_IMPLICIT)
760ac839 9530 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 9531 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 9532 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 9533 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 9534 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 9535 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 9536#else
7918f24d 9537 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 9538 PERL_UNUSED_CONTEXT;
65e66c80 9539 PERL_UNUSED_ARG(r);
17c3b450 9540#endif /* DEBUGGING */
a687059c
LW
9541}
9542
9543/*
a0d0e21e
LW
9544- regprop - printable representation of opcode
9545*/
3339dfd8
YO
9546#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9547STMT_START { \
9548 if (do_sep) { \
9549 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9550 if (flags & ANYOF_INVERT) \
9551 /*make sure the invert info is in each */ \
9552 sv_catpvs(sv, "^"); \
9553 do_sep = 0; \
9554 } \
9555} STMT_END
9556
46fc3d4c 9557void
32fc9b6a 9558Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 9559{
35ff7856 9560#ifdef DEBUGGING
97aff369 9561 dVAR;
9b155405 9562 register int k;
f8fc2ecf 9563 RXi_GET_DECL(prog,progi);
1de06328 9564 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 9565
7918f24d 9566 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 9567
76f68e9b 9568 sv_setpvs(sv, "");
8aa23a47 9569
03363afd 9570 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
9571 /* It would be nice to FAIL() here, but this may be called from
9572 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 9573 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 9574 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 9575
3dab1dad 9576 k = PL_regkind[OP(o)];
9b155405 9577
2a782b5b 9578 if (k == EXACT) {
f92a2122 9579 sv_catpvs(sv, " ");
ab3bbdeb
YO
9580 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9581 * is a crude hack but it may be the best for now since
9582 * we have no flag "this EXACTish node was UTF-8"
9583 * --jhi */
f92a2122
NC
9584 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9585 PERL_PV_ESCAPE_UNI_DETECT |
9586 PERL_PV_PRETTY_ELLIPSES |
9587 PERL_PV_PRETTY_LTGT |
9588 PERL_PV_PRETTY_NOCLEAR
9589 );
bb263b4e 9590 } else if (k == TRIE) {
3dab1dad 9591 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 9592 * progi->data isn't available here */
1de06328 9593 const char op = OP(o);
647f639f 9594 const U32 n = ARG(o);
1de06328 9595 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 9596 (reg_ac_data *)progi->data->data[n] :
1de06328 9597 NULL;
3251b653
NC
9598 const reg_trie_data * const trie
9599 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 9600
13d6edb4 9601 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
9602 DEBUG_TRIE_COMPILE_r(
9603 Perl_sv_catpvf(aTHX_ sv,
9604 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9605 (UV)trie->startstate,
1e2e3d02 9606 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
9607 (UV)trie->wordcount,
9608 (UV)trie->minlen,
9609 (UV)trie->maxlen,
9610 (UV)TRIE_CHARCOUNT(trie),
9611 (UV)trie->uniquecharcount
9612 )
9613 );
9614 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9615 int i;
9616 int rangestart = -1;
f46cb337 9617 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 9618 sv_catpvs(sv, "[");
1de06328
YO
9619 for (i = 0; i <= 256; i++) {
9620 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9621 if (rangestart == -1)
9622 rangestart = i;
9623 } else if (rangestart != -1) {
9624 if (i <= rangestart + 3)
9625 for (; rangestart < i; rangestart++)
9626 put_byte(sv, rangestart);
9627 else {
9628 put_byte(sv, rangestart);
9629 sv_catpvs(sv, "-");
9630 put_byte(sv, i - 1);
9631 }
9632 rangestart = -1;
9633 }
9634 }
f3a2811a 9635 sv_catpvs(sv, "]");
1de06328
YO
9636 }
9637
a3621e74 9638 } else if (k == CURLY) {
cb434fcc 9639 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
9640 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9641 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 9642 }
2c2d71f5
JH
9643 else if (k == WHILEM && o->flags) /* Ordinal/of */
9644 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 9645 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 9646 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 9647 if ( RXp_PAREN_NAMES(prog) ) {
9d6ecd7a 9648 if ( k != REF || (OP(o) < NREF)) {
502c6561 9649 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
9650 SV **name= av_fetch(list, ARG(o), 0 );
9651 if (name)
9652 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9653 }
9654 else {
502c6561 9655 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 9656 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
9657 I32 *nums=(I32*)SvPVX(sv_dat);
9658 SV **name= av_fetch(list, nums[0], 0 );
9659 I32 n;
9660 if (name) {
9661 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9662 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9663 (n ? "," : ""), (IV)nums[n]);
9664 }
9665 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 9666 }
1f1031fe 9667 }
ee9b8eae 9668 }
1f1031fe 9669 } else if (k == GOSUB)
6bda09f9 9670 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
9671 else if (k == VERB) {
9672 if (!o->flags)
9673 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 9674 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 9675 } else if (k == LOGICAL)
04ebc1ab 9676 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
f9a79580 9677 else if (k == FOLDCHAR)
df44d732 9678 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
653099ff
GS
9679 else if (k == ANYOF) {
9680 int i, rangestart = -1;
2d03de9c 9681 const U8 flags = ANYOF_FLAGS(o);
24d786f4 9682 int do_sep = 0;
0bd48802
AL
9683
9684 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9685 static const char * const anyofs[] = {
653099ff
GS
9686 "\\w",
9687 "\\W",
9688 "\\s",
9689 "\\S",
9690 "\\d",
9691 "\\D",
9692 "[:alnum:]",
9693 "[:^alnum:]",
9694 "[:alpha:]",
9695 "[:^alpha:]",
9696 "[:ascii:]",
9697 "[:^ascii:]",
24d786f4
YO
9698 "[:cntrl:]",
9699 "[:^cntrl:]",
653099ff
GS
9700 "[:graph:]",
9701 "[:^graph:]",
9702 "[:lower:]",
9703 "[:^lower:]",
9704 "[:print:]",
9705 "[:^print:]",
9706 "[:punct:]",
9707 "[:^punct:]",
9708 "[:upper:]",
aaa51d5e 9709 "[:^upper:]",
653099ff 9710 "[:xdigit:]",
aaa51d5e
JF
9711 "[:^xdigit:]",
9712 "[:space:]",
9713 "[:^space:]",
9714 "[:blank:]",
9715 "[:^blank:]"
653099ff
GS
9716 };
9717
19860706 9718 if (flags & ANYOF_LOCALE)
396482e1 9719 sv_catpvs(sv, "{loc}");
19860706 9720 if (flags & ANYOF_FOLD)
396482e1 9721 sv_catpvs(sv, "{i}");
653099ff 9722 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 9723 if (flags & ANYOF_INVERT)
396482e1 9724 sv_catpvs(sv, "^");
3339dfd8
YO
9725
9726 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
9727 for (i = 0; i <= 256; i++) {
9728 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9729 if (rangestart == -1)
9730 rangestart = i;
9731 } else if (rangestart != -1) {
9732 if (i <= rangestart + 3)
9733 for (; rangestart < i; rangestart++)
653099ff 9734 put_byte(sv, rangestart);
ffc61ed2
JH
9735 else {
9736 put_byte(sv, rangestart);
396482e1 9737 sv_catpvs(sv, "-");
ffc61ed2 9738 put_byte(sv, i - 1);
653099ff 9739 }
24d786f4 9740 do_sep = 1;
ffc61ed2 9741 rangestart = -1;
653099ff 9742 }
847a199f 9743 }
3339dfd8
YO
9744
9745 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
3a15e693
KW
9746 /* output any special charclass tests (used entirely under use locale) */
9747 if (ANYOF_CLASS_TEST_ANY_SET(o))
bb7a0f54 9748 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 9749 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 9750 sv_catpv(sv, anyofs[i]);
24d786f4
YO
9751 do_sep = 1;
9752 }
9753
3339dfd8
YO
9754 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9755
9756 /* output information about the unicode matching */
ef87b810 9757 if (flags & ANYOF_UNICODE_ALL)
396482e1 9758 sv_catpvs(sv, "{unicode_all}");
ef87b810
KW
9759 else if (flags & ANYOF_UTF8)
9760 sv_catpvs(sv, "{unicode}");
f5ecd18d 9761 if (flags & ANYOF_NONBITMAP_NON_UTF8)
ef87b810 9762 sv_catpvs(sv, "{outside bitmap}");
ffc61ed2
JH
9763
9764 {
9765 SV *lv;
32fc9b6a 9766 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 9767
ffc61ed2
JH
9768 if (lv) {
9769 if (sw) {
89ebb4a3 9770 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 9771
ffc61ed2 9772 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 9773 uvchr_to_utf8(s, i);
ffc61ed2 9774
3568d838 9775 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
9776 if (rangestart == -1)
9777 rangestart = i;
9778 } else if (rangestart != -1) {
ffc61ed2
JH
9779 if (i <= rangestart + 3)
9780 for (; rangestart < i; rangestart++) {
2d03de9c
AL
9781 const U8 * const e = uvchr_to_utf8(s,rangestart);
9782 U8 *p;
9783 for(p = s; p < e; p++)
ffc61ed2
JH
9784 put_byte(sv, *p);
9785 }
9786 else {
2d03de9c
AL
9787 const U8 *e = uvchr_to_utf8(s,rangestart);
9788 U8 *p;
9789 for (p = s; p < e; p++)
ffc61ed2 9790 put_byte(sv, *p);
396482e1 9791 sv_catpvs(sv, "-");
2d03de9c
AL
9792 e = uvchr_to_utf8(s, i-1);
9793 for (p = s; p < e; p++)
1df70142 9794 put_byte(sv, *p);
ffc61ed2
JH
9795 }
9796 rangestart = -1;
9797 }
19860706 9798 }
ffc61ed2 9799
396482e1 9800 sv_catpvs(sv, "..."); /* et cetera */
19860706 9801 }
fde631ed 9802
ffc61ed2 9803 {
2e0de35c 9804 char *s = savesvpv(lv);
c445ea15 9805 char * const origs = s;
b81d288d 9806
3dab1dad
YO
9807 while (*s && *s != '\n')
9808 s++;
b81d288d 9809
ffc61ed2 9810 if (*s == '\n') {
2d03de9c 9811 const char * const t = ++s;
ffc61ed2
JH
9812
9813 while (*s) {
9814 if (*s == '\n')
9815 *s = ' ';
9816 s++;
9817 }
9818 if (s[-1] == ' ')
9819 s[-1] = 0;
9820
9821 sv_catpv(sv, t);
fde631ed 9822 }
b81d288d 9823
ffc61ed2 9824 Safefree(origs);
fde631ed
JH
9825 }
9826 }
653099ff 9827 }
ffc61ed2 9828
653099ff
GS
9829 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9830 }
9b155405 9831 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 9832 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 9833#else
96a5add6 9834 PERL_UNUSED_CONTEXT;
65e66c80
SP
9835 PERL_UNUSED_ARG(sv);
9836 PERL_UNUSED_ARG(o);
f9049ba1 9837 PERL_UNUSED_ARG(prog);
17c3b450 9838#endif /* DEBUGGING */
35ff7856 9839}
a687059c 9840
cad2e5aa 9841SV *
288b8c02 9842Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 9843{ /* Assume that RE_INTUIT is set */
97aff369 9844 dVAR;
288b8c02 9845 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 9846 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9847
9848 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
9849 PERL_UNUSED_CONTEXT;
9850
a3621e74 9851 DEBUG_COMPILE_r(
cfd0369c 9852 {
2d03de9c 9853 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 9854 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
9855
9856 if (!PL_colorset) reginitcolors();
9857 PerlIO_printf(Perl_debug_log,
a0288114 9858 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
9859 PL_colors[4],
9860 prog->check_substr ? "" : "utf8 ",
9861 PL_colors[5],PL_colors[0],
cad2e5aa
JH
9862 s,
9863 PL_colors[1],
9864 (strlen(s) > 60 ? "..." : ""));
9865 } );
9866
33b8afdf 9867 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
9868}
9869
84da74a7 9870/*
f8149455 9871 pregfree()
84da74a7 9872
f8149455
YO
9873 handles refcounting and freeing the perl core regexp structure. When
9874 it is necessary to actually free the structure the first thing it
3b753521 9875 does is call the 'free' method of the regexp_engine associated to
f8149455
YO
9876 the regexp, allowing the handling of the void *pprivate; member
9877 first. (This routine is not overridable by extensions, which is why
9878 the extensions free is called first.)
9879
9880 See regdupe and regdupe_internal if you change anything here.
84da74a7 9881*/
f8149455 9882#ifndef PERL_IN_XSUB_RE
2b69d0c2 9883void
84679df5 9884Perl_pregfree(pTHX_ REGEXP *r)
a687059c 9885{
288b8c02
NC
9886 SvREFCNT_dec(r);
9887}
9888
9889void
9890Perl_pregfree2(pTHX_ REGEXP *rx)
9891{
27da23d5 9892 dVAR;
288b8c02 9893 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 9894 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 9895
7918f24d
NC
9896 PERL_ARGS_ASSERT_PREGFREE2;
9897
28d8d7f4
YO
9898 if (r->mother_re) {
9899 ReREFCNT_dec(r->mother_re);
9900 } else {
288b8c02 9901 CALLREGFREE_PVT(rx); /* free the private data */
ef8d46e8 9902 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
9903 }
9904 if (r->substrs) {
ef8d46e8
VP
9905 SvREFCNT_dec(r->anchored_substr);
9906 SvREFCNT_dec(r->anchored_utf8);
9907 SvREFCNT_dec(r->float_substr);
9908 SvREFCNT_dec(r->float_utf8);
28d8d7f4
YO
9909 Safefree(r->substrs);
9910 }
288b8c02 9911 RX_MATCH_COPY_FREE(rx);
f8c7b90f 9912#ifdef PERL_OLD_COPY_ON_WRITE
ef8d46e8 9913 SvREFCNT_dec(r->saved_copy);
ed252734 9914#endif
f0ab9afb 9915 Safefree(r->offs);
f8149455 9916}
28d8d7f4
YO
9917
9918/* reg_temp_copy()
9919
9920 This is a hacky workaround to the structural issue of match results
9921 being stored in the regexp structure which is in turn stored in
9922 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9923 could be PL_curpm in multiple contexts, and could require multiple
9924 result sets being associated with the pattern simultaneously, such
9925 as when doing a recursive match with (??{$qr})
9926
9927 The solution is to make a lightweight copy of the regexp structure
9928 when a qr// is returned from the code executed by (??{$qr}) this
9929 lightweight copy doesnt actually own any of its data except for
9930 the starp/end and the actual regexp structure itself.
9931
9932*/
9933
9934
84679df5 9935REGEXP *
f0826785 9936Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
7918f24d 9937{
f0826785 9938 struct regexp *ret;
288b8c02 9939 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 9940 register const I32 npar = r->nparens+1;
7918f24d
NC
9941
9942 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9943
f0826785
BM
9944 if (!ret_x)
9945 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9946 ret = (struct regexp *)SvANY(ret_x);
9947
288b8c02 9948 (void)ReREFCNT_inc(rx);
f7c278bf
NC
9949 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9950 by pointing directly at the buffer, but flagging that the allocated
9951 space in the copy is zero. As we've just done a struct copy, it's now
9952 a case of zero-ing that, rather than copying the current length. */
9953 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 9954 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
b6f60916
NC
9955 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9956 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
f7c278bf 9957 SvLEN_set(ret_x, 0);
b9ad13ac 9958 SvSTASH_set(ret_x, NULL);
703c388d 9959 SvMAGIC_set(ret_x, NULL);
f0ab9afb
NC
9960 Newx(ret->offs, npar, regexp_paren_pair);
9961 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 9962 if (r->substrs) {
28d8d7f4 9963 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
9964 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9965
9966 SvREFCNT_inc_void(ret->anchored_substr);
9967 SvREFCNT_inc_void(ret->anchored_utf8);
9968 SvREFCNT_inc_void(ret->float_substr);
9969 SvREFCNT_inc_void(ret->float_utf8);
9970
9971 /* check_substr and check_utf8, if non-NULL, point to either their
9972 anchored or float namesakes, and don't hold a second reference. */
486913e4 9973 }
288b8c02 9974 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 9975#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 9976 ret->saved_copy = NULL;
28d8d7f4 9977#endif
288b8c02 9978 ret->mother_re = rx;
28d8d7f4 9979
288b8c02 9980 return ret_x;
28d8d7f4 9981}
f8149455
YO
9982#endif
9983
9984/* regfree_internal()
9985
9986 Free the private data in a regexp. This is overloadable by
9987 extensions. Perl takes care of the regexp structure in pregfree(),
3b753521 9988 this covers the *pprivate pointer which technically perl doesn't
f8149455
YO
9989 know about, however of course we have to handle the
9990 regexp_internal structure when no extension is in use.
9991
9992 Note this is called before freeing anything in the regexp
9993 structure.
9994 */
9995
9996void
288b8c02 9997Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
9998{
9999 dVAR;
288b8c02 10000 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
10001 RXi_GET_DECL(r,ri);
10002 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10003
10004 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
10005
f8149455
YO
10006 DEBUG_COMPILE_r({
10007 if (!PL_colorset)
10008 reginitcolors();
10009 {
10010 SV *dsv= sv_newmortal();
3c8556c3 10011 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 10012 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
10013 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
10014 PL_colors[4],PL_colors[5],s);
10015 }
10016 });
7122b237
YO
10017#ifdef RE_TRACK_PATTERN_OFFSETS
10018 if (ri->u.offsets)
10019 Safefree(ri->u.offsets); /* 20010421 MJD */
10020#endif
f8fc2ecf
YO
10021 if (ri->data) {
10022 int n = ri->data->count;
f3548bdc
DM
10023 PAD* new_comppad = NULL;
10024 PAD* old_comppad;
4026c95a 10025 PADOFFSET refcnt;
dfad63ad 10026
c277df42 10027 while (--n >= 0) {
261faec3 10028 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 10029 switch (ri->data->what[n]) {
af534a04 10030 case 'a':
c277df42 10031 case 's':
81714fb9 10032 case 'S':
55eed653 10033 case 'u':
ad64d0ec 10034 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 10035 break;
653099ff 10036 case 'f':
f8fc2ecf 10037 Safefree(ri->data->data[n]);
653099ff 10038 break;
dfad63ad 10039 case 'p':
502c6561 10040 new_comppad = MUTABLE_AV(ri->data->data[n]);
dfad63ad 10041 break;
c277df42 10042 case 'o':
dfad63ad 10043 if (new_comppad == NULL)
cea2e8a9 10044 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
10045 PAD_SAVE_LOCAL(old_comppad,
10046 /* Watch out for global destruction's random ordering. */
c445ea15 10047 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 10048 );
b34c0dd4 10049 OP_REFCNT_LOCK;
f8fc2ecf 10050 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
10051 OP_REFCNT_UNLOCK;
10052 if (!refcnt)
f8fc2ecf 10053 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 10054
f3548bdc 10055 PAD_RESTORE_LOCAL(old_comppad);
ad64d0ec 10056 SvREFCNT_dec(MUTABLE_SV(new_comppad));
dfad63ad 10057 new_comppad = NULL;
c277df42
IZ
10058 break;
10059 case 'n':
9e55ce06 10060 break;
07be1b83 10061 case 'T':
be8e71aa
YO
10062 { /* Aho Corasick add-on structure for a trie node.
10063 Used in stclass optimization only */
07be1b83 10064 U32 refcount;
f8fc2ecf 10065 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
10066 OP_REFCNT_LOCK;
10067 refcount = --aho->refcount;
10068 OP_REFCNT_UNLOCK;
10069 if ( !refcount ) {
446bd890
NC
10070 PerlMemShared_free(aho->states);
10071 PerlMemShared_free(aho->fail);
446bd890
NC
10072 /* do this last!!!! */
10073 PerlMemShared_free(ri->data->data[n]);
10074 PerlMemShared_free(ri->regstclass);
07be1b83
YO
10075 }
10076 }
10077 break;
a3621e74 10078 case 't':
07be1b83 10079 {
be8e71aa 10080 /* trie structure. */
07be1b83 10081 U32 refcount;
f8fc2ecf 10082 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
10083 OP_REFCNT_LOCK;
10084 refcount = --trie->refcount;
10085 OP_REFCNT_UNLOCK;
10086 if ( !refcount ) {
446bd890 10087 PerlMemShared_free(trie->charmap);
446bd890
NC
10088 PerlMemShared_free(trie->states);
10089 PerlMemShared_free(trie->trans);
07be1b83 10090 if (trie->bitmap)
446bd890 10091 PerlMemShared_free(trie->bitmap);
786e8c11 10092 if (trie->jump)
446bd890 10093 PerlMemShared_free(trie->jump);
2e64971a 10094 PerlMemShared_free(trie->wordinfo);
446bd890
NC
10095 /* do this last!!!! */
10096 PerlMemShared_free(ri->data->data[n]);
a3621e74 10097 }
07be1b83
YO
10098 }
10099 break;
c277df42 10100 default:
f8fc2ecf 10101 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
10102 }
10103 }
f8fc2ecf
YO
10104 Safefree(ri->data->what);
10105 Safefree(ri->data);
a0d0e21e 10106 }
28d8d7f4 10107
f8fc2ecf 10108 Safefree(ri);
a687059c 10109}
c277df42 10110
a09252eb
NC
10111#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
10112#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
84da74a7
YO
10113#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
10114
10115/*
32cd70f6 10116 re_dup - duplicate a regexp.
84da74a7 10117
8233f606
DM
10118 This routine is expected to clone a given regexp structure. It is only
10119 compiled under USE_ITHREADS.
32cd70f6 10120
f8149455
YO
10121 After all of the core data stored in struct regexp is duplicated
10122 the regexp_engine.dupe method is used to copy any private data
10123 stored in the *pprivate pointer. This allows extensions to handle
10124 any duplication it needs to do.
10125
10126 See pregfree() and regfree_internal() if you change anything here.
84da74a7 10127*/
a3c0e9ca 10128#if defined(USE_ITHREADS)
f8149455 10129#ifndef PERL_IN_XSUB_RE
288b8c02
NC
10130void
10131Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 10132{
84da74a7 10133 dVAR;
a86a1ca7 10134 I32 npar;
288b8c02
NC
10135 const struct regexp *r = (const struct regexp *)SvANY(sstr);
10136 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 10137
7918f24d
NC
10138 PERL_ARGS_ASSERT_RE_DUP_GUTS;
10139
84da74a7 10140 npar = r->nparens+1;
f0ab9afb
NC
10141 Newx(ret->offs, npar, regexp_paren_pair);
10142 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 10143 if(ret->swap) {
28d8d7f4 10144 /* no need to copy these */
f0ab9afb 10145 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 10146 }
84da74a7 10147
6057429f 10148 if (ret->substrs) {
32cd70f6
NC
10149 /* Do it this way to avoid reading from *r after the StructCopy().
10150 That way, if any of the sv_dup_inc()s dislodge *r from the L1
10151 cache, it doesn't matter. */
66b1de87
NC
10152 const bool anchored = r->check_substr
10153 ? r->check_substr == r->anchored_substr
10154 : r->check_utf8 == r->anchored_utf8;
785a26d5 10155 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
10156 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10157
32cd70f6
NC
10158 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
10159 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
10160 ret->float_substr = sv_dup_inc(ret->float_substr, param);
10161 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 10162
32cd70f6
NC
10163 /* check_substr and check_utf8, if non-NULL, point to either their
10164 anchored or float namesakes, and don't hold a second reference. */
10165
10166 if (ret->check_substr) {
10167 if (anchored) {
10168 assert(r->check_utf8 == r->anchored_utf8);
10169 ret->check_substr = ret->anchored_substr;
10170 ret->check_utf8 = ret->anchored_utf8;
10171 } else {
10172 assert(r->check_substr == r->float_substr);
10173 assert(r->check_utf8 == r->float_utf8);
10174 ret->check_substr = ret->float_substr;
10175 ret->check_utf8 = ret->float_utf8;
10176 }
66b1de87
NC
10177 } else if (ret->check_utf8) {
10178 if (anchored) {
10179 ret->check_utf8 = ret->anchored_utf8;
10180 } else {
10181 ret->check_utf8 = ret->float_utf8;
10182 }
32cd70f6 10183 }
6057429f 10184 }
f8149455 10185
5daac39c 10186 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 10187
6057429f 10188 if (ret->pprivate)
288b8c02 10189 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 10190
288b8c02 10191 if (RX_MATCH_COPIED(dstr))
6057429f 10192 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
10193 else
10194 ret->subbeg = NULL;
10195#ifdef PERL_OLD_COPY_ON_WRITE
10196 ret->saved_copy = NULL;
10197#endif
6057429f 10198
c2123ae3
NC
10199 if (ret->mother_re) {
10200 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
10201 /* Our storage points directly to our mother regexp, but that's
10202 1: a buffer in a different thread
10203 2: something we no longer hold a reference on
10204 so we need to copy it locally. */
10205 /* Note we need to sue SvCUR() on our mother_re, because it, in
10206 turn, may well be pointing to its own mother_re. */
10207 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
10208 SvCUR(ret->mother_re)+1));
10209 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
10210 }
10211 ret->mother_re = NULL;
10212 }
6057429f 10213 ret->gofs = 0;
f8149455
YO
10214}
10215#endif /* PERL_IN_XSUB_RE */
10216
10217/*
10218 regdupe_internal()
10219
10220 This is the internal complement to regdupe() which is used to copy
10221 the structure pointed to by the *pprivate pointer in the regexp.
10222 This is the core version of the extension overridable cloning hook.
10223 The regexp structure being duplicated will be copied by perl prior
10224 to this and will be provided as the regexp *r argument, however
10225 with the /old/ structures pprivate pointer value. Thus this routine
10226 may override any copying normally done by perl.
10227
10228 It returns a pointer to the new regexp_internal structure.
10229*/
10230
10231void *
288b8c02 10232Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
10233{
10234 dVAR;
288b8c02 10235 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
10236 regexp_internal *reti;
10237 int len, npar;
10238 RXi_GET_DECL(r,ri);
7918f24d
NC
10239
10240 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455
YO
10241
10242 npar = r->nparens+1;
7122b237 10243 len = ProgLen(ri);
f8149455 10244
45cf4570 10245 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455
YO
10246 Copy(ri->program, reti->program, len+1, regnode);
10247
f8149455 10248
f8fc2ecf 10249 reti->regstclass = NULL;
bcdf7404 10250
f8fc2ecf 10251 if (ri->data) {
84da74a7 10252 struct reg_data *d;
f8fc2ecf 10253 const int count = ri->data->count;
84da74a7
YO
10254 int i;
10255
10256 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10257 char, struct reg_data);
10258 Newx(d->what, count, U8);
10259
10260 d->count = count;
10261 for (i = 0; i < count; i++) {
f8fc2ecf 10262 d->what[i] = ri->data->what[i];
84da74a7 10263 switch (d->what[i]) {
af534a04 10264 /* legal options are one of: sSfpontTua
84da74a7 10265 see also regcomp.h and pregfree() */
af534a04 10266 case 'a': /* actually an AV, but the dup function is identical. */
84da74a7 10267 case 's':
81714fb9 10268 case 'S':
0536c0a7 10269 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 10270 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 10271 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 10272 break;
84da74a7
YO
10273 case 'f':
10274 /* This is cheating. */
10275 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 10276 StructCopy(ri->data->data[i], d->data[i],
84da74a7 10277 struct regnode_charclass_class);
f8fc2ecf 10278 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
10279 break;
10280 case 'o':
bbe252da
YO
10281 /* Compiled op trees are readonly and in shared memory,
10282 and can thus be shared without duplication. */
84da74a7 10283 OP_REFCNT_LOCK;
f8fc2ecf 10284 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
10285 OP_REFCNT_UNLOCK;
10286 break;
23eab42c
NC
10287 case 'T':
10288 /* Trie stclasses are readonly and can thus be shared
10289 * without duplication. We free the stclass in pregfree
10290 * when the corresponding reg_ac_data struct is freed.
10291 */
10292 reti->regstclass= ri->regstclass;
10293 /* Fall through */
84da74a7 10294 case 't':
84da74a7 10295 OP_REFCNT_LOCK;
0536c0a7 10296 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 10297 OP_REFCNT_UNLOCK;
0536c0a7
NC
10298 /* Fall through */
10299 case 'n':
10300 d->data[i] = ri->data->data[i];
84da74a7 10301 break;
84da74a7 10302 default:
f8fc2ecf 10303 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
10304 }
10305 }
10306
f8fc2ecf 10307 reti->data = d;
84da74a7
YO
10308 }
10309 else
f8fc2ecf 10310 reti->data = NULL;
84da74a7 10311
cde0cee5
YO
10312 reti->name_list_idx = ri->name_list_idx;
10313
7122b237
YO
10314#ifdef RE_TRACK_PATTERN_OFFSETS
10315 if (ri->u.offsets) {
10316 Newx(reti->u.offsets, 2*len+1, U32);
10317 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10318 }
10319#else
10320 SetProgLen(reti,len);
10321#endif
10322
f8149455 10323 return (void*)reti;
84da74a7 10324}
f8149455
YO
10325
10326#endif /* USE_ITHREADS */
84da74a7 10327
f8149455 10328#ifndef PERL_IN_XSUB_RE
bcdf7404 10329
c277df42
IZ
10330/*
10331 - regnext - dig the "next" pointer out of a node
c277df42
IZ
10332 */
10333regnode *
864dbfa3 10334Perl_regnext(pTHX_ register regnode *p)
c277df42 10335{
97aff369 10336 dVAR;
c277df42
IZ
10337 register I32 offset;
10338
f8fc2ecf 10339 if (!p)
c277df42
IZ
10340 return(NULL);
10341
35db910f
KW
10342 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
10343 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10344 }
10345
c277df42
IZ
10346 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10347 if (offset == 0)
10348 return(NULL);
10349
c277df42 10350 return(p+offset);
c277df42 10351}
76234dfb 10352#endif
c277df42 10353
01f988be 10354STATIC void
cea2e8a9 10355S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
10356{
10357 va_list args;
10358 STRLEN l1 = strlen(pat1);
10359 STRLEN l2 = strlen(pat2);
10360 char buf[512];
06bf62c7 10361 SV *msv;
73d840c0 10362 const char *message;
c277df42 10363
7918f24d
NC
10364 PERL_ARGS_ASSERT_RE_CROAK2;
10365
c277df42
IZ
10366 if (l1 > 510)
10367 l1 = 510;
10368 if (l1 + l2 > 510)
10369 l2 = 510 - l1;
10370 Copy(pat1, buf, l1 , char);
10371 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
10372 buf[l1 + l2] = '\n';
10373 buf[l1 + l2 + 1] = '\0';
8736538c
AS
10374#ifdef I_STDARG
10375 /* ANSI variant takes additional second argument */
c277df42 10376 va_start(args, pat2);
8736538c
AS
10377#else
10378 va_start(args);
10379#endif
5a844595 10380 msv = vmess(buf, &args);
c277df42 10381 va_end(args);
cfd0369c 10382 message = SvPV_const(msv,l1);
c277df42
IZ
10383 if (l1 > 512)
10384 l1 = 512;
10385 Copy(message, buf, l1 , char);
197cf9b9 10386 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 10387 Perl_croak(aTHX_ "%s", buf);
c277df42 10388}
a0ed51b3
LW
10389
10390/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
10391
76234dfb 10392#ifndef PERL_IN_XSUB_RE
a0ed51b3 10393void
864dbfa3 10394Perl_save_re_context(pTHX)
b81d288d 10395{
97aff369 10396 dVAR;
1ade1aa1
NC
10397
10398 struct re_save_state *state;
10399
10400 SAVEVPTR(PL_curcop);
10401 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10402
10403 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10404 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
c6bf6a65 10405 SSPUSHUV(SAVEt_RE_STATE);
1ade1aa1 10406
46ab3289 10407 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 10408
a0ed51b3 10409 PL_reg_start_tmp = 0;
a0ed51b3 10410 PL_reg_start_tmpl = 0;
c445ea15 10411 PL_reg_oldsaved = NULL;
a5db57d6 10412 PL_reg_oldsavedlen = 0;
a5db57d6 10413 PL_reg_maxiter = 0;
a5db57d6 10414 PL_reg_leftiter = 0;
c445ea15 10415 PL_reg_poscache = NULL;
a5db57d6 10416 PL_reg_poscache_size = 0;
1ade1aa1
NC
10417#ifdef PERL_OLD_COPY_ON_WRITE
10418 PL_nrs = NULL;
10419#endif
ada6e8a9 10420
c445ea15
AL
10421 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10422 if (PL_curpm) {
10423 const REGEXP * const rx = PM_GETRE(PL_curpm);
10424 if (rx) {
1df70142 10425 U32 i;
07bc277f 10426 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 10427 char digits[TYPE_CHARS(long)];
d9fad198 10428 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
10429 GV *const *const gvp
10430 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10431
b37c2d43
AL
10432 if (gvp) {
10433 GV * const gv = *gvp;
10434 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10435 save_scalar(gv);
49f27e4b 10436 }
ada6e8a9
AMS
10437 }
10438 }
10439 }
a0ed51b3 10440}
76234dfb 10441#endif
51371543 10442
51371543 10443static void
acfe0abc 10444clear_re(pTHX_ void *r)
51371543 10445{
97aff369 10446 dVAR;
84679df5 10447 ReREFCNT_dec((REGEXP *)r);
51371543 10448}
ffbc6a93 10449
a28509cc
AL
10450#ifdef DEBUGGING
10451
10452STATIC void
10453S_put_byte(pTHX_ SV *sv, int c)
10454{
7918f24d
NC
10455 PERL_ARGS_ASSERT_PUT_BYTE;
10456
7fddd944
NC
10457 /* Our definition of isPRINT() ignores locales, so only bytes that are
10458 not part of UTF-8 are considered printable. I assume that the same
10459 holds for UTF-EBCDIC.
10460 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10461 which Wikipedia says:
10462
10463 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10464 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10465 identical, to the ASCII delete (DEL) or rubout control character.
10466 ) So the old condition can be simplified to !isPRINT(c) */
9ce2357e
KW
10467 if (!isPRINT(c)) {
10468 if (c < 256) {
10469 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
10470 }
10471 else {
10472 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
10473 }
10474 }
5e7aa789 10475 else {
88c9ea1e 10476 const char string = c;
5e7aa789
NC
10477 if (c == '-' || c == ']' || c == '\\' || c == '^')
10478 sv_catpvs(sv, "\\");
10479 sv_catpvn(sv, &string, 1);
10480 }
a28509cc
AL
10481}
10482
786e8c11 10483
3dab1dad
YO
10484#define CLEAR_OPTSTART \
10485 if (optstart) STMT_START { \
70685ca0 10486 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
10487 optstart=NULL; \
10488 } STMT_END
10489
786e8c11 10490#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 10491
b5a2f8d8
NC
10492STATIC const regnode *
10493S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
10494 const regnode *last, const regnode *plast,
10495 SV* sv, I32 indent, U32 depth)
a28509cc 10496{
97aff369 10497 dVAR;
786e8c11 10498 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 10499 register const regnode *next;
3dab1dad 10500 const regnode *optstart= NULL;
1f1031fe 10501
f8fc2ecf 10502 RXi_GET_DECL(r,ri);
3dab1dad 10503 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10504
10505 PERL_ARGS_ASSERT_DUMPUNTIL;
10506
786e8c11
YO
10507#ifdef DEBUG_DUMPUNTIL
10508 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10509 last ? last-start : 0,plast ? plast-start : 0);
10510#endif
10511
10512 if (plast && plast < last)
10513 last= plast;
10514
10515 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 10516 /* While that wasn't END last time... */
a28509cc
AL
10517 NODE_ALIGN(node);
10518 op = OP(node);
de734bd5 10519 if (op == CLOSE || op == WHILEM)
786e8c11 10520 indent--;
b5a2f8d8 10521 next = regnext((regnode *)node);
1f1031fe 10522
a28509cc 10523 /* Where, what. */
8e11feef 10524 if (OP(node) == OPTIMIZED) {
e68ec53f 10525 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 10526 optstart = node;
3dab1dad 10527 else
8e11feef 10528 goto after_print;
3dab1dad
YO
10529 } else
10530 CLEAR_OPTSTART;
1f1031fe 10531
32fc9b6a 10532 regprop(r, sv, node);
a28509cc 10533 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 10534 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
10535
10536 if (OP(node) != OPTIMIZED) {
10537 if (next == NULL) /* Next ptr. */
10538 PerlIO_printf(Perl_debug_log, " (0)");
10539 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10540 PerlIO_printf(Perl_debug_log, " (FAIL)");
10541 else
10542 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10543 (void)PerlIO_putc(Perl_debug_log, '\n');
10544 }
10545
a28509cc
AL
10546 after_print:
10547 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
10548 assert(next);
10549 {
10550 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
10551 ? regnext((regnode *)next)
10552 : next);
be8e71aa
YO
10553 if (last && nnode > last)
10554 nnode = last;
786e8c11 10555 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 10556 }
a28509cc
AL
10557 }
10558 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 10559 assert(next);
786e8c11 10560 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
10561 }
10562 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 10563 const regnode *this_trie = node;
1de06328 10564 const char op = OP(node);
647f639f 10565 const U32 n = ARG(node);
1de06328 10566 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 10567 (reg_ac_data *)ri->data->data[n] :
1de06328 10568 NULL;
3251b653
NC
10569 const reg_trie_data * const trie =
10570 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 10571#ifdef DEBUGGING
502c6561 10572 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 10573#endif
786e8c11 10574 const regnode *nextbranch= NULL;
a28509cc 10575 I32 word_idx;
76f68e9b 10576 sv_setpvs(sv, "");
786e8c11 10577 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 10578 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
10579
10580 PerlIO_printf(Perl_debug_log, "%*s%s ",
10581 (int)(2*(indent+3)), "",
10582 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
10583 PL_colors[0], PL_colors[1],
10584 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 10585 PERL_PV_PRETTY_ELLIPSES |
7f69552c 10586 PERL_PV_PRETTY_LTGT
786e8c11
YO
10587 )
10588 : "???"
10589 );
10590 if (trie->jump) {
40d049e4 10591 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
10592 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10593 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
10594 if (dist) {
10595 if (!nextbranch)
24b23f37 10596 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
10597 DUMPUNTIL(this_trie + dist, nextbranch);
10598 }
786e8c11
YO
10599 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10600 nextbranch= regnext((regnode *)nextbranch);
10601 } else {
10602 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 10603 }
786e8c11
YO
10604 }
10605 if (last && next > last)
10606 node= last;
10607 else
10608 node= next;
a28509cc 10609 }
786e8c11
YO
10610 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10611 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10612 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
10613 }
10614 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 10615 assert(next);
786e8c11 10616 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
10617 }
10618 else if ( op == PLUS || op == STAR) {
786e8c11 10619 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
10620 }
10621 else if (op == ANYOF) {
10622 /* arglen 1 + class block */
4a3ee7a8 10623 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
a28509cc
AL
10624 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10625 node = NEXTOPER(node);
10626 }
10627 else if (PL_regkind[(U8)op] == EXACT) {
10628 /* Literal string, where present. */
10629 node += NODE_SZ_STR(node) - 1;
10630 node = NEXTOPER(node);
10631 }
10632 else {
10633 node = NEXTOPER(node);
10634 node += regarglen[(U8)op];
10635 }
10636 if (op == CURLYX || op == OPEN)
786e8c11 10637 indent++;
a28509cc 10638 }
3dab1dad 10639 CLEAR_OPTSTART;
786e8c11 10640#ifdef DEBUG_DUMPUNTIL
70685ca0 10641 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 10642#endif
1de06328 10643 return node;
a28509cc
AL
10644}
10645
10646#endif /* DEBUGGING */
10647
241d1a3b
NC
10648/*
10649 * Local variables:
10650 * c-indentation-style: bsd
10651 * c-basic-offset: 4
10652 * indent-tabs-mode: t
10653 * End:
10654 *
37442d52
RGS
10655 * ex: set ts=8 sts=4 sw=4 noet:
10656 */