This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
make 0 not a special value for COP_SEQ_RANGE_HIGH
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
e40e74fe
KW
137 I32 uni_semantics; /* If a d charset modifier should use unicode
138 rules, even if the pattern is not in
139 utf8 */
81714fb9 140 HV *paren_names; /* Paren names */
1f1031fe 141
40d049e4
YO
142 regnode **recurse; /* Recurse regops */
143 I32 recurse_count; /* Number of recurse regops */
b57e4118 144 I32 in_lookbehind;
830247a4
IZ
145#if ADD_TO_REGEXEC
146 char *starttry; /* -Dr: where regtry was called. */
147#define RExC_starttry (pRExC_state->starttry)
148#endif
3dab1dad 149#ifdef DEBUGGING
be8e71aa 150 const char *lastparse;
3dab1dad 151 I32 lastnum;
1f1031fe 152 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
153#define RExC_lastparse (pRExC_state->lastparse)
154#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 155#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 156#endif
830247a4
IZ
157} RExC_state_t;
158
e2509266 159#define RExC_flags (pRExC_state->flags)
830247a4 160#define RExC_precomp (pRExC_state->precomp)
288b8c02 161#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 162#define RExC_rx (pRExC_state->rx)
f8fc2ecf 163#define RExC_rxi (pRExC_state->rxi)
fac92740 164#define RExC_start (pRExC_state->start)
830247a4
IZ
165#define RExC_end (pRExC_state->end)
166#define RExC_parse (pRExC_state->parse)
167#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
168#ifdef RE_TRACK_PATTERN_OFFSETS
169#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
170#endif
830247a4 171#define RExC_emit (pRExC_state->emit)
fac92740 172#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 173#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
174#define RExC_naughty (pRExC_state->naughty)
175#define RExC_sawback (pRExC_state->sawback)
176#define RExC_seen (pRExC_state->seen)
177#define RExC_size (pRExC_state->size)
178#define RExC_npar (pRExC_state->npar)
e2e6a0f1 179#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
180#define RExC_extralen (pRExC_state->extralen)
181#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
182#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 183#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 184#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 185#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
186#define RExC_open_parens (pRExC_state->open_parens)
187#define RExC_close_parens (pRExC_state->close_parens)
188#define RExC_opend (pRExC_state->opend)
81714fb9 189#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
190#define RExC_recurse (pRExC_state->recurse)
191#define RExC_recurse_count (pRExC_state->recurse_count)
b57e4118 192#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
830247a4 193
cde0cee5 194
a687059c
LW
195#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
196#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
197 ((*s) == '{' && regcurly(s)))
a687059c 198
35c8bce7
LW
199#ifdef SPSTART
200#undef SPSTART /* dratted cpp namespace... */
201#endif
a687059c
LW
202/*
203 * Flags to be passed up and down.
204 */
a687059c 205#define WORST 0 /* Worst case. */
a3b492c3 206#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
207
208/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
d7b56a3c 209 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
fda99bee 210#define SIMPLE 0x02
a3b492c3
YO
211#define SPSTART 0x04 /* Starts with * or +. */
212#define TRYAGAIN 0x08 /* Weeded out a declaration. */
213#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 214
3dab1dad
YO
215#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
216
07be1b83
YO
217/* whether trie related optimizations are enabled */
218#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
219#define TRIE_STUDY_OPT
786e8c11 220#define FULL_TRIE_STUDY
07be1b83
YO
221#define TRIE_STCLASS
222#endif
1de06328
YO
223
224
40d049e4
YO
225
226#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
227#define PBITVAL(paren) (1 << ((paren) & 7))
228#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
229#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
230#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
231
bbd61b5f
KW
232/* If not already in utf8, do a longjmp back to the beginning */
233#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
234#define REQUIRE_UTF8 STMT_START { \
235 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
236 } STMT_END
40d049e4 237
1de06328
YO
238/* About scan_data_t.
239
240 During optimisation we recurse through the regexp program performing
241 various inplace (keyhole style) optimisations. In addition study_chunk
242 and scan_commit populate this data structure with information about
243 what strings MUST appear in the pattern. We look for the longest
3b753521 244 string that must appear at a fixed location, and we look for the
1de06328
YO
245 longest string that may appear at a floating location. So for instance
246 in the pattern:
247
248 /FOO[xX]A.*B[xX]BAR/
249
250 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
251 strings (because they follow a .* construct). study_chunk will identify
252 both FOO and BAR as being the longest fixed and floating strings respectively.
253
254 The strings can be composites, for instance
255
256 /(f)(o)(o)/
257
258 will result in a composite fixed substring 'foo'.
259
260 For each string some basic information is maintained:
261
262 - offset or min_offset
263 This is the position the string must appear at, or not before.
264 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
265 characters must match before the string we are searching for.
266 Likewise when combined with minlenp and the length of the string it
1de06328
YO
267 tells us how many characters must appear after the string we have
268 found.
269
270 - max_offset
271 Only used for floating strings. This is the rightmost point that
3b753521 272 the string can appear at. If set to I32 max it indicates that the
1de06328
YO
273 string can occur infinitely far to the right.
274
275 - minlenp
276 A pointer to the minimum length of the pattern that the string
277 was found inside. This is important as in the case of positive
278 lookahead or positive lookbehind we can have multiple patterns
279 involved. Consider
280
281 /(?=FOO).*F/
282
283 The minimum length of the pattern overall is 3, the minimum length
284 of the lookahead part is 3, but the minimum length of the part that
285 will actually match is 1. So 'FOO's minimum length is 3, but the
286 minimum length for the F is 1. This is important as the minimum length
287 is used to determine offsets in front of and behind the string being
288 looked for. Since strings can be composites this is the length of the
486ec47a 289 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
290 the length is calculated by study_chunk, so that the minimum lengths
291 are not known until the full pattern has been compiled, thus the
292 pointer to the value.
293
294 - lookbehind
295
296 In the case of lookbehind the string being searched for can be
297 offset past the start point of the final matching string.
298 If this value was just blithely removed from the min_offset it would
299 invalidate some of the calculations for how many chars must match
300 before or after (as they are derived from min_offset and minlen and
301 the length of the string being searched for).
302 When the final pattern is compiled and the data is moved from the
303 scan_data_t structure into the regexp structure the information
304 about lookbehind is factored in, with the information that would
305 have been lost precalculated in the end_shift field for the
306 associated string.
307
308 The fields pos_min and pos_delta are used to store the minimum offset
309 and the delta to the maximum offset at the current point in the pattern.
310
311*/
2c2d71f5
JH
312
313typedef struct scan_data_t {
1de06328
YO
314 /*I32 len_min; unused */
315 /*I32 len_delta; unused */
2c2d71f5
JH
316 I32 pos_min;
317 I32 pos_delta;
318 SV *last_found;
1de06328 319 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
320 I32 last_start_min;
321 I32 last_start_max;
1de06328
YO
322 SV **longest; /* Either &l_fixed, or &l_float. */
323 SV *longest_fixed; /* longest fixed string found in pattern */
324 I32 offset_fixed; /* offset where it starts */
486ec47a 325 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
326 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
327 SV *longest_float; /* longest floating string found in pattern */
328 I32 offset_float_min; /* earliest point in string it can appear */
329 I32 offset_float_max; /* latest point in string it can appear */
486ec47a 330 I32 *minlen_float; /* pointer to the minlen relevant to the string */
1de06328 331 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
332 I32 flags;
333 I32 whilem_c;
cb434fcc 334 I32 *last_closep;
653099ff 335 struct regnode_charclass_class *start_class;
2c2d71f5
JH
336} scan_data_t;
337
a687059c 338/*
e50aee73 339 * Forward declarations for pregcomp()'s friends.
a687059c 340 */
a0d0e21e 341
27da23d5 342static const scan_data_t zero_scan_data =
1de06328 343 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
344
345#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
346#define SF_BEFORE_SEOL 0x0001
347#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
348#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
349#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
350
09b7f37c
CB
351#ifdef NO_UNARY_PLUS
352# define SF_FIX_SHIFT_EOL (0+2)
353# define SF_FL_SHIFT_EOL (0+4)
354#else
355# define SF_FIX_SHIFT_EOL (+2)
356# define SF_FL_SHIFT_EOL (+4)
357#endif
c277df42
IZ
358
359#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
360#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
361
362#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
363#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
364#define SF_IS_INF 0x0040
365#define SF_HAS_PAR 0x0080
366#define SF_IN_PAR 0x0100
367#define SF_HAS_EVAL 0x0200
368#define SCF_DO_SUBSTR 0x0400
653099ff
GS
369#define SCF_DO_STCLASS_AND 0x0800
370#define SCF_DO_STCLASS_OR 0x1000
371#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 372#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 373
786e8c11 374#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 375#define SCF_SEEN_ACCEPT 0x8000
07be1b83 376
43fead97 377#define UTF cBOOL(RExC_utf8)
a62b1201
KW
378#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
379#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
cfaf538b
KW
380#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
381#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
382#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
a62b1201 383
43fead97 384#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 385
ffc61ed2 386#define OOB_UNICODE 12345678
93733859 387#define OOB_NAMEDCLASS -1
b8c5462f 388
a0ed51b3
LW
389#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
390#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
391
8615cb43 392
b45f050a
JF
393/* length of regex to show in messages that don't mark a position within */
394#define RegexLengthToShowInErrorMessages 127
395
396/*
397 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
398 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
399 * op/pragma/warn/regcomp.
400 */
7253e4e3
RK
401#define MARKER1 "<-- HERE" /* marker as it appears in the description */
402#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 403
7253e4e3 404#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
405
406/*
407 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
408 * arg. Show regex, up to a maximum length. If it's too long, chop and add
409 * "...".
410 */
58e23c8d 411#define _FAIL(code) STMT_START { \
bfed75c6 412 const char *ellipses = ""; \
ccb2c380
MP
413 IV len = RExC_end - RExC_precomp; \
414 \
415 if (!SIZE_ONLY) \
288b8c02 416 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
417 if (len > RegexLengthToShowInErrorMessages) { \
418 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
419 len = RegexLengthToShowInErrorMessages - 10; \
420 ellipses = "..."; \
421 } \
58e23c8d 422 code; \
ccb2c380 423} STMT_END
8615cb43 424
58e23c8d
YO
425#define FAIL(msg) _FAIL( \
426 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
427 msg, (int)len, RExC_precomp, ellipses))
428
429#define FAIL2(msg,arg) _FAIL( \
430 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
431 arg, (int)len, RExC_precomp, ellipses))
432
b45f050a 433/*
b45f050a
JF
434 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
435 */
ccb2c380 436#define Simple_vFAIL(m) STMT_START { \
a28509cc 437 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
438 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
439 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
440} STMT_END
b45f050a
JF
441
442/*
443 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
444 */
ccb2c380
MP
445#define vFAIL(m) STMT_START { \
446 if (!SIZE_ONLY) \
288b8c02 447 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
448 Simple_vFAIL(m); \
449} STMT_END
b45f050a
JF
450
451/*
452 * Like Simple_vFAIL(), but accepts two arguments.
453 */
ccb2c380 454#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 455 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
456 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
457 (int)offset, RExC_precomp, RExC_precomp + offset); \
458} STMT_END
b45f050a
JF
459
460/*
461 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
462 */
ccb2c380
MP
463#define vFAIL2(m,a1) STMT_START { \
464 if (!SIZE_ONLY) \
288b8c02 465 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
466 Simple_vFAIL2(m, a1); \
467} STMT_END
b45f050a
JF
468
469
470/*
471 * Like Simple_vFAIL(), but accepts three arguments.
472 */
ccb2c380 473#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 474 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
475 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
476 (int)offset, RExC_precomp, RExC_precomp + offset); \
477} STMT_END
b45f050a
JF
478
479/*
480 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
481 */
ccb2c380
MP
482#define vFAIL3(m,a1,a2) STMT_START { \
483 if (!SIZE_ONLY) \
288b8c02 484 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
485 Simple_vFAIL3(m, a1, a2); \
486} STMT_END
b45f050a
JF
487
488/*
489 * Like Simple_vFAIL(), but accepts four arguments.
490 */
ccb2c380 491#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 492 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
493 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
494 (int)offset, RExC_precomp, RExC_precomp + offset); \
495} STMT_END
b45f050a 496
668c081a 497#define ckWARNreg(loc,m) STMT_START { \
a28509cc 498 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
499 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
500 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
501} STMT_END
502
668c081a 503#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 504 const IV offset = loc - RExC_precomp; \
d1d15184 505 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
506 m REPORT_LOCATION, \
507 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
508} STMT_END
509
668c081a 510#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 511 const IV offset = loc - RExC_precomp; \
668c081a 512 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
513 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
514} STMT_END
515
516#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 517 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
518 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
519 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
520} STMT_END
521
668c081a
NC
522#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
523 const IV offset = loc - RExC_precomp; \
524 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
525 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
526} STMT_END
527
ccb2c380 528#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 529 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
530 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
531 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
532} STMT_END
533
668c081a
NC
534#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
535 const IV offset = loc - RExC_precomp; \
536 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
537 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
538} STMT_END
539
ccb2c380 540#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 541 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
542 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
543 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
544} STMT_END
9d1d55b5 545
8615cb43 546
cd439c50 547/* Allow for side effects in s */
ccb2c380
MP
548#define REGC(c,s) STMT_START { \
549 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
550} STMT_END
cd439c50 551
fac92740
MJD
552/* Macros for recording node offsets. 20001227 mjd@plover.com
553 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
554 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
555 * Element 0 holds the number n.
07be1b83 556 * Position is 1 indexed.
fac92740 557 */
7122b237
YO
558#ifndef RE_TRACK_PATTERN_OFFSETS
559#define Set_Node_Offset_To_R(node,byte)
560#define Set_Node_Offset(node,byte)
561#define Set_Cur_Node_Offset
562#define Set_Node_Length_To_R(node,len)
563#define Set_Node_Length(node,len)
564#define Set_Node_Cur_Length(node)
565#define Node_Offset(n)
566#define Node_Length(n)
567#define Set_Node_Offset_Length(node,offset,len)
568#define ProgLen(ri) ri->u.proglen
569#define SetProgLen(ri,x) ri->u.proglen = x
570#else
571#define ProgLen(ri) ri->u.offsets[0]
572#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
573#define Set_Node_Offset_To_R(node,byte) STMT_START { \
574 if (! SIZE_ONLY) { \
575 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 576 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 577 if((node) < 0) { \
551405c4 578 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
579 } else { \
580 RExC_offsets[2*(node)-1] = (byte); \
581 } \
582 } \
583} STMT_END
584
585#define Set_Node_Offset(node,byte) \
586 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
587#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
588
589#define Set_Node_Length_To_R(node,len) STMT_START { \
590 if (! SIZE_ONLY) { \
591 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 592 __LINE__, (int)(node), (int)(len))); \
ccb2c380 593 if((node) < 0) { \
551405c4 594 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
595 } else { \
596 RExC_offsets[2*(node)] = (len); \
597 } \
598 } \
599} STMT_END
600
601#define Set_Node_Length(node,len) \
602 Set_Node_Length_To_R((node)-RExC_emit_start, len)
603#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
604#define Set_Node_Cur_Length(node) \
605 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
606
607/* Get offsets and lengths */
608#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
609#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
610
07be1b83
YO
611#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
612 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
613 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
614} STMT_END
7122b237 615#endif
07be1b83
YO
616
617#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
618#define EXPERIMENTAL_INPLACESCAN
f427392e 619#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 620
304ee84b
YO
621#define DEBUG_STUDYDATA(str,data,depth) \
622DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 623 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
624 "%*s" str "Pos:%"IVdf"/%"IVdf \
625 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
626 (int)(depth)*2, "", \
627 (IV)((data)->pos_min), \
628 (IV)((data)->pos_delta), \
304ee84b 629 (UV)((data)->flags), \
1de06328 630 (IV)((data)->whilem_c), \
304ee84b
YO
631 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
632 is_inf ? "INF " : "" \
1de06328
YO
633 ); \
634 if ((data)->last_found) \
635 PerlIO_printf(Perl_debug_log, \
636 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
637 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
638 SvPVX_const((data)->last_found), \
639 (IV)((data)->last_end), \
640 (IV)((data)->last_start_min), \
641 (IV)((data)->last_start_max), \
642 ((data)->longest && \
643 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
644 SvPVX_const((data)->longest_fixed), \
645 (IV)((data)->offset_fixed), \
646 ((data)->longest && \
647 (data)->longest==&((data)->longest_float)) ? "*" : "", \
648 SvPVX_const((data)->longest_float), \
649 (IV)((data)->offset_float_min), \
650 (IV)((data)->offset_float_max) \
651 ); \
652 PerlIO_printf(Perl_debug_log,"\n"); \
653});
654
acfe0abc 655static void clear_re(pTHX_ void *r);
4327152a 656
653099ff 657/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 658 Update the longest found anchored substring and the longest found
653099ff
GS
659 floating substrings if needed. */
660
4327152a 661STATIC void
304ee84b 662S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 663{
e1ec3a88
AL
664 const STRLEN l = CHR_SVLEN(data->last_found);
665 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 666 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 667
7918f24d
NC
668 PERL_ARGS_ASSERT_SCAN_COMMIT;
669
c277df42 670 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 671 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
672 if (*data->longest == data->longest_fixed) {
673 data->offset_fixed = l ? data->last_start_min : data->pos_min;
674 if (data->flags & SF_BEFORE_EOL)
b81d288d 675 data->flags
c277df42
IZ
676 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
677 else
678 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
679 data->minlen_fixed=minlenp;
680 data->lookbehind_fixed=0;
a0ed51b3 681 }
304ee84b 682 else { /* *data->longest == data->longest_float */
c277df42 683 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
684 data->offset_float_max = (l
685 ? data->last_start_max
c277df42 686 : data->pos_min + data->pos_delta);
304ee84b 687 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 688 data->offset_float_max = I32_MAX;
c277df42 689 if (data->flags & SF_BEFORE_EOL)
b81d288d 690 data->flags
c277df42
IZ
691 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
692 else
693 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
694 data->minlen_float=minlenp;
695 data->lookbehind_float=0;
c277df42
IZ
696 }
697 }
698 SvCUR_set(data->last_found, 0);
0eda9292 699 {
a28509cc 700 SV * const sv = data->last_found;
097eb12c
AL
701 if (SvUTF8(sv) && SvMAGICAL(sv)) {
702 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
703 if (mg)
704 mg->mg_len = 0;
705 }
0eda9292 706 }
c277df42
IZ
707 data->last_end = -1;
708 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 709 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
710}
711
653099ff
GS
712/* Can match anything (initialization) */
713STATIC void
097eb12c 714S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 715{
7918f24d
NC
716 PERL_ARGS_ASSERT_CL_ANYTHING;
717
653099ff 718 ANYOF_CLASS_ZERO(cl);
f8bef550 719 ANYOF_BITMAP_SETALL(cl);
11454c59 720 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
653099ff
GS
721 if (LOC)
722 cl->flags |= ANYOF_LOCALE;
723}
724
725/* Can match anything (initialization) */
726STATIC int
5f66b61c 727S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
728{
729 int value;
730
7918f24d
NC
731 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
732
aaa51d5e 733 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
734 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
735 return 1;
1aa99e6b
IH
736 if (!(cl->flags & ANYOF_UNICODE_ALL))
737 return 0;
10edeb5d 738 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 739 return 0;
653099ff
GS
740 return 1;
741}
742
743/* Can match anything (initialization) */
744STATIC void
097eb12c 745S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 746{
7918f24d
NC
747 PERL_ARGS_ASSERT_CL_INIT;
748
8ecf7187 749 Zero(cl, 1, struct regnode_charclass_class);
653099ff 750 cl->type = ANYOF;
830247a4 751 cl_anything(pRExC_state, cl);
653099ff
GS
752}
753
754STATIC void
097eb12c 755S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 756{
7918f24d
NC
757 PERL_ARGS_ASSERT_CL_INIT_ZERO;
758
8ecf7187 759 Zero(cl, 1, struct regnode_charclass_class);
653099ff 760 cl->type = ANYOF;
830247a4 761 cl_anything(pRExC_state, cl);
653099ff
GS
762 if (LOC)
763 cl->flags |= ANYOF_LOCALE;
764}
765
766/* 'And' a given class with another one. Can create false positives */
767/* We assume that cl is not inverted */
768STATIC void
5f66b61c 769S_cl_and(struct regnode_charclass_class *cl,
a28509cc 770 const struct regnode_charclass_class *and_with)
653099ff 771{
7918f24d 772 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
773
774 assert(and_with->type == ANYOF);
1e6ade67
KW
775
776 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
777 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
653099ff 778 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
779 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
780 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
653099ff
GS
781 int i;
782
783 if (and_with->flags & ANYOF_INVERT)
784 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
785 cl->bitmap[i] &= ~and_with->bitmap[i];
786 else
787 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
788 cl->bitmap[i] &= and_with->bitmap[i];
789 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
790 if (!(and_with->flags & ANYOF_EOS))
791 cl->flags &= ~ANYOF_EOS;
1aa99e6b 792
39065660
KW
793 if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
794 cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
11454c59
KW
795 if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
796 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
8951c461 797
4713bfe1
KW
798 if (cl->flags & ANYOF_UNICODE_ALL
799 && and_with->flags & ANYOF_NONBITMAP
800 && !(and_with->flags & ANYOF_INVERT))
801 {
802 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
803 cl->flags &= ~ANYOF_UNICODE_ALL;
804 }
ef87b810
KW
805 cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
806 only the one(s)
807 actually set */
1aa99e6b
IH
808 ARG_SET(cl, ARG(and_with));
809 }
14ebb1a2
JH
810 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
811 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 812 cl->flags &= ~ANYOF_UNICODE_ALL;
3ff7ceb3 813 if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
14ebb1a2 814 !(and_with->flags & ANYOF_INVERT))
3ff7ceb3 815 cl->flags &= ~ANYOF_NONBITMAP;
653099ff
GS
816}
817
818/* 'OR' a given class with another one. Can create false positives */
819/* We assume that cl is not inverted */
820STATIC void
097eb12c 821S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 822{
7918f24d
NC
823 PERL_ARGS_ASSERT_CL_OR;
824
653099ff
GS
825 if (or_with->flags & ANYOF_INVERT) {
826 /* We do not use
827 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
828 * <= (B1 | !B2) | (CL1 | !CL2)
829 * which is wasteful if CL2 is small, but we ignore CL2:
830 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
831 * XXXX Can we handle case-fold? Unclear:
832 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
833 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
834 */
835 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
836 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
837 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
653099ff
GS
838 int i;
839
840 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
841 cl->bitmap[i] |= ~or_with->bitmap[i];
842 } /* XXXX: logic is complicated otherwise */
843 else {
830247a4 844 cl_anything(pRExC_state, cl);
653099ff
GS
845 }
846 } else {
847 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
848 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
849 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
850 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
653099ff
GS
851 int i;
852
853 /* OR char bitmap and class bitmap separately */
854 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
855 cl->bitmap[i] |= or_with->bitmap[i];
1e6ade67 856 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
653099ff
GS
857 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
858 cl->classflags[i] |= or_with->classflags[i];
859 cl->flags |= ANYOF_CLASS;
860 }
861 }
862 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 863 cl_anything(pRExC_state, cl);
653099ff
GS
864 }
865 }
866 if (or_with->flags & ANYOF_EOS)
867 cl->flags |= ANYOF_EOS;
11454c59
KW
868 if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
869 cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
1aa99e6b 870
39065660
KW
871 if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
872 cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
8951c461 873
9826f543
KW
874 /* If both nodes match something outside the bitmap, but what they match
875 * outside is not the same pointer, and hence not easily compared, give up
876 * and allow the start class to match everything outside the bitmap */
3ff7ceb3 877 if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
1aa99e6b
IH
878 ARG(cl) != ARG(or_with)) {
879 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 880 }
9826f543 881
1aa99e6b
IH
882 if (or_with->flags & ANYOF_UNICODE_ALL) {
883 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 884 }
653099ff
GS
885}
886
a3621e74
YO
887#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
888#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
889#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
890#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
891
3dab1dad
YO
892
893#ifdef DEBUGGING
07be1b83 894/*
2b8b4781
NC
895 dump_trie(trie,widecharmap,revcharmap)
896 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
897 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
898
899 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
900 The _interim_ variants are used for debugging the interim
901 tables that are used to generate the final compressed
902 representation which is what dump_trie expects.
903
486ec47a 904 Part of the reason for their existence is to provide a form
3dab1dad 905 of documentation as to how the different representations function.
07be1b83
YO
906
907*/
3dab1dad
YO
908
909/*
3dab1dad
YO
910 Dumps the final compressed table form of the trie to Perl_debug_log.
911 Used for debugging make_trie().
912*/
b9a59e08 913
3dab1dad 914STATIC void
2b8b4781
NC
915S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
916 AV *revcharmap, U32 depth)
3dab1dad
YO
917{
918 U32 state;
ab3bbdeb 919 SV *sv=sv_newmortal();
55eed653 920 int colwidth= widecharmap ? 6 : 4;
2e64971a 921 U16 word;
3dab1dad
YO
922 GET_RE_DEBUG_FLAGS_DECL;
923
7918f24d 924 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 925
3dab1dad
YO
926 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
927 (int)depth * 2 + 2,"",
928 "Match","Base","Ofs" );
929
930 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 931 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 932 if ( tmp ) {
ab3bbdeb
YO
933 PerlIO_printf( Perl_debug_log, "%*s",
934 colwidth,
ddc5bc0f 935 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
936 PL_colors[0], PL_colors[1],
937 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
938 PERL_PV_ESCAPE_FIRSTCHAR
939 )
940 );
3dab1dad
YO
941 }
942 }
943 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
944 (int)depth * 2 + 2,"");
945
946 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 947 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
948 PerlIO_printf( Perl_debug_log, "\n");
949
1e2e3d02 950 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 951 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
952
953 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
954
955 if ( trie->states[ state ].wordnum ) {
956 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
957 } else {
958 PerlIO_printf( Perl_debug_log, "%6s", "" );
959 }
960
961 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
962
963 if ( base ) {
964 U32 ofs = 0;
965
966 while( ( base + ofs < trie->uniquecharcount ) ||
967 ( base + ofs - trie->uniquecharcount < trie->lasttrans
968 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
969 ofs++;
970
971 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
972
973 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
974 if ( ( base + ofs >= trie->uniquecharcount ) &&
975 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
976 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
977 {
ab3bbdeb
YO
978 PerlIO_printf( Perl_debug_log, "%*"UVXf,
979 colwidth,
3dab1dad
YO
980 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
981 } else {
ab3bbdeb 982 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
983 }
984 }
985
986 PerlIO_printf( Perl_debug_log, "]");
987
988 }
989 PerlIO_printf( Perl_debug_log, "\n" );
990 }
2e64971a
DM
991 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
992 for (word=1; word <= trie->wordcount; word++) {
993 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
994 (int)word, (int)(trie->wordinfo[word].prev),
995 (int)(trie->wordinfo[word].len));
996 }
997 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
998}
999/*
3dab1dad
YO
1000 Dumps a fully constructed but uncompressed trie in list form.
1001 List tries normally only are used for construction when the number of
1002 possible chars (trie->uniquecharcount) is very high.
1003 Used for debugging make_trie().
1004*/
1005STATIC void
55eed653 1006S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1007 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1008 U32 depth)
3dab1dad
YO
1009{
1010 U32 state;
ab3bbdeb 1011 SV *sv=sv_newmortal();
55eed653 1012 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1013 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1014
1015 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1016
3dab1dad 1017 /* print out the table precompression. */
ab3bbdeb
YO
1018 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1019 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1020 "------:-----+-----------------\n" );
3dab1dad
YO
1021
1022 for( state=1 ; state < next_alloc ; state ++ ) {
1023 U16 charid;
1024
ab3bbdeb 1025 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1026 (int)depth * 2 + 2,"", (UV)state );
1027 if ( ! trie->states[ state ].wordnum ) {
1028 PerlIO_printf( Perl_debug_log, "%5s| ","");
1029 } else {
1030 PerlIO_printf( Perl_debug_log, "W%4x| ",
1031 trie->states[ state ].wordnum
1032 );
1033 }
1034 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1035 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1036 if ( tmp ) {
1037 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1038 colwidth,
ddc5bc0f 1039 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1040 PL_colors[0], PL_colors[1],
1041 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1042 PERL_PV_ESCAPE_FIRSTCHAR
1043 ) ,
1e2e3d02
YO
1044 TRIE_LIST_ITEM(state,charid).forid,
1045 (UV)TRIE_LIST_ITEM(state,charid).newstate
1046 );
1047 if (!(charid % 10))
664e119d
RGS
1048 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1049 (int)((depth * 2) + 14), "");
1e2e3d02 1050 }
ab3bbdeb
YO
1051 }
1052 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1053 }
1054}
1055
1056/*
3dab1dad
YO
1057 Dumps a fully constructed but uncompressed trie in table form.
1058 This is the normal DFA style state transition table, with a few
1059 twists to facilitate compression later.
1060 Used for debugging make_trie().
1061*/
1062STATIC void
55eed653 1063S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1064 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1065 U32 depth)
3dab1dad
YO
1066{
1067 U32 state;
1068 U16 charid;
ab3bbdeb 1069 SV *sv=sv_newmortal();
55eed653 1070 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1071 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1072
1073 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1074
1075 /*
1076 print out the table precompression so that we can do a visual check
1077 that they are identical.
1078 */
1079
1080 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1081
1082 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1083 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1084 if ( tmp ) {
ab3bbdeb
YO
1085 PerlIO_printf( Perl_debug_log, "%*s",
1086 colwidth,
ddc5bc0f 1087 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1088 PL_colors[0], PL_colors[1],
1089 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1090 PERL_PV_ESCAPE_FIRSTCHAR
1091 )
1092 );
3dab1dad
YO
1093 }
1094 }
1095
1096 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1097
1098 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1099 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1100 }
1101
1102 PerlIO_printf( Perl_debug_log, "\n" );
1103
1104 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1105
1106 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1107 (int)depth * 2 + 2,"",
1108 (UV)TRIE_NODENUM( state ) );
1109
1110 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1111 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1112 if (v)
1113 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1114 else
1115 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1116 }
1117 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1118 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1119 } else {
1120 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1121 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1122 }
1123 }
07be1b83 1124}
3dab1dad
YO
1125
1126#endif
1127
2e64971a 1128
786e8c11
YO
1129/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1130 startbranch: the first branch in the whole branch sequence
1131 first : start branch of sequence of branch-exact nodes.
1132 May be the same as startbranch
1133 last : Thing following the last branch.
1134 May be the same as tail.
1135 tail : item following the branch sequence
1136 count : words in the sequence
1137 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1138 depth : indent depth
3dab1dad 1139
786e8c11 1140Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1141
786e8c11
YO
1142A trie is an N'ary tree where the branches are determined by digital
1143decomposition of the key. IE, at the root node you look up the 1st character and
1144follow that branch repeat until you find the end of the branches. Nodes can be
1145marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1146
786e8c11 1147 /he|she|his|hers/
72f13be8 1148
786e8c11
YO
1149would convert into the following structure. Numbers represent states, letters
1150following numbers represent valid transitions on the letter from that state, if
1151the number is in square brackets it represents an accepting state, otherwise it
1152will be in parenthesis.
07be1b83 1153
786e8c11
YO
1154 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1155 | |
1156 | (2)
1157 | |
1158 (1) +-i->(6)-+-s->[7]
1159 |
1160 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1161
786e8c11
YO
1162 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1163
1164This shows that when matching against the string 'hers' we will begin at state 1
1165read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1166then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1167is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1168single traverse. We store a mapping from accepting to state to which word was
1169matched, and then when we have multiple possibilities we try to complete the
1170rest of the regex in the order in which they occured in the alternation.
1171
1172The only prior NFA like behaviour that would be changed by the TRIE support is
1173the silent ignoring of duplicate alternations which are of the form:
1174
1175 / (DUPE|DUPE) X? (?{ ... }) Y /x
1176
4b714af6 1177Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1178and without the optimisation. With the optimisations dupes will be silently
486ec47a 1179ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1180the following demonstrates:
1181
1182 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1183
1184which prints out 'word' three times, but
1185
1186 'words'=~/(word|word|word)(?{ print $1 })S/
1187
1188which doesnt print it out at all. This is due to other optimisations kicking in.
1189
1190Example of what happens on a structural level:
1191
486ec47a 1192The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1193
1194 1: CURLYM[1] {1,32767}(18)
1195 5: BRANCH(8)
1196 6: EXACT <ac>(16)
1197 8: BRANCH(11)
1198 9: EXACT <ad>(16)
1199 11: BRANCH(14)
1200 12: EXACT <ab>(16)
1201 16: SUCCEED(0)
1202 17: NOTHING(18)
1203 18: END(0)
1204
1205This would be optimizable with startbranch=5, first=5, last=16, tail=16
1206and should turn into:
1207
1208 1: CURLYM[1] {1,32767}(18)
1209 5: TRIE(16)
1210 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1211 <ac>
1212 <ad>
1213 <ab>
1214 16: SUCCEED(0)
1215 17: NOTHING(18)
1216 18: END(0)
1217
1218Cases where tail != last would be like /(?foo|bar)baz/:
1219
1220 1: BRANCH(4)
1221 2: EXACT <foo>(8)
1222 4: BRANCH(7)
1223 5: EXACT <bar>(8)
1224 7: TAIL(8)
1225 8: EXACT <baz>(10)
1226 10: END(0)
1227
1228which would be optimizable with startbranch=1, first=1, last=7, tail=8
1229and would end up looking like:
1230
1231 1: TRIE(8)
1232 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1233 <foo>
1234 <bar>
1235 7: TAIL(8)
1236 8: EXACT <baz>(10)
1237 10: END(0)
1238
1239 d = uvuni_to_utf8_flags(d, uv, 0);
1240
1241is the recommended Unicode-aware way of saying
1242
1243 *(d++) = uv;
1244*/
1245
1e2e3d02 1246#define TRIE_STORE_REVCHAR \
786e8c11 1247 STMT_START { \
73031816
NC
1248 if (UTF) { \
1249 SV *zlopp = newSV(2); \
88c9ea1e
CB
1250 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1251 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1252 SvCUR_set(zlopp, kapow - flrbbbbb); \
1253 SvPOK_on(zlopp); \
1254 SvUTF8_on(zlopp); \
1255 av_push(revcharmap, zlopp); \
1256 } else { \
6bdeddd2 1257 char ooooff = (char)uvc; \
73031816
NC
1258 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1259 } \
1260 } STMT_END
786e8c11
YO
1261
1262#define TRIE_READ_CHAR STMT_START { \
1263 wordlen++; \
1264 if ( UTF ) { \
1265 if ( folder ) { \
1266 if ( foldlen > 0 ) { \
1267 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1268 foldlen -= len; \
1269 scan += len; \
1270 len = 0; \
1271 } else { \
1272 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1273 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1274 foldlen -= UNISKIP( uvc ); \
1275 scan = foldbuf + UNISKIP( uvc ); \
1276 } \
1277 } else { \
1278 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1279 } \
1280 } else { \
1281 uvc = (U32)*uc; \
1282 len = 1; \
1283 } \
1284} STMT_END
1285
1286
1287
1288#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1289 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1290 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1291 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1292 } \
1293 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1294 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1295 TRIE_LIST_CUR( state )++; \
1296} STMT_END
07be1b83 1297
786e8c11
YO
1298#define TRIE_LIST_NEW(state) STMT_START { \
1299 Newxz( trie->states[ state ].trans.list, \
1300 4, reg_trie_trans_le ); \
1301 TRIE_LIST_CUR( state ) = 1; \
1302 TRIE_LIST_LEN( state ) = 4; \
1303} STMT_END
07be1b83 1304
786e8c11
YO
1305#define TRIE_HANDLE_WORD(state) STMT_START { \
1306 U16 dupe= trie->states[ state ].wordnum; \
1307 regnode * const noper_next = regnext( noper ); \
1308 \
786e8c11
YO
1309 DEBUG_r({ \
1310 /* store the word for dumping */ \
1311 SV* tmp; \
1312 if (OP(noper) != NOTHING) \
740cce10 1313 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1314 else \
740cce10 1315 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1316 av_push( trie_words, tmp ); \
786e8c11
YO
1317 }); \
1318 \
1319 curword++; \
2e64971a
DM
1320 trie->wordinfo[curword].prev = 0; \
1321 trie->wordinfo[curword].len = wordlen; \
1322 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1323 \
1324 if ( noper_next < tail ) { \
1325 if (!trie->jump) \
c944940b 1326 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1327 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1328 if (!jumper) \
1329 jumper = noper_next; \
1330 if (!nextbranch) \
1331 nextbranch= regnext(cur); \
1332 } \
1333 \
1334 if ( dupe ) { \
2e64971a
DM
1335 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1336 /* chain, so that when the bits of chain are later */\
1337 /* linked together, the dups appear in the chain */\
1338 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1339 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1340 } else { \
1341 /* we haven't inserted this word yet. */ \
1342 trie->states[ state ].wordnum = curword; \
1343 } \
1344} STMT_END
07be1b83 1345
3dab1dad 1346
786e8c11
YO
1347#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1348 ( ( base + charid >= ucharcount \
1349 && base + charid < ubound \
1350 && state == trie->trans[ base - ucharcount + charid ].check \
1351 && trie->trans[ base - ucharcount + charid ].next ) \
1352 ? trie->trans[ base - ucharcount + charid ].next \
1353 : ( state==1 ? special : 0 ) \
1354 )
3dab1dad 1355
786e8c11
YO
1356#define MADE_TRIE 1
1357#define MADE_JUMP_TRIE 2
1358#define MADE_EXACT_TRIE 4
3dab1dad 1359
a3621e74 1360STATIC I32
786e8c11 1361S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1362{
27da23d5 1363 dVAR;
a3621e74
YO
1364 /* first pass, loop through and scan words */
1365 reg_trie_data *trie;
55eed653 1366 HV *widecharmap = NULL;
2b8b4781 1367 AV *revcharmap = newAV();
a3621e74 1368 regnode *cur;
9f7f3913 1369 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1370 STRLEN len = 0;
1371 UV uvc = 0;
1372 U16 curword = 0;
1373 U32 next_alloc = 0;
786e8c11
YO
1374 regnode *jumper = NULL;
1375 regnode *nextbranch = NULL;
7f69552c 1376 regnode *convert = NULL;
2e64971a 1377 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1378 /* we just use folder as a flag in utf8 */
1e696034 1379 const U8 * folder = NULL;
a3621e74 1380
2b8b4781
NC
1381#ifdef DEBUGGING
1382 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1383 AV *trie_words = NULL;
1384 /* along with revcharmap, this only used during construction but both are
1385 * useful during debugging so we store them in the struct when debugging.
8e11feef 1386 */
2b8b4781
NC
1387#else
1388 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1389 STRLEN trie_charcount=0;
3dab1dad 1390#endif
2b8b4781 1391 SV *re_trie_maxbuff;
a3621e74 1392 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1393
1394 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1395#ifndef DEBUGGING
1396 PERL_UNUSED_ARG(depth);
1397#endif
a3621e74 1398
1e696034
KW
1399 switch (flags) {
1400 case EXACTFU: folder = PL_fold_latin1; break;
1401 case EXACTF: folder = PL_fold; break;
1402 case EXACTFL: folder = PL_fold_locale; break;
1403 }
1404
c944940b 1405 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1406 trie->refcount = 1;
3dab1dad 1407 trie->startstate = 1;
786e8c11 1408 trie->wordcount = word_count;
f8fc2ecf 1409 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1410 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1411 if (!(UTF && folder))
c944940b 1412 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1413 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1414 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1415
a3621e74 1416 DEBUG_r({
2b8b4781 1417 trie_words = newAV();
a3621e74 1418 });
a3621e74 1419
0111c4fd 1420 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1421 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1422 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1423 }
3dab1dad
YO
1424 DEBUG_OPTIMISE_r({
1425 PerlIO_printf( Perl_debug_log,
786e8c11 1426 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1427 (int)depth * 2 + 2, "",
1428 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1429 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1430 (int)depth);
3dab1dad 1431 });
7f69552c
YO
1432
1433 /* Find the node we are going to overwrite */
1434 if ( first == startbranch && OP( last ) != BRANCH ) {
1435 /* whole branch chain */
1436 convert = first;
1437 } else {
1438 /* branch sub-chain */
1439 convert = NEXTOPER( first );
1440 }
1441
a3621e74
YO
1442 /* -- First loop and Setup --
1443
1444 We first traverse the branches and scan each word to determine if it
1445 contains widechars, and how many unique chars there are, this is
1446 important as we have to build a table with at least as many columns as we
1447 have unique chars.
1448
1449 We use an array of integers to represent the character codes 0..255
38a44b82 1450 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1451 native representation of the character value as the key and IV's for the
1452 coded index.
1453
1454 *TODO* If we keep track of how many times each character is used we can
1455 remap the columns so that the table compression later on is more
3b753521 1456 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1457 middle and the least common are on the outside. IMO this would be better
1458 than a most to least common mapping as theres a decent chance the most
1459 common letter will share a node with the least common, meaning the node
486ec47a 1460 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
1461 case is when we have the least common nodes twice.
1462
1463 */
1464
a3621e74 1465 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1466 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1467 const U8 *uc = (U8*)STRING( noper );
a28509cc 1468 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1469 STRLEN foldlen = 0;
1470 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1471 const U8 *scan = (U8*)NULL;
07be1b83 1472 U32 wordlen = 0; /* required init */
02daf0ab
YO
1473 STRLEN chars = 0;
1474 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1475
3dab1dad
YO
1476 if (OP(noper) == NOTHING) {
1477 trie->minlen= 0;
1478 continue;
1479 }
02daf0ab
YO
1480 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1481 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1482 regardless of encoding */
1483
a3621e74 1484 for ( ; uc < e ; uc += len ) {
3dab1dad 1485 TRIE_CHARCOUNT(trie)++;
a3621e74 1486 TRIE_READ_CHAR;
3dab1dad 1487 chars++;
a3621e74
YO
1488 if ( uvc < 256 ) {
1489 if ( !trie->charmap[ uvc ] ) {
1490 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1491 if ( folder )
1492 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1493 TRIE_STORE_REVCHAR;
a3621e74 1494 }
02daf0ab 1495 if ( set_bit ) {
62012aee
KW
1496 /* store the codepoint in the bitmap, and its folded
1497 * equivalent. */
02daf0ab 1498 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1499
1500 /* store the folded codepoint */
1501 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1502
1503 if ( !UTF ) {
1504 /* store first byte of utf8 representation of
acdf4139
KW
1505 variant codepoints */
1506 if (! UNI_IS_INVARIANT(uvc)) {
1507 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
1508 }
1509 }
02daf0ab
YO
1510 set_bit = 0; /* We've done our bit :-) */
1511 }
a3621e74
YO
1512 } else {
1513 SV** svpp;
55eed653
NC
1514 if ( !widecharmap )
1515 widecharmap = newHV();
a3621e74 1516
55eed653 1517 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1518
1519 if ( !svpp )
e4584336 1520 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1521
1522 if ( !SvTRUE( *svpp ) ) {
1523 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1524 TRIE_STORE_REVCHAR;
a3621e74
YO
1525 }
1526 }
1527 }
3dab1dad
YO
1528 if( cur == first ) {
1529 trie->minlen=chars;
1530 trie->maxlen=chars;
1531 } else if (chars < trie->minlen) {
1532 trie->minlen=chars;
1533 } else if (chars > trie->maxlen) {
1534 trie->maxlen=chars;
1535 }
1536
a3621e74
YO
1537 } /* end first pass */
1538 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1539 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1540 (int)depth * 2 + 2,"",
55eed653 1541 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1542 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1543 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1544 );
a3621e74
YO
1545
1546 /*
1547 We now know what we are dealing with in terms of unique chars and
1548 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1549 representation using a flat table will take. If it's over a reasonable
1550 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1551 conservative but potentially much slower representation using an array
1552 of lists.
1553
1554 At the end we convert both representations into the same compressed
1555 form that will be used in regexec.c for matching with. The latter
1556 is a form that cannot be used to construct with but has memory
1557 properties similar to the list form and access properties similar
1558 to the table form making it both suitable for fast searches and
1559 small enough that its feasable to store for the duration of a program.
1560
1561 See the comment in the code where the compressed table is produced
1562 inplace from the flat tabe representation for an explanation of how
1563 the compression works.
1564
1565 */
1566
1567
2e64971a
DM
1568 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1569 prev_states[1] = 0;
1570
3dab1dad 1571 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1572 /*
1573 Second Pass -- Array Of Lists Representation
1574
1575 Each state will be represented by a list of charid:state records
1576 (reg_trie_trans_le) the first such element holds the CUR and LEN
1577 points of the allocated array. (See defines above).
1578
1579 We build the initial structure using the lists, and then convert
1580 it into the compressed table form which allows faster lookups
1581 (but cant be modified once converted).
a3621e74
YO
1582 */
1583
a3621e74
YO
1584 STRLEN transcount = 1;
1585
1e2e3d02
YO
1586 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1587 "%*sCompiling trie using list compiler\n",
1588 (int)depth * 2 + 2, ""));
446bd890 1589
c944940b
JH
1590 trie->states = (reg_trie_state *)
1591 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1592 sizeof(reg_trie_state) );
a3621e74
YO
1593 TRIE_LIST_NEW(1);
1594 next_alloc = 2;
1595
1596 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1597
c445ea15
AL
1598 regnode * const noper = NEXTOPER( cur );
1599 U8 *uc = (U8*)STRING( noper );
1600 const U8 * const e = uc + STR_LEN( noper );
1601 U32 state = 1; /* required init */
1602 U16 charid = 0; /* sanity init */
1603 U8 *scan = (U8*)NULL; /* sanity init */
1604 STRLEN foldlen = 0; /* required init */
07be1b83 1605 U32 wordlen = 0; /* required init */
c445ea15
AL
1606 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1607
3dab1dad 1608 if (OP(noper) != NOTHING) {
786e8c11 1609 for ( ; uc < e ; uc += len ) {
c445ea15 1610
786e8c11 1611 TRIE_READ_CHAR;
c445ea15 1612
786e8c11
YO
1613 if ( uvc < 256 ) {
1614 charid = trie->charmap[ uvc ];
c445ea15 1615 } else {
55eed653 1616 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1617 if ( !svpp ) {
1618 charid = 0;
1619 } else {
1620 charid=(U16)SvIV( *svpp );
1621 }
c445ea15 1622 }
786e8c11
YO
1623 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1624 if ( charid ) {
a3621e74 1625
786e8c11
YO
1626 U16 check;
1627 U32 newstate = 0;
a3621e74 1628
786e8c11
YO
1629 charid--;
1630 if ( !trie->states[ state ].trans.list ) {
1631 TRIE_LIST_NEW( state );
c445ea15 1632 }
786e8c11
YO
1633 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1634 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1635 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1636 break;
1637 }
1638 }
1639 if ( ! newstate ) {
1640 newstate = next_alloc++;
2e64971a 1641 prev_states[newstate] = state;
786e8c11
YO
1642 TRIE_LIST_PUSH( state, charid, newstate );
1643 transcount++;
1644 }
1645 state = newstate;
1646 } else {
1647 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1648 }
a28509cc 1649 }
c445ea15 1650 }
3dab1dad 1651 TRIE_HANDLE_WORD(state);
a3621e74
YO
1652
1653 } /* end second pass */
1654
1e2e3d02
YO
1655 /* next alloc is the NEXT state to be allocated */
1656 trie->statecount = next_alloc;
c944940b
JH
1657 trie->states = (reg_trie_state *)
1658 PerlMemShared_realloc( trie->states,
1659 next_alloc
1660 * sizeof(reg_trie_state) );
a3621e74 1661
3dab1dad 1662 /* and now dump it out before we compress it */
2b8b4781
NC
1663 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1664 revcharmap, next_alloc,
1665 depth+1)
1e2e3d02 1666 );
a3621e74 1667
c944940b
JH
1668 trie->trans = (reg_trie_trans *)
1669 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1670 {
1671 U32 state;
a3621e74
YO
1672 U32 tp = 0;
1673 U32 zp = 0;
1674
1675
1676 for( state=1 ; state < next_alloc ; state ++ ) {
1677 U32 base=0;
1678
1679 /*
1680 DEBUG_TRIE_COMPILE_MORE_r(
1681 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1682 );
1683 */
1684
1685 if (trie->states[state].trans.list) {
1686 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1687 U16 maxid=minid;
a28509cc 1688 U16 idx;
a3621e74
YO
1689
1690 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1691 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1692 if ( forid < minid ) {
1693 minid=forid;
1694 } else if ( forid > maxid ) {
1695 maxid=forid;
1696 }
a3621e74
YO
1697 }
1698 if ( transcount < tp + maxid - minid + 1) {
1699 transcount *= 2;
c944940b
JH
1700 trie->trans = (reg_trie_trans *)
1701 PerlMemShared_realloc( trie->trans,
446bd890
NC
1702 transcount
1703 * sizeof(reg_trie_trans) );
a3621e74
YO
1704 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1705 }
1706 base = trie->uniquecharcount + tp - minid;
1707 if ( maxid == minid ) {
1708 U32 set = 0;
1709 for ( ; zp < tp ; zp++ ) {
1710 if ( ! trie->trans[ zp ].next ) {
1711 base = trie->uniquecharcount + zp - minid;
1712 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1713 trie->trans[ zp ].check = state;
1714 set = 1;
1715 break;
1716 }
1717 }
1718 if ( !set ) {
1719 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1720 trie->trans[ tp ].check = state;
1721 tp++;
1722 zp = tp;
1723 }
1724 } else {
1725 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1726 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1727 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1728 trie->trans[ tid ].check = state;
1729 }
1730 tp += ( maxid - minid + 1 );
1731 }
1732 Safefree(trie->states[ state ].trans.list);
1733 }
1734 /*
1735 DEBUG_TRIE_COMPILE_MORE_r(
1736 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1737 );
1738 */
1739 trie->states[ state ].trans.base=base;
1740 }
cc601c31 1741 trie->lasttrans = tp + 1;
a3621e74
YO
1742 }
1743 } else {
1744 /*
1745 Second Pass -- Flat Table Representation.
1746
1747 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1748 We know that we will need Charcount+1 trans at most to store the data
1749 (one row per char at worst case) So we preallocate both structures
1750 assuming worst case.
1751
1752 We then construct the trie using only the .next slots of the entry
1753 structs.
1754
3b753521 1755 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
1756 make compression both faster and easier by keeping track of how many non
1757 zero fields are in the node.
1758
1759 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1760 transition.
1761
1762 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1763 number representing the first entry of the node, and state as a
1764 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1765 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1766 are 2 entrys per node. eg:
1767
1768 A B A B
1769 1. 2 4 1. 3 7
1770 2. 0 3 3. 0 5
1771 3. 0 0 5. 0 0
1772 4. 0 0 7. 0 0
1773
1774 The table is internally in the right hand, idx form. However as we also
1775 have to deal with the states array which is indexed by nodenum we have to
1776 use TRIE_NODENUM() to convert.
1777
1778 */
1e2e3d02
YO
1779 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1780 "%*sCompiling trie using table compiler\n",
1781 (int)depth * 2 + 2, ""));
3dab1dad 1782
c944940b
JH
1783 trie->trans = (reg_trie_trans *)
1784 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1785 * trie->uniquecharcount + 1,
1786 sizeof(reg_trie_trans) );
1787 trie->states = (reg_trie_state *)
1788 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1789 sizeof(reg_trie_state) );
a3621e74
YO
1790 next_alloc = trie->uniquecharcount + 1;
1791
3dab1dad 1792
a3621e74
YO
1793 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1794
c445ea15 1795 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1796 const U8 *uc = (U8*)STRING( noper );
1797 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1798
1799 U32 state = 1; /* required init */
1800
1801 U16 charid = 0; /* sanity init */
1802 U32 accept_state = 0; /* sanity init */
1803 U8 *scan = (U8*)NULL; /* sanity init */
1804
1805 STRLEN foldlen = 0; /* required init */
07be1b83 1806 U32 wordlen = 0; /* required init */
a3621e74
YO
1807 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1808
3dab1dad 1809 if ( OP(noper) != NOTHING ) {
786e8c11 1810 for ( ; uc < e ; uc += len ) {
a3621e74 1811
786e8c11 1812 TRIE_READ_CHAR;
a3621e74 1813
786e8c11
YO
1814 if ( uvc < 256 ) {
1815 charid = trie->charmap[ uvc ];
1816 } else {
55eed653 1817 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1818 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1819 }
786e8c11
YO
1820 if ( charid ) {
1821 charid--;
1822 if ( !trie->trans[ state + charid ].next ) {
1823 trie->trans[ state + charid ].next = next_alloc;
1824 trie->trans[ state ].check++;
2e64971a
DM
1825 prev_states[TRIE_NODENUM(next_alloc)]
1826 = TRIE_NODENUM(state);
786e8c11
YO
1827 next_alloc += trie->uniquecharcount;
1828 }
1829 state = trie->trans[ state + charid ].next;
1830 } else {
1831 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1832 }
1833 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1834 }
a3621e74 1835 }
3dab1dad
YO
1836 accept_state = TRIE_NODENUM( state );
1837 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1838
1839 } /* end second pass */
1840
3dab1dad 1841 /* and now dump it out before we compress it */
2b8b4781
NC
1842 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1843 revcharmap,
1844 next_alloc, depth+1));
a3621e74 1845
a3621e74
YO
1846 {
1847 /*
1848 * Inplace compress the table.*
1849
1850 For sparse data sets the table constructed by the trie algorithm will
1851 be mostly 0/FAIL transitions or to put it another way mostly empty.
1852 (Note that leaf nodes will not contain any transitions.)
1853
1854 This algorithm compresses the tables by eliminating most such
1855 transitions, at the cost of a modest bit of extra work during lookup:
1856
1857 - Each states[] entry contains a .base field which indicates the
1858 index in the state[] array wheres its transition data is stored.
1859
3b753521 1860 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
1861
1862 - If .base is nonzero then charid is added to it to find an entry in
1863 the trans array.
1864
1865 -If trans[states[state].base+charid].check!=state then the
1866 transition is taken to be a 0/Fail transition. Thus if there are fail
1867 transitions at the front of the node then the .base offset will point
1868 somewhere inside the previous nodes data (or maybe even into a node
1869 even earlier), but the .check field determines if the transition is
1870 valid.
1871
786e8c11 1872 XXX - wrong maybe?
a3621e74 1873 The following process inplace converts the table to the compressed
3b753521 1874 table: We first do not compress the root node 1,and mark all its
a3621e74 1875 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
1876 allows us to do a DFA construction from the compressed table later,
1877 and ensures that any .base pointers we calculate later are greater
1878 than 0.
a3621e74
YO
1879
1880 - We set 'pos' to indicate the first entry of the second node.
1881
1882 - We then iterate over the columns of the node, finding the first and
1883 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1884 and set the .check pointers accordingly, and advance pos
1885 appropriately and repreat for the next node. Note that when we copy
1886 the next pointers we have to convert them from the original
1887 NODEIDX form to NODENUM form as the former is not valid post
1888 compression.
1889
1890 - If a node has no transitions used we mark its base as 0 and do not
1891 advance the pos pointer.
1892
1893 - If a node only has one transition we use a second pointer into the
1894 structure to fill in allocated fail transitions from other states.
1895 This pointer is independent of the main pointer and scans forward
1896 looking for null transitions that are allocated to a state. When it
1897 finds one it writes the single transition into the "hole". If the
786e8c11 1898 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1899
1900 - Once compressed we can Renew/realloc the structures to release the
1901 excess space.
1902
1903 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1904 specifically Fig 3.47 and the associated pseudocode.
1905
1906 demq
1907 */
a3b680e6 1908 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1909 U32 state, charid;
a3621e74 1910 U32 pos = 0, zp=0;
1e2e3d02 1911 trie->statecount = laststate;
a3621e74
YO
1912
1913 for ( state = 1 ; state < laststate ; state++ ) {
1914 U8 flag = 0;
a28509cc
AL
1915 const U32 stateidx = TRIE_NODEIDX( state );
1916 const U32 o_used = trie->trans[ stateidx ].check;
1917 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1918 trie->trans[ stateidx ].check = 0;
1919
1920 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1921 if ( flag || trie->trans[ stateidx + charid ].next ) {
1922 if ( trie->trans[ stateidx + charid ].next ) {
1923 if (o_used == 1) {
1924 for ( ; zp < pos ; zp++ ) {
1925 if ( ! trie->trans[ zp ].next ) {
1926 break;
1927 }
1928 }
1929 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1930 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1931 trie->trans[ zp ].check = state;
1932 if ( ++zp > pos ) pos = zp;
1933 break;
1934 }
1935 used--;
1936 }
1937 if ( !flag ) {
1938 flag = 1;
1939 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1940 }
1941 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1942 trie->trans[ pos ].check = state;
1943 pos++;
1944 }
1945 }
1946 }
cc601c31 1947 trie->lasttrans = pos + 1;
c944940b
JH
1948 trie->states = (reg_trie_state *)
1949 PerlMemShared_realloc( trie->states, laststate
1950 * sizeof(reg_trie_state) );
a3621e74 1951 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1952 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1953 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1954 (int)depth * 2 + 2,"",
1955 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1956 (IV)next_alloc,
1957 (IV)pos,
a3621e74
YO
1958 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1959 );
1960
1961 } /* end table compress */
1962 }
1e2e3d02
YO
1963 DEBUG_TRIE_COMPILE_MORE_r(
1964 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1965 (int)depth * 2 + 2, "",
1966 (UV)trie->statecount,
1967 (UV)trie->lasttrans)
1968 );
cc601c31 1969 /* resize the trans array to remove unused space */
c944940b
JH
1970 trie->trans = (reg_trie_trans *)
1971 PerlMemShared_realloc( trie->trans, trie->lasttrans
1972 * sizeof(reg_trie_trans) );
a3621e74 1973
3b753521 1974 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
1975 U8 nodetype =(U8)(flags & 0xFF);
1976 char *str=NULL;
786e8c11 1977
07be1b83 1978#ifdef DEBUGGING
e62cc96a 1979 regnode *optimize = NULL;
7122b237
YO
1980#ifdef RE_TRACK_PATTERN_OFFSETS
1981
b57a0404
JH
1982 U32 mjd_offset = 0;
1983 U32 mjd_nodelen = 0;
7122b237
YO
1984#endif /* RE_TRACK_PATTERN_OFFSETS */
1985#endif /* DEBUGGING */
a3621e74 1986 /*
3dab1dad
YO
1987 This means we convert either the first branch or the first Exact,
1988 depending on whether the thing following (in 'last') is a branch
1989 or not and whther first is the startbranch (ie is it a sub part of
1990 the alternation or is it the whole thing.)
3b753521 1991 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 1992 the whole branch sequence, including the first.
a3621e74 1993 */
3dab1dad 1994 /* Find the node we are going to overwrite */
7f69552c 1995 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1996 /* branch sub-chain */
3dab1dad 1997 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1998#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1999 DEBUG_r({
2000 mjd_offset= Node_Offset((convert));
2001 mjd_nodelen= Node_Length((convert));
2002 });
7122b237 2003#endif
7f69552c 2004 /* whole branch chain */
7122b237
YO
2005 }
2006#ifdef RE_TRACK_PATTERN_OFFSETS
2007 else {
7f69552c
YO
2008 DEBUG_r({
2009 const regnode *nop = NEXTOPER( convert );
2010 mjd_offset= Node_Offset((nop));
2011 mjd_nodelen= Node_Length((nop));
2012 });
07be1b83
YO
2013 }
2014 DEBUG_OPTIMISE_r(
2015 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2016 (int)depth * 2 + 2, "",
786e8c11 2017 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2018 );
7122b237 2019#endif
3dab1dad
YO
2020 /* But first we check to see if there is a common prefix we can
2021 split out as an EXACT and put in front of the TRIE node. */
2022 trie->startstate= 1;
55eed653 2023 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2024 U32 state;
1e2e3d02 2025 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2026 U32 ofs = 0;
8e11feef
RGS
2027 I32 idx = -1;
2028 U32 count = 0;
2029 const U32 base = trie->states[ state ].trans.base;
a3621e74 2030
3dab1dad 2031 if ( trie->states[state].wordnum )
8e11feef 2032 count = 1;
a3621e74 2033
8e11feef 2034 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2035 if ( ( base + ofs >= trie->uniquecharcount ) &&
2036 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2037 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2038 {
3dab1dad 2039 if ( ++count > 1 ) {
2b8b4781 2040 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2041 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2042 if ( state == 1 ) break;
3dab1dad
YO
2043 if ( count == 2 ) {
2044 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2045 DEBUG_OPTIMISE_r(
8e11feef
RGS
2046 PerlIO_printf(Perl_debug_log,
2047 "%*sNew Start State=%"UVuf" Class: [",
2048 (int)depth * 2 + 2, "",
786e8c11 2049 (UV)state));
be8e71aa 2050 if (idx >= 0) {
2b8b4781 2051 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2052 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2053
3dab1dad 2054 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2055 if ( folder )
2056 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2057 DEBUG_OPTIMISE_r(
f1f66076 2058 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2059 );
8e11feef
RGS
2060 }
2061 }
2062 TRIE_BITMAP_SET(trie,*ch);
2063 if ( folder )
2064 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2065 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2066 }
2067 idx = ofs;
2068 }
3dab1dad
YO
2069 }
2070 if ( count == 1 ) {
2b8b4781 2071 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2072 STRLEN len;
2073 char *ch = SvPV( *tmp, len );
de734bd5
A
2074 DEBUG_OPTIMISE_r({
2075 SV *sv=sv_newmortal();
8e11feef
RGS
2076 PerlIO_printf( Perl_debug_log,
2077 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2078 (int)depth * 2 + 2, "",
de734bd5
A
2079 (UV)state, (UV)idx,
2080 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2081 PL_colors[0], PL_colors[1],
2082 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2083 PERL_PV_ESCAPE_FIRSTCHAR
2084 )
2085 );
2086 });
3dab1dad
YO
2087 if ( state==1 ) {
2088 OP( convert ) = nodetype;
2089 str=STRING(convert);
2090 STR_LEN(convert)=0;
2091 }
c490c714
YO
2092 STR_LEN(convert) += len;
2093 while (len--)
de734bd5 2094 *str++ = *ch++;
8e11feef 2095 } else {
f9049ba1 2096#ifdef DEBUGGING
8e11feef
RGS
2097 if (state>1)
2098 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2099#endif
8e11feef
RGS
2100 break;
2101 }
2102 }
2e64971a 2103 trie->prefixlen = (state-1);
3dab1dad 2104 if (str) {
8e11feef 2105 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2106 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2107 trie->startstate = state;
07be1b83
YO
2108 trie->minlen -= (state - 1);
2109 trie->maxlen -= (state - 1);
33809eae
JH
2110#ifdef DEBUGGING
2111 /* At least the UNICOS C compiler choked on this
2112 * being argument to DEBUG_r(), so let's just have
2113 * it right here. */
2114 if (
2115#ifdef PERL_EXT_RE_BUILD
2116 1
2117#else
2118 DEBUG_r_TEST
2119#endif
2120 ) {
2121 regnode *fix = convert;
2122 U32 word = trie->wordcount;
2123 mjd_nodelen++;
2124 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2125 while( ++fix < n ) {
2126 Set_Node_Offset_Length(fix, 0, 0);
2127 }
2128 while (word--) {
2129 SV ** const tmp = av_fetch( trie_words, word, 0 );
2130 if (tmp) {
2131 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2132 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2133 else
2134 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2135 }
2136 }
2137 }
2138#endif
8e11feef
RGS
2139 if (trie->maxlen) {
2140 convert = n;
2141 } else {
3dab1dad 2142 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2143 DEBUG_r(optimize= n);
3dab1dad
YO
2144 }
2145 }
2146 }
a5ca303d
YO
2147 if (!jumper)
2148 jumper = last;
3dab1dad 2149 if ( trie->maxlen ) {
8e11feef
RGS
2150 NEXT_OFF( convert ) = (U16)(tail - convert);
2151 ARG_SET( convert, data_slot );
786e8c11
YO
2152 /* Store the offset to the first unabsorbed branch in
2153 jump[0], which is otherwise unused by the jump logic.
2154 We use this when dumping a trie and during optimisation. */
2155 if (trie->jump)
7f69552c 2156 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2157
6c48061a
YO
2158 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2159 * and there is a bitmap
2160 * and the first "jump target" node we found leaves enough room
2161 * then convert the TRIE node into a TRIEC node, with the bitmap
2162 * embedded inline in the opcode - this is hypothetically faster.
2163 */
2164 if ( !trie->states[trie->startstate].wordnum
2165 && trie->bitmap
2166 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2167 {
2168 OP( convert ) = TRIEC;
2169 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2170 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2171 trie->bitmap= NULL;
2172 } else
2173 OP( convert ) = TRIE;
a3621e74 2174
3dab1dad
YO
2175 /* store the type in the flags */
2176 convert->flags = nodetype;
a5ca303d
YO
2177 DEBUG_r({
2178 optimize = convert
2179 + NODE_STEP_REGNODE
2180 + regarglen[ OP( convert ) ];
2181 });
2182 /* XXX We really should free up the resource in trie now,
2183 as we won't use them - (which resources?) dmq */
3dab1dad 2184 }
a3621e74 2185 /* needed for dumping*/
e62cc96a 2186 DEBUG_r(if (optimize) {
07be1b83 2187 regnode *opt = convert;
bcdf7404 2188
e62cc96a 2189 while ( ++opt < optimize) {
07be1b83
YO
2190 Set_Node_Offset_Length(opt,0,0);
2191 }
786e8c11
YO
2192 /*
2193 Try to clean up some of the debris left after the
2194 optimisation.
a3621e74 2195 */
786e8c11 2196 while( optimize < jumper ) {
07be1b83 2197 mjd_nodelen += Node_Length((optimize));
a3621e74 2198 OP( optimize ) = OPTIMIZED;
07be1b83 2199 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2200 optimize++;
2201 }
07be1b83 2202 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2203 });
2204 } /* end node insert */
2e64971a
DM
2205
2206 /* Finish populating the prev field of the wordinfo array. Walk back
2207 * from each accept state until we find another accept state, and if
2208 * so, point the first word's .prev field at the second word. If the
2209 * second already has a .prev field set, stop now. This will be the
2210 * case either if we've already processed that word's accept state,
3b753521
FN
2211 * or that state had multiple words, and the overspill words were
2212 * already linked up earlier.
2e64971a
DM
2213 */
2214 {
2215 U16 word;
2216 U32 state;
2217 U16 prev;
2218
2219 for (word=1; word <= trie->wordcount; word++) {
2220 prev = 0;
2221 if (trie->wordinfo[word].prev)
2222 continue;
2223 state = trie->wordinfo[word].accept;
2224 while (state) {
2225 state = prev_states[state];
2226 if (!state)
2227 break;
2228 prev = trie->states[state].wordnum;
2229 if (prev)
2230 break;
2231 }
2232 trie->wordinfo[word].prev = prev;
2233 }
2234 Safefree(prev_states);
2235 }
2236
2237
2238 /* and now dump out the compressed format */
2239 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2240
55eed653 2241 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2242#ifdef DEBUGGING
2243 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2244 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2245#else
2246 SvREFCNT_dec(revcharmap);
07be1b83 2247#endif
786e8c11
YO
2248 return trie->jump
2249 ? MADE_JUMP_TRIE
2250 : trie->startstate>1
2251 ? MADE_EXACT_TRIE
2252 : MADE_TRIE;
2253}
2254
2255STATIC void
2256S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2257{
3b753521 2258/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2259
2260 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2261 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2262 ISBN 0-201-10088-6
2263
2264 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2265 suffix of the current state's 'word' that is also a proper prefix of another word in our
2266 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2267 the DFA not to have to restart after its tried and failed a word at a given point, it
2268 simply continues as though it had been matching the other word in the first place.
2269 Consider
2270 'abcdgu'=~/abcdefg|cdgu/
2271 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2272 fail, which would bring us to the state representing 'd' in the second word where we would
2273 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2274 */
2275 /* add a fail transition */
3251b653
NC
2276 const U32 trie_offset = ARG(source);
2277 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2278 U32 *q;
2279 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2280 const U32 numstates = trie->statecount;
786e8c11
YO
2281 const U32 ubound = trie->lasttrans + ucharcount;
2282 U32 q_read = 0;
2283 U32 q_write = 0;
2284 U32 charid;
2285 U32 base = trie->states[ 1 ].trans.base;
2286 U32 *fail;
2287 reg_ac_data *aho;
2288 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2289 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2290
2291 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2292#ifndef DEBUGGING
2293 PERL_UNUSED_ARG(depth);
2294#endif
2295
2296
2297 ARG_SET( stclass, data_slot );
c944940b 2298 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2299 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2300 aho->trie=trie_offset;
446bd890
NC
2301 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2302 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2303 Newxz( q, numstates, U32);
c944940b 2304 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2305 aho->refcount = 1;
2306 fail = aho->fail;
2307 /* initialize fail[0..1] to be 1 so that we always have
2308 a valid final fail state */
2309 fail[ 0 ] = fail[ 1 ] = 1;
2310
2311 for ( charid = 0; charid < ucharcount ; charid++ ) {
2312 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2313 if ( newstate ) {
2314 q[ q_write ] = newstate;
2315 /* set to point at the root */
2316 fail[ q[ q_write++ ] ]=1;
2317 }
2318 }
2319 while ( q_read < q_write) {
2320 const U32 cur = q[ q_read++ % numstates ];
2321 base = trie->states[ cur ].trans.base;
2322
2323 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2324 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2325 if (ch_state) {
2326 U32 fail_state = cur;
2327 U32 fail_base;
2328 do {
2329 fail_state = fail[ fail_state ];
2330 fail_base = aho->states[ fail_state ].trans.base;
2331 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2332
2333 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2334 fail[ ch_state ] = fail_state;
2335 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2336 {
2337 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2338 }
2339 q[ q_write++ % numstates] = ch_state;
2340 }
2341 }
2342 }
2343 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2344 when we fail in state 1, this allows us to use the
2345 charclass scan to find a valid start char. This is based on the principle
2346 that theres a good chance the string being searched contains lots of stuff
2347 that cant be a start char.
2348 */
2349 fail[ 0 ] = fail[ 1 ] = 0;
2350 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2351 PerlIO_printf(Perl_debug_log,
2352 "%*sStclass Failtable (%"UVuf" states): 0",
2353 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2354 );
786e8c11
YO
2355 for( q_read=1; q_read<numstates; q_read++ ) {
2356 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2357 }
2358 PerlIO_printf(Perl_debug_log, "\n");
2359 });
2360 Safefree(q);
2361 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2362}
2363
786e8c11 2364
a3621e74 2365/*
5d1c421c
JH
2366 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2367 * These need to be revisited when a newer toolchain becomes available.
2368 */
2369#if defined(__sparc64__) && defined(__GNUC__)
2370# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2371# undef SPARC64_GCC_WORKAROUND
2372# define SPARC64_GCC_WORKAROUND 1
2373# endif
2374#endif
2375
07be1b83 2376#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2377 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2378 SV * const mysv=sv_newmortal(); \
2379 regnode *Next = regnext(scan); \
2380 regprop(RExC_rx, mysv, scan); \
7f69552c 2381 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2382 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2383 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2384 }});
07be1b83 2385
1de06328
YO
2386
2387
2388
2389
07be1b83
YO
2390#define JOIN_EXACT(scan,min,flags) \
2391 if (PL_regkind[OP(scan)] == EXACT) \
2392 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2393
be8e71aa 2394STATIC U32
07be1b83
YO
2395S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2396 /* Merge several consecutive EXACTish nodes into one. */
2397 regnode *n = regnext(scan);
2398 U32 stringok = 1;
2399 regnode *next = scan + NODE_SZ_STR(scan);
2400 U32 merged = 0;
2401 U32 stopnow = 0;
2402#ifdef DEBUGGING
2403 regnode *stop = scan;
72f13be8 2404 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2405#else
d47053eb
RGS
2406 PERL_UNUSED_ARG(depth);
2407#endif
7918f24d
NC
2408
2409 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2410#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2411 PERL_UNUSED_ARG(flags);
2412 PERL_UNUSED_ARG(val);
07be1b83 2413#endif
07be1b83
YO
2414 DEBUG_PEEP("join",scan,depth);
2415
2416 /* Skip NOTHING, merge EXACT*. */
2417 while (n &&
2418 ( PL_regkind[OP(n)] == NOTHING ||
2419 (stringok && (OP(n) == OP(scan))))
2420 && NEXT_OFF(n)
2421 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2422
2423 if (OP(n) == TAIL || n > next)
2424 stringok = 0;
2425 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2426 DEBUG_PEEP("skip:",n,depth);
2427 NEXT_OFF(scan) += NEXT_OFF(n);
2428 next = n + NODE_STEP_REGNODE;
2429#ifdef DEBUGGING
2430 if (stringok)
2431 stop = n;
2432#endif
2433 n = regnext(n);
2434 }
2435 else if (stringok) {
786e8c11 2436 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2437 regnode * const nnext = regnext(n);
2438
2439 DEBUG_PEEP("merg",n,depth);
2440
2441 merged++;
2442 if (oldl + STR_LEN(n) > U8_MAX)
2443 break;
2444 NEXT_OFF(scan) += NEXT_OFF(n);
2445 STR_LEN(scan) += STR_LEN(n);
2446 next = n + NODE_SZ_STR(n);
2447 /* Now we can overwrite *n : */
2448 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2449#ifdef DEBUGGING
2450 stop = next - 1;
2451#endif
2452 n = nnext;
2453 if (stopnow) break;
2454 }
2455
d47053eb
RGS
2456#ifdef EXPERIMENTAL_INPLACESCAN
2457 if (flags && !NEXT_OFF(n)) {
2458 DEBUG_PEEP("atch", val, depth);
2459 if (reg_off_by_arg[OP(n)]) {
2460 ARG_SET(n, val - n);
2461 }
2462 else {
2463 NEXT_OFF(n) = val - n;
2464 }
2465 stopnow = 1;
2466 }
07be1b83
YO
2467#endif
2468 }
ced7f090
KW
2469#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2470#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2471#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2472#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2c2b7f86
KW
2473
2474 if (UTF
2475 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
2476 && ( STR_LEN(scan) >= 6 ) )
2477 {
07be1b83
YO
2478 /*
2479 Two problematic code points in Unicode casefolding of EXACT nodes:
2480
2481 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2482 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2483
2484 which casefold to
2485
2486 Unicode UTF-8
2487
2488 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2489 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2490
2491 This means that in case-insensitive matching (or "loose matching",
2492 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2493 length of the above casefolded versions) can match a target string
2494 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2495 This would rather mess up the minimum length computation.
2496
2497 What we'll do is to look for the tail four bytes, and then peek
2498 at the preceding two bytes to see whether we need to decrease
2499 the minimum length by four (six minus two).
2500
2501 Thanks to the design of UTF-8, there cannot be false matches:
2502 A sequence of valid UTF-8 bytes cannot be a subsequence of
2503 another valid sequence of UTF-8 bytes.
2504
2505 */
2506 char * const s0 = STRING(scan), *s, *t;
2507 char * const s1 = s0 + STR_LEN(scan) - 1;
2508 char * const s2 = s1 - 4;
e294cc5d
JH
2509#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2510 const char t0[] = "\xaf\x49\xaf\x42";
2511#else
07be1b83 2512 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2513#endif
07be1b83
YO
2514 const char * const t1 = t0 + 3;
2515
2516 for (s = s0 + 2;
2517 s < s2 && (t = ninstr(s, s1, t0, t1));
2518 s = t + 4) {
e294cc5d
JH
2519#ifdef EBCDIC
2520 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2521 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2522#else
07be1b83
YO
2523 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2524 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2525#endif
07be1b83
YO
2526 *min -= 4;
2527 }
2528 }
2529
2530#ifdef DEBUGGING
2531 /* Allow dumping */
2532 n = scan + NODE_SZ_STR(scan);
2533 while (n <= stop) {
2534 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2535 OP(n) = OPTIMIZED;
2536 NEXT_OFF(n) = 0;
2537 }
2538 n++;
2539 }
2540#endif
2541 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2542 return stopnow;
2543}
2544
486ec47a 2545/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
2546 Finds fixed substrings. */
2547
a0288114 2548/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2549 to the position after last scanned or to NULL. */
2550
40d049e4
YO
2551#define INIT_AND_WITHP \
2552 assert(!and_withp); \
2553 Newx(and_withp,1,struct regnode_charclass_class); \
2554 SAVEFREEPV(and_withp)
07be1b83 2555
b515a41d 2556/* this is a chain of data about sub patterns we are processing that
486ec47a 2557 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
2558 we can simulate recursion without losing state. */
2559struct scan_frame;
2560typedef struct scan_frame {
2561 regnode *last; /* last node to process in this frame */
2562 regnode *next; /* next node to process when last is reached */
2563 struct scan_frame *prev; /*previous frame*/
2564 I32 stop; /* what stopparen do we use */
2565} scan_frame;
2566
304ee84b
YO
2567
2568#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2569
e1d1eefb
YO
2570#define CASE_SYNST_FNC(nAmE) \
2571case nAmE: \
2572 if (flags & SCF_DO_STCLASS_AND) { \
2573 for (value = 0; value < 256; value++) \
2574 if (!is_ ## nAmE ## _cp(value)) \
2575 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2576 } \
2577 else { \
2578 for (value = 0; value < 256; value++) \
2579 if (is_ ## nAmE ## _cp(value)) \
2580 ANYOF_BITMAP_SET(data->start_class, value); \
2581 } \
2582 break; \
2583case N ## nAmE: \
2584 if (flags & SCF_DO_STCLASS_AND) { \
2585 for (value = 0; value < 256; value++) \
2586 if (is_ ## nAmE ## _cp(value)) \
2587 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2588 } \
2589 else { \
2590 for (value = 0; value < 256; value++) \
2591 if (!is_ ## nAmE ## _cp(value)) \
2592 ANYOF_BITMAP_SET(data->start_class, value); \
2593 } \
2594 break
2595
2596
2597
76e3520e 2598STATIC I32
40d049e4 2599S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2600 I32 *minlenp, I32 *deltap,
40d049e4
YO
2601 regnode *last,
2602 scan_data_t *data,
2603 I32 stopparen,
2604 U8* recursed,
2605 struct regnode_charclass_class *and_withp,
2606 U32 flags, U32 depth)
c277df42
IZ
2607 /* scanp: Start here (read-write). */
2608 /* deltap: Write maxlen-minlen here. */
2609 /* last: Stop before this one. */
40d049e4
YO
2610 /* data: string data about the pattern */
2611 /* stopparen: treat close N as END */
2612 /* recursed: which subroutines have we recursed into */
2613 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2614{
97aff369 2615 dVAR;
c277df42
IZ
2616 I32 min = 0, pars = 0, code;
2617 regnode *scan = *scanp, *next;
2618 I32 delta = 0;
2619 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2620 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2621 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2622 scan_data_t data_fake;
a3621e74 2623 SV *re_trie_maxbuff = NULL;
786e8c11 2624 regnode *first_non_open = scan;
e2e6a0f1 2625 I32 stopmin = I32_MAX;
8aa23a47 2626 scan_frame *frame = NULL;
a3621e74 2627 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2628
7918f24d
NC
2629 PERL_ARGS_ASSERT_STUDY_CHUNK;
2630
13a24bad 2631#ifdef DEBUGGING
40d049e4 2632 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2633#endif
40d049e4 2634
786e8c11 2635 if ( depth == 0 ) {
40d049e4 2636 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2637 first_non_open=regnext(first_non_open);
2638 }
2639
b81d288d 2640
8aa23a47
YO
2641 fake_study_recurse:
2642 while ( scan && OP(scan) != END && scan < last ){
2643 /* Peephole optimizer: */
304ee84b 2644 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2645 DEBUG_PEEP("Peep",scan,depth);
2646 JOIN_EXACT(scan,&min,0);
2647
2648 /* Follow the next-chain of the current node and optimize
2649 away all the NOTHINGs from it. */
2650 if (OP(scan) != CURLYX) {
2651 const int max = (reg_off_by_arg[OP(scan)]
2652 ? I32_MAX
2653 /* I32 may be smaller than U16 on CRAYs! */
2654 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2655 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2656 int noff;
2657 regnode *n = scan;
2658
2659 /* Skip NOTHING and LONGJMP. */
2660 while ((n = regnext(n))
2661 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2662 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2663 && off + noff < max)
2664 off += noff;
2665 if (reg_off_by_arg[OP(scan)])
2666 ARG(scan) = off;
2667 else
2668 NEXT_OFF(scan) = off;
2669 }
a3621e74 2670
c277df42 2671
8aa23a47
YO
2672
2673 /* The principal pseudo-switch. Cannot be a switch, since we
2674 look into several different things. */
2675 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2676 || OP(scan) == IFTHEN) {
2677 next = regnext(scan);
2678 code = OP(scan);
2679 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2680
2681 if (OP(next) == code || code == IFTHEN) {
2682 /* NOTE - There is similar code to this block below for handling
2683 TRIE nodes on a re-study. If you change stuff here check there
2684 too. */
2685 I32 max1 = 0, min1 = I32_MAX, num = 0;
2686 struct regnode_charclass_class accum;
2687 regnode * const startbranch=scan;
2688
2689 if (flags & SCF_DO_SUBSTR)
304ee84b 2690 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2691 if (flags & SCF_DO_STCLASS)
2692 cl_init_zero(pRExC_state, &accum);
2693
2694 while (OP(scan) == code) {
2695 I32 deltanext, minnext, f = 0, fake;
2696 struct regnode_charclass_class this_class;
2697
2698 num++;
2699 data_fake.flags = 0;
2700 if (data) {
2701 data_fake.whilem_c = data->whilem_c;
2702 data_fake.last_closep = data->last_closep;
2703 }
2704 else
2705 data_fake.last_closep = &fake;
58e23c8d
YO
2706
2707 data_fake.pos_delta = delta;
8aa23a47
YO
2708 next = regnext(scan);
2709 scan = NEXTOPER(scan);
2710 if (code != BRANCH)
c277df42 2711 scan = NEXTOPER(scan);
8aa23a47
YO
2712 if (flags & SCF_DO_STCLASS) {
2713 cl_init(pRExC_state, &this_class);
2714 data_fake.start_class = &this_class;
2715 f = SCF_DO_STCLASS_AND;
58e23c8d 2716 }
8aa23a47
YO
2717 if (flags & SCF_WHILEM_VISITED_POS)
2718 f |= SCF_WHILEM_VISITED_POS;
2719
2720 /* we suppose the run is continuous, last=next...*/
2721 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2722 next, &data_fake,
2723 stopparen, recursed, NULL, f,depth+1);
2724 if (min1 > minnext)
2725 min1 = minnext;
2726 if (max1 < minnext + deltanext)
2727 max1 = minnext + deltanext;
2728 if (deltanext == I32_MAX)
2729 is_inf = is_inf_internal = 1;
2730 scan = next;
2731 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2732 pars++;
2733 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2734 if ( stopmin > minnext)
2735 stopmin = min + min1;
2736 flags &= ~SCF_DO_SUBSTR;
2737 if (data)
2738 data->flags |= SCF_SEEN_ACCEPT;
2739 }
2740 if (data) {
2741 if (data_fake.flags & SF_HAS_EVAL)
2742 data->flags |= SF_HAS_EVAL;
2743 data->whilem_c = data_fake.whilem_c;
3dab1dad 2744 }
8aa23a47
YO
2745 if (flags & SCF_DO_STCLASS)
2746 cl_or(pRExC_state, &accum, &this_class);
2747 }
2748 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2749 min1 = 0;
2750 if (flags & SCF_DO_SUBSTR) {
2751 data->pos_min += min1;
2752 data->pos_delta += max1 - min1;
2753 if (max1 != min1 || is_inf)
2754 data->longest = &(data->longest_float);
2755 }
2756 min += min1;
2757 delta += max1 - min1;
2758 if (flags & SCF_DO_STCLASS_OR) {
2759 cl_or(pRExC_state, data->start_class, &accum);
2760 if (min1) {
2761 cl_and(data->start_class, and_withp);
2762 flags &= ~SCF_DO_STCLASS;
653099ff 2763 }
8aa23a47
YO
2764 }
2765 else if (flags & SCF_DO_STCLASS_AND) {
2766 if (min1) {
2767 cl_and(data->start_class, &accum);
2768 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2769 }
8aa23a47
YO
2770 else {
2771 /* Switch to OR mode: cache the old value of
2772 * data->start_class */
2773 INIT_AND_WITHP;
2774 StructCopy(data->start_class, and_withp,
2775 struct regnode_charclass_class);
2776 flags &= ~SCF_DO_STCLASS_AND;
2777 StructCopy(&accum, data->start_class,
2778 struct regnode_charclass_class);
2779 flags |= SCF_DO_STCLASS_OR;
2780 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2781 }
8aa23a47 2782 }
a3621e74 2783
8aa23a47
YO
2784 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2785 /* demq.
a3621e74 2786
8aa23a47
YO
2787 Assuming this was/is a branch we are dealing with: 'scan' now
2788 points at the item that follows the branch sequence, whatever
2789 it is. We now start at the beginning of the sequence and look
2790 for subsequences of
a3621e74 2791
8aa23a47
YO
2792 BRANCH->EXACT=>x1
2793 BRANCH->EXACT=>x2
2794 tail
a3621e74 2795
8aa23a47 2796 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2797
486ec47a 2798 If we can find such a subsequence we need to turn the first
8aa23a47
YO
2799 element into a trie and then add the subsequent branch exact
2800 strings to the trie.
a3621e74 2801
8aa23a47 2802 We have two cases
a3621e74 2803
3b753521 2804 1. patterns where the whole set of branches can be converted.
a3621e74 2805
8aa23a47 2806 2. patterns where only a subset can be converted.
a3621e74 2807
8aa23a47
YO
2808 In case 1 we can replace the whole set with a single regop
2809 for the trie. In case 2 we need to keep the start and end
3b753521 2810 branches so
a3621e74 2811
8aa23a47
YO
2812 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2813 becomes BRANCH TRIE; BRANCH X;
786e8c11 2814
8aa23a47
YO
2815 There is an additional case, that being where there is a
2816 common prefix, which gets split out into an EXACT like node
2817 preceding the TRIE node.
a3621e74 2818
8aa23a47
YO
2819 If x(1..n)==tail then we can do a simple trie, if not we make
2820 a "jump" trie, such that when we match the appropriate word
486ec47a 2821 we "jump" to the appropriate tail node. Essentially we turn
8aa23a47 2822 a nested if into a case structure of sorts.
b515a41d 2823
8aa23a47
YO
2824 */
2825
2826 int made=0;
2827 if (!re_trie_maxbuff) {
2828 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2829 if (!SvIOK(re_trie_maxbuff))
2830 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2831 }
2832 if ( SvIV(re_trie_maxbuff)>=0 ) {
2833 regnode *cur;
2834 regnode *first = (regnode *)NULL;
2835 regnode *last = (regnode *)NULL;
2836 regnode *tail = scan;
2837 U8 optype = 0;
2838 U32 count=0;
a3621e74
YO
2839
2840#ifdef DEBUGGING
8aa23a47 2841 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2842#endif
8aa23a47
YO
2843 /* var tail is used because there may be a TAIL
2844 regop in the way. Ie, the exacts will point to the
2845 thing following the TAIL, but the last branch will
2846 point at the TAIL. So we advance tail. If we
2847 have nested (?:) we may have to move through several
2848 tails.
2849 */
2850
2851 while ( OP( tail ) == TAIL ) {
2852 /* this is the TAIL generated by (?:) */
2853 tail = regnext( tail );
2854 }
a3621e74 2855
8aa23a47
YO
2856
2857 DEBUG_OPTIMISE_r({
2858 regprop(RExC_rx, mysv, tail );
2859 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2860 (int)depth * 2 + 2, "",
2861 "Looking for TRIE'able sequences. Tail node is: ",
2862 SvPV_nolen_const( mysv )
2863 );
2864 });
2865
2866 /*
2867
2868 step through the branches, cur represents each
2869 branch, noper is the first thing to be matched
2870 as part of that branch and noper_next is the
2871 regnext() of that node. if noper is an EXACT
2872 and noper_next is the same as scan (our current
2873 position in the regex) then the EXACT branch is
2874 a possible optimization target. Once we have
486ec47a 2875 two or more consecutive such branches we can
8aa23a47
YO
2876 create a trie of the EXACT's contents and stich
2877 it in place. If the sequence represents all of
2878 the branches we eliminate the whole thing and
2879 replace it with a single TRIE. If it is a
2880 subsequence then we need to stitch it in. This
2881 means the first branch has to remain, and needs
2882 to be repointed at the item on the branch chain
2883 following the last branch optimized. This could
2884 be either a BRANCH, in which case the
2885 subsequence is internal, or it could be the
2886 item following the branch sequence in which
2887 case the subsequence is at the end.
2888
2889 */
2890
2891 /* dont use tail as the end marker for this traverse */
2892 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2893 regnode * const noper = NEXTOPER( cur );
b515a41d 2894#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2895 regnode * const noper_next = regnext( noper );
b515a41d
YO
2896#endif
2897
8aa23a47
YO
2898 DEBUG_OPTIMISE_r({
2899 regprop(RExC_rx, mysv, cur);
2900 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2901 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2902
2903 regprop(RExC_rx, mysv, noper);
2904 PerlIO_printf( Perl_debug_log, " -> %s",
2905 SvPV_nolen_const(mysv));
2906
2907 if ( noper_next ) {
2908 regprop(RExC_rx, mysv, noper_next );
2909 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2910 SvPV_nolen_const(mysv));
2911 }
2912 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2913 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2914 });
2915 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2916 : PL_regkind[ OP( noper ) ] == EXACT )
2917 || OP(noper) == NOTHING )
786e8c11 2918#ifdef NOJUMPTRIE
8aa23a47 2919 && noper_next == tail
786e8c11 2920#endif
8aa23a47
YO
2921 && count < U16_MAX)
2922 {
2923 count++;
2924 if ( !first || optype == NOTHING ) {
2925 if (!first) first = cur;
2926 optype = OP( noper );
2927 } else {
2928 last = cur;
2929 }
2930 } else {
a0a388a1 2931/*
0abd0d78
YO
2932 Currently we do not believe that the trie logic can
2933 handle case insensitive matching properly when the
2934 pattern is not unicode (thus forcing unicode semantics).
2935
2936 If/when this is fixed the following define can be swapped
2937 in below to fully enable trie logic.
2938
a0a388a1 2939#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2940
2941*/
2942#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2943
a0a388a1 2944 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2945 make_trie( pRExC_state,
2946 startbranch, first, cur, tail, count,
2947 optype, depth+1 );
2948 }
2949 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2950#ifdef NOJUMPTRIE
8aa23a47 2951 && noper_next == tail
786e8c11 2952#endif
8aa23a47
YO
2953 ){
2954 count = 1;
2955 first = cur;
2956 optype = OP( noper );
2957 } else {
2958 count = 0;
2959 first = NULL;
2960 optype = 0;
2961 }
2962 last = NULL;
2963 }
2964 }
2965 DEBUG_OPTIMISE_r({
2966 regprop(RExC_rx, mysv, cur);
2967 PerlIO_printf( Perl_debug_log,
2968 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2969 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2970
2971 });
a0a388a1
YO
2972
2973 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2974 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2975#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2976 if ( ((made == MADE_EXACT_TRIE &&
2977 startbranch == first)
2978 || ( first_non_open == first )) &&
2979 depth==0 ) {
2980 flags |= SCF_TRIE_RESTUDY;
2981 if ( startbranch == first
2982 && scan == tail )
2983 {
2984 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2985 }
2986 }
3dab1dad 2987#endif
8aa23a47
YO
2988 }
2989 }
2990
2991 } /* do trie */
2992
653099ff 2993 }
8aa23a47
YO
2994 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2995 scan = NEXTOPER(NEXTOPER(scan));
2996 } else /* single branch is optimized. */
2997 scan = NEXTOPER(scan);
2998 continue;
2999 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3000 scan_frame *newframe = NULL;
3001 I32 paren;
3002 regnode *start;
3003 regnode *end;
3004
3005 if (OP(scan) != SUSPEND) {
3006 /* set the pointer */
3007 if (OP(scan) == GOSUB) {
3008 paren = ARG(scan);
3009 RExC_recurse[ARG2L(scan)] = scan;
3010 start = RExC_open_parens[paren-1];
3011 end = RExC_close_parens[paren-1];
3012 } else {
3013 paren = 0;
f8fc2ecf 3014 start = RExC_rxi->program + 1;
8aa23a47
YO
3015 end = RExC_opend;
3016 }
3017 if (!recursed) {
3018 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3019 SAVEFREEPV(recursed);
3020 }
3021 if (!PAREN_TEST(recursed,paren+1)) {
3022 PAREN_SET(recursed,paren+1);
3023 Newx(newframe,1,scan_frame);
3024 } else {
3025 if (flags & SCF_DO_SUBSTR) {
304ee84b 3026 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3027 data->longest = &(data->longest_float);
3028 }
3029 is_inf = is_inf_internal = 1;
3030 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3031 cl_anything(pRExC_state, data->start_class);
3032 flags &= ~SCF_DO_STCLASS;
3033 }
3034 } else {
3035 Newx(newframe,1,scan_frame);
3036 paren = stopparen;
3037 start = scan+2;
3038 end = regnext(scan);
3039 }
3040 if (newframe) {
3041 assert(start);
3042 assert(end);
3043 SAVEFREEPV(newframe);
3044 newframe->next = regnext(scan);
3045 newframe->last = last;
3046 newframe->stop = stopparen;
3047 newframe->prev = frame;
3048
3049 frame = newframe;
3050 scan = start;
3051 stopparen = paren;
3052 last = end;
3053
3054 continue;
3055 }
3056 }
3057 else if (OP(scan) == EXACT) {
3058 I32 l = STR_LEN(scan);
3059 UV uc;
3060 if (UTF) {
3061 const U8 * const s = (U8*)STRING(scan);
3062 l = utf8_length(s, s + l);
3063 uc = utf8_to_uvchr(s, NULL);
3064 } else {
3065 uc = *((U8*)STRING(scan));
3066 }
3067 min += l;
3068 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3069 /* The code below prefers earlier match for fixed
3070 offset, later match for variable offset. */
3071 if (data->last_end == -1) { /* Update the start info. */
3072 data->last_start_min = data->pos_min;
3073 data->last_start_max = is_inf
3074 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3075 }
8aa23a47
YO
3076 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3077 if (UTF)
3078 SvUTF8_on(data->last_found);
3079 {
3080 SV * const sv = data->last_found;
3081 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3082 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3083 if (mg && mg->mg_len >= 0)
3084 mg->mg_len += utf8_length((U8*)STRING(scan),
3085 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3086 }
8aa23a47
YO
3087 data->last_end = data->pos_min + l;
3088 data->pos_min += l; /* As in the first entry. */
3089 data->flags &= ~SF_BEFORE_EOL;
3090 }
3091 if (flags & SCF_DO_STCLASS_AND) {
3092 /* Check whether it is compatible with what we know already! */
3093 int compat = 1;
3094
54251c2e 3095
486ec47a 3096 /* If compatible, we or it in below. It is compatible if is
54251c2e
KW
3097 * in the bitmp and either 1) its bit or its fold is set, or 2)
3098 * it's for a locale. Even if there isn't unicode semantics
3099 * here, at runtime there may be because of matching against a
3100 * utf8 string, so accept a possible false positive for
3101 * latin1-range folds */
8aa23a47
YO
3102 if (uc >= 0x100 ||
3103 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3104 && !ANYOF_BITMAP_TEST(data->start_class, uc)
39065660 3105 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
54251c2e 3106 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
8aa23a47
YO
3107 )
3108 compat = 0;
3109 ANYOF_CLASS_ZERO(data->start_class);
3110 ANYOF_BITMAP_ZERO(data->start_class);
3111 if (compat)
3112 ANYOF_BITMAP_SET(data->start_class, uc);
3113 data->start_class->flags &= ~ANYOF_EOS;
3114 if (uc < 0x100)
3115 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3116 }
3117 else if (flags & SCF_DO_STCLASS_OR) {
3118 /* false positive possible if the class is case-folded */
3119 if (uc < 0x100)
3120 ANYOF_BITMAP_SET(data->start_class, uc);
3121 else
3122 data->start_class->flags |= ANYOF_UNICODE_ALL;
3123 data->start_class->flags &= ~ANYOF_EOS;
3124 cl_and(data->start_class, and_withp);
3125 }
3126 flags &= ~SCF_DO_STCLASS;
3127 }
3128 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3129 I32 l = STR_LEN(scan);
3130 UV uc = *((U8*)STRING(scan));
3131
3132 /* Search for fixed substrings supports EXACT only. */
3133 if (flags & SCF_DO_SUBSTR) {
3134 assert(data);
304ee84b 3135 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3136 }
3137 if (UTF) {
3138 const U8 * const s = (U8 *)STRING(scan);
3139 l = utf8_length(s, s + l);
3140 uc = utf8_to_uvchr(s, NULL);
3141 }
3142 min += l;
3143 if (flags & SCF_DO_SUBSTR)
3144 data->pos_min += l;
3145 if (flags & SCF_DO_STCLASS_AND) {
3146 /* Check whether it is compatible with what we know already! */
3147 int compat = 1;
8aa23a47 3148 if (uc >= 0x100 ||
54251c2e
KW
3149 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3150 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3151 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3152 {
8aa23a47 3153 compat = 0;
54251c2e 3154 }
8aa23a47
YO
3155 ANYOF_CLASS_ZERO(data->start_class);
3156 ANYOF_BITMAP_ZERO(data->start_class);
3157 if (compat) {
3158 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3159 data->start_class->flags &= ~ANYOF_EOS;
39065660 3160 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
970c8436 3161 if (OP(scan) == EXACTFL) {
8aa23a47 3162 data->start_class->flags |= ANYOF_LOCALE;
970c8436
KW
3163 }
3164 else {
3165
54251c2e
KW
3166 /* Also set the other member of the fold pair. In case
3167 * that unicode semantics is called for at runtime, use
3168 * the full latin1 fold. (Can't do this for locale,
3169 * because not known until runtime */
3170 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
970c8436 3171 }
653099ff 3172 }
8aa23a47
YO
3173 }
3174 else if (flags & SCF_DO_STCLASS_OR) {
39065660 3175 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
8aa23a47
YO
3176 /* false positive possible if the class is case-folded.
3177 Assume that the locale settings are the same... */
970c8436 3178 if (uc < 0x100) {
1aa99e6b 3179 ANYOF_BITMAP_SET(data->start_class, uc);
970c8436
KW
3180 if (OP(scan) != EXACTFL) {
3181
3182 /* And set the other member of the fold pair, but
3183 * can't do that in locale because not known until
3184 * run-time */
3185 ANYOF_BITMAP_SET(data->start_class,
54251c2e 3186 PL_fold_latin1[uc]);
970c8436
KW
3187 }
3188 }
653099ff
GS
3189 data->start_class->flags &= ~ANYOF_EOS;
3190 }
8aa23a47 3191 cl_and(data->start_class, and_withp);
653099ff 3192 }
8aa23a47
YO
3193 flags &= ~SCF_DO_STCLASS;
3194 }
e52fc539 3195 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3196 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3197 I32 f = flags, pos_before = 0;
3198 regnode * const oscan = scan;
3199 struct regnode_charclass_class this_class;
3200 struct regnode_charclass_class *oclass = NULL;
3201 I32 next_is_eval = 0;
3202
3203 switch (PL_regkind[OP(scan)]) {
3204 case WHILEM: /* End of (?:...)* . */
3205 scan = NEXTOPER(scan);
3206 goto finish;
3207 case PLUS:
3208 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3209 next = NEXTOPER(scan);
3210 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3211 mincount = 1;
3212 maxcount = REG_INFTY;
3213 next = regnext(scan);
3214 scan = NEXTOPER(scan);
3215 goto do_curly;
3216 }
3217 }
3218 if (flags & SCF_DO_SUBSTR)
3219 data->pos_min++;
3220 min++;
3221 /* Fall through. */
3222 case STAR:
3223 if (flags & SCF_DO_STCLASS) {
3224 mincount = 0;
3225 maxcount = REG_INFTY;
3226 next = regnext(scan);
3227 scan = NEXTOPER(scan);
3228 goto do_curly;
3229 }
3230 is_inf = is_inf_internal = 1;
3231 scan = regnext(scan);
c277df42 3232 if (flags & SCF_DO_SUBSTR) {
304ee84b 3233 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3234 data->longest = &(data->longest_float);
c277df42 3235 }
8aa23a47
YO
3236 goto optimize_curly_tail;
3237 case CURLY:
3238 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3239 && (scan->flags == stopparen))
3240 {
3241 mincount = 1;
3242 maxcount = 1;
3243 } else {
3244 mincount = ARG1(scan);
3245 maxcount = ARG2(scan);
653099ff 3246 }
8aa23a47
YO
3247 next = regnext(scan);
3248 if (OP(scan) == CURLYX) {
3249 I32 lp = (data ? *(data->last_closep) : 0);
3250 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3251 }
8aa23a47
YO
3252 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3253 next_is_eval = (OP(scan) == EVAL);
3254 do_curly:
3255 if (flags & SCF_DO_SUBSTR) {
304ee84b 3256 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3257 pos_before = data->pos_min;
b45f050a 3258 }
8aa23a47
YO
3259 if (data) {
3260 fl = data->flags;
3261 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3262 if (is_inf)
3263 data->flags |= SF_IS_INF;
3264 }
3265 if (flags & SCF_DO_STCLASS) {
3266 cl_init(pRExC_state, &this_class);
3267 oclass = data->start_class;
3268 data->start_class = &this_class;
3269 f |= SCF_DO_STCLASS_AND;
3270 f &= ~SCF_DO_STCLASS_OR;
3271 }
779bcb7d
NC
3272 /* Exclude from super-linear cache processing any {n,m}
3273 regops for which the combination of input pos and regex
3274 pos is not enough information to determine if a match
3275 will be possible.
3276
3277 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3278 regex pos at the \s*, the prospects for a match depend not
3279 only on the input position but also on how many (bar\s*)
3280 repeats into the {4,8} we are. */
3281 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
8aa23a47 3282 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3283
8aa23a47
YO
3284 /* This will finish on WHILEM, setting scan, or on NULL: */
3285 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3286 last, data, stopparen, recursed, NULL,
3287 (mincount == 0
3288 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3289
8aa23a47
YO
3290 if (flags & SCF_DO_STCLASS)
3291 data->start_class = oclass;
3292 if (mincount == 0 || minnext == 0) {
3293 if (flags & SCF_DO_STCLASS_OR) {
3294 cl_or(pRExC_state, data->start_class, &this_class);
3295 }
3296 else if (flags & SCF_DO_STCLASS_AND) {
3297 /* Switch to OR mode: cache the old value of
3298 * data->start_class */
3299 INIT_AND_WITHP;
3300 StructCopy(data->start_class, and_withp,
3301 struct regnode_charclass_class);
3302 flags &= ~SCF_DO_STCLASS_AND;
3303 StructCopy(&this_class, data->start_class,
3304 struct regnode_charclass_class);
3305 flags |= SCF_DO_STCLASS_OR;
3306 data->start_class->flags |= ANYOF_EOS;
3307 }
3308 } else { /* Non-zero len */
3309 if (flags & SCF_DO_STCLASS_OR) {
3310 cl_or(pRExC_state, data->start_class, &this_class);
3311 cl_and(data->start_class, and_withp);
3312 }
3313 else if (flags & SCF_DO_STCLASS_AND)
3314 cl_and(data->start_class, &this_class);
3315 flags &= ~SCF_DO_STCLASS;
3316 }
3317 if (!scan) /* It was not CURLYX, but CURLY. */
3318 scan = next;
3319 if ( /* ? quantifier ok, except for (?{ ... }) */
3320 (next_is_eval || !(mincount == 0 && maxcount == 1))
3321 && (minnext == 0) && (deltanext == 0)
3322 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3323 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3324 {
668c081a
NC
3325 ckWARNreg(RExC_parse,
3326 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3327 }
3328
3329 min += minnext * mincount;
3330 is_inf_internal |= ((maxcount == REG_INFTY
3331 && (minnext + deltanext) > 0)
3332 || deltanext == I32_MAX);
3333 is_inf |= is_inf_internal;
3334 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3335
3336 /* Try powerful optimization CURLYX => CURLYN. */
3337 if ( OP(oscan) == CURLYX && data
3338 && data->flags & SF_IN_PAR
3339 && !(data->flags & SF_HAS_EVAL)
3340 && !deltanext && minnext == 1 ) {
3341 /* Try to optimize to CURLYN. */
3342 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3343 regnode * const nxt1 = nxt;
497b47a8 3344#ifdef DEBUGGING
8aa23a47 3345 regnode *nxt2;
497b47a8 3346#endif
c277df42 3347
8aa23a47
YO
3348 /* Skip open. */
3349 nxt = regnext(nxt);
e52fc539 3350 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3351 && !(PL_regkind[OP(nxt)] == EXACT
3352 && STR_LEN(nxt) == 1))
3353 goto nogo;
497b47a8 3354#ifdef DEBUGGING
8aa23a47 3355 nxt2 = nxt;
497b47a8 3356#endif
8aa23a47
YO
3357 nxt = regnext(nxt);
3358 if (OP(nxt) != CLOSE)
3359 goto nogo;
3360 if (RExC_open_parens) {
3361 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3362 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3363 }
3364 /* Now we know that nxt2 is the only contents: */
3365 oscan->flags = (U8)ARG(nxt);
3366 OP(oscan) = CURLYN;
3367 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3368
c277df42 3369#ifdef DEBUGGING
8aa23a47 3370 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3371 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3372 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3373 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3374 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3375 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3376#endif
8aa23a47
YO
3377 }
3378 nogo:
3379
3380 /* Try optimization CURLYX => CURLYM. */
3381 if ( OP(oscan) == CURLYX && data
3382 && !(data->flags & SF_HAS_PAR)
3383 && !(data->flags & SF_HAS_EVAL)
3384 && !deltanext /* atom is fixed width */
3385 && minnext != 0 /* CURLYM can't handle zero width */
3386 ) {
3387 /* XXXX How to optimize if data == 0? */
3388 /* Optimize to a simpler form. */
3389 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3390 regnode *nxt2;
3391
3392 OP(oscan) = CURLYM;
3393 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3394 && (OP(nxt2) != WHILEM))
3395 nxt = nxt2;
3396 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3397 /* Need to optimize away parenths. */
b3c0965f 3398 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3399 /* Set the parenth number. */
3400 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3401
8aa23a47
YO
3402 oscan->flags = (U8)ARG(nxt);
3403 if (RExC_open_parens) {
3404 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3405 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3406 }
8aa23a47
YO
3407 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3408 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3409
c277df42 3410#ifdef DEBUGGING
8aa23a47
YO
3411 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3412 OP(nxt + 1) = OPTIMIZED; /* was count. */
486ec47a
PA
3413 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3414 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
b81d288d 3415#endif
c277df42 3416#if 0
8aa23a47
YO
3417 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3418 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3419 if (nnxt == nxt) {
3420 if (reg_off_by_arg[OP(nxt1)])
3421 ARG_SET(nxt1, nxt2 - nxt1);
3422 else if (nxt2 - nxt1 < U16_MAX)
3423 NEXT_OFF(nxt1) = nxt2 - nxt1;
3424 else
3425 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3426 }
8aa23a47 3427 nxt1 = nnxt;
c277df42 3428 }
5d1c421c 3429#endif
8aa23a47
YO
3430 /* Optimize again: */
3431 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3432 NULL, stopparen, recursed, NULL, 0,depth+1);
3433 }
3434 else
3435 oscan->flags = 0;
3436 }
3437 else if ((OP(oscan) == CURLYX)
3438 && (flags & SCF_WHILEM_VISITED_POS)
3439 /* See the comment on a similar expression above.
3b753521 3440 However, this time it's not a subexpression
8aa23a47
YO
3441 we care about, but the expression itself. */
3442 && (maxcount == REG_INFTY)
3443 && data && ++data->whilem_c < 16) {
3444 /* This stays as CURLYX, we can put the count/of pair. */
3445 /* Find WHILEM (as in regexec.c) */
3446 regnode *nxt = oscan + NEXT_OFF(oscan);
3447
3448 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3449 nxt += ARG(nxt);
3450 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3451 | (RExC_whilem_seen << 4)); /* On WHILEM */
3452 }
3453 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3454 pars++;
3455 if (flags & SCF_DO_SUBSTR) {
3456 SV *last_str = NULL;
3457 int counted = mincount != 0;
a0ed51b3 3458
8aa23a47
YO
3459 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3460#if defined(SPARC64_GCC_WORKAROUND)
3461 I32 b = 0;
3462 STRLEN l = 0;
3463 const char *s = NULL;
3464 I32 old = 0;
b515a41d 3465
8aa23a47
YO
3466 if (pos_before >= data->last_start_min)
3467 b = pos_before;
3468 else
3469 b = data->last_start_min;
b515a41d 3470
8aa23a47
YO
3471 l = 0;
3472 s = SvPV_const(data->last_found, l);
3473 old = b - data->last_start_min;
3474
3475#else
3476 I32 b = pos_before >= data->last_start_min
3477 ? pos_before : data->last_start_min;
3478 STRLEN l;
3479 const char * const s = SvPV_const(data->last_found, l);
3480 I32 old = b - data->last_start_min;
3481#endif
3482
3483 if (UTF)
3484 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3485 l -= old;
3486 /* Get the added string: */
740cce10 3487 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3488 if (deltanext == 0 && pos_before == b) {
3489 /* What was added is a constant string */
3490 if (mincount > 1) {
3491 SvGROW(last_str, (mincount * l) + 1);
3492 repeatcpy(SvPVX(last_str) + l,
3493 SvPVX_const(last_str), l, mincount - 1);
3494 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3495 /* Add additional parts. */
3496 SvCUR_set(data->last_found,
3497 SvCUR(data->last_found) - l);
3498 sv_catsv(data->last_found, last_str);
3499 {
3500 SV * sv = data->last_found;
3501 MAGIC *mg =
3502 SvUTF8(sv) && SvMAGICAL(sv) ?
3503 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3504 if (mg && mg->mg_len >= 0)
bd94e887 3505 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3506 }
8aa23a47 3507 data->last_end += l * (mincount - 1);
b515a41d 3508 }
8aa23a47
YO
3509 } else {
3510 /* start offset must point into the last copy */
3511 data->last_start_min += minnext * (mincount - 1);
3512 data->last_start_max += is_inf ? I32_MAX
3513 : (maxcount - 1) * (minnext + data->pos_delta);
3514 }
c277df42 3515 }
8aa23a47
YO
3516 /* It is counted once already... */
3517 data->pos_min += minnext * (mincount - counted);
3518 data->pos_delta += - counted * deltanext +
3519 (minnext + deltanext) * maxcount - minnext * mincount;
3520 if (mincount != maxcount) {
3521 /* Cannot extend fixed substrings found inside
3522 the group. */
304ee84b 3523 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3524 if (mincount && last_str) {
3525 SV * const sv = data->last_found;
3526 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3527 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3528
3529 if (mg)
3530 mg->mg_len = -1;
3531 sv_setsv(sv, last_str);
3532 data->last_end = data->pos_min;
3533 data->last_start_min =
3534 data->pos_min - CHR_SVLEN(last_str);
3535 data->last_start_max = is_inf
3536 ? I32_MAX
3537 : data->pos_min + data->pos_delta
3538 - CHR_SVLEN(last_str);
3539 }
3540 data->longest = &(data->longest_float);
3541 }
3542 SvREFCNT_dec(last_str);
c277df42 3543 }
8aa23a47
YO
3544 if (data && (fl & SF_HAS_EVAL))
3545 data->flags |= SF_HAS_EVAL;
3546 optimize_curly_tail:
3547 if (OP(oscan) != CURLYX) {
3548 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3549 && NEXT_OFF(next))
3550 NEXT_OFF(oscan) += NEXT_OFF(next);
3551 }
3552 continue;
f56b6394 3553 default: /* REF, ANYOFV, and CLUMP only? */
8aa23a47 3554 if (flags & SCF_DO_SUBSTR) {
304ee84b 3555 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3556 data->longest = &(data->longest_float);
3557 }
3558 is_inf = is_inf_internal = 1;
3559 if (flags & SCF_DO_STCLASS_OR)
3560 cl_anything(pRExC_state, data->start_class);
3561 flags &= ~SCF_DO_STCLASS;
3562 break;
c277df42 3563 }
8aa23a47 3564 }
e1d1eefb
YO
3565 else if (OP(scan) == LNBREAK) {
3566 if (flags & SCF_DO_STCLASS) {
3567 int value = 0;
3568 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3569 if (flags & SCF_DO_STCLASS_AND) {
3570 for (value = 0; value < 256; value++)
e64b1bd1 3571 if (!is_VERTWS_cp(value))
b9a59e08
KW
3572 ANYOF_BITMAP_CLEAR(data->start_class, value);
3573 }
3574 else {
e1d1eefb 3575 for (value = 0; value < 256; value++)
e64b1bd1 3576 if (is_VERTWS_cp(value))
b9a59e08
KW
3577 ANYOF_BITMAP_SET(data->start_class, value);
3578 }
e1d1eefb
YO
3579 if (flags & SCF_DO_STCLASS_OR)
3580 cl_and(data->start_class, and_withp);
3581 flags &= ~SCF_DO_STCLASS;
3582 }
3583 min += 1;
f9a79580 3584 delta += 1;
e1d1eefb
YO
3585 if (flags & SCF_DO_SUBSTR) {
3586 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3587 data->pos_min += 1;
f9a79580 3588 data->pos_delta += 1;
e1d1eefb
YO
3589 data->longest = &(data->longest_float);
3590 }
e1d1eefb 3591 }
f9a79580 3592 else if (OP(scan) == FOLDCHAR) {
ced7f090 3593 int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
f9a79580
RGS
3594 flags &= ~SCF_DO_STCLASS;
3595 min += 1;
3596 delta += d;
3597 if (flags & SCF_DO_SUBSTR) {
3598 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3599 data->pos_min += 1;
3600 data->pos_delta += d;
3601 data->longest = &(data->longest_float);
3602 }
3603 }
e52fc539 3604 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 3605 int value = 0;
653099ff 3606
8aa23a47 3607 if (flags & SCF_DO_SUBSTR) {
304ee84b 3608 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3609 data->pos_min++;
3610 }
3611 min++;
3612 if (flags & SCF_DO_STCLASS) {
3613 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3614
8aa23a47
YO
3615 /* Some of the logic below assumes that switching
3616 locale on will only add false positives. */
3617 switch (PL_regkind[OP(scan)]) {
3618 case SANY:
3619 default:
3620 do_default:
3621 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3622 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3623 cl_anything(pRExC_state, data->start_class);
3624 break;
3625 case REG_ANY:
3626 if (OP(scan) == SANY)
3627 goto do_default;
3628 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3629 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3a15e693 3630 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
8aa23a47 3631 cl_anything(pRExC_state, data->start_class);
653099ff 3632 }
8aa23a47
YO
3633 if (flags & SCF_DO_STCLASS_AND || !value)
3634 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3635 break;
3636 case ANYOF:
3637 if (flags & SCF_DO_STCLASS_AND)
3638 cl_and(data->start_class,
3639 (struct regnode_charclass_class*)scan);
653099ff 3640 else
8aa23a47
YO
3641 cl_or(pRExC_state, data->start_class,
3642 (struct regnode_charclass_class*)scan);
3643 break;
3644 case ALNUM:
3645 if (flags & SCF_DO_STCLASS_AND) {
3646 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3647 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
980866de 3648 if (OP(scan) == ALNUMU) {
a12cf05f
KW
3649 for (value = 0; value < 256; value++) {
3650 if (!isWORDCHAR_L1(value)) {
3651 ANYOF_BITMAP_CLEAR(data->start_class, value);
3652 }
3653 }
3654 } else {
3655 for (value = 0; value < 256; value++) {
3656 if (!isALNUM(value)) {
3657 ANYOF_BITMAP_CLEAR(data->start_class, value);
3658 }
3659 }
3660 }
8aa23a47 3661 }
653099ff 3662 }
8aa23a47
YO
3663 else {
3664 if (data->start_class->flags & ANYOF_LOCALE)
3665 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
980866de 3666 else if (OP(scan) == ALNUMU) {
a12cf05f
KW
3667 for (value = 0; value < 256; value++) {
3668 if (isWORDCHAR_L1(value)) {
3669 ANYOF_BITMAP_SET(data->start_class, value);
3670 }
3671 }
3672 } else {
3673 for (value = 0; value < 256; value++) {
3674 if (isALNUM(value)) {
3675 ANYOF_BITMAP_SET(data->start_class, value);
3676 }
3677 }
3678 }
8aa23a47
YO
3679 }
3680 break;
8aa23a47
YO
3681 case NALNUM:
3682 if (flags & SCF_DO_STCLASS_AND) {
3683 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3684 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
980866de 3685 if (OP(scan) == NALNUMU) {
a12cf05f
KW
3686 for (value = 0; value < 256; value++) {
3687 if (isWORDCHAR_L1(value)) {
3688 ANYOF_BITMAP_CLEAR(data->start_class, value);
3689 }
3690 }
3691 } else {
3692 for (value = 0; value < 256; value++) {
3693 if (isALNUM(value)) {
3694 ANYOF_BITMAP_CLEAR(data->start_class, value);
3695 }
3696 }
3697 }
653099ff
GS
3698 }
3699 }
8aa23a47
YO
3700 else {
3701 if (data->start_class->flags & ANYOF_LOCALE)
3702 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3703 else {
980866de 3704 if (OP(scan) == NALNUMU) {
e9a9c1bc
KW
3705 for (value = 0; value < 256; value++) {
3706 if (! isWORDCHAR_L1(value)) {
3707 ANYOF_BITMAP_SET(data->start_class, value);
3708 }
3709 }
3710 } else {
3711 for (value = 0; value < 256; value++) {
3712 if (! isALNUM(value)) {
3713 ANYOF_BITMAP_SET(data->start_class, value);
3714 }
3715 }
3716 }
8aa23a47 3717 }
653099ff 3718 }
8aa23a47 3719 break;
8aa23a47
YO
3720 case SPACE:
3721 if (flags & SCF_DO_STCLASS_AND) {
3722 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3723 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
980866de 3724 if (OP(scan) == SPACEU) {
a12cf05f
KW
3725 for (value = 0; value < 256; value++) {
3726 if (!isSPACE_L1(value)) {
3727 ANYOF_BITMAP_CLEAR(data->start_class, value);
3728 }
3729 }
3730 } else {
3731 for (value = 0; value < 256; value++) {
3732 if (!isSPACE(value)) {
3733 ANYOF_BITMAP_CLEAR(data->start_class, value);
3734 }
3735 }
3736 }
653099ff
GS
3737 }
3738 }
8aa23a47 3739 else {
a12cf05f 3740 if (data->start_class->flags & ANYOF_LOCALE) {
8aa23a47 3741 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
a12cf05f 3742 }
980866de 3743 else if (OP(scan) == SPACEU) {
a12cf05f
KW
3744 for (value = 0; value < 256; value++) {
3745 if (isSPACE_L1(value)) {
3746 ANYOF_BITMAP_SET(data->start_class, value);
3747 }
3748 }
3749 } else {
3750 for (value = 0; value < 256; value++) {
3751 if (isSPACE(value)) {
3752 ANYOF_BITMAP_SET(data->start_class, value);
3753 }
3754 }
8aa23a47 3755 }
653099ff 3756 }
8aa23a47 3757 break;
8aa23a47
YO
3758 case NSPACE:
3759 if (flags & SCF_DO_STCLASS_AND) {
3760 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3761 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
980866de 3762 if (OP(scan) == NSPACEU) {
a12cf05f
KW
3763 for (value = 0; value < 256; value++) {
3764 if (isSPACE_L1(value)) {
3765 ANYOF_BITMAP_CLEAR(data->start_class, value);
3766 }
3767 }
3768 } else {
3769 for (value = 0; value < 256; value++) {
3770 if (isSPACE(value)) {
3771 ANYOF_BITMAP_CLEAR(data->start_class, value);
3772 }
3773 }
3774 }
653099ff 3775 }
8aa23a47
YO
3776 }
3777 else {
3778 if (data->start_class->flags & ANYOF_LOCALE)
3779 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
980866de 3780 else if (OP(scan) == NSPACEU) {
a12cf05f
KW
3781 for (value = 0; value < 256; value++) {
3782 if (!isSPACE_L1(value)) {
3783 ANYOF_BITMAP_SET(data->start_class, value);
3784 }
3785 }
3786 }
3787 else {
3788 for (value = 0; value < 256; value++) {
3789 if (!isSPACE(value)) {
3790 ANYOF_BITMAP_SET(data->start_class, value);
3791 }
3792 }
3793 }
653099ff 3794 }
8aa23a47 3795 break;
8aa23a47
YO
3796 case DIGIT:
3797 if (flags & SCF_DO_STCLASS_AND) {
3798 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3799 for (value = 0; value < 256; value++)
3800 if (!isDIGIT(value))
3801 ANYOF_BITMAP_CLEAR(data->start_class, value);
3802 }
3803 else {
3804 if (data->start_class->flags & ANYOF_LOCALE)
3805 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3806 else {
3807 for (value = 0; value < 256; value++)
3808 if (isDIGIT(value))
b9a59e08 3809 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47
YO
3810 }
3811 }
3812 break;
3813 case NDIGIT:
3814 if (flags & SCF_DO_STCLASS_AND) {
3815 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3816 for (value = 0; value < 256; value++)
3817 if (isDIGIT(value))
3818 ANYOF_BITMAP_CLEAR(data->start_class, value);
3819 }
3820 else {
3821 if (data->start_class->flags & ANYOF_LOCALE)
3822 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3823 else {
3824 for (value = 0; value < 256; value++)
3825 if (!isDIGIT(value))
b9a59e08 3826 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3827 }
3828 }
8aa23a47 3829 break;
e1d1eefb
YO
3830 CASE_SYNST_FNC(VERTWS);
3831 CASE_SYNST_FNC(HORIZWS);
3832
8aa23a47
YO
3833 }
3834 if (flags & SCF_DO_STCLASS_OR)
3835 cl_and(data->start_class, and_withp);
3836 flags &= ~SCF_DO_STCLASS;
3837 }
3838 }
3839 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3840 data->flags |= (OP(scan) == MEOL
3841 ? SF_BEFORE_MEOL
3842 : SF_BEFORE_SEOL);
3843 }
3844 else if ( PL_regkind[OP(scan)] == BRANCHJ
3845 /* Lookbehind, or need to calculate parens/evals/stclass: */
3846 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3847 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3848 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3849 || OP(scan) == UNLESSM )
3850 {
3851 /* Negative Lookahead/lookbehind
3852 In this case we can't do fixed string optimisation.
3853 */
1de06328 3854
8aa23a47
YO
3855 I32 deltanext, minnext, fake = 0;
3856 regnode *nscan;
3857 struct regnode_charclass_class intrnl;
3858 int f = 0;
1de06328 3859
8aa23a47
YO
3860 data_fake.flags = 0;
3861 if (data) {
3862 data_fake.whilem_c = data->whilem_c;
3863 data_fake.last_closep = data->last_closep;
c277df42 3864 }
8aa23a47
YO
3865 else
3866 data_fake.last_closep = &fake;
58e23c8d 3867 data_fake.pos_delta = delta;
8aa23a47
YO
3868 if ( flags & SCF_DO_STCLASS && !scan->flags
3869 && OP(scan) == IFMATCH ) { /* Lookahead */
3870 cl_init(pRExC_state, &intrnl);
3871 data_fake.start_class = &intrnl;
3872 f |= SCF_DO_STCLASS_AND;
3873 }
3874 if (flags & SCF_WHILEM_VISITED_POS)
3875 f |= SCF_WHILEM_VISITED_POS;
3876 next = regnext(scan);
3877 nscan = NEXTOPER(NEXTOPER(scan));
3878 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3879 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3880 if (scan->flags) {
3881 if (deltanext) {
58e23c8d 3882 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3883 }
3884 else if (minnext > (I32)U8_MAX) {
58e23c8d 3885 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3886 }
3887 scan->flags = (U8)minnext;
3888 }
3889 if (data) {
3890 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3891 pars++;
3892 if (data_fake.flags & SF_HAS_EVAL)
3893 data->flags |= SF_HAS_EVAL;
3894 data->whilem_c = data_fake.whilem_c;
3895 }
3896 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
3897 if (flags & SCF_DO_STCLASS_OR) {
3898 /* OR before, AND after: ideally we would recurse with
3899 * data_fake to get the AND applied by study of the
3900 * remainder of the pattern, and then derecurse;
3901 * *** HACK *** for now just treat as "no information".
3902 * See [perl #56690].
3903 */
3904 cl_init(pRExC_state, data->start_class);
3905 } else {
3906 /* AND before and after: combine and continue */
3907 const int was = (data->start_class->flags & ANYOF_EOS);
3908
3909 cl_and(data->start_class, &intrnl);
3910 if (was)
3911 data->start_class->flags |= ANYOF_EOS;
3912 }
8aa23a47 3913 }
cb434fcc 3914 }
8aa23a47
YO
3915#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3916 else {
3917 /* Positive Lookahead/lookbehind
3918 In this case we can do fixed string optimisation,
3919 but we must be careful about it. Note in the case of
3920 lookbehind the positions will be offset by the minimum
3921 length of the pattern, something we won't know about
3922 until after the recurse.
3923 */
3924 I32 deltanext, fake = 0;
3925 regnode *nscan;
3926 struct regnode_charclass_class intrnl;
3927 int f = 0;
3928 /* We use SAVEFREEPV so that when the full compile
3929 is finished perl will clean up the allocated
3b753521 3930 minlens when it's all done. This way we don't
8aa23a47
YO
3931 have to worry about freeing them when we know
3932 they wont be used, which would be a pain.
3933 */
3934 I32 *minnextp;
3935 Newx( minnextp, 1, I32 );
3936 SAVEFREEPV(minnextp);
3937
3938 if (data) {
3939 StructCopy(data, &data_fake, scan_data_t);
3940 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3941 f |= SCF_DO_SUBSTR;
3942 if (scan->flags)
304ee84b 3943 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3944 data_fake.last_found=newSVsv(data->last_found);
3945 }
3946 }
3947 else
3948 data_fake.last_closep = &fake;
3949 data_fake.flags = 0;
58e23c8d 3950 data_fake.pos_delta = delta;
8aa23a47
YO
3951 if (is_inf)
3952 data_fake.flags |= SF_IS_INF;
3953 if ( flags & SCF_DO_STCLASS && !scan->flags
3954 && OP(scan) == IFMATCH ) { /* Lookahead */
3955 cl_init(pRExC_state, &intrnl);
3956 data_fake.start_class = &intrnl;
3957 f |= SCF_DO_STCLASS_AND;
3958 }
3959 if (flags & SCF_WHILEM_VISITED_POS)
3960 f |= SCF_WHILEM_VISITED_POS;
3961 next = regnext(scan);
3962 nscan = NEXTOPER(NEXTOPER(scan));
3963
3964 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3965 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3966 if (scan->flags) {
3967 if (deltanext) {
58e23c8d 3968 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3969 }
3970 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3971 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3972 }
3973 scan->flags = (U8)*minnextp;
3974 }
3975
3976 *minnextp += min;
3977
3978 if (f & SCF_DO_STCLASS_AND) {
3979 const int was = (data->start_class->flags & ANYOF_EOS);
3980
3981 cl_and(data->start_class, &intrnl);
3982 if (was)
3983 data->start_class->flags |= ANYOF_EOS;
3984 }
3985 if (data) {
3986 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3987 pars++;
3988 if (data_fake.flags & SF_HAS_EVAL)
3989 data->flags |= SF_HAS_EVAL;
3990 data->whilem_c = data_fake.whilem_c;
3991 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3992 if (RExC_rx->minlen<*minnextp)
3993 RExC_rx->minlen=*minnextp;
304ee84b 3994 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3995 SvREFCNT_dec(data_fake.last_found);
3996
3997 if ( data_fake.minlen_fixed != minlenp )
3998 {
3999 data->offset_fixed= data_fake.offset_fixed;
4000 data->minlen_fixed= data_fake.minlen_fixed;
4001 data->lookbehind_fixed+= scan->flags;
4002 }
4003 if ( data_fake.minlen_float != minlenp )
4004 {
4005 data->minlen_float= data_fake.minlen_float;
4006 data->offset_float_min=data_fake.offset_float_min;
4007 data->offset_float_max=data_fake.offset_float_max;
4008 data->lookbehind_float+= scan->flags;
4009 }
4010 }
4011 }
4012
4013
40d049e4 4014 }
8aa23a47
YO
4015#endif
4016 }
4017 else if (OP(scan) == OPEN) {
4018 if (stopparen != (I32)ARG(scan))
4019 pars++;
4020 }
4021 else if (OP(scan) == CLOSE) {
4022 if (stopparen == (I32)ARG(scan)) {
4023 break;
4024 }
4025 if ((I32)ARG(scan) == is_par) {
4026 next = regnext(scan);
b515a41d 4027
8aa23a47
YO
4028 if ( next && (OP(next) != WHILEM) && next < last)
4029 is_par = 0; /* Disable optimization */
40d049e4 4030 }
8aa23a47
YO
4031 if (data)
4032 *(data->last_closep) = ARG(scan);
4033 }
4034 else if (OP(scan) == EVAL) {
c277df42
IZ
4035 if (data)
4036 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
4037 }
4038 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4039 if (flags & SCF_DO_SUBSTR) {
304ee84b 4040 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 4041 flags &= ~SCF_DO_SUBSTR;
40d049e4 4042 }
8aa23a47
YO
4043 if (data && OP(scan)==ACCEPT) {
4044 data->flags |= SCF_SEEN_ACCEPT;
4045 if (stopmin > min)
4046 stopmin = min;
e2e6a0f1 4047 }
8aa23a47
YO
4048 }
4049 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4050 {
0f5d15d6 4051 if (flags & SCF_DO_SUBSTR) {
304ee84b 4052 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
4053 data->longest = &(data->longest_float);
4054 }
4055 is_inf = is_inf_internal = 1;
653099ff 4056 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 4057 cl_anything(pRExC_state, data->start_class);
96776eda 4058 flags &= ~SCF_DO_STCLASS;
8aa23a47 4059 }
58e23c8d 4060 else if (OP(scan) == GPOS) {
bbe252da 4061 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
4062 !(delta || is_inf || (data && data->pos_delta)))
4063 {
bbe252da
YO
4064 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4065 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
4066 if (RExC_rx->gofs < (U32)min)
4067 RExC_rx->gofs = min;
4068 } else {
bbe252da 4069 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
4070 RExC_rx->gofs = 0;
4071 }
4072 }
786e8c11 4073#ifdef TRIE_STUDY_OPT
40d049e4 4074#ifdef FULL_TRIE_STUDY
8aa23a47
YO
4075 else if (PL_regkind[OP(scan)] == TRIE) {
4076 /* NOTE - There is similar code to this block above for handling
4077 BRANCH nodes on the initial study. If you change stuff here
4078 check there too. */
4079 regnode *trie_node= scan;
4080 regnode *tail= regnext(scan);
f8fc2ecf 4081 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4082 I32 max1 = 0, min1 = I32_MAX;
4083 struct regnode_charclass_class accum;
4084
4085 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 4086 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
4087 if (flags & SCF_DO_STCLASS)
4088 cl_init_zero(pRExC_state, &accum);
4089
4090 if (!trie->jump) {
4091 min1= trie->minlen;
4092 max1= trie->maxlen;
4093 } else {
4094 const regnode *nextbranch= NULL;
4095 U32 word;
4096
4097 for ( word=1 ; word <= trie->wordcount ; word++)
4098 {
4099 I32 deltanext=0, minnext=0, f = 0, fake;
4100 struct regnode_charclass_class this_class;
4101
4102 data_fake.flags = 0;
4103 if (data) {
4104 data_fake.whilem_c = data->whilem_c;
4105 data_fake.last_closep = data->last_closep;
4106 }
4107 else
4108 data_fake.last_closep = &fake;
58e23c8d 4109 data_fake.pos_delta = delta;
8aa23a47
YO
4110 if (flags & SCF_DO_STCLASS) {
4111 cl_init(pRExC_state, &this_class);
4112 data_fake.start_class = &this_class;
4113 f = SCF_DO_STCLASS_AND;
4114 }
4115 if (flags & SCF_WHILEM_VISITED_POS)
4116 f |= SCF_WHILEM_VISITED_POS;
4117
4118 if (trie->jump[word]) {
4119 if (!nextbranch)
4120 nextbranch = trie_node + trie->jump[0];
4121 scan= trie_node + trie->jump[word];
4122 /* We go from the jump point to the branch that follows
4123 it. Note this means we need the vestigal unused branches
4124 even though they arent otherwise used.
4125 */
4126 minnext = study_chunk(pRExC_state, &scan, minlenp,
4127 &deltanext, (regnode *)nextbranch, &data_fake,
4128 stopparen, recursed, NULL, f,depth+1);
4129 }
4130 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4131 nextbranch= regnext((regnode*)nextbranch);
4132
4133 if (min1 > (I32)(minnext + trie->minlen))
4134 min1 = minnext + trie->minlen;
4135 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4136 max1 = minnext + deltanext + trie->maxlen;
4137 if (deltanext == I32_MAX)
4138 is_inf = is_inf_internal = 1;
4139
4140 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4141 pars++;
4142 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4143 if ( stopmin > min + min1)
4144 stopmin = min + min1;
4145 flags &= ~SCF_DO_SUBSTR;
4146 if (data)
4147 data->flags |= SCF_SEEN_ACCEPT;
4148 }
4149 if (data) {
4150 if (data_fake.flags & SF_HAS_EVAL)
4151 data->flags |= SF_HAS_EVAL;
4152 data->whilem_c = data_fake.whilem_c;
4153 }
4154 if (flags & SCF_DO_STCLASS)
4155 cl_or(pRExC_state, &accum, &this_class);
4156 }
4157 }
4158 if (flags & SCF_DO_SUBSTR) {
4159 data->pos_min += min1;
4160 data->pos_delta += max1 - min1;
4161 if (max1 != min1 || is_inf)
4162 data->longest = &(data->longest_float);
4163 }
4164 min += min1;
4165 delta += max1 - min1;
4166 if (flags & SCF_DO_STCLASS_OR) {
4167 cl_or(pRExC_state, data->start_class, &accum);
4168 if (min1) {
4169 cl_and(data->start_class, and_withp);
4170 flags &= ~SCF_DO_STCLASS;
4171 }
4172 }
4173 else if (flags & SCF_DO_STCLASS_AND) {
4174 if (min1) {
4175 cl_and(data->start_class, &accum);
4176 flags &= ~SCF_DO_STCLASS;
4177 }
4178 else {
4179 /* Switch to OR mode: cache the old value of
4180 * data->start_class */
4181 INIT_AND_WITHP;
4182 StructCopy(data->start_class, and_withp,
4183 struct regnode_charclass_class);
4184 flags &= ~SCF_DO_STCLASS_AND;
4185 StructCopy(&accum, data->start_class,
4186 struct regnode_charclass_class);
4187 flags |= SCF_DO_STCLASS_OR;
4188 data->start_class->flags |= ANYOF_EOS;
4189 }
4190 }
4191 scan= tail;
4192 continue;
4193 }
786e8c11 4194#else
8aa23a47 4195 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4196 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4197 U8*bang=NULL;
4198
4199 min += trie->minlen;
4200 delta += (trie->maxlen - trie->minlen);
4201 flags &= ~SCF_DO_STCLASS; /* xxx */
4202 if (flags & SCF_DO_SUBSTR) {
304ee84b 4203 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4204 data->pos_min += trie->minlen;
4205 data->pos_delta += (trie->maxlen - trie->minlen);
4206 if (trie->maxlen != trie->minlen)
4207 data->longest = &(data->longest_float);
4208 }
4209 if (trie->jump) /* no more substrings -- for now /grr*/
4210 flags &= ~SCF_DO_SUBSTR;
b515a41d 4211 }
8aa23a47
YO
4212#endif /* old or new */
4213#endif /* TRIE_STUDY_OPT */
e1d1eefb 4214
8aa23a47
YO
4215 /* Else: zero-length, ignore. */
4216 scan = regnext(scan);
4217 }
4218 if (frame) {
4219 last = frame->last;
4220 scan = frame->next;
4221 stopparen = frame->stop;
4222 frame = frame->prev;
4223 goto fake_study_recurse;
c277df42
IZ
4224 }
4225
4226 finish:
8aa23a47 4227 assert(!frame);
304ee84b 4228 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4229
c277df42 4230 *scanp = scan;
aca2d497 4231 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4232 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4233 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4234 if (is_par > (I32)U8_MAX)
c277df42
IZ
4235 is_par = 0;
4236 if (is_par && pars==1 && data) {
4237 data->flags |= SF_IN_PAR;
4238 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4239 }
4240 else if (pars && data) {
c277df42
IZ
4241 data->flags |= SF_HAS_PAR;
4242 data->flags &= ~SF_IN_PAR;
4243 }
653099ff 4244 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4245 cl_and(data->start_class, and_withp);
786e8c11
YO
4246 if (flags & SCF_TRIE_RESTUDY)
4247 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4248
304ee84b 4249 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4250
e2e6a0f1 4251 return min < stopmin ? min : stopmin;
c277df42
IZ
4252}
4253
2eccd3b2
NC
4254STATIC U32
4255S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4256{
4a4e7719
NC
4257 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4258
7918f24d
NC
4259 PERL_ARGS_ASSERT_ADD_DATA;
4260
4a4e7719
NC
4261 Renewc(RExC_rxi->data,
4262 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4263 char, struct reg_data);
4264 if(count)
f8fc2ecf 4265 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4266 else
f8fc2ecf 4267 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4268 RExC_rxi->data->count = count + n;
4269 Copy(s, RExC_rxi->data->what + count, n, U8);
4270 return count;
c277df42
IZ
4271}
4272
f8149455 4273/*XXX: todo make this not included in a non debugging perl */
76234dfb 4274#ifndef PERL_IN_XSUB_RE
d88dccdf 4275void
864dbfa3 4276Perl_reginitcolors(pTHX)
d88dccdf 4277{
97aff369 4278 dVAR;
1df70142 4279 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4280 if (s) {
1df70142
AL
4281 char *t = savepv(s);
4282 int i = 0;
4283 PL_colors[0] = t;
d88dccdf 4284 while (++i < 6) {
1df70142
AL
4285 t = strchr(t, '\t');
4286 if (t) {
4287 *t = '\0';
4288 PL_colors[i] = ++t;
d88dccdf
IZ
4289 }
4290 else
1df70142 4291 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4292 }
4293 } else {
1df70142 4294 int i = 0;
b81d288d 4295 while (i < 6)
06b5626a 4296 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4297 }
4298 PL_colorset = 1;
4299}
76234dfb 4300#endif
8615cb43 4301
07be1b83 4302
786e8c11
YO
4303#ifdef TRIE_STUDY_OPT
4304#define CHECK_RESTUDY_GOTO \
4305 if ( \
4306 (data.flags & SCF_TRIE_RESTUDY) \
4307 && ! restudied++ \
4308 ) goto reStudy
4309#else
4310#define CHECK_RESTUDY_GOTO
4311#endif
f9f4320a 4312
a687059c 4313/*
e50aee73 4314 - pregcomp - compile a regular expression into internal code
a687059c
LW
4315 *
4316 * We can't allocate space until we know how big the compiled form will be,
4317 * but we can't compile it (and thus know how big it is) until we've got a
4318 * place to put the code. So we cheat: we compile it twice, once with code
4319 * generation turned off and size counting turned on, and once "for real".
4320 * This also means that we don't allocate space until we are sure that the
4321 * thing really will compile successfully, and we never have to move the
4322 * code and thus invalidate pointers into it. (Note that it has to be in
4323 * one piece because free() must be able to free it all.) [NB: not true in perl]
4324 *
4325 * Beware that the optimization-preparation code in here knows about some
4326 * of the structure of the compiled regexp. [I'll say.]
4327 */
b9b4dddf
YO
4328
4329
4330
f9f4320a 4331#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4332#define RE_ENGINE_PTR &PL_core_reg_engine
4333#else
f9f4320a
YO
4334extern const struct regexp_engine my_reg_engine;
4335#define RE_ENGINE_PTR &my_reg_engine
4336#endif
6d5c990f
RGS
4337
4338#ifndef PERL_IN_XSUB_RE
3ab4a224 4339REGEXP *
1593ad57 4340Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4341{
97aff369 4342 dVAR;
6d5c990f 4343 HV * const table = GvHV(PL_hintgv);
7918f24d
NC
4344
4345 PERL_ARGS_ASSERT_PREGCOMP;
4346
f9f4320a
YO
4347 /* Dispatch a request to compile a regexp to correct
4348 regexp engine. */
f9f4320a
YO
4349 if (table) {
4350 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4351 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4352 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4353 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4354 DEBUG_COMPILE_r({
8d8756e7 4355 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4356 SvIV(*ptr));
4357 });
3ab4a224 4358 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4359 }
b9b4dddf 4360 }
3ab4a224 4361 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4362}
6d5c990f 4363#endif
2a5d9b1d 4364
3ab4a224 4365REGEXP *
29b09c41 4366Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
2a5d9b1d
RGS
4367{
4368 dVAR;
288b8c02
NC
4369 REGEXP *rx;
4370 struct regexp *r;
f8fc2ecf 4371 register regexp_internal *ri;
3ab4a224 4372 STRLEN plen;
5d51ce98
KW
4373 char *exp;
4374 char* xend;
c277df42 4375 regnode *scan;
a0d0e21e 4376 I32 flags;
a0d0e21e 4377 I32 minlen = 0;
29b09c41 4378 U32 pm_flags;
e7f38d0f
YO
4379
4380 /* these are all flags - maybe they should be turned
4381 * into a single int with different bit masks */
4382 I32 sawlookahead = 0;
a0d0e21e
LW
4383 I32 sawplus = 0;
4384 I32 sawopen = 0;
29b09c41 4385 bool used_setjump = FALSE;
e7f38d0f 4386
bbd61b5f
KW
4387 U8 jump_ret = 0;
4388 dJMPENV;
2c2d71f5 4389 scan_data_t data;
830247a4 4390 RExC_state_t RExC_state;
be8e71aa 4391 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83 4392#ifdef TRIE_STUDY_OPT
5d51ce98 4393 int restudied;
07be1b83
YO
4394 RExC_state_t copyRExC_state;
4395#endif
2a5d9b1d 4396 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4397
4398 PERL_ARGS_ASSERT_RE_COMPILE;
4399
6d5c990f 4400 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4401
29b09c41 4402 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
e40e74fe 4403 RExC_uni_semantics = 0;
7b597bb8 4404
d6bd454d 4405 /****************** LONG JUMP TARGET HERE***********************/
bbd61b5f
KW
4406 /* Longjmp back to here if have to switch in midstream to utf8 */
4407 if (! RExC_orig_utf8) {
4408 JMPENV_PUSH(jump_ret);
29b09c41 4409 used_setjump = TRUE;
bbd61b5f
KW
4410 }
4411
5d51ce98 4412 if (jump_ret == 0) { /* First time through */
29b09c41
KW
4413 exp = SvPV(pattern, plen);
4414 xend = exp + plen;
4415 /* ignore the utf8ness if the pattern is 0 length */
4416 if (plen == 0) {
4417 RExC_utf8 = RExC_orig_utf8 = 0;
4418 }
4419
5d51ce98
KW
4420 DEBUG_COMPILE_r({
4421 SV *dsv= sv_newmortal();
4422 RE_PV_QUOTED_DECL(s, RExC_utf8,
4423 dsv, exp, plen, 60);
4424 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4425 PL_colors[4],PL_colors[5],s);
4426 });
4427 }
4428 else { /* longjumped back */
bbd61b5f
KW
4429 STRLEN len = plen;
4430
5d51ce98
KW
4431 /* If the cause for the longjmp was other than changing to utf8, pop
4432 * our own setjmp, and longjmp to the correct handler */
bbd61b5f
KW
4433 if (jump_ret != UTF8_LONGJMP) {
4434 JMPENV_POP;
4435 JMPENV_JUMP(jump_ret);
4436 }
4437
595598ee
KW
4438 GET_RE_DEBUG_FLAGS;
4439
bbd61b5f
KW
4440 /* It's possible to write a regexp in ascii that represents Unicode
4441 codepoints outside of the byte range, such as via \x{100}. If we
4442 detect such a sequence we have to convert the entire pattern to utf8
4443 and then recompile, as our sizing calculation will have been based
4444 on 1 byte == 1 character, but we will need to use utf8 to encode
4445 at least some part of the pattern, and therefore must convert the whole
4446 thing.
4447 -- dmq */
4448 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4449 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
595598ee 4450 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
bbd61b5f
KW
4451 xend = exp + len;
4452 RExC_orig_utf8 = RExC_utf8 = 1;
4453 SAVEFREEPV(exp);
4454 }
4455
5d51ce98
KW
4456#ifdef TRIE_STUDY_OPT
4457 restudied = 0;
4458#endif
4459
29b09c41 4460 /* Set to use unicode semantics if the pattern is in utf8 and has the
50e91148 4461 * 'depends' charset specified, as it means unicode when utf8 */
29b09c41 4462 pm_flags = orig_pm_flags;
a62b1201
KW
4463
4464 if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
4465 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
29b09c41
KW
4466 }
4467
02daf0ab 4468 RExC_precomp = exp;
c737faaf 4469 RExC_flags = pm_flags;
830247a4 4470 RExC_sawback = 0;
bbce6d69 4471
830247a4 4472 RExC_seen = 0;
b57e4118 4473 RExC_in_lookbehind = 0;
830247a4
IZ
4474 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4475 RExC_seen_evals = 0;
4476 RExC_extralen = 0;
c277df42 4477
bbce6d69 4478 /* First pass: determine size, legality. */
830247a4 4479 RExC_parse = exp;
fac92740 4480 RExC_start = exp;
830247a4
IZ
4481 RExC_end = xend;
4482 RExC_naughty = 0;
4483 RExC_npar = 1;
e2e6a0f1 4484 RExC_nestroot = 0;
830247a4
IZ
4485 RExC_size = 0L;
4486 RExC_emit = &PL_regdummy;
4487 RExC_whilem_seen = 0;
40d049e4
YO
4488 RExC_open_parens = NULL;
4489 RExC_close_parens = NULL;
4490 RExC_opend = NULL;
81714fb9 4491 RExC_paren_names = NULL;
1f1031fe
YO
4492#ifdef DEBUGGING
4493 RExC_paren_name_list = NULL;
4494#endif
40d049e4
YO
4495 RExC_recurse = NULL;
4496 RExC_recurse_count = 0;
81714fb9 4497
85ddcde9
JH
4498#if 0 /* REGC() is (currently) a NOP at the first pass.
4499 * Clever compilers notice this and complain. --jhi */
830247a4 4500 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4501#endif
3dab1dad
YO
4502 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4503 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4504 RExC_precomp = NULL;
a0d0e21e
LW
4505 return(NULL);
4506 }
bbd61b5f 4507
29b09c41
KW
4508 /* Here, finished first pass. Get rid of any added setjmp */
4509 if (used_setjump) {
bbd61b5f 4510 JMPENV_POP;
02daf0ab 4511 }
e40e74fe 4512
07be1b83 4513 DEBUG_PARSE_r({
81714fb9
YO
4514 PerlIO_printf(Perl_debug_log,
4515 "Required size %"IVdf" nodes\n"
4516 "Starting second pass (creation)\n",
4517 (IV)RExC_size);
07be1b83
YO
4518 RExC_lastnum=0;
4519 RExC_lastparse=NULL;
4520 });
e40e74fe
KW
4521
4522 /* The first pass could have found things that force Unicode semantics */
4523 if ((RExC_utf8 || RExC_uni_semantics)
4524 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4525 {
4526 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4527 }
4528
c277df42
IZ
4529 /* Small enough for pointer-storage convention?
4530 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4531 if (RExC_size >= 0x10000L && RExC_extralen)
4532 RExC_size += RExC_extralen;
c277df42 4533 else
830247a4
IZ
4534 RExC_extralen = 0;
4535 if (RExC_whilem_seen > 15)
4536 RExC_whilem_seen = 15;
a0d0e21e 4537
f9f4320a
YO
4538 /* Allocate space and zero-initialize. Note, the two step process
4539 of zeroing when in debug mode, thus anything assigned has to
4540 happen after that */
d2f13c59 4541 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 4542 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
4543 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4544 char, regexp_internal);
4545 if ( r == NULL || ri == NULL )
b45f050a 4546 FAIL("Regexp out of space");
0f79a09d
GS
4547#ifdef DEBUGGING
4548 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4549 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4550#else
f8fc2ecf
YO
4551 /* bulk initialize base fields with 0. */
4552 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4553#endif
58e23c8d
YO
4554
4555 /* non-zero initialization begins here */
f8fc2ecf 4556 RXi_SET( r, ri );
f9f4320a 4557 r->engine= RE_ENGINE_PTR;
c737faaf 4558 r->extflags = pm_flags;
bcdf7404 4559 {
f7819f85 4560 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
a62b1201 4561 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
c5ea2ffa
KW
4562
4563 /* The caret is output if there are any defaults: if not all the STD
4564 * flags are set, or if no character set specifier is needed */
4565 bool has_default =
4566 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4567 || ! has_charset);
bcdf7404 4568 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
4569 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4570 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
4571 const char *fptr = STD_PAT_MODS; /*"msix"*/
4572 char *p;
fb85c044 4573 /* Allocate for the worst case, which is all the std flags are turned
c5ea2ffa
KW
4574 * on. If more precision is desired, we could do a population count of
4575 * the flags set. This could be done with a small lookup table, or by
4576 * shifting, masking and adding, or even, when available, assembly
4577 * language for a machine-language population count.
4578 * We never output a minus, as all those are defaults, so are
4579 * covered by the caret */
fb85c044 4580 const STRLEN wraplen = plen + has_p + has_runon
c5ea2ffa 4581 + has_default /* If needs a caret */
a62b1201
KW
4582
4583 /* If needs a character set specifier */
4584 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
bcdf7404
YO
4585 + (sizeof(STD_PAT_MODS) - 1)
4586 + (sizeof("(?:)") - 1);
4587
c5ea2ffa 4588 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
f7c278bf 4589 SvPOK_on(rx);
8f6ae13c 4590 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 4591 *p++='('; *p++='?';
9de15fec
KW
4592
4593 /* If a default, cover it using the caret */
c5ea2ffa 4594 if (has_default) {
85508812 4595 *p++= DEFAULT_PAT_MOD;
fb85c044 4596 }
c5ea2ffa 4597 if (has_charset) {
a62b1201
KW
4598 STRLEN len;
4599 const char* const name = get_regex_charset_name(r->extflags, &len);
4600 Copy(name, p, len, char);
4601 p += len;
9de15fec 4602 }
f7819f85
A
4603 if (has_p)
4604 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404 4605 {
bcdf7404 4606 char ch;
bcdf7404
YO
4607 while((ch = *fptr++)) {
4608 if(reganch & 1)
4609 *p++ = ch;
bcdf7404
YO
4610 reganch >>= 1;
4611 }
bcdf7404
YO
4612 }
4613
28d8d7f4 4614 *p++ = ':';
bb661a58 4615 Copy(RExC_precomp, p, plen, char);
efd26800
NC
4616 assert ((RX_WRAPPED(rx) - p) < 16);
4617 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 4618 p += plen;
bcdf7404 4619 if (has_runon)
28d8d7f4
YO
4620 *p++ = '\n';
4621 *p++ = ')';
4622 *p = 0;
fb85c044 4623 SvCUR_set(rx, p - SvPVX_const(rx));
bcdf7404
YO
4624 }
4625
bbe252da 4626 r->intflags = 0;
830247a4 4627 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4628
6bda09f9 4629 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4630 Newxz(RExC_open_parens, RExC_npar,regnode *);
4631 SAVEFREEPV(RExC_open_parens);
4632 Newxz(RExC_close_parens,RExC_npar,regnode *);
4633 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4634 }
4635
4636 /* Useful during FAIL. */
7122b237
YO
4637#ifdef RE_TRACK_PATTERN_OFFSETS
4638 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4639 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4640 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4641 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4642 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4643#endif
4644 SetProgLen(ri,RExC_size);
288b8c02 4645 RExC_rx_sv = rx;
830247a4 4646 RExC_rx = r;
f8fc2ecf 4647 RExC_rxi = ri;
bbce6d69 4648
4649 /* Second pass: emit code. */
c737faaf 4650 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
4651 RExC_parse = exp;
4652 RExC_end = xend;
4653 RExC_naughty = 0;
4654 RExC_npar = 1;
f8fc2ecf
YO
4655 RExC_emit_start = ri->program;
4656 RExC_emit = ri->program;
3b57cd43
YO
4657 RExC_emit_bound = ri->program + RExC_size + 1;
4658
2cd61cdb 4659 /* Store the count of eval-groups for security checks: */
f8149455 4660 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4661 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 4662 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 4663 ReREFCNT_dec(rx);
a0d0e21e 4664 return(NULL);
80757612 4665 }
07be1b83
YO
4666 /* XXXX To minimize changes to RE engine we always allocate
4667 3-units-long substrs field. */
4668 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4669 if (RExC_recurse_count) {
4670 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4671 SAVEFREEPV(RExC_recurse);
4672 }
a0d0e21e 4673
07be1b83 4674reStudy:
e7f38d0f 4675 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
07be1b83 4676 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4677
07be1b83 4678#ifdef TRIE_STUDY_OPT
0934c9d9
SH
4679 if (!restudied) {
4680 StructCopy(&zero_scan_data, &data, scan_data_t);
4681 copyRExC_state = RExC_state;
4682 } else {
5d458dd8 4683 U32 seen=RExC_seen;
07be1b83 4684 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4685
4686 RExC_state = copyRExC_state;
4687 if (seen & REG_TOP_LEVEL_BRANCHES)
4688 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4689 else
4690 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4691 if (data.last_found) {
07be1b83 4692 SvREFCNT_dec(data.longest_fixed);
07be1b83 4693 SvREFCNT_dec(data.longest_float);
07be1b83 4694 SvREFCNT_dec(data.last_found);
1de06328 4695 }
40d049e4 4696 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4697 }
40d049e4
YO
4698#else
4699 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4700#endif
fc8cd66c 4701
a0d0e21e 4702 /* Dig out information for optimizations. */
f7819f85 4703 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
4704 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4705
a0ed51b3 4706 if (UTF)
8f6ae13c 4707 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 4708 ri->regstclass = NULL;
830247a4 4709 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4710 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4711 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4712
1de06328
YO
4713 /* testing for BRANCH here tells us whether there is "must appear"
4714 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4715 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4716 I32 fake;
c5254dd6 4717 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4718 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4719 int stclass_flag;
07be1b83 4720 I32 last_close = 0; /* pointed to by data */
5339e136
YO
4721 regnode *first= scan;
4722 regnode *first_next= regnext(first);
639081d6
YO
4723 /*
4724 * Skip introductions and multiplicators >= 1
4725 * so that we can extract the 'meat' of the pattern that must
4726 * match in the large if() sequence following.
4727 * NOTE that EXACT is NOT covered here, as it is normally
4728 * picked up by the optimiser separately.
4729 *
4730 * This is unfortunate as the optimiser isnt handling lookahead
4731 * properly currently.
4732 *
4733 */
a0d0e21e 4734 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4735 /* An OR of *one* alternative - should not happen now. */
5339e136 4736 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83 4737 /* for now we can't handle lookbehind IFMATCH*/
e7f38d0f 4738 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
a0d0e21e
LW
4739 (OP(first) == PLUS) ||
4740 (OP(first) == MINMOD) ||
653099ff 4741 /* An {n,m} with n>0 */
5339e136
YO
4742 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4743 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 4744 {
639081d6
YO
4745 /*
4746 * the only op that could be a regnode is PLUS, all the rest
4747 * will be regnode_1 or regnode_2.
4748 *
4749 */
a0d0e21e
LW
4750 if (OP(first) == PLUS)
4751 sawplus = 1;
4752 else
3dab1dad 4753 first += regarglen[OP(first)];
639081d6
YO
4754
4755 first = NEXTOPER(first);
5339e136 4756 first_next= regnext(first);
a687059c
LW
4757 }
4758
a0d0e21e
LW
4759 /* Starting-point info. */
4760 again:
786e8c11 4761 DEBUG_PEEP("first:",first,0);
07be1b83 4762 /* Ignore EXACT as we deal with it later. */
3dab1dad 4763 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4764 if (OP(first) == EXACT)
6f207bd3 4765 NOOP; /* Empty, get anchored substr later. */
e5fbd0ff 4766 else
f8fc2ecf 4767 ri->regstclass = first;
b3c9acc1 4768 }
07be1b83 4769#ifdef TRIE_STCLASS
786e8c11 4770 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4771 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4772 {
786e8c11 4773 regnode *trie_op;
07be1b83 4774 /* this can happen only on restudy */
786e8c11 4775 if ( OP(first) == TRIE ) {
c944940b 4776 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4777 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4778 StructCopy(first,trieop,struct regnode_1);
4779 trie_op=(regnode *)trieop;
4780 } else {
c944940b 4781 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4782 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4783 StructCopy(first,trieop,struct regnode_charclass);
4784 trie_op=(regnode *)trieop;
4785 }
1de06328 4786 OP(trie_op)+=2;
786e8c11 4787 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4788 ri->regstclass = trie_op;
07be1b83
YO
4789 }
4790#endif
e52fc539 4791 else if (REGNODE_SIMPLE(OP(first)))
f8fc2ecf 4792 ri->regstclass = first;
3dab1dad
YO
4793 else if (PL_regkind[OP(first)] == BOUND ||
4794 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4795 ri->regstclass = first;
3dab1dad 4796 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4797 r->extflags |= (OP(first) == MBOL
4798 ? RXf_ANCH_MBOL
cad2e5aa 4799 : (OP(first) == SBOL
bbe252da
YO
4800 ? RXf_ANCH_SBOL
4801 : RXf_ANCH_BOL));
a0d0e21e 4802 first = NEXTOPER(first);
774d564b 4803 goto again;
4804 }
4805 else if (OP(first) == GPOS) {
bbe252da 4806 r->extflags |= RXf_ANCH_GPOS;
774d564b 4807 first = NEXTOPER(first);
4808 goto again;
a0d0e21e 4809 }
cf2a2b69
YO
4810 else if ((!sawopen || !RExC_sawback) &&
4811 (OP(first) == STAR &&
3dab1dad 4812 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4813 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4814 {
4815 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4816 const int type =
4817 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4818 ? RXf_ANCH_MBOL
4819 : RXf_ANCH_SBOL;
4820 r->extflags |= type;
4821 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4822 first = NEXTOPER(first);
774d564b 4823 goto again;
a0d0e21e 4824 }
e7f38d0f 4825 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
830247a4 4826 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4827 /* x+ must match at the 1st pos of run of x's */
bbe252da 4828 r->intflags |= PREGf_SKIP;
a0d0e21e 4829
c277df42 4830 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4831#ifdef TRIE_STUDY_OPT
81714fb9 4832 DEBUG_PARSE_r(
be8e71aa
YO
4833 if (!restudied)
4834 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4835 (IV)(first - scan + 1))
4836 );
4837#else
81714fb9 4838 DEBUG_PARSE_r(
be8e71aa
YO
4839 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4840 (IV)(first - scan + 1))
4841 );
4842#endif
4843
4844
a0d0e21e
LW
4845 /*
4846 * If there's something expensive in the r.e., find the
4847 * longest literal string that must appear and make it the
4848 * regmust. Resolve ties in favor of later strings, since
4849 * the regstart check works with the beginning of the r.e.
4850 * and avoiding duplication strengthens checking. Not a
4851 * strong reason, but sufficient in the absence of others.
4852 * [Now we resolve ties in favor of the earlier string if
c277df42 4853 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4854 * earlier string may buy us something the later one won't.]
4855 */
de8c5301 4856
396482e1
GA
4857 data.longest_fixed = newSVpvs("");
4858 data.longest_float = newSVpvs("");
4859 data.last_found = newSVpvs("");
c277df42
IZ
4860 data.longest = &(data.longest_fixed);
4861 first = scan;
f8fc2ecf 4862 if (!ri->regstclass) {
830247a4 4863 cl_init(pRExC_state, &ch_class);
653099ff
GS
4864 data.start_class = &ch_class;
4865 stclass_flag = SCF_DO_STCLASS_AND;
4866 } else /* XXXX Check for BOUND? */
4867 stclass_flag = 0;
cb434fcc 4868 data.last_closep = &last_close;
de8c5301 4869
1de06328 4870 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4871 &data, -1, NULL, NULL,
4872 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4873
07be1b83 4874
786e8c11
YO
4875 CHECK_RESTUDY_GOTO;
4876
4877
830247a4 4878 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4879 && data.last_start_min == 0 && data.last_end > 0
830247a4 4880 && !RExC_seen_zerolen
2bf803e2 4881 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4882 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4883 r->extflags |= RXf_CHECK_ALL;
304ee84b 4884 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4885 SvREFCNT_dec(data.last_found);
4886
1de06328
YO
4887 /* Note that code very similar to this but for anchored string
4888 follows immediately below, changes may need to be made to both.
4889 Be careful.
4890 */
a0ed51b3 4891 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4892 if (longest_float_length
c277df42
IZ
4893 || (data.flags & SF_FL_BEFORE_EOL
4894 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4895 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4896 {
1182767e 4897 I32 t,ml;
cf93c79d 4898
1de06328 4899 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4900 && data.offset_fixed == data.offset_float_min
4901 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4902 goto remove_float; /* As in (a)+. */
4903
1de06328
YO
4904 /* copy the information about the longest float from the reg_scan_data
4905 over to the program. */
33b8afdf
JH
4906 if (SvUTF8(data.longest_float)) {
4907 r->float_utf8 = data.longest_float;
c445ea15 4908 r->float_substr = NULL;
33b8afdf
JH
4909 } else {
4910 r->float_substr = data.longest_float;
c445ea15 4911 r->float_utf8 = NULL;
33b8afdf 4912 }
1de06328
YO
4913 /* float_end_shift is how many chars that must be matched that
4914 follow this item. We calculate it ahead of time as once the
4915 lookbehind offset is added in we lose the ability to correctly
4916 calculate it.*/
4917 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4918 : (I32)longest_float_length;
1de06328
YO
4919 r->float_end_shift = ml - data.offset_float_min
4920 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4921 + data.lookbehind_float;
4922 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4923 r->float_max_offset = data.offset_float_max;
1182767e 4924 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4925 r->float_max_offset -= data.lookbehind_float;
4926
cf93c79d
IZ
4927 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4928 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4929 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4930 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4931 }
4932 else {
aca2d497 4933 remove_float:
c445ea15 4934 r->float_substr = r->float_utf8 = NULL;
c277df42 4935 SvREFCNT_dec(data.longest_float);
c5254dd6 4936 longest_float_length = 0;
a0d0e21e 4937 }
c277df42 4938
1de06328
YO
4939 /* Note that code very similar to this but for floating string
4940 is immediately above, changes may need to be made to both.
4941 Be careful.
4942 */
a0ed51b3 4943 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4944 if (longest_fixed_length
c277df42
IZ
4945 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4946 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4947 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4948 {
1182767e 4949 I32 t,ml;
cf93c79d 4950
1de06328
YO
4951 /* copy the information about the longest fixed
4952 from the reg_scan_data over to the program. */
33b8afdf
JH
4953 if (SvUTF8(data.longest_fixed)) {
4954 r->anchored_utf8 = data.longest_fixed;
c445ea15 4955 r->anchored_substr = NULL;
33b8afdf
JH
4956 } else {
4957 r->anchored_substr = data.longest_fixed;
c445ea15 4958 r->anchored_utf8 = NULL;
33b8afdf 4959 }
1de06328
YO
4960 /* fixed_end_shift is how many chars that must be matched that
4961 follow this item. We calculate it ahead of time as once the
4962 lookbehind offset is added in we lose the ability to correctly
4963 calculate it.*/
4964 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4965 : (I32)longest_fixed_length;
1de06328
YO
4966 r->anchored_end_shift = ml - data.offset_fixed
4967 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4968 + data.lookbehind_fixed;
4969 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4970
cf93c79d
IZ
4971 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4972 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4973 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4974 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4975 }
4976 else {
c445ea15 4977 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4978 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4979 longest_fixed_length = 0;
a0d0e21e 4980 }
f8fc2ecf
YO
4981 if (ri->regstclass
4982 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4983 ri->regstclass = NULL;
f4244008
KW
4984
4985 /* If the synthetic start class were to ever be used when EOS is set,
4986 * that bit would have to be cleared, as it is shared with another */
33b8afdf
JH
4987 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4988 && stclass_flag
653099ff 4989 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4990 && !cl_is_anything(data.start_class))
4991 {
2eccd3b2 4992 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4993
f8fc2ecf 4994 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4995 struct regnode_charclass_class);
4996 StructCopy(data.start_class,
f8fc2ecf 4997 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4998 struct regnode_charclass_class);
f8fc2ecf 4999 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5000 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5001 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 5002 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5003 PerlIO_printf(Perl_debug_log,
a0288114 5004 "synthetic stclass \"%s\".\n",
3f7c398e 5005 SvPVX_const(sv));});
653099ff 5006 }
c277df42
IZ
5007
5008 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 5009 if (longest_fixed_length > longest_float_length) {
1de06328 5010 r->check_end_shift = r->anchored_end_shift;
c277df42 5011 r->check_substr = r->anchored_substr;
33b8afdf 5012 r->check_utf8 = r->anchored_utf8;
c277df42 5013 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
5014 if (r->extflags & RXf_ANCH_SINGLE)
5015 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
5016 }
5017 else {
1de06328 5018 r->check_end_shift = r->float_end_shift;
c277df42 5019 r->check_substr = r->float_substr;
33b8afdf 5020 r->check_utf8 = r->float_utf8;
1de06328
YO
5021 r->check_offset_min = r->float_min_offset;
5022 r->check_offset_max = r->float_max_offset;
a0d0e21e 5023 }
30382c73
IZ
5024 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5025 This should be changed ASAP! */
bbe252da
YO
5026 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5027 r->extflags |= RXf_USE_INTUIT;
33b8afdf 5028 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 5029 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 5030 }
1de06328
YO
5031 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5032 if ( (STRLEN)minlen < longest_float_length )
5033 minlen= longest_float_length;
5034 if ( (STRLEN)minlen < longest_fixed_length )
5035 minlen= longest_fixed_length;
5036 */
a0ed51b3
LW
5037 }
5038 else {
c277df42
IZ
5039 /* Several toplevels. Best we can is to set minlen. */
5040 I32 fake;
653099ff 5041 struct regnode_charclass_class ch_class;
cb434fcc 5042 I32 last_close = 0;
c277df42 5043
5d458dd8 5044 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 5045
f8fc2ecf 5046 scan = ri->program + 1;
830247a4 5047 cl_init(pRExC_state, &ch_class);
653099ff 5048 data.start_class = &ch_class;
cb434fcc 5049 data.last_closep = &last_close;
07be1b83 5050
de8c5301 5051
1de06328 5052 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 5053 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 5054
786e8c11 5055 CHECK_RESTUDY_GOTO;
07be1b83 5056
33b8afdf 5057 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 5058 = r->float_substr = r->float_utf8 = NULL;
f4244008
KW
5059
5060 /* If the synthetic start class were to ever be used when EOS is set,
5061 * that bit would have to be cleared, as it is shared with another */
653099ff 5062 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
5063 && !cl_is_anything(data.start_class))
5064 {
2eccd3b2 5065 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 5066
f8fc2ecf 5067 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
5068 struct regnode_charclass_class);
5069 StructCopy(data.start_class,
f8fc2ecf 5070 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 5071 struct regnode_charclass_class);
f8fc2ecf 5072 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5073 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5074 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 5075 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5076 PerlIO_printf(Perl_debug_log,
a0288114 5077 "synthetic stclass \"%s\".\n",
3f7c398e 5078 SvPVX_const(sv));});
653099ff 5079 }
a0d0e21e
LW
5080 }
5081
1de06328
YO
5082 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5083 the "real" pattern. */
cf9788e3
RGS
5084 DEBUG_OPTIMISE_r({
5085 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 5086 (IV)minlen, (IV)r->minlen);
cf9788e3 5087 });
de8c5301 5088 r->minlenret = minlen;
1de06328
YO
5089 if (r->minlen < minlen)
5090 r->minlen = minlen;
5091
b81d288d 5092 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 5093 r->extflags |= RXf_GPOS_SEEN;
830247a4 5094 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 5095 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 5096 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 5097 r->extflags |= RXf_EVAL_SEEN;
f33976b4 5098 if (RExC_seen & REG_SEEN_CANY)
bbe252da 5099 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 5100 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 5101 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 5102 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 5103 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 5104 if (RExC_paren_names)
85fbaab2 5105 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 5106 else
5daac39c 5107 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 5108
7bd1e614 5109#ifdef STUPID_PATTERN_CHECKS
5509d87a 5110 if (RX_PRELEN(rx) == 0)
640f820d 5111 r->extflags |= RXf_NULL;
5509d87a 5112 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
5113 /* XXX: this should happen BEFORE we compile */
5114 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 5115 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 5116 r->extflags |= RXf_WHITE;
5509d87a 5117 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 5118 r->extflags |= RXf_START_ONLY;
f1b875a0 5119#else
5509d87a 5120 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
5121 /* XXX: this should happen BEFORE we compile */
5122 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5123 else {
5124 regnode *first = ri->program + 1;
39aa8307 5125 U8 fop = OP(first);
f6d9469c
DM
5126
5127 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
640f820d 5128 r->extflags |= RXf_NULL;
f6d9469c 5129 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
7bd1e614 5130 r->extflags |= RXf_START_ONLY;
f6d9469c
DM
5131 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5132 && OP(regnext(first)) == END)
7bd1e614
YO
5133 r->extflags |= RXf_WHITE;
5134 }
f1b875a0 5135#endif
1f1031fe
YO
5136#ifdef DEBUGGING
5137 if (RExC_paren_names) {
af534a04 5138 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
1f1031fe
YO
5139 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5140 } else
1f1031fe 5141#endif
cde0cee5 5142 ri->name_list_idx = 0;
1f1031fe 5143
40d049e4
YO
5144 if (RExC_recurse_count) {
5145 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5146 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5147 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5148 }
5149 }
f0ab9afb 5150 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
5151 /* assume we don't need to swap parens around before we match */
5152
be8e71aa
YO
5153 DEBUG_DUMP_r({
5154 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
5155 regdump(r);
5156 });
7122b237
YO
5157#ifdef RE_TRACK_PATTERN_OFFSETS
5158 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5159 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
5160 U32 i;
5161 GET_RE_DEBUG_FLAGS_DECL;
7122b237 5162 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 5163 for (i = 1; i <= len; i++) {
7122b237 5164 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 5165 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 5166 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
5167 }
5168 PerlIO_printf(Perl_debug_log, "\n");
5169 });
7122b237 5170#endif
288b8c02 5171 return rx;
a687059c
LW
5172}
5173
f9f4320a 5174#undef RE_ENGINE_PTR
3dab1dad 5175
93b32b6d 5176
81714fb9 5177SV*
192b9cd1
AB
5178Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5179 const U32 flags)
5180{
7918f24d
NC
5181 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5182
192b9cd1
AB
5183 PERL_UNUSED_ARG(value);
5184
f1b875a0 5185 if (flags & RXapif_FETCH) {
192b9cd1 5186 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 5187 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6ad8f254 5188 Perl_croak_no_modify(aTHX);
192b9cd1 5189 return NULL;
f1b875a0 5190 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
5191 return reg_named_buff_exists(rx, key, flags)
5192 ? &PL_sv_yes
5193 : &PL_sv_no;
f1b875a0 5194 } else if (flags & RXapif_REGNAMES) {
192b9cd1 5195 return reg_named_buff_all(rx, flags);
f1b875a0 5196 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
5197 return reg_named_buff_scalar(rx, flags);
5198 } else {
5199 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5200 return NULL;
5201 }
5202}
5203
5204SV*
5205Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5206 const U32 flags)
5207{
7918f24d 5208 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
5209 PERL_UNUSED_ARG(lastkey);
5210
f1b875a0 5211 if (flags & RXapif_FIRSTKEY)
192b9cd1 5212 return reg_named_buff_firstkey(rx, flags);
f1b875a0 5213 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
5214 return reg_named_buff_nextkey(rx, flags);
5215 else {
5216 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5217 return NULL;
5218 }
5219}
5220
5221SV*
288b8c02
NC
5222Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5223 const U32 flags)
81714fb9 5224{
44a2ac75
YO
5225 AV *retarray = NULL;
5226 SV *ret;
288b8c02 5227 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5228
5229 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5230
f1b875a0 5231 if (flags & RXapif_ALL)
44a2ac75 5232 retarray=newAV();
93b32b6d 5233
5daac39c
NC
5234 if (rx && RXp_PAREN_NAMES(rx)) {
5235 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
5236 if (he_str) {
5237 IV i;
5238 SV* sv_dat=HeVAL(he_str);
5239 I32 *nums=(I32*)SvPVX(sv_dat);
5240 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
5241 if ((I32)(rx->nparens) >= nums[i]
5242 && rx->offs[nums[i]].start != -1
5243 && rx->offs[nums[i]].end != -1)
93b32b6d 5244 {
49d7dfbc 5245 ret = newSVpvs("");
288b8c02 5246 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
5247 if (!retarray)
5248 return ret;
5249 } else {
5250 ret = newSVsv(&PL_sv_undef);
5251 }
ec83ea38 5252 if (retarray)
93b32b6d 5253 av_push(retarray, ret);
81714fb9 5254 }
93b32b6d 5255 if (retarray)
ad64d0ec 5256 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
5257 }
5258 }
5259 return NULL;
5260}
5261
5262bool
288b8c02 5263Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
5264 const U32 flags)
5265{
288b8c02 5266 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5267
5268 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5269
5daac39c 5270 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5271 if (flags & RXapif_ALL) {
5daac39c 5272 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 5273 } else {
288b8c02 5274 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
5275 if (sv) {
5276 SvREFCNT_dec(sv);
192b9cd1
AB
5277 return TRUE;
5278 } else {
5279 return FALSE;
5280 }
5281 }
5282 } else {
5283 return FALSE;
5284 }
5285}
5286
5287SV*
288b8c02 5288Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5289{
288b8c02 5290 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5291
5292 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5293
5daac39c
NC
5294 if ( rx && RXp_PAREN_NAMES(rx) ) {
5295 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 5296
288b8c02 5297 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
5298 } else {
5299 return FALSE;
5300 }
192b9cd1
AB
5301}
5302
5303SV*
288b8c02 5304Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5305{
288b8c02 5306 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 5307 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5308
5309 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5310
5daac39c
NC
5311 if (rx && RXp_PAREN_NAMES(rx)) {
5312 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
5313 HE *temphe;
5314 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5315 IV i;
5316 IV parno = 0;
5317 SV* sv_dat = HeVAL(temphe);
5318 I32 *nums = (I32*)SvPVX(sv_dat);
5319 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5320 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5321 rx->offs[nums[i]].start != -1 &&
5322 rx->offs[nums[i]].end != -1)
5323 {
5324 parno = nums[i];
5325 break;
5326 }
5327 }
f1b875a0 5328 if (parno || flags & RXapif_ALL) {
a663657d 5329 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5330 }
81714fb9
YO
5331 }
5332 }
44a2ac75
YO
5333 return NULL;
5334}
5335
192b9cd1 5336SV*
288b8c02 5337Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5338{
5339 SV *ret;
5340 AV *av;
5341 I32 length;
288b8c02 5342 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5343
7918f24d
NC
5344 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5345
5daac39c 5346 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5347 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5348 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5349 } else if (flags & RXapif_ONE) {
288b8c02 5350 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 5351 av = MUTABLE_AV(SvRV(ret));
192b9cd1 5352 length = av_len(av);
ec83ea38 5353 SvREFCNT_dec(ret);
192b9cd1
AB
5354 return newSViv(length + 1);
5355 } else {
5356 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5357 return NULL;
5358 }
5359 }
5360 return &PL_sv_undef;
5361}
5362
5363SV*
288b8c02 5364Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5365{
288b8c02 5366 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5367 AV *av = newAV();
5368
7918f24d
NC
5369 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5370
5daac39c
NC
5371 if (rx && RXp_PAREN_NAMES(rx)) {
5372 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5373 HE *temphe;
5374 (void)hv_iterinit(hv);
5375 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5376 IV i;
5377 IV parno = 0;
5378 SV* sv_dat = HeVAL(temphe);
5379 I32 *nums = (I32*)SvPVX(sv_dat);
5380 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5381 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5382 rx->offs[nums[i]].start != -1 &&
5383 rx->offs[nums[i]].end != -1)
5384 {
5385 parno = nums[i];
5386 break;
5387 }
5388 }
f1b875a0 5389 if (parno || flags & RXapif_ALL) {
a663657d 5390 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
5391 }
5392 }
5393 }
5394
ad64d0ec 5395 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
5396}
5397
49d7dfbc 5398void
288b8c02
NC
5399Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5400 SV * const sv)
44a2ac75 5401{
288b8c02 5402 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 5403 char *s = NULL;
a9d504c3 5404 I32 i = 0;
44a2ac75 5405 I32 s1, t1;
7918f24d
NC
5406
5407 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 5408
cde0cee5
YO
5409 if (!rx->subbeg) {
5410 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5411 return;
cde0cee5
YO
5412 }
5413 else
f1b875a0 5414 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 5415 /* $` */
f0ab9afb 5416 i = rx->offs[0].start;
cde0cee5 5417 s = rx->subbeg;
44a2ac75
YO
5418 }
5419 else
f1b875a0 5420 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 5421 /* $' */
f0ab9afb
NC
5422 s = rx->subbeg + rx->offs[0].end;
5423 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
5424 }
5425 else
5426 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
5427 (s1 = rx->offs[paren].start) != -1 &&
5428 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
5429 {
5430 /* $& $1 ... */
5431 i = t1 - s1;
5432 s = rx->subbeg + s1;
cde0cee5
YO
5433 } else {
5434 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5435 return;
cde0cee5
YO
5436 }
5437 assert(rx->sublen >= (s - rx->subbeg) + i );
5438 if (i >= 0) {
5439 const int oldtainted = PL_tainted;
5440 TAINT_NOT;
5441 sv_setpvn(sv, s, i);
5442 PL_tainted = oldtainted;
5443 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 5444 ? (RXp_MATCH_UTF8(rx)
cde0cee5 5445 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 5446 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
5447 {
5448 SvUTF8_on(sv);
5449 }
5450 else
5451 SvUTF8_off(sv);
5452 if (PL_tainting) {
07bc277f 5453 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
5454 if (SvTYPE(sv) >= SVt_PVMG) {
5455 MAGIC* const mg = SvMAGIC(sv);
5456 MAGIC* mgt;
5457 PL_tainted = 1;
5458 SvMAGIC_set(sv, mg->mg_moremagic);
5459 SvTAINT(sv);
5460 if ((mgt = SvMAGIC(sv))) {
5461 mg->mg_moremagic = mgt;
5462 SvMAGIC_set(sv, mg);
44a2ac75 5463 }
cde0cee5
YO
5464 } else {
5465 PL_tainted = 1;
5466 SvTAINT(sv);
5467 }
5468 } else
5469 SvTAINTED_off(sv);
44a2ac75 5470 }
81714fb9 5471 } else {
44a2ac75 5472 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5473 return;
81714fb9
YO
5474 }
5475}
93b32b6d 5476
2fdbfb4d
AB
5477void
5478Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5479 SV const * const value)
5480{
7918f24d
NC
5481 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5482
2fdbfb4d
AB
5483 PERL_UNUSED_ARG(rx);
5484 PERL_UNUSED_ARG(paren);
5485 PERL_UNUSED_ARG(value);
5486
5487 if (!PL_localizing)
6ad8f254 5488 Perl_croak_no_modify(aTHX);
2fdbfb4d
AB
5489}
5490
5491I32
288b8c02 5492Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
5493 const I32 paren)
5494{
288b8c02 5495 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
5496 I32 i;
5497 I32 s1, t1;
5498
7918f24d
NC
5499 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5500
2fdbfb4d
AB
5501 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5502 switch (paren) {
192b9cd1 5503 /* $` / ${^PREMATCH} */
f1b875a0 5504 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
5505 if (rx->offs[0].start != -1) {
5506 i = rx->offs[0].start;
5507 if (i > 0) {
5508 s1 = 0;
5509 t1 = i;
5510 goto getlen;
5511 }
5512 }
5513 return 0;
192b9cd1 5514 /* $' / ${^POSTMATCH} */
f1b875a0 5515 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
5516 if (rx->offs[0].end != -1) {
5517 i = rx->sublen - rx->offs[0].end;
5518 if (i > 0) {
5519 s1 = rx->offs[0].end;
5520 t1 = rx->sublen;
5521 goto getlen;
5522 }
5523 }
5524 return 0;
192b9cd1
AB
5525 /* $& / ${^MATCH}, $1, $2, ... */
5526 default:
2fdbfb4d
AB
5527 if (paren <= (I32)rx->nparens &&
5528 (s1 = rx->offs[paren].start) != -1 &&
5529 (t1 = rx->offs[paren].end) != -1)
5530 {
5531 i = t1 - s1;
5532 goto getlen;
5533 } else {
5534 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 5535 report_uninit((const SV *)sv);
2fdbfb4d
AB
5536 return 0;
5537 }
5538 }
5539 getlen:
07bc277f 5540 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
5541 const char * const s = rx->subbeg + s1;
5542 const U8 *ep;
5543 STRLEN el;
5544
5545 i = t1 - s1;
5546 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5547 i = el;
5548 }
5549 return i;
5550}
5551
fe578d7f 5552SV*
49d7dfbc 5553Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 5554{
7918f24d 5555 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 5556 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
5557 if (0)
5558 return NULL;
5559 else
5560 return newSVpvs("Regexp");
fe578d7f 5561}
0a4db386 5562
894be9b7 5563/* Scans the name of a named buffer from the pattern.
0a4db386
YO
5564 * If flags is REG_RSN_RETURN_NULL returns null.
5565 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5566 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5567 * to the parsed name as looked up in the RExC_paren_names hash.
5568 * If there is an error throws a vFAIL().. type exception.
894be9b7 5569 */
0a4db386
YO
5570
5571#define REG_RSN_RETURN_NULL 0
5572#define REG_RSN_RETURN_NAME 1
5573#define REG_RSN_RETURN_DATA 2
5574
894be9b7 5575STATIC SV*
7918f24d
NC
5576S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5577{
894be9b7 5578 char *name_start = RExC_parse;
1f1031fe 5579
7918f24d
NC
5580 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5581
1f1031fe
YO
5582 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5583 /* skip IDFIRST by using do...while */
5584 if (UTF)
5585 do {
5586 RExC_parse += UTF8SKIP(RExC_parse);
5587 } while (isALNUM_utf8((U8*)RExC_parse));
5588 else
5589 do {
5590 RExC_parse++;
5591 } while (isALNUM(*RExC_parse));
894be9b7 5592 }
1f1031fe 5593
0a4db386 5594 if ( flags ) {
59cd0e26
NC
5595 SV* sv_name
5596 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5597 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
5598 if ( flags == REG_RSN_RETURN_NAME)
5599 return sv_name;
5600 else if (flags==REG_RSN_RETURN_DATA) {
5601 HE *he_str = NULL;
5602 SV *sv_dat = NULL;
5603 if ( ! sv_name ) /* should not happen*/
5604 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5605 if (RExC_paren_names)
5606 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5607 if ( he_str )
5608 sv_dat = HeVAL(he_str);
5609 if ( ! sv_dat )
5610 vFAIL("Reference to nonexistent named group");
5611 return sv_dat;
5612 }
5613 else {
5614 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5615 }
5616 /* NOT REACHED */
894be9b7 5617 }
0a4db386 5618 return NULL;
894be9b7
YO
5619}
5620
3dab1dad
YO
5621#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5622 int rem=(int)(RExC_end - RExC_parse); \
5623 int cut; \
5624 int num; \
5625 int iscut=0; \
5626 if (rem>10) { \
5627 rem=10; \
5628 iscut=1; \
5629 } \
5630 cut=10-rem; \
5631 if (RExC_lastparse!=RExC_parse) \
5632 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5633 rem, RExC_parse, \
5634 cut + 4, \
5635 iscut ? "..." : "<" \
5636 ); \
5637 else \
5638 PerlIO_printf(Perl_debug_log,"%16s",""); \
5639 \
5640 if (SIZE_ONLY) \
3b57cd43 5641 num = RExC_size + 1; \
3dab1dad
YO
5642 else \
5643 num=REG_NODE_NUM(RExC_emit); \
5644 if (RExC_lastnum!=num) \
0a4db386 5645 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 5646 else \
0a4db386 5647 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
5648 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5649 (int)((depth*2)), "", \
3dab1dad
YO
5650 (funcname) \
5651 ); \
5652 RExC_lastnum=num; \
5653 RExC_lastparse=RExC_parse; \
5654})
5655
07be1b83
YO
5656
5657
3dab1dad
YO
5658#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5659 DEBUG_PARSE_MSG((funcname)); \
5660 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5661})
6bda09f9
YO
5662#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5663 DEBUG_PARSE_MSG((funcname)); \
5664 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5665})
d764b54e
KW
5666
5667/* This section of code defines the inversion list object and its methods. The
5668 * interfaces are highly subject to change, so as much as possible is static to
5669 * this file. An inversion list is here implemented as a malloc'd C array with
5670 * some added info. More will be coming when functionality is added later.
5671 *
5672 * Some of the methods should always be private to the implementation, and some
5673 * should eventually be made public */
5674
5675#define INVLIST_INITIAL_LEN 10
5676#define INVLIST_ARRAY_KEY "array"
5677#define INVLIST_MAX_KEY "max"
5678#define INVLIST_LEN_KEY "len"
5679
5680PERL_STATIC_INLINE UV*
5681S_invlist_array(pTHX_ HV* const invlist)
5682{
5683 /* Returns the pointer to the inversion list's array. Every time the
5684 * length changes, this needs to be called in case malloc or realloc moved
5685 * it */
5686
5687 SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5688
5689 PERL_ARGS_ASSERT_INVLIST_ARRAY;
5690
5691 if (list_ptr == NULL) {
5692 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5693 INVLIST_ARRAY_KEY);
5694 }
5695
5696 return INT2PTR(UV *, SvUV(*list_ptr));
5697}
5698
5699PERL_STATIC_INLINE void
5700S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
5701{
5702 PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
5703
5704 /* Sets the array stored in the inversion list to the memory beginning with
5705 * the parameter */
5706
5707 if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
5708 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5709 INVLIST_ARRAY_KEY);
5710 }
5711}
5712
5713PERL_STATIC_INLINE UV
5714S_invlist_len(pTHX_ HV* const invlist)
5715{
5716 /* Returns the current number of elements in the inversion list's array */
5717
5718 SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
5719
5720 PERL_ARGS_ASSERT_INVLIST_LEN;
5721
5722 if (len_ptr == NULL) {
5723 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5724 INVLIST_LEN_KEY);
5725 }
5726
5727 return SvUV(*len_ptr);
5728}
5729
5730PERL_STATIC_INLINE UV
5731S_invlist_max(pTHX_ HV* const invlist)
5732{
5733 /* Returns the maximum number of elements storable in the inversion list's
5734 * array, without having to realloc() */
5735
5736 SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
5737
5738 PERL_ARGS_ASSERT_INVLIST_MAX;
5739
5740 if (max_ptr == NULL) {
5741 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5742 INVLIST_MAX_KEY);
5743 }
5744
5745 return SvUV(*max_ptr);
5746}
5747
5748PERL_STATIC_INLINE void
5749S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
5750{
5751 /* Sets the current number of elements stored in the inversion list */
5752
5753 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
5754
5755 if (len != 0 && len > invlist_max(invlist)) {
5756 Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
5757 }
5758
5759 if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
5760 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5761 INVLIST_LEN_KEY);
5762 }
5763}
5764
5765PERL_STATIC_INLINE void
5766S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
5767{
5768
5769 /* Sets the maximum number of elements storable in the inversion list
5770 * without having to realloc() */
5771
5772 PERL_ARGS_ASSERT_INVLIST_SET_MAX;
5773
5774 if (max < invlist_len(invlist)) {
5775 Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
5776 }
5777
5778 if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
5779 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5780 INVLIST_LEN_KEY);
5781 }
5782}
5783
8d69a883 5784#ifndef PERL_IN_XSUB_RE
d764b54e
KW
5785HV*
5786Perl__new_invlist(pTHX_ IV initial_size)
5787{
5788
5789 /* Return a pointer to a newly constructed inversion list, with enough
5790 * space to store 'initial_size' elements. If that number is negative, a
5791 * system default is used instead */
5792
5793 HV* invlist = newHV();
5794 UV* list;
5795
5796 if (initial_size < 0) {
5797 initial_size = INVLIST_INITIAL_LEN;
5798 }
5799
5800 /* Allocate the initial space */
5801 Newx(list, initial_size, UV);
5802 invlist_set_array(invlist, list);
5803
5804 /* set_len has to come before set_max, as the latter inspects the len */
5805 invlist_set_len(invlist, 0);
5806 invlist_set_max(invlist, initial_size);
5807
5808 return invlist;
5809}
8d69a883 5810#endif
d764b54e
KW
5811
5812PERL_STATIC_INLINE void
5813S_invlist_destroy(pTHX_ HV* const invlist)
5814{
5815 /* Inversion list destructor */
5816
5817 SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5818
5819 PERL_ARGS_ASSERT_INVLIST_DESTROY;
5820
5821 if (list_ptr != NULL) {
b9d2ea5b
GG
5822 UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */
5823 Safefree(list);
d764b54e
KW
5824 }
5825}
5826
5827STATIC void
5828S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
5829{
5830 /* Change the maximum size of an inversion list (up or down) */
5831
5832 UV* orig_array;
5833 UV* array;
5834 const UV old_max = invlist_max(invlist);
5835
5836 PERL_ARGS_ASSERT_INVLIST_EXTEND;
5837
5838 if (old_max == new_max) { /* If a no-op */
5839 return;
5840 }
5841
5842 array = orig_array = invlist_array(invlist);
5843 Renew(array, new_max, UV);
5844
5845 /* If the size change moved the list in memory, set the new one */
5846 if (array != orig_array) {
5847 invlist_set_array(invlist, array);
5848 }
5849
5850 invlist_set_max(invlist, new_max);
5851
5852}
5853
5854PERL_STATIC_INLINE void
5855S_invlist_trim(pTHX_ HV* const invlist)
5856{
5857 PERL_ARGS_ASSERT_INVLIST_TRIM;
5858
5859 /* Change the length of the inversion list to how many entries it currently
5860 * has */
5861
5862 invlist_extend(invlist, invlist_len(invlist));
5863}
5864
5865/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
5866 * etc */
5867
5868#define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
5869
8d69a883 5870#ifndef PERL_IN_XSUB_RE
d764b54e
KW
5871void
5872Perl__append_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end)
5873{
5874 /* Subject to change or removal. Append the range from 'start' to 'end' at
5875 * the end of the inversion list. The range must be above any existing
5876 * ones. */
5877
5878 UV* array = invlist_array(invlist);
5879 UV max = invlist_max(invlist);
5880 UV len = invlist_len(invlist);
5881
5882 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
5883
5884 if (len > 0) {
5885
5886 /* Here, the existing list is non-empty. The current max entry in the
5887 * list is generally the first value not in the set, except when the
5888 * set extends to the end of permissible values, in which case it is
5889 * the first entry in that final set, and so this call is an attempt to
5890 * append out-of-order */
5891
5892 UV final_element = len - 1;
5893 if (array[final_element] > start
5894 || ELEMENT_IN_INVLIST_SET(final_element))
5895 {
5896 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
5897 }
5898
5899 /* Here, it is a legal append. If the new range begins with the first
5900 * value not in the set, it is extending the set, so the new first
5901 * value not in the set is one greater than the newly extended range.
5902 * */
5903 if (array[final_element] == start) {
5904 if (end != UV_MAX) {
5905 array[final_element] = end + 1;
5906 }
5907 else {
5908 /* But if the end is the maximum representable on the machine,
5909 * just let the range that this would extend have no end */
5910 invlist_set_len(invlist, len - 1);
5911 }
5912 return;
5913 }
5914 }
5915
5916 /* Here the new range doesn't extend any existing set. Add it */
5917
5918 len += 2; /* Includes an element each for the start and end of range */
5919
5920 /* If overflows the existing space, extend, which may cause the array to be
5921 * moved */
5922 if (max < len) {
5923 invlist_extend(invlist, len);
5924 array = invlist_array(invlist);
5925 }
5926
5927 invlist_set_len(invlist, len);
5928
5929 /* The next item on the list starts the range, the one after that is
5930 * one past the new range. */
5931 array[len - 2] = start;
5932 if (end != UV_MAX) {
5933 array[len - 1] = end + 1;
5934 }
5935 else {
5936 /* But if the end is the maximum representable on the machine, just let
5937 * the range have no end */
5938 invlist_set_len(invlist, len - 1);
5939 }
5940}
8d69a883 5941#endif
d764b54e
KW
5942
5943PERL_STATIC_INLINE HV*
5944S_invlist_union(pTHX_ HV* const a, HV* const b)
5945{
5946 /* Return a new inversion list which is the union of two inversion lists.
5947 * The basis for this comes from "Unicode Demystified" Chapter 13 by
5948 * Richard Gillam, published by Addison-Wesley, and explained at some
5949 * length there. The preface says to incorporate its examples into your
5950 * code at your own risk.
5951 *
5952 * The algorithm is like a merge sort.
5953 *
5954 * XXX A potential performance improvement is to keep track as we go along
5955 * if only one of the inputs contributes to the result, meaning the other
5956 * is a subset of that one. In that case, we can skip the final copy and
5957 * return the larger of the input lists */
5958
5959 UV* array_a = invlist_array(a); /* a's array */
5960 UV* array_b = invlist_array(b);
5961 UV len_a = invlist_len(a); /* length of a's array */
5962 UV len_b = invlist_len(b);
5963
5964 HV* u; /* the resulting union */
5965 UV* array_u;
5966 UV len_u;
5967
5968 UV i_a = 0; /* current index into a's array */
5969 UV i_b = 0;
5970 UV i_u = 0;
5971
5972 /* running count, as explained in the algorithm source book; items are
5973 * stopped accumulating and are output when the count changes to/from 0.
5974 * The count is incremented when we start a range that's in the set, and
5975 * decremented when we start a range that's not in the set. So its range
5976 * is 0 to 2. Only when the count is zero is something not in the set.
5977 */
5978 UV count = 0;
5979
5980 PERL_ARGS_ASSERT_INVLIST_UNION;
5981
5982 /* Size the union for the worst case: that the sets are completely
5983 * disjoint */
5984 u = _new_invlist(len_a + len_b);
5985 array_u = invlist_array(u);
5986
5987 /* Go through each list item by item, stopping when exhausted one of
5988 * them */
5989 while (i_a < len_a && i_b < len_b) {
5990 UV cp; /* The element to potentially add to the union's array */
5991 bool cp_in_set; /* is it in the the input list's set or not */
5992
5993 /* We need to take one or the other of the two inputs for the union.
5994 * Since we are merging two sorted lists, we take the smaller of the
5995 * next items. In case of a tie, we take the one that is in its set
5996 * first. If we took one not in the set first, it would decrement the
5997 * count, possibly to 0 which would cause it to be output as ending the
5998 * range, and the next time through we would take the same number, and
5999 * output it again as beginning the next range. By doing it the
6000 * opposite way, there is no possibility that the count will be
6001 * momentarily decremented to 0, and thus the two adjoining ranges will
6002 * be seamlessly merged. (In a tie and both are in the set or both not
6003 * in the set, it doesn't matter which we take first.) */
6004 if (array_a[i_a] < array_b[i_b]
6005 || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
6006 {
6007 cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6008 cp= array_a[i_a++];
6009 }
6010 else {
6011 cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6012 cp= array_b[i_b++];
6013 }
6014
6015 /* Here, have chosen which of the two inputs to look at. Only output
6016 * if the running count changes to/from 0, which marks the
6017 * beginning/end of a range in that's in the set */
6018 if (cp_in_set) {
6019 if (count == 0) {
6020 array_u[i_u++] = cp;
6021 }
6022 count++;
6023 }
6024 else {
6025 count--;
6026 if (count == 0) {
6027 array_u[i_u++] = cp;
6028 }
6029 }
6030 }
6031
6032 /* Here, we are finished going through at least one of the lists, which
6033 * means there is something remaining in at most one. We check if the list
6034 * that hasn't been exhausted is positioned such that we are in the middle
6035 * of a range in its set or not. (We are in the set if the next item in
6036 * the array marks the beginning of something not in the set) If in the
6037 * set, we decrement 'count'; if 0, there is potentially more to output.
6038 * There are four cases:
6039 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
6040 * in the union is entirely from the non-exhausted set.
6041 * 2) Both were in their sets, count is 2. Nothing further should
6042 * be output, as everything that remains will be in the exhausted
6043 * list's set, hence in the union; decrementing to 1 but not 0 insures
6044 * that
6045 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
6046 * Nothing further should be output because the union includes
6047 * everything from the exhausted set. Not decrementing insures that.
6048 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6049 * decrementing to 0 insures that we look at the remainder of the
6050 * non-exhausted set */
6051 if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6052 || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6053 {
6054 count--;
6055 }
6056
6057 /* The final length is what we've output so far, plus what else is about to
6058 * be output. (If 'count' is non-zero, then the input list we exhausted
6059 * has everything remaining up to the machine's limit in its set, and hence
6060 * in the union, so there will be no further output. */
6061 len_u = i_u;
6062 if (count == 0) {
6063 /* At most one of the subexpressions will be non-zero */
6064 len_u += (len_a - i_a) + (len_b - i_b);
6065 }
6066
6067 /* Set result to final length, which can change the pointer to array_u, so
6068 * re-find it */
6069 if (len_u != invlist_len(u)) {
6070 invlist_set_len(u, len_u);
6071 invlist_trim(u);
6072 array_u = invlist_array(u);
6073 }
6074
6075 /* When 'count' is 0, the list that was exhausted (if one was shorter than
6076 * the other) ended with everything above it not in its set. That means
6077 * that the remaining part of the union is precisely the same as the
6078 * non-exhausted list, so can just copy it unchanged. (If both list were
6079 * exhausted at the same time, then the operations below will be both 0.)
6080 */
6081 if (count == 0) {
6082 IV copy_count; /* At most one will have a non-zero copy count */
6083 if ((copy_count = len_a - i_a) > 0) {
6084 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6085 }
6086 else if ((copy_count = len_b - i_b) > 0) {
6087 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6088 }
6089 }
6090
6091 return u;
6092}
6093
6094PERL_STATIC_INLINE HV*
6095S_invlist_intersection(pTHX_ HV* const a, HV* const b)
6096{
6097 /* Return the intersection of two inversion lists. The basis for this
6098 * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
6099 * by Addison-Wesley, and explained at some length there. The preface says
6100 * to incorporate its examples into your code at your own risk.
6101 *
6102 * The algorithm is like a merge sort, and is essentially the same as the
6103 * union above
6104 */
6105
6106 UV* array_a = invlist_array(a); /* a's array */
6107 UV* array_b = invlist_array(b);
6108 UV len_a = invlist_len(a); /* length of a's array */
6109 UV len_b = invlist_len(b);
6110
6111 HV* r; /* the resulting intersection */
6112 UV* array_r;
6113 UV len_r;
6114
6115 UV i_a = 0; /* current index into a's array */
6116 UV i_b = 0;
6117 UV i_r = 0;
6118
6119 /* running count, as explained in the algorithm source book; items are
6120 * stopped accumulating and are output when the count changes to/from 2.
6121 * The count is incremented when we start a range that's in the set, and
6122 * decremented when we start a range that's not in the set. So its range
6123 * is 0 to 2. Only when the count is 2 is something in the intersection.
6124 */
6125 UV count = 0;
6126
6127 PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
6128
6129 /* Size the intersection for the worst case: that the intersection ends up
6130 * fragmenting everything to be completely disjoint */
6131 r= _new_invlist(len_a + len_b);
6132 array_r = invlist_array(r);
6133
6134 /* Go through each list item by item, stopping when exhausted one of
6135 * them */
6136 while (i_a < len_a && i_b < len_b) {
6137 UV cp; /* The element to potentially add to the intersection's
6138 array */
6139 bool cp_in_set; /* Is it in the input list's set or not */
6140
6141 /* We need to take one or the other of the two inputs for the union.
6142 * Since we are merging two sorted lists, we take the smaller of the
6143 * next items. In case of a tie, we take the one that is not in its
6144 * set first (a difference from the union algorithm). If we took one
6145 * in the set first, it would increment the count, possibly to 2 which
6146 * would cause it to be output as starting a range in the intersection,
6147 * and the next time through we would take that same number, and output
6148 * it again as ending the set. By doing it the opposite of this, we
6149 * there is no possibility that the count will be momentarily
6150 * incremented to 2. (In a tie and both are in the set or both not in
6151 * the set, it doesn't matter which we take first.) */
6152 if (array_a[i_a] < array_b[i_b]
6153 || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
6154 {
6155 cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6156 cp= array_a[i_a++];
6157 }
6158 else {
6159 cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6160 cp= array_b[i_b++];
6161 }
6162
6163 /* Here, have chosen which of the two inputs to look at. Only output
6164 * if the running count changes to/from 2, which marks the
6165 * beginning/end of a range that's in the intersection */
6166 if (cp_in_set) {
6167 count++;
6168 if (count == 2) {
6169 array_r[i_r++] = cp;
6170 }
6171 }
6172 else {
6173 if (count == 2) {
6174 array_r[i_r++] = cp;
6175 }
6176 count--;
6177 }
6178 }
6179
6180 /* Here, we are finished going through at least one of the sets, which
6181 * means there is something remaining in at most one. See the comments in
6182 * the union code */
6183 if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6184 || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6185 {
6186 count--;
6187 }
6188
6189 /* The final length is what we've output so far plus what else is in the
6190 * intersection. Only one of the subexpressions below will be non-zero */
6191 len_r = i_r;
6192 if (count == 2) {
6193 len_r += (len_a - i_a) + (len_b - i_b);
6194 }
6195
6196 /* Set result to final length, which can change the pointer to array_r, so
6197 * re-find it */
6198 if (len_r != invlist_len(r)) {
6199 invlist_set_len(r, len_r);
6200 invlist_trim(r);
6201 array_r = invlist_array(r);
6202 }
6203
6204 /* Finish outputting any remaining */
6205 if (count == 2) { /* Only one of will have a non-zero copy count */
6206 IV copy_count;
6207 if ((copy_count = len_a - i_a) > 0) {
6208 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6209 }
6210 else if ((copy_count = len_b - i_b) > 0) {
6211 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6212 }
6213 }
6214
6215 return r;
6216}
6217
6218STATIC HV*
6219S_add_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
6220{
6221 /* Add the range from 'start' to 'end' inclusive to the inversion list's
6222 * set. A pointer to the inversion list is returned. This may actually be
6223 * a new list, in which case the passed in one has been destroyed */
6224
6225 HV* range_invlist;
6226 HV* added_invlist;
6227
6228 UV len = invlist_len(invlist);
6229
6230 PERL_ARGS_ASSERT_ADD_RANGE_TO_INVLIST;
6231
6232 /* If comes after the final entry, can just append it to the end */
6233 if (len == 0
6234 || start >= invlist_array(invlist)
6235 [invlist_len(invlist) - 1])
6236 {
6237 _append_range_to_invlist(invlist, start, end);
6238 return invlist;
6239 }
6240
6241 /* Here, can't just append things, create and return a new inversion list
6242 * which is the union of this range and the existing inversion list */
6243 range_invlist = _new_invlist(2);
6244 _append_range_to_invlist(range_invlist, start, end);
6245
6246 added_invlist = invlist_union(invlist, range_invlist);
6247
6248 /* The passed in list can be freed, as well as our temporary */
6249 invlist_destroy(range_invlist);
6250 if (invlist != added_invlist) {
6251 invlist_destroy(invlist);
6252 }
6253
6254 return added_invlist;
6255}
6256
6257/* End of inversion list object */
6258
a687059c
LW
6259/*
6260 - reg - regular expression, i.e. main body or parenthesized thing
6261 *
6262 * Caller must absorb opening parenthesis.
6263 *
6264 * Combining parenthesis handling with the base level of regular expression
6265 * is a trifle forced, but the need to tie the tails of the branches to what
6266 * follows makes it hard to avoid.
6267 */
07be1b83
YO
6268#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
6269#ifdef DEBUGGING
6270#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
6271#else
6272#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
6273#endif
3dab1dad 6274
76e3520e 6275STATIC regnode *
3dab1dad 6276S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 6277 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 6278{
27da23d5 6279 dVAR;
c277df42
IZ
6280 register regnode *ret; /* Will be the head of the group. */
6281 register regnode *br;
6282 register regnode *lastbr;
cbbf8932 6283 register regnode *ender = NULL;
a0d0e21e 6284 register I32 parno = 0;
cbbf8932 6285 I32 flags;
f7819f85 6286 U32 oregflags = RExC_flags;
6136c704
AL
6287 bool have_branch = 0;
6288 bool is_open = 0;
594d7033
YO
6289 I32 freeze_paren = 0;
6290 I32 after_freeze = 0;
9d1d55b5
JP
6291
6292 /* for (?g), (?gc), and (?o) warnings; warning
6293 about (?c) will warn about (?g) -- japhy */
6294
6136c704
AL
6295#define WASTED_O 0x01
6296#define WASTED_G 0x02
6297#define WASTED_C 0x04
6298#define WASTED_GC (0x02|0x04)
cbbf8932 6299 I32 wastedflags = 0x00;
9d1d55b5 6300
fac92740 6301 char * parse_start = RExC_parse; /* MJD */
a28509cc 6302 char * const oregcomp_parse = RExC_parse;
a0d0e21e 6303
3dab1dad 6304 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6305
6306 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
6307 DEBUG_PARSE("reg ");
6308
821b33a5 6309 *flagp = 0; /* Tentatively. */
a0d0e21e 6310
9d1d55b5 6311
a0d0e21e
LW
6312 /* Make an OPEN node, if parenthesized. */
6313 if (paren) {
e2e6a0f1
YO
6314 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
6315 char *start_verb = RExC_parse;
6316 STRLEN verb_len = 0;
6317 char *start_arg = NULL;
6318 unsigned char op = 0;
6319 int argok = 1;
6320 int internal_argval = 0; /* internal_argval is only useful if !argok */
6321 while ( *RExC_parse && *RExC_parse != ')' ) {
6322 if ( *RExC_parse == ':' ) {
6323 start_arg = RExC_parse + 1;
6324 break;
6325 }
6326 RExC_parse++;
6327 }
6328 ++start_verb;
6329 verb_len = RExC_parse - start_verb;
6330 if ( start_arg ) {
6331 RExC_parse++;
6332 while ( *RExC_parse && *RExC_parse != ')' )
6333 RExC_parse++;
6334 if ( *RExC_parse != ')' )
6335 vFAIL("Unterminated verb pattern argument");
6336 if ( RExC_parse == start_arg )
6337 start_arg = NULL;
6338 } else {
6339 if ( *RExC_parse != ')' )
6340 vFAIL("Unterminated verb pattern");
6341 }
5d458dd8 6342
e2e6a0f1
YO
6343 switch ( *start_verb ) {
6344 case 'A': /* (*ACCEPT) */
568a785a 6345 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
6346 op = ACCEPT;
6347 internal_argval = RExC_nestroot;
6348 }
6349 break;
6350 case 'C': /* (*COMMIT) */
568a785a 6351 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 6352 op = COMMIT;
e2e6a0f1
YO
6353 break;
6354 case 'F': /* (*FAIL) */
568a785a 6355 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
6356 op = OPFAIL;
6357 argok = 0;
6358 }
6359 break;
5d458dd8
YO
6360 case ':': /* (*:NAME) */
6361 case 'M': /* (*MARK:NAME) */
568a785a 6362 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 6363 op = MARKPOINT;
5d458dd8
YO
6364 argok = -1;
6365 }
6366 break;
6367 case 'P': /* (*PRUNE) */
568a785a 6368 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 6369 op = PRUNE;
e2e6a0f1 6370 break;
5d458dd8 6371 case 'S': /* (*SKIP) */
568a785a 6372 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
6373 op = SKIP;
6374 break;
6375 case 'T': /* (*THEN) */
6376 /* [19:06] <TimToady> :: is then */
568a785a 6377 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
6378 op = CUTGROUP;
6379 RExC_seen |= REG_SEEN_CUTGROUP;
6380 }
e2e6a0f1
YO
6381 break;
6382 }
6383 if ( ! op ) {
6384 RExC_parse++;
6385 vFAIL3("Unknown verb pattern '%.*s'",
6386 verb_len, start_verb);
6387 }
6388 if ( argok ) {
6389 if ( start_arg && internal_argval ) {
6390 vFAIL3("Verb pattern '%.*s' may not have an argument",
6391 verb_len, start_verb);
6392 } else if ( argok < 0 && !start_arg ) {
6393 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
6394 verb_len, start_verb);
6395 } else {
6396 ret = reganode(pRExC_state, op, internal_argval);
6397 if ( ! internal_argval && ! SIZE_ONLY ) {
6398 if (start_arg) {
6399 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
6400 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 6401 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
6402 ret->flags = 0;
6403 } else {
6404 ret->flags = 1;
6405 }
6406 }
6407 }
6408 if (!internal_argval)
6409 RExC_seen |= REG_SEEN_VERBARG;
6410 } else if ( start_arg ) {
6411 vFAIL3("Verb pattern '%.*s' may not have an argument",
6412 verb_len, start_verb);
6413 } else {
6414 ret = reg_node(pRExC_state, op);
6415 }
6416 nextchar(pRExC_state);
6417 return ret;
6418 } else
fac92740 6419 if (*RExC_parse == '?') { /* (?...) */
6136c704 6420 bool is_logical = 0;
a28509cc 6421 const char * const seqstart = RExC_parse;
fb85c044 6422 bool has_use_defaults = FALSE;
ca9dfc88 6423
830247a4
IZ
6424 RExC_parse++;
6425 paren = *RExC_parse++;
c277df42 6426 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 6427 switch (paren) {
894be9b7 6428
1f1031fe
YO
6429 case 'P': /* (?P...) variants for those used to PCRE/Python */
6430 paren = *RExC_parse++;
6431 if ( paren == '<') /* (?P<...>) named capture */
6432 goto named_capture;
6433 else if (paren == '>') { /* (?P>name) named recursion */
6434 goto named_recursion;
6435 }
6436 else if (paren == '=') { /* (?P=...) named backref */
6437 /* this pretty much dupes the code for \k<NAME> in regatom(), if
6438 you change this make sure you change that */
6439 char* name_start = RExC_parse;
6440 U32 num = 0;
6441 SV *sv_dat = reg_scan_name(pRExC_state,
6442 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6443 if (RExC_parse == name_start || *RExC_parse != ')')
6444 vFAIL2("Sequence %.3s... not terminated",parse_start);
6445
6446 if (!SIZE_ONLY) {
6447 num = add_data( pRExC_state, 1, "S" );
6448 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 6449 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
6450 }
6451 RExC_sawback = 1;
4444fd9f
KW
6452 ret = reganode(pRExC_state,
6453 ((! FOLD)
6454 ? NREF
6455 : (UNI_SEMANTICS)
6456 ? NREFFU
6457 : (LOC)
6458 ? NREFFL
6459 : NREFF),
6460 num);
1f1031fe
YO
6461 *flagp |= HASWIDTH;
6462
6463 Set_Node_Offset(ret, parse_start+1);
6464 Set_Node_Cur_Length(ret); /* MJD */
6465
6466 nextchar(pRExC_state);
6467 return ret;
6468 }
57b84237
YO
6469 RExC_parse++;
6470 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6471 /*NOTREACHED*/
6472 case '<': /* (?<...) */
b81d288d 6473 if (*RExC_parse == '!')
c277df42 6474 paren = ',';
0a4db386 6475 else if (*RExC_parse != '=')
1f1031fe 6476 named_capture:
0a4db386 6477 { /* (?<...>) */
81714fb9 6478 char *name_start;
894be9b7 6479 SV *svname;
81714fb9
YO
6480 paren= '>';
6481 case '\'': /* (?'...') */
6482 name_start= RExC_parse;
0a4db386
YO
6483 svname = reg_scan_name(pRExC_state,
6484 SIZE_ONLY ? /* reverse test from the others */
6485 REG_RSN_RETURN_NAME :
6486 REG_RSN_RETURN_NULL);
57b84237
YO
6487 if (RExC_parse == name_start) {
6488 RExC_parse++;
6489 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6490 /*NOTREACHED*/
6491 }
81714fb9
YO
6492 if (*RExC_parse != paren)
6493 vFAIL2("Sequence (?%c... not terminated",
6494 paren=='>' ? '<' : paren);
6495 if (SIZE_ONLY) {
e62cc96a
YO
6496 HE *he_str;
6497 SV *sv_dat = NULL;
486ec47a 6498 if (!svname) /* shouldn't happen */
894be9b7
YO
6499 Perl_croak(aTHX_
6500 "panic: reg_scan_name returned NULL");
81714fb9
YO
6501 if (!RExC_paren_names) {
6502 RExC_paren_names= newHV();
ad64d0ec 6503 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
6504#ifdef DEBUGGING
6505 RExC_paren_name_list= newAV();
ad64d0ec 6506 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 6507#endif
81714fb9
YO
6508 }
6509 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 6510 if ( he_str )
81714fb9 6511 sv_dat = HeVAL(he_str);
e62cc96a 6512 if ( ! sv_dat ) {
81714fb9 6513 /* croak baby croak */
e62cc96a
YO
6514 Perl_croak(aTHX_
6515 "panic: paren_name hash element allocation failed");
6516 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
6517 /* (?|...) can mean we have dupes so scan to check
6518 its already been stored. Maybe a flag indicating
6519 we are inside such a construct would be useful,
6520 but the arrays are likely to be quite small, so
6521 for now we punt -- dmq */
6522 IV count = SvIV(sv_dat);
6523 I32 *pv = (I32*)SvPVX(sv_dat);
6524 IV i;
6525 for ( i = 0 ; i < count ; i++ ) {
6526 if ( pv[i] == RExC_npar ) {
6527 count = 0;
6528 break;
6529 }
6530 }
6531 if ( count ) {
6532 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
6533 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
6534 pv[count] = RExC_npar;
3a92e6ae 6535 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 6536 }
81714fb9
YO
6537 } else {
6538 (void)SvUPGRADE(sv_dat,SVt_PVNV);
6539 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
6540 SvIOK_on(sv_dat);
3ec35e0f 6541 SvIV_set(sv_dat, 1);
e62cc96a 6542 }
1f1031fe
YO
6543#ifdef DEBUGGING
6544 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
6545 SvREFCNT_dec(svname);
6546#endif
e62cc96a 6547
81714fb9
YO
6548 /*sv_dump(sv_dat);*/
6549 }
6550 nextchar(pRExC_state);
6551 paren = 1;
6552 goto capturing_parens;
6553 }
6554 RExC_seen |= REG_SEEN_LOOKBEHIND;
b57e4118 6555 RExC_in_lookbehind++;
830247a4 6556 RExC_parse++;
fac92740 6557 case '=': /* (?=...) */
89c6a13e 6558 RExC_seen_zerolen++;
5c3fa2e7 6559 break;
fac92740 6560 case '!': /* (?!...) */
830247a4 6561 RExC_seen_zerolen++;
e2e6a0f1
YO
6562 if (*RExC_parse == ')') {
6563 ret=reg_node(pRExC_state, OPFAIL);
6564 nextchar(pRExC_state);
6565 return ret;
6566 }
594d7033
YO
6567 break;
6568 case '|': /* (?|...) */
6569 /* branch reset, behave like a (?:...) except that
6570 buffers in alternations share the same numbers */
6571 paren = ':';
6572 after_freeze = freeze_paren = RExC_npar;
6573 break;
fac92740
MJD
6574 case ':': /* (?:...) */
6575 case '>': /* (?>...) */
a0d0e21e 6576 break;
fac92740
MJD
6577 case '$': /* (?$...) */
6578 case '@': /* (?@...) */
8615cb43 6579 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 6580 break;
fac92740 6581 case '#': /* (?#...) */
830247a4
IZ
6582 while (*RExC_parse && *RExC_parse != ')')
6583 RExC_parse++;
6584 if (*RExC_parse != ')')
c277df42 6585 FAIL("Sequence (?#... not terminated");
830247a4 6586 nextchar(pRExC_state);
a0d0e21e
LW
6587 *flagp = TRYAGAIN;
6588 return NULL;
894be9b7
YO
6589 case '0' : /* (?0) */
6590 case 'R' : /* (?R) */
6591 if (*RExC_parse != ')')
6bda09f9 6592 FAIL("Sequence (?R) not terminated");
1a147d38 6593 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 6594 *flagp |= POSTPONED;
7f69552c
YO
6595 nextchar(pRExC_state);
6596 return ret;
6597 /*notreached*/
894be9b7
YO
6598 { /* named and numeric backreferences */
6599 I32 num;
894be9b7
YO
6600 case '&': /* (?&NAME) */
6601 parse_start = RExC_parse - 1;
1f1031fe 6602 named_recursion:
894be9b7 6603 {
0a4db386
YO
6604 SV *sv_dat = reg_scan_name(pRExC_state,
6605 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6606 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
6607 }
6608 goto gen_recurse_regop;
6609 /* NOT REACHED */
542fa716
YO
6610 case '+':
6611 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6612 RExC_parse++;
6613 vFAIL("Illegal pattern");
6614 }
6615 goto parse_recursion;
6616 /* NOT REACHED*/
6617 case '-': /* (?-1) */
6618 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6619 RExC_parse--; /* rewind to let it be handled later */
6620 goto parse_flags;
6621 }
6622 /*FALLTHROUGH */
6bda09f9
YO
6623 case '1': case '2': case '3': case '4': /* (?1) */
6624 case '5': case '6': case '7': case '8': case '9':
6625 RExC_parse--;
542fa716 6626 parse_recursion:
894be9b7
YO
6627 num = atoi(RExC_parse);
6628 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
6629 if (*RExC_parse == '-')
6630 RExC_parse++;
6bda09f9
YO
6631 while (isDIGIT(*RExC_parse))
6632 RExC_parse++;
6633 if (*RExC_parse!=')')
6634 vFAIL("Expecting close bracket");
894be9b7
YO
6635
6636 gen_recurse_regop:
542fa716
YO
6637 if ( paren == '-' ) {
6638 /*
6639 Diagram of capture buffer numbering.
6640 Top line is the normal capture buffer numbers
3b753521 6641 Bottom line is the negative indexing as from
542fa716
YO
6642 the X (the (?-2))
6643
6644 + 1 2 3 4 5 X 6 7
6645 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6646 - 5 4 3 2 1 X x x
6647
6648 */
6649 num = RExC_npar + num;
6650 if (num < 1) {
6651 RExC_parse++;
6652 vFAIL("Reference to nonexistent group");
6653 }
6654 } else if ( paren == '+' ) {
6655 num = RExC_npar + num - 1;
6656 }
6657
1a147d38 6658 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
6659 if (!SIZE_ONLY) {
6660 if (num > (I32)RExC_rx->nparens) {
6661 RExC_parse++;
6662 vFAIL("Reference to nonexistent group");
6663 }
40d049e4 6664 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 6665 RExC_emit++;
226de585 6666 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 6667 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 6668 } else {
6bda09f9 6669 RExC_size++;
6bda09f9 6670 }
0a4db386 6671 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 6672 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
6673 Set_Node_Offset(ret, parse_start); /* MJD */
6674
a3b492c3 6675 *flagp |= POSTPONED;
6bda09f9
YO
6676 nextchar(pRExC_state);
6677 return ret;
894be9b7
YO
6678 } /* named and numeric backreferences */
6679 /* NOT REACHED */
6680
fac92740 6681 case '?': /* (??...) */
6136c704 6682 is_logical = 1;
57b84237
YO
6683 if (*RExC_parse != '{') {
6684 RExC_parse++;
6685 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6686 /*NOTREACHED*/
6687 }
a3b492c3 6688 *flagp |= POSTPONED;
830247a4 6689 paren = *RExC_parse++;
0f5d15d6 6690 /* FALL THROUGH */
fac92740 6691 case '{': /* (?{...}) */
c277df42 6692 {
2eccd3b2
NC
6693 I32 count = 1;
6694 U32 n = 0;
c277df42 6695 char c;
830247a4 6696 char *s = RExC_parse;
c277df42 6697
830247a4
IZ
6698 RExC_seen_zerolen++;
6699 RExC_seen |= REG_SEEN_EVAL;
6700 while (count && (c = *RExC_parse)) {
6136c704
AL
6701 if (c == '\\') {
6702 if (RExC_parse[1])
6703 RExC_parse++;
6704 }
b81d288d 6705 else if (c == '{')
c277df42 6706 count++;
b81d288d 6707 else if (c == '}')
c277df42 6708 count--;
830247a4 6709 RExC_parse++;
c277df42 6710 }
6136c704 6711 if (*RExC_parse != ')') {
b81d288d 6712 RExC_parse = s;
b45f050a
JF
6713 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6714 }
c277df42 6715 if (!SIZE_ONLY) {
f3548bdc 6716 PAD *pad;
6136c704
AL
6717 OP_4tree *sop, *rop;
6718 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 6719
569233ed
SB
6720 ENTER;
6721 Perl_save_re_context(aTHX);
d59a8b3e 6722 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
9b978d73
DM
6723 sop->op_private |= OPpREFCOUNTED;
6724 /* re_dup will OpREFCNT_inc */
6725 OpREFCNT_set(sop, 1);
569233ed 6726 LEAVE;
c277df42 6727
830247a4 6728 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
6729 RExC_rxi->data->data[n] = (void*)rop;
6730 RExC_rxi->data->data[n+1] = (void*)sop;
6731 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 6732 SvREFCNT_dec(sv);
a0ed51b3 6733 }
e24b16f9 6734 else { /* First pass */
830247a4 6735 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 6736 && IN_PERL_RUNTIME)
2cd61cdb
IZ
6737 /* No compiled RE interpolated, has runtime
6738 components ===> unsafe. */
6739 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 6740 if (PL_tainting && PL_tainted)
cc6b7395 6741 FAIL("Eval-group in insecure regular expression");
54df2634 6742#if PERL_VERSION > 8
923e4eb5 6743 if (IN_PERL_COMPILETIME)
b5c19bd7 6744 PL_cv_has_eval = 1;
54df2634 6745#endif
c277df42 6746 }
b5c19bd7 6747
830247a4 6748 nextchar(pRExC_state);
6136c704 6749 if (is_logical) {
830247a4 6750 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6751 if (!SIZE_ONLY)
6752 ret->flags = 2;
3dab1dad 6753 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 6754 /* deal with the length of this later - MJD */
0f5d15d6
IZ
6755 return ret;
6756 }
ccb2c380
MP
6757 ret = reganode(pRExC_state, EVAL, n);
6758 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6759 Set_Node_Offset(ret, parse_start);
6760 return ret;
c277df42 6761 }
fac92740 6762 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 6763 {
0a4db386 6764 int is_define= 0;
fac92740 6765 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
6766 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6767 || RExC_parse[1] == '<'
830247a4 6768 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
6769 I32 flag;
6770
830247a4 6771 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6772 if (!SIZE_ONLY)
6773 ret->flags = 1;
3dab1dad 6774 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 6775 goto insert_if;
b81d288d 6776 }
a0ed51b3 6777 }
0a4db386
YO
6778 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6779 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6780 {
6781 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6782 char *name_start= RExC_parse++;
2eccd3b2 6783 U32 num = 0;
0a4db386
YO
6784 SV *sv_dat=reg_scan_name(pRExC_state,
6785 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6786 if (RExC_parse == name_start || *RExC_parse != ch)
6787 vFAIL2("Sequence (?(%c... not terminated",
6788 (ch == '>' ? '<' : ch));
6789 RExC_parse++;
6790 if (!SIZE_ONLY) {
6791 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 6792 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 6793 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
6794 }
6795 ret = reganode(pRExC_state,NGROUPP,num);
6796 goto insert_if_check_paren;
6797 }
6798 else if (RExC_parse[0] == 'D' &&
6799 RExC_parse[1] == 'E' &&
6800 RExC_parse[2] == 'F' &&
6801 RExC_parse[3] == 'I' &&
6802 RExC_parse[4] == 'N' &&
6803 RExC_parse[5] == 'E')
6804 {
6805 ret = reganode(pRExC_state,DEFINEP,0);
6806 RExC_parse +=6 ;
6807 is_define = 1;
6808 goto insert_if_check_paren;
6809 }
6810 else if (RExC_parse[0] == 'R') {
6811 RExC_parse++;
6812 parno = 0;
6813 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6814 parno = atoi(RExC_parse++);
6815 while (isDIGIT(*RExC_parse))
6816 RExC_parse++;
6817 } else if (RExC_parse[0] == '&') {
6818 SV *sv_dat;
6819 RExC_parse++;
6820 sv_dat = reg_scan_name(pRExC_state,
6821 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6822 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6823 }
1a147d38 6824 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
6825 goto insert_if_check_paren;
6826 }
830247a4 6827 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 6828 /* (?(1)...) */
6136c704 6829 char c;
830247a4 6830 parno = atoi(RExC_parse++);
c277df42 6831
830247a4
IZ
6832 while (isDIGIT(*RExC_parse))
6833 RExC_parse++;
fac92740 6834 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 6835
0a4db386 6836 insert_if_check_paren:
830247a4 6837 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 6838 vFAIL("Switch condition not recognized");
c277df42 6839 insert_if:
3dab1dad
YO
6840 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6841 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 6842 if (br == NULL)
830247a4 6843 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 6844 else
3dab1dad 6845 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 6846 c = *nextchar(pRExC_state);
d1b80229
IZ
6847 if (flags&HASWIDTH)
6848 *flagp |= HASWIDTH;
c277df42 6849 if (c == '|') {
0a4db386
YO
6850 if (is_define)
6851 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 6852 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
6853 regbranch(pRExC_state, &flags, 1,depth+1);
6854 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
6855 if (flags&HASWIDTH)
6856 *flagp |= HASWIDTH;
830247a4 6857 c = *nextchar(pRExC_state);
a0ed51b3
LW
6858 }
6859 else
c277df42
IZ
6860 lastbr = NULL;
6861 if (c != ')')
8615cb43 6862 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 6863 ender = reg_node(pRExC_state, TAIL);
3dab1dad 6864 REGTAIL(pRExC_state, br, ender);
c277df42 6865 if (lastbr) {
3dab1dad
YO
6866 REGTAIL(pRExC_state, lastbr, ender);
6867 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
6868 }
6869 else
3dab1dad 6870 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
6871 RExC_size++; /* XXX WHY do we need this?!!
6872 For large programs it seems to be required
6873 but I can't figure out why. -- dmq*/
c277df42 6874 return ret;
a0ed51b3
LW
6875 }
6876 else {
830247a4 6877 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
6878 }
6879 }
1b1626e4 6880 case 0:
830247a4 6881 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 6882 vFAIL("Sequence (? incomplete");
1b1626e4 6883 break;
85508812
KW
6884 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6885 that follow */
fb85c044
KW
6886 has_use_defaults = TRUE;
6887 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
e40e74fe
KW
6888 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
6889 ? REGEX_UNICODE_CHARSET
6890 : REGEX_DEPENDS_CHARSET);
fb85c044 6891 goto parse_flags;
a0d0e21e 6892 default:
cde0cee5
YO
6893 --RExC_parse;
6894 parse_flags: /* (?i) */
6895 {
6896 U32 posflags = 0, negflags = 0;
6897 U32 *flagsp = &posflags;
9de15fec 6898 bool has_charset_modifier = 0;
a62b1201 6899 regex_charset cs = REGEX_DEPENDS_CHARSET;
cde0cee5
YO
6900
6901 while (*RExC_parse) {
6902 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
6903 /* (?g), (?gc) and (?o) are useless here
6904 and must be globally applied -- japhy */
cde0cee5
YO
6905 switch (*RExC_parse) {
6906 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9de15fec
KW
6907 case LOCALE_PAT_MOD:
6908 if (has_charset_modifier || flagsp == &negflags) {
6909 goto fail_modifiers;
6910 }
a62b1201 6911 cs = REGEX_LOCALE_CHARSET;
9de15fec
KW
6912 has_charset_modifier = 1;
6913 break;
6914 case UNICODE_PAT_MOD:
6915 if (has_charset_modifier || flagsp == &negflags) {
6916 goto fail_modifiers;
6917 }
a62b1201 6918 cs = REGEX_UNICODE_CHARSET;
9de15fec
KW
6919 has_charset_modifier = 1;
6920 break;
cfaf538b
KW
6921 case ASCII_RESTRICT_PAT_MOD:
6922 if (has_charset_modifier || flagsp == &negflags) {
6923 goto fail_modifiers;
6924 }
6925 cs = REGEX_ASCII_RESTRICTED_CHARSET;
6926 has_charset_modifier = 1;
6927 break;
50e91148 6928 case DEPENDS_PAT_MOD:
9de15fec
KW
6929 if (has_use_defaults
6930 || has_charset_modifier
6931 || flagsp == &negflags)
6932 {
6933 goto fail_modifiers;
6934 }
7b98bc43
KW
6935
6936 /* The dual charset means unicode semantics if the
6937 * pattern (or target, not known until runtime) are
e40e74fe
KW
6938 * utf8, or something in the pattern indicates unicode
6939 * semantics */
6940 cs = (RExC_utf8 || RExC_uni_semantics)
a62b1201
KW
6941 ? REGEX_UNICODE_CHARSET
6942 : REGEX_DEPENDS_CHARSET;
9de15fec
KW
6943 has_charset_modifier = 1;
6944 break;
f7819f85
A
6945 case ONCE_PAT_MOD: /* 'o' */
6946 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 6947 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 6948 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
6949 if (! (wastedflags & wflagbit) ) {
6950 wastedflags |= wflagbit;
6951 vWARN5(
6952 RExC_parse + 1,
6953 "Useless (%s%c) - %suse /%c modifier",
6954 flagsp == &negflags ? "?-" : "?",
6955 *RExC_parse,
6956 flagsp == &negflags ? "don't " : "",
6957 *RExC_parse
6958 );
6959 }
6960 }
cde0cee5
YO
6961 break;
6962
f7819f85 6963 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 6964 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
6965 if (! (wastedflags & WASTED_C) ) {
6966 wastedflags |= WASTED_GC;
9d1d55b5
JP
6967 vWARN3(
6968 RExC_parse + 1,
6969 "Useless (%sc) - %suse /gc modifier",
6970 flagsp == &negflags ? "?-" : "?",
6971 flagsp == &negflags ? "don't " : ""
6972 );
6973 }
6974 }
cde0cee5 6975 break;
f7819f85 6976 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5 6977 if (flagsp == &negflags) {
668c081a
NC
6978 if (SIZE_ONLY)
6979 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
6980 } else {
6981 *flagsp |= RXf_PMf_KEEPCOPY;
6982 }
6983 break;
6984 case '-':
3b753521 6985 /* A flag is a default iff it is following a minus, so
fb85c044
KW
6986 * if there is a minus, it means will be trying to
6987 * re-specify a default which is an error */
6988 if (has_use_defaults || flagsp == &negflags) {
9de15fec 6989 fail_modifiers:
57b84237
YO
6990 RExC_parse++;
6991 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6992 /*NOTREACHED*/
6993 }
cde0cee5
YO
6994 flagsp = &negflags;
6995 wastedflags = 0; /* reset so (?g-c) warns twice */
6996 break;
6997 case ':':
6998 paren = ':';
6999 /*FALLTHROUGH*/
7000 case ')':
7001 RExC_flags |= posflags;
7002 RExC_flags &= ~negflags;
a62b1201 7003 set_regex_charset(&RExC_flags, cs);
f7819f85
A
7004 if (paren != ':') {
7005 oregflags |= posflags;
7006 oregflags &= ~negflags;
a62b1201 7007 set_regex_charset(&oregflags, cs);
f7819f85 7008 }
cde0cee5
YO
7009 nextchar(pRExC_state);
7010 if (paren != ':') {
7011 *flagp = TRYAGAIN;
7012 return NULL;
7013 } else {
7014 ret = NULL;
7015 goto parse_rest;
7016 }
7017 /*NOTREACHED*/
7018 default:
cde0cee5
YO
7019 RExC_parse++;
7020 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7021 /*NOTREACHED*/
7022 }
830247a4 7023 ++RExC_parse;
48c036b1 7024 }
cde0cee5 7025 }} /* one for the default block, one for the switch */
a0d0e21e 7026 }
fac92740 7027 else { /* (...) */
81714fb9 7028 capturing_parens:
830247a4
IZ
7029 parno = RExC_npar;
7030 RExC_npar++;
e2e6a0f1 7031
830247a4 7032 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
7033 if (!SIZE_ONLY ){
7034 if (!RExC_nestroot)
7035 RExC_nestroot = parno;
c009da3d
YO
7036 if (RExC_seen & REG_SEEN_RECURSE
7037 && !RExC_open_parens[parno-1])
7038 {
e2e6a0f1 7039 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
7040 "Setting open paren #%"IVdf" to %d\n",
7041 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
7042 RExC_open_parens[parno-1]= ret;
7043 }
6bda09f9 7044 }
fac92740
MJD
7045 Set_Node_Length(ret, 1); /* MJD */
7046 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 7047 is_open = 1;
a0d0e21e 7048 }
a0ed51b3 7049 }
fac92740 7050 else /* ! paren */
a0d0e21e 7051 ret = NULL;
cde0cee5
YO
7052
7053 parse_rest:
a0d0e21e 7054 /* Pick up the branches, linking them together. */
fac92740 7055 parse_start = RExC_parse; /* MJD */
3dab1dad 7056 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e
VP
7057
7058 if (freeze_paren) {
7059 if (RExC_npar > after_freeze)
7060 after_freeze = RExC_npar;
7061 RExC_npar = freeze_paren;
7062 }
7063
fac92740 7064 /* branch_len = (paren != 0); */
2af232bd 7065
a0d0e21e
LW
7066 if (br == NULL)
7067 return(NULL);
830247a4
IZ
7068 if (*RExC_parse == '|') {
7069 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 7070 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 7071 }
fac92740 7072 else { /* MJD */
6bda09f9 7073 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
7074 Set_Node_Length(br, paren != 0);
7075 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
7076 }
c277df42
IZ
7077 have_branch = 1;
7078 if (SIZE_ONLY)
830247a4 7079 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
7080 }
7081 else if (paren == ':') {
c277df42
IZ
7082 *flagp |= flags&SIMPLE;
7083 }
6136c704 7084 if (is_open) { /* Starts with OPEN. */
3dab1dad 7085 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
7086 }
7087 else if (paren != '?') /* Not Conditional */
a0d0e21e 7088 ret = br;
8ae10a67 7089 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 7090 lastbr = br;
830247a4
IZ
7091 while (*RExC_parse == '|') {
7092 if (!SIZE_ONLY && RExC_extralen) {
7093 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 7094 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
7095 }
7096 if (SIZE_ONLY)
830247a4
IZ
7097 RExC_extralen += 2; /* Account for LONGJMP. */
7098 nextchar(pRExC_state);
594d7033
YO
7099 if (freeze_paren) {
7100 if (RExC_npar > after_freeze)
7101 after_freeze = RExC_npar;
7102 RExC_npar = freeze_paren;
7103 }
3dab1dad 7104 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 7105
a687059c 7106 if (br == NULL)
a0d0e21e 7107 return(NULL);
3dab1dad 7108 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 7109 lastbr = br;
8ae10a67 7110 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
7111 }
7112
c277df42
IZ
7113 if (have_branch || paren != ':') {
7114 /* Make a closing node, and hook it on the end. */
7115 switch (paren) {
7116 case ':':
830247a4 7117 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
7118 break;
7119 case 1:
830247a4 7120 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
7121 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
7122 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7123 "Setting close paren #%"IVdf" to %d\n",
7124 (IV)parno, REG_NODE_NUM(ender)));
7125 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
7126 if (RExC_nestroot == parno)
7127 RExC_nestroot = 0;
40d049e4 7128 }
fac92740
MJD
7129 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
7130 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
7131 break;
7132 case '<':
c277df42
IZ
7133 case ',':
7134 case '=':
7135 case '!':
c277df42 7136 *flagp &= ~HASWIDTH;
821b33a5
IZ
7137 /* FALL THROUGH */
7138 case '>':
830247a4 7139 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
7140 break;
7141 case 0:
830247a4 7142 ender = reg_node(pRExC_state, END);
40d049e4
YO
7143 if (!SIZE_ONLY) {
7144 assert(!RExC_opend); /* there can only be one! */
7145 RExC_opend = ender;
7146 }
c277df42
IZ
7147 break;
7148 }
eaf3ca90 7149 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 7150
9674d46a 7151 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
7152 if (depth==1)
7153 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
7154
c277df42 7155 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
7156 for (br = ret; br; br = regnext(br)) {
7157 const U8 op = PL_regkind[OP(br)];
7158 if (op == BRANCH) {
07be1b83 7159 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
7160 }
7161 else if (op == BRANCHJ) {
07be1b83 7162 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 7163 }
c277df42
IZ
7164 }
7165 }
a0d0e21e 7166 }
c277df42
IZ
7167
7168 {
e1ec3a88
AL
7169 const char *p;
7170 static const char parens[] = "=!<,>";
c277df42
IZ
7171
7172 if (paren && (p = strchr(parens, paren))) {
eb160463 7173 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
7174 int flag = (p - parens) > 1;
7175
7176 if (paren == '>')
7177 node = SUSPEND, flag = 0;
6bda09f9 7178 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
7179 Set_Node_Cur_Length(ret);
7180 Set_Node_Offset(ret, parse_start + 1);
c277df42 7181 ret->flags = flag;
07be1b83 7182 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 7183 }
a0d0e21e
LW
7184 }
7185
7186 /* Check for proper termination. */
ce3e6498 7187 if (paren) {
e2509266 7188 RExC_flags = oregflags;
830247a4
IZ
7189 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
7190 RExC_parse = oregcomp_parse;
380a0633 7191 vFAIL("Unmatched (");
ce3e6498 7192 }
a0ed51b3 7193 }
830247a4
IZ
7194 else if (!paren && RExC_parse < RExC_end) {
7195 if (*RExC_parse == ')') {
7196 RExC_parse++;
380a0633 7197 vFAIL("Unmatched )");
a0ed51b3
LW
7198 }
7199 else
b45f050a 7200 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
7201 /* NOTREACHED */
7202 }
b57e4118
KW
7203
7204 if (RExC_in_lookbehind) {
7205 RExC_in_lookbehind--;
7206 }
594d7033
YO
7207 if (after_freeze)
7208 RExC_npar = after_freeze;
a0d0e21e 7209 return(ret);
a687059c
LW
7210}
7211
7212/*
7213 - regbranch - one alternative of an | operator
7214 *
7215 * Implements the concatenation operator.
7216 */
76e3520e 7217STATIC regnode *
3dab1dad 7218S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 7219{
97aff369 7220 dVAR;
c277df42
IZ
7221 register regnode *ret;
7222 register regnode *chain = NULL;
7223 register regnode *latest;
7224 I32 flags = 0, c = 0;
3dab1dad 7225 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
7226
7227 PERL_ARGS_ASSERT_REGBRANCH;
7228
3dab1dad 7229 DEBUG_PARSE("brnc");
02daf0ab 7230
b81d288d 7231 if (first)
c277df42
IZ
7232 ret = NULL;
7233 else {
b81d288d 7234 if (!SIZE_ONLY && RExC_extralen)
830247a4 7235 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 7236 else {
830247a4 7237 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
7238 Set_Node_Length(ret, 1);
7239 }
c277df42
IZ
7240 }
7241
b81d288d 7242 if (!first && SIZE_ONLY)
830247a4 7243 RExC_extralen += 1; /* BRANCHJ */
b81d288d 7244
c277df42 7245 *flagp = WORST; /* Tentatively. */
a0d0e21e 7246
830247a4
IZ
7247 RExC_parse--;
7248 nextchar(pRExC_state);
7249 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 7250 flags &= ~TRYAGAIN;
3dab1dad 7251 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
7252 if (latest == NULL) {
7253 if (flags & TRYAGAIN)
7254 continue;
7255 return(NULL);
a0ed51b3
LW
7256 }
7257 else if (ret == NULL)
c277df42 7258 ret = latest;
8ae10a67 7259 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 7260 if (chain == NULL) /* First piece. */
a0d0e21e
LW
7261 *flagp |= flags&SPSTART;
7262 else {
830247a4 7263 RExC_naughty++;
3dab1dad 7264 REGTAIL(pRExC_state, chain, latest);
a687059c 7265 }
a0d0e21e 7266 chain = latest;
c277df42
IZ
7267 c++;
7268 }
7269 if (chain == NULL) { /* Loop ran zero times. */
830247a4 7270 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
7271 if (ret == NULL)
7272 ret = chain;
7273 }
7274 if (c == 1) {
7275 *flagp |= flags&SIMPLE;
a0d0e21e 7276 }
a687059c 7277
d4c19fe8 7278 return ret;
a687059c
LW
7279}
7280
7281/*
7282 - regpiece - something followed by possible [*+?]
7283 *
7284 * Note that the branching code sequences used for ? and the general cases
7285 * of * and + are somewhat optimized: they use the same NOTHING node as
7286 * both the endmarker for their branch list and the body of the last branch.
7287 * It might seem that this node could be dispensed with entirely, but the
7288 * endmarker role is not redundant.
7289 */
76e3520e 7290STATIC regnode *
3dab1dad 7291S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 7292{
97aff369 7293 dVAR;
c277df42 7294 register regnode *ret;
a0d0e21e
LW
7295 register char op;
7296 register char *next;
7297 I32 flags;
1df70142 7298 const char * const origparse = RExC_parse;
a0d0e21e 7299 I32 min;
c277df42 7300 I32 max = REG_INFTY;
fac92740 7301 char *parse_start;
10edeb5d 7302 const char *maxpos = NULL;
3dab1dad 7303 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
7304
7305 PERL_ARGS_ASSERT_REGPIECE;
7306
3dab1dad 7307 DEBUG_PARSE("piec");
a0d0e21e 7308
3dab1dad 7309 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
7310 if (ret == NULL) {
7311 if (flags & TRYAGAIN)
7312 *flagp |= TRYAGAIN;
7313 return(NULL);
7314 }
7315
830247a4 7316 op = *RExC_parse;
a0d0e21e 7317
830247a4 7318 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 7319 maxpos = NULL;
fac92740 7320 parse_start = RExC_parse; /* MJD */
830247a4 7321 next = RExC_parse + 1;
a0d0e21e
LW
7322 while (isDIGIT(*next) || *next == ',') {
7323 if (*next == ',') {
7324 if (maxpos)
7325 break;
7326 else
7327 maxpos = next;
a687059c 7328 }
a0d0e21e
LW
7329 next++;
7330 }
7331 if (*next == '}') { /* got one */
7332 if (!maxpos)
7333 maxpos = next;
830247a4
IZ
7334 RExC_parse++;
7335 min = atoi(RExC_parse);
a0d0e21e
LW
7336 if (*maxpos == ',')
7337 maxpos++;
7338 else
830247a4 7339 maxpos = RExC_parse;
a0d0e21e
LW
7340 max = atoi(maxpos);
7341 if (!max && *maxpos != '0')
c277df42
IZ
7342 max = REG_INFTY; /* meaning "infinity" */
7343 else if (max >= REG_INFTY)
8615cb43 7344 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
7345 RExC_parse = next;
7346 nextchar(pRExC_state);
a0d0e21e
LW
7347
7348 do_curly:
7349 if ((flags&SIMPLE)) {
830247a4 7350 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 7351 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
7352 Set_Node_Offset(ret, parse_start+1); /* MJD */
7353 Set_Node_Cur_Length(ret);
a0d0e21e
LW
7354 }
7355 else {
3dab1dad 7356 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
7357
7358 w->flags = 0;
3dab1dad 7359 REGTAIL(pRExC_state, ret, w);
830247a4 7360 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
7361 reginsert(pRExC_state, LONGJMP,ret, depth+1);
7362 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
7363 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
7364 }
6bda09f9 7365 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
7366 /* MJD hk */
7367 Set_Node_Offset(ret, parse_start+1);
2af232bd 7368 Set_Node_Length(ret,
fac92740 7369 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 7370
830247a4 7371 if (!SIZE_ONLY && RExC_extralen)
c277df42 7372 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 7373 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 7374 if (SIZE_ONLY)
830247a4
IZ
7375 RExC_whilem_seen++, RExC_extralen += 3;
7376 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 7377 }
c277df42 7378 ret->flags = 0;
a0d0e21e
LW
7379
7380 if (min > 0)
821b33a5
IZ
7381 *flagp = WORST;
7382 if (max > 0)
7383 *flagp |= HASWIDTH;
8fa23287 7384 if (max < min)
8615cb43 7385 vFAIL("Can't do {n,m} with n > m");
c277df42 7386 if (!SIZE_ONLY) {
eb160463
GS
7387 ARG1_SET(ret, (U16)min);
7388 ARG2_SET(ret, (U16)max);
a687059c 7389 }
a687059c 7390
a0d0e21e 7391 goto nest_check;
a687059c 7392 }
a0d0e21e 7393 }
a687059c 7394
a0d0e21e
LW
7395 if (!ISMULT1(op)) {
7396 *flagp = flags;
a687059c 7397 return(ret);
a0d0e21e 7398 }
bb20fd44 7399
c277df42 7400#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
7401
7402 /* if this is reinstated, don't forget to put this back into perldiag:
7403
7404 =item Regexp *+ operand could be empty at {#} in regex m/%s/
7405
7406 (F) The part of the regexp subject to either the * or + quantifier
7407 could match an empty string. The {#} shows in the regular
7408 expression about where the problem was discovered.
7409
7410 */
7411
bb20fd44 7412 if (!(flags&HASWIDTH) && op != '?')
b45f050a 7413 vFAIL("Regexp *+ operand could be empty");
b81d288d 7414#endif
bb20fd44 7415
fac92740 7416 parse_start = RExC_parse;
830247a4 7417 nextchar(pRExC_state);
a0d0e21e 7418
821b33a5 7419 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
7420
7421 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 7422 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 7423 ret->flags = 0;
830247a4 7424 RExC_naughty += 4;
a0d0e21e
LW
7425 }
7426 else if (op == '*') {
7427 min = 0;
7428 goto do_curly;
a0ed51b3
LW
7429 }
7430 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 7431 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 7432 ret->flags = 0;
830247a4 7433 RExC_naughty += 3;
a0d0e21e
LW
7434 }
7435 else if (op == '+') {
7436 min = 1;
7437 goto do_curly;
a0ed51b3
LW
7438 }
7439 else if (op == '?') {
a0d0e21e
LW
7440 min = 0; max = 1;
7441 goto do_curly;
7442 }
7443 nest_check:
668c081a
NC
7444 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
7445 ckWARN3reg(RExC_parse,
7446 "%.*s matches null string many times",
7447 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
7448 origparse);
a0d0e21e
LW
7449 }
7450
b9b4dddf 7451 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 7452 nextchar(pRExC_state);
6bda09f9 7453 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 7454 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 7455 }
b9b4dddf
YO
7456#ifndef REG_ALLOW_MINMOD_SUSPEND
7457 else
7458#endif
7459 if (RExC_parse < RExC_end && *RExC_parse == '+') {
7460 regnode *ender;
7461 nextchar(pRExC_state);
7462 ender = reg_node(pRExC_state, SUCCEED);
7463 REGTAIL(pRExC_state, ret, ender);
7464 reginsert(pRExC_state, SUSPEND, ret, depth+1);
7465 ret->flags = 0;
7466 ender = reg_node(pRExC_state, TAIL);
7467 REGTAIL(pRExC_state, ret, ender);
7468 /*ret= ender;*/
7469 }
7470
7471 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 7472 RExC_parse++;
b45f050a
JF
7473 vFAIL("Nested quantifiers");
7474 }
a0d0e21e
LW
7475
7476 return(ret);
a687059c
LW
7477}
7478
fc8cd66c
YO
7479
7480/* reg_namedseq(pRExC_state,UVp)
7481
7482 This is expected to be called by a parser routine that has
afefe6bf 7483 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
7484 expected to point at the first char following the N at the time
7485 of the call.
ff3f963a
KW
7486
7487 The \N may be inside (indicated by valuep not being NULL) or outside a
7488 character class.
7489
7490 \N may begin either a named sequence, or if outside a character class, mean
7491 to match a non-newline. For non single-quoted regexes, the tokenizer has
7492 attempted to decide which, and in the case of a named sequence converted it
7493 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
7494 where c1... are the characters in the sequence. For single-quoted regexes,
7495 the tokenizer passes the \N sequence through unchanged; this code will not
7496 attempt to determine this nor expand those. The net effect is that if the
7497 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
7498 signals that this \N occurrence means to match a non-newline.
7499
7500 Only the \N{U+...} form should occur in a character class, for the same
7501 reason that '.' inside a character class means to just match a period: it
7502 just doesn't make sense.
fc8cd66c
YO
7503
7504 If valuep is non-null then it is assumed that we are parsing inside
7505 of a charclass definition and the first codepoint in the resolved
7506 string is returned via *valuep and the routine will return NULL.
7507 In this mode if a multichar string is returned from the charnames
ff3f963a 7508 handler, a warning will be issued, and only the first char in the
fc8cd66c
YO
7509 sequence will be examined. If the string returned is zero length
7510 then the value of *valuep is undefined and NON-NULL will
7511 be returned to indicate failure. (This will NOT be a valid pointer
7512 to a regnode.)
7513
ff3f963a
KW
7514 If valuep is null then it is assumed that we are parsing normal text and a
7515 new EXACT node is inserted into the program containing the resolved string,
7516 and a pointer to the new node is returned. But if the string is zero length
7517 a NOTHING node is emitted instead.
afefe6bf 7518
fc8cd66c 7519 On success RExC_parse is set to the char following the endbrace.
ff3f963a 7520 Parsing failures will generate a fatal error via vFAIL(...)
fc8cd66c
YO
7521 */
7522STATIC regnode *
afefe6bf 7523S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
fc8cd66c 7524{
c3c41406 7525 char * endbrace; /* '}' following the name */
fc8cd66c 7526 regnode *ret = NULL;
ff3f963a
KW
7527#ifdef DEBUGGING
7528 char* parse_start = RExC_parse - 2; /* points to the '\N' */
7529#endif
c3c41406 7530 char* p;
ff3f963a
KW
7531
7532 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
7533
7534 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
ff3f963a
KW
7535
7536 GET_RE_DEBUG_FLAGS;
c3c41406
KW
7537
7538 /* The [^\n] meaning of \N ignores spaces and comments under the /x
7539 * modifier. The other meaning does not */
7540 p = (RExC_flags & RXf_PMf_EXTENDED)
7541 ? regwhite( pRExC_state, RExC_parse )
7542 : RExC_parse;
7918f24d 7543
ff3f963a 7544 /* Disambiguate between \N meaning a named character versus \N meaning
c3c41406
KW
7545 * [^\n]. The former is assumed when it can't be the latter. */
7546 if (*p != '{' || regcurly(p)) {
7547 RExC_parse = p;
ff3f963a 7548 if (valuep) {
afefe6bf 7549 /* no bare \N in a charclass */
ff3f963a
KW
7550 vFAIL("\\N in a character class must be a named character: \\N{...}");
7551 }
afefe6bf
RGS
7552 nextchar(pRExC_state);
7553 ret = reg_node(pRExC_state, REG_ANY);
7554 *flagp |= HASWIDTH|SIMPLE;
7555 RExC_naughty++;
7556 RExC_parse--;
7557 Set_Node_Length(ret, 1); /* MJD */
7558 return ret;
fc8cd66c 7559 }
a4893424 7560
c3c41406
KW
7561 /* Here, we have decided it should be a named sequence */
7562
7563 /* The test above made sure that the next real character is a '{', but
7564 * under the /x modifier, it could be separated by space (or a comment and
7565 * \n) and this is not allowed (for consistency with \x{...} and the
7566 * tokenizer handling of \N{NAME}). */
7567 if (*RExC_parse != '{') {
7568 vFAIL("Missing braces on \\N{}");
7569 }
7570
ff3f963a 7571 RExC_parse++; /* Skip past the '{' */
c3c41406
KW
7572
7573 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
7574 || ! (endbrace == RExC_parse /* nothing between the {} */
7575 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
7576 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
7577 {
7578 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
7579 vFAIL("\\N{NAME} must be resolved by the lexer");
7580 }
7581
ff3f963a
KW
7582 if (endbrace == RExC_parse) { /* empty: \N{} */
7583 if (! valuep) {
7584 RExC_parse = endbrace + 1;
7585 return reg_node(pRExC_state,NOTHING);
a4893424 7586 }
fc8cd66c 7587
ff3f963a
KW
7588 if (SIZE_ONLY) {
7589 ckWARNreg(RExC_parse,
7590 "Ignoring zero length \\N{} in character class"
7591 );
7592 RExC_parse = endbrace + 1;
7593 }
7594 *valuep = 0;
7595 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 7596 }
ff3f963a 7597
62fed28b 7598 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
ff3f963a
KW
7599 RExC_parse += 2; /* Skip past the 'U+' */
7600
7601 if (valuep) { /* In a bracketed char class */
7602 /* We only pay attention to the first char of
7603 multichar strings being returned. I kinda wonder
7604 if this makes sense as it does change the behaviour
7605 from earlier versions, OTOH that behaviour was broken
7606 as well. XXX Solution is to recharacterize as
7607 [rest-of-class]|multi1|multi2... */
7608
7609 STRLEN length_of_hex;
7610 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7611 | PERL_SCAN_DISALLOW_PREFIX
7612 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7613
37820adc
KW
7614 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7615 if (endchar < endbrace) {
ff3f963a
KW
7616 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7617 }
ff3f963a
KW
7618
7619 length_of_hex = (STRLEN)(endchar - RExC_parse);
7620 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7621
7622 /* The tokenizer should have guaranteed validity, but it's possible to
7623 * bypass it by using single quoting, so check */
c3c41406
KW
7624 if (length_of_hex == 0
7625 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7626 {
7627 RExC_parse += length_of_hex; /* Includes all the valid */
7628 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7629 ? UTF8SKIP(RExC_parse)
7630 : 1;
7631 /* Guard against malformed utf8 */
7632 if (RExC_parse >= endchar) RExC_parse = endchar;
7633 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
7634 }
7635
7636 RExC_parse = endbrace + 1;
7637 if (endchar == endbrace) return NULL;
7638
7639 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 7640 }
ff3f963a
KW
7641 else { /* Not a char class */
7642 char *s; /* String to put in generated EXACT node */
fda99bee 7643 STRLEN len = 0; /* Its current byte length */
ff3f963a
KW
7644 char *endchar; /* Points to '.' or '}' ending cur char in the input
7645 stream */
7646
2c2b7f86
KW
7647 ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
7648 : (LOC)
7649 ? EXACTFL
7650 : UNI_SEMANTICS
7651 ? EXACTFU
7652 : EXACTF));
ff3f963a
KW
7653 s= STRING(ret);
7654
7655 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
7656 * the input which is of the form now 'c1.c2.c3...}' until find the
fda99bee 7657 * ending brace or exceed length 255. The characters that exceed this
ff3f963a
KW
7658 * limit are dropped. The limit could be relaxed should it become
7659 * desirable by reparsing this as (?:\N{NAME}), so could generate
7660 * multiple EXACT nodes, as is done for just regular input. But this
7661 * is primarily a named character, and not intended to be a huge long
7662 * string, so 255 bytes should be good enough */
7663 while (1) {
c3c41406 7664 STRLEN length_of_hex;
ff3f963a
KW
7665 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7666 | PERL_SCAN_DISALLOW_PREFIX
7667 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7668 UV cp; /* Ord of current character */
7669
7670 /* Code points are separated by dots. If none, there is only one
7671 * code point, and is terminated by the brace */
37820adc 7672 endchar = RExC_parse + strcspn(RExC_parse, ".}");
ff3f963a
KW
7673
7674 /* The values are Unicode even on EBCDIC machines */
c3c41406
KW
7675 length_of_hex = (STRLEN)(endchar - RExC_parse);
7676 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7677 if ( length_of_hex == 0
7678 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
ff3f963a 7679 {
c3c41406
KW
7680 RExC_parse += length_of_hex; /* Includes all the valid */
7681 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7682 ? UTF8SKIP(RExC_parse)
7683 : 1;
7684 /* Guard against malformed utf8 */
7685 if (RExC_parse >= endchar) RExC_parse = endchar;
7686 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
7687 }
7688
7689 if (! FOLD) { /* Not folding, just append to the string */
7690 STRLEN unilen;
7691
7692 /* Quit before adding this character if would exceed limit */
7693 if (len + UNISKIP(cp) > U8_MAX) break;
fc8cd66c 7694
ff3f963a
KW
7695 unilen = reguni(pRExC_state, cp, s);
7696 if (unilen > 0) {
7697 s += unilen;
7698 len += unilen;
7699 }
7700 } else { /* Folding, output the folded equivalent */
7701 STRLEN foldlen,numlen;
7702 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7703 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7704
7705 /* Quit before exceeding size limit */
7706 if (len + foldlen > U8_MAX) break;
7707
7708 for (foldbuf = tmpbuf;
7709 foldlen;
7710 foldlen -= numlen)
7711 {
7712 cp = utf8_to_uvchr(foldbuf, &numlen);
7713 if (numlen > 0) {
7714 const STRLEN unilen = reguni(pRExC_state, cp, s);
7715 s += unilen;
7716 len += unilen;
7717 /* In EBCDIC the numlen and unilen can differ. */
7718 foldbuf += numlen;
7719 if (numlen >= foldlen)
7720 break;
7721 }
7722 else
7723 break; /* "Can't happen." */
7724 }
7725 }
7726
7727 /* Point to the beginning of the next character in the sequence. */
7728 RExC_parse = endchar + 1;
7729
7730 /* Quit if no more characters */
7731 if (RExC_parse >= endbrace) break;
7732 }
7733
7734
7735 if (SIZE_ONLY) {
7736 if (RExC_parse < endbrace) {
7737 ckWARNreg(RExC_parse - 1,
7738 "Using just the first characters returned by \\N{}");
7739 }
7740
7741 RExC_size += STR_SZ(len);
7742 } else {
7743 STR_LEN(ret) = len;
7744 RExC_emit += STR_SZ(len);
7745 }
7746
7747 RExC_parse = endbrace + 1;
7748
7749 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7750 with malformed in t/re/pat_advanced.t */
7751 RExC_parse --;
7752 Set_Node_Cur_Length(ret); /* MJD */
7753 nextchar(pRExC_state);
7754 }
7755
7756 return ret;
fc8cd66c
YO
7757}
7758
7759
9e08bc66
TS
7760/*
7761 * reg_recode
7762 *
7763 * It returns the code point in utf8 for the value in *encp.
7764 * value: a code value in the source encoding
7765 * encp: a pointer to an Encode object
7766 *
7767 * If the result from Encode is not a single character,
7768 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7769 */
7770STATIC UV
7771S_reg_recode(pTHX_ const char value, SV **encp)
7772{
7773 STRLEN numlen = 1;
59cd0e26 7774 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 7775 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
7776 const STRLEN newlen = SvCUR(sv);
7777 UV uv = UNICODE_REPLACEMENT;
7778
7918f24d
NC
7779 PERL_ARGS_ASSERT_REG_RECODE;
7780
9e08bc66
TS
7781 if (newlen)
7782 uv = SvUTF8(sv)
7783 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7784 : *(U8*)s;
7785
7786 if (!newlen || numlen != newlen) {
7787 uv = UNICODE_REPLACEMENT;
c86f7df5 7788 *encp = NULL;
9e08bc66
TS
7789 }
7790 return uv;
7791}
7792
fc8cd66c 7793
a687059c
LW
7794/*
7795 - regatom - the lowest level
ee9b8eae
YO
7796
7797 Try to identify anything special at the start of the pattern. If there
7798 is, then handle it as required. This may involve generating a single regop,
7799 such as for an assertion; or it may involve recursing, such as to
7800 handle a () structure.
7801
7802 If the string doesn't start with something special then we gobble up
7803 as much literal text as we can.
7804
7805 Once we have been able to handle whatever type of thing started the
7806 sequence, we return.
7807
7808 Note: we have to be careful with escapes, as they can be both literal
7809 and special, and in the case of \10 and friends can either, depending
486ec47a 7810 on context. Specifically there are two separate switches for handling
ee9b8eae
YO
7811 escape sequences, with the one for handling literal escapes requiring
7812 a dummy entry for all of the special escapes that are actually handled
7813 by the other.
7814*/
7815
76e3520e 7816STATIC regnode *
3dab1dad 7817S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 7818{
97aff369 7819 dVAR;
cbbf8932 7820 register regnode *ret = NULL;
a0d0e21e 7821 I32 flags;
45948336 7822 char *parse_start = RExC_parse;
980866de 7823 U8 op;
3dab1dad
YO
7824 GET_RE_DEBUG_FLAGS_DECL;
7825 DEBUG_PARSE("atom");
a0d0e21e
LW
7826 *flagp = WORST; /* Tentatively. */
7827
7918f24d 7828 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 7829
a0d0e21e 7830tryagain:
f9a79580 7831 switch ((U8)*RExC_parse) {
a0d0e21e 7832 case '^':
830247a4
IZ
7833 RExC_seen_zerolen++;
7834 nextchar(pRExC_state);
bbe252da 7835 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7836 ret = reg_node(pRExC_state, MBOL);
bbe252da 7837 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7838 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7839 else
830247a4 7840 ret = reg_node(pRExC_state, BOL);
fac92740 7841 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7842 break;
7843 case '$':
830247a4 7844 nextchar(pRExC_state);
b81d288d 7845 if (*RExC_parse)
830247a4 7846 RExC_seen_zerolen++;
bbe252da 7847 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7848 ret = reg_node(pRExC_state, MEOL);
bbe252da 7849 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7850 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7851 else
830247a4 7852 ret = reg_node(pRExC_state, EOL);
fac92740 7853 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7854 break;
7855 case '.':
830247a4 7856 nextchar(pRExC_state);
bbe252da 7857 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
7858 ret = reg_node(pRExC_state, SANY);
7859 else
7860 ret = reg_node(pRExC_state, REG_ANY);
7861 *flagp |= HASWIDTH|SIMPLE;
830247a4 7862 RExC_naughty++;
fac92740 7863 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7864 break;
7865 case '[':
b45f050a 7866 {
3dab1dad
YO
7867 char * const oregcomp_parse = ++RExC_parse;
7868 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
7869 if (*RExC_parse != ']') {
7870 RExC_parse = oregcomp_parse;
b45f050a
JF
7871 vFAIL("Unmatched [");
7872 }
830247a4 7873 nextchar(pRExC_state);
a0d0e21e 7874 *flagp |= HASWIDTH|SIMPLE;
fac92740 7875 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 7876 break;
b45f050a 7877 }
a0d0e21e 7878 case '(':
830247a4 7879 nextchar(pRExC_state);
3dab1dad 7880 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 7881 if (ret == NULL) {
bf93d4cc 7882 if (flags & TRYAGAIN) {
830247a4 7883 if (RExC_parse == RExC_end) {
bf93d4cc
GS
7884 /* Make parent create an empty node if needed. */
7885 *flagp |= TRYAGAIN;
7886 return(NULL);
7887 }
a0d0e21e 7888 goto tryagain;
bf93d4cc 7889 }
a0d0e21e
LW
7890 return(NULL);
7891 }
a3b492c3 7892 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
7893 break;
7894 case '|':
7895 case ')':
7896 if (flags & TRYAGAIN) {
7897 *flagp |= TRYAGAIN;
7898 return NULL;
7899 }
b45f050a 7900 vFAIL("Internal urp");
a0d0e21e
LW
7901 /* Supposed to be caught earlier. */
7902 break;
85afd4ae 7903 case '{':
830247a4
IZ
7904 if (!regcurly(RExC_parse)) {
7905 RExC_parse++;
85afd4ae
CS
7906 goto defchar;
7907 }
7908 /* FALL THROUGH */
a0d0e21e
LW
7909 case '?':
7910 case '+':
7911 case '*':
830247a4 7912 RExC_parse++;
b45f050a 7913 vFAIL("Quantifier follows nothing");
a0d0e21e 7914 break;
ced7f090
KW
7915 case LATIN_SMALL_LETTER_SHARP_S:
7916 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7917 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7918#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
7919#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below.
7920 case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
7921#endif
a0a388a1 7922 do_foldchar:
56d400ed 7923 if (!LOC && FOLD) {
e64b1bd1 7924 U32 len,cp;
7cf3a6a3 7925 len=0; /* silence a spurious compiler warning */
56d400ed 7926 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
e64b1bd1
YO
7927 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7928 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7929 ret = reganode(pRExC_state, FOLDCHAR, cp);
7930 Set_Node_Length(ret, 1); /* MJD */
7931 nextchar(pRExC_state); /* kill whitespace under /x */
7932 return ret;
7933 }
7934 }
7935 goto outer_default;
a0d0e21e 7936 case '\\':
ee9b8eae
YO
7937 /* Special Escapes
7938
7939 This switch handles escape sequences that resolve to some kind
7940 of special regop and not to literal text. Escape sequnces that
7941 resolve to literal text are handled below in the switch marked
7942 "Literal Escapes".
7943
7944 Every entry in this switch *must* have a corresponding entry
7945 in the literal escape switch. However, the opposite is not
7946 required, as the default for this switch is to jump to the
7947 literal text handling code.
7948 */
a0a388a1 7949 switch ((U8)*++RExC_parse) {
ced7f090
KW
7950 case LATIN_SMALL_LETTER_SHARP_S:
7951 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7952 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
a0a388a1 7953 goto do_foldchar;
ee9b8eae 7954 /* Special Escapes */
a0d0e21e 7955 case 'A':
830247a4
IZ
7956 RExC_seen_zerolen++;
7957 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7958 *flagp |= SIMPLE;
ee9b8eae 7959 goto finish_meta_pat;
a0d0e21e 7960 case 'G':
830247a4
IZ
7961 ret = reg_node(pRExC_state, GPOS);
7962 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 7963 *flagp |= SIMPLE;
ee9b8eae
YO
7964 goto finish_meta_pat;
7965 case 'K':
7966 RExC_seen_zerolen++;
7967 ret = reg_node(pRExC_state, KEEPS);
7968 *flagp |= SIMPLE;
37923168
RGS
7969 /* XXX:dmq : disabling in-place substitution seems to
7970 * be necessary here to avoid cases of memory corruption, as
7971 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7972 */
7973 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 7974 goto finish_meta_pat;
a0d0e21e 7975 case 'Z':
830247a4 7976 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7977 *flagp |= SIMPLE;
a1917ab9 7978 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7979 goto finish_meta_pat;
b85d18e9 7980 case 'z':
830247a4 7981 ret = reg_node(pRExC_state, EOS);
b85d18e9 7982 *flagp |= SIMPLE;
830247a4 7983 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7984 goto finish_meta_pat;
4a2d328f 7985 case 'C':
f33976b4
DB
7986 ret = reg_node(pRExC_state, CANY);
7987 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 7988 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7989 goto finish_meta_pat;
a0ed51b3 7990 case 'X':
830247a4 7991 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 7992 *flagp |= HASWIDTH;
ee9b8eae 7993 goto finish_meta_pat;
a0d0e21e 7994 case 'w':
980866de
KW
7995 switch (get_regex_charset(RExC_flags)) {
7996 case REGEX_LOCALE_CHARSET:
7997 op = ALNUML;
7998 break;
7999 case REGEX_UNICODE_CHARSET:
8000 op = ALNUMU;
8001 break;
cfaf538b
KW
8002 case REGEX_ASCII_RESTRICTED_CHARSET:
8003 op = ALNUMA;
8004 break;
980866de
KW
8005 case REGEX_DEPENDS_CHARSET:
8006 op = ALNUM;
8007 break;
8008 default:
8009 goto bad_charset;
a12cf05f 8010 }
980866de 8011 ret = reg_node(pRExC_state, op);
a0d0e21e 8012 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8013 goto finish_meta_pat;
a0d0e21e 8014 case 'W':
980866de
KW
8015 switch (get_regex_charset(RExC_flags)) {
8016 case REGEX_LOCALE_CHARSET:
8017 op = NALNUML;
8018 break;
8019 case REGEX_UNICODE_CHARSET:
8020 op = NALNUMU;
8021 break;
cfaf538b
KW
8022 case REGEX_ASCII_RESTRICTED_CHARSET:
8023 op = NALNUMA;
8024 break;
980866de
KW
8025 case REGEX_DEPENDS_CHARSET:
8026 op = NALNUM;
8027 break;
8028 default:
8029 goto bad_charset;
a12cf05f 8030 }
980866de 8031 ret = reg_node(pRExC_state, op);
a0d0e21e 8032 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8033 goto finish_meta_pat;
a0d0e21e 8034 case 'b':
830247a4
IZ
8035 RExC_seen_zerolen++;
8036 RExC_seen |= REG_SEEN_LOOKBEHIND;
63ac0dad
KW
8037 switch (get_regex_charset(RExC_flags)) {
8038 case REGEX_LOCALE_CHARSET:
8039 op = BOUNDL;
8040 break;
8041 case REGEX_UNICODE_CHARSET:
8042 op = BOUNDU;
8043 break;
cfaf538b
KW
8044 case REGEX_ASCII_RESTRICTED_CHARSET:
8045 op = BOUNDA;
8046 break;
63ac0dad
KW
8047 case REGEX_DEPENDS_CHARSET:
8048 op = BOUND;
8049 break;
8050 default:
8051 goto bad_charset;
a12cf05f 8052 }
63ac0dad 8053 ret = reg_node(pRExC_state, op);
b988e673 8054 FLAGS(ret) = get_regex_charset(RExC_flags);
a0d0e21e 8055 *flagp |= SIMPLE;
ee9b8eae 8056 goto finish_meta_pat;
a0d0e21e 8057 case 'B':
830247a4
IZ
8058 RExC_seen_zerolen++;
8059 RExC_seen |= REG_SEEN_LOOKBEHIND;
63ac0dad
KW
8060 switch (get_regex_charset(RExC_flags)) {
8061 case REGEX_LOCALE_CHARSET:
8062 op = NBOUNDL;
8063 break;
8064 case REGEX_UNICODE_CHARSET:
8065 op = NBOUNDU;
8066 break;
cfaf538b
KW
8067 case REGEX_ASCII_RESTRICTED_CHARSET:
8068 op = NBOUNDA;
8069 break;
63ac0dad
KW
8070 case REGEX_DEPENDS_CHARSET:
8071 op = NBOUND;
8072 break;
8073 default:
8074 goto bad_charset;
a12cf05f 8075 }
63ac0dad 8076 ret = reg_node(pRExC_state, op);
b988e673 8077 FLAGS(ret) = get_regex_charset(RExC_flags);
a0d0e21e 8078 *flagp |= SIMPLE;
ee9b8eae 8079 goto finish_meta_pat;
a0d0e21e 8080 case 's':
980866de
KW
8081 switch (get_regex_charset(RExC_flags)) {
8082 case REGEX_LOCALE_CHARSET:
8083 op = SPACEL;
8084 break;
8085 case REGEX_UNICODE_CHARSET:
8086 op = SPACEU;
8087 break;
cfaf538b
KW
8088 case REGEX_ASCII_RESTRICTED_CHARSET:
8089 op = SPACEA;
8090 break;
980866de
KW
8091 case REGEX_DEPENDS_CHARSET:
8092 op = SPACE;
8093 break;
8094 default:
8095 goto bad_charset;
a12cf05f 8096 }
980866de 8097 ret = reg_node(pRExC_state, op);
a0d0e21e 8098 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8099 goto finish_meta_pat;
a0d0e21e 8100 case 'S':
980866de
KW
8101 switch (get_regex_charset(RExC_flags)) {
8102 case REGEX_LOCALE_CHARSET:
8103 op = NSPACEL;
8104 break;
8105 case REGEX_UNICODE_CHARSET:
8106 op = NSPACEU;
8107 break;
cfaf538b
KW
8108 case REGEX_ASCII_RESTRICTED_CHARSET:
8109 op = NSPACEA;
8110 break;
980866de
KW
8111 case REGEX_DEPENDS_CHARSET:
8112 op = NSPACE;
8113 break;
8114 default:
8115 goto bad_charset;
a12cf05f 8116 }
980866de 8117 ret = reg_node(pRExC_state, op);
a0d0e21e 8118 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8119 goto finish_meta_pat;
a0d0e21e 8120 case 'd':
56ae17b4
KW
8121 switch (get_regex_charset(RExC_flags)) {
8122 case REGEX_LOCALE_CHARSET:
8123 op = DIGITL;
8124 break;
cfaf538b
KW
8125 case REGEX_ASCII_RESTRICTED_CHARSET:
8126 op = DIGITA;
8127 break;
56ae17b4
KW
8128 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8129 case REGEX_UNICODE_CHARSET:
8130 op = DIGIT;
8131 break;
8132 default:
8133 goto bad_charset;
6ab9ea91 8134 }
56ae17b4 8135 ret = reg_node(pRExC_state, op);
a0d0e21e 8136 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8137 goto finish_meta_pat;
a0d0e21e 8138 case 'D':
56ae17b4
KW
8139 switch (get_regex_charset(RExC_flags)) {
8140 case REGEX_LOCALE_CHARSET:
8141 op = NDIGITL;
8142 break;
cfaf538b
KW
8143 case REGEX_ASCII_RESTRICTED_CHARSET:
8144 op = NDIGITA;
8145 break;
56ae17b4
KW
8146 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8147 case REGEX_UNICODE_CHARSET:
8148 op = NDIGIT;
8149 break;
8150 default:
8151 goto bad_charset;
6ab9ea91 8152 }
56ae17b4 8153 ret = reg_node(pRExC_state, op);
a0d0e21e 8154 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8155 goto finish_meta_pat;
e1d1eefb
YO
8156 case 'R':
8157 ret = reg_node(pRExC_state, LNBREAK);
8158 *flagp |= HASWIDTH|SIMPLE;
8159 goto finish_meta_pat;
8160 case 'h':
8161 ret = reg_node(pRExC_state, HORIZWS);
8162 *flagp |= HASWIDTH|SIMPLE;
8163 goto finish_meta_pat;
8164 case 'H':
8165 ret = reg_node(pRExC_state, NHORIZWS);
8166 *flagp |= HASWIDTH|SIMPLE;
8167 goto finish_meta_pat;
ee9b8eae 8168 case 'v':
e1d1eefb
YO
8169 ret = reg_node(pRExC_state, VERTWS);
8170 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
8171 goto finish_meta_pat;
8172 case 'V':
e1d1eefb
YO
8173 ret = reg_node(pRExC_state, NVERTWS);
8174 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8175 finish_meta_pat:
830247a4 8176 nextchar(pRExC_state);
fac92740 8177 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 8178 break;
a14b48bc
LW
8179 case 'p':
8180 case 'P':
3568d838 8181 {
3dab1dad 8182 char* const oldregxend = RExC_end;
d008bc60 8183#ifdef DEBUGGING
ccb2c380 8184 char* parse_start = RExC_parse - 2;
d008bc60 8185#endif
a14b48bc 8186
830247a4 8187 if (RExC_parse[1] == '{') {
3568d838 8188 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
8189 RExC_end = strchr(RExC_parse, '}');
8190 if (!RExC_end) {
3dab1dad 8191 const U8 c = (U8)*RExC_parse;
830247a4
IZ
8192 RExC_parse += 2;
8193 RExC_end = oldregxend;
0da60cf5 8194 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 8195 }
830247a4 8196 RExC_end++;
a14b48bc 8197 }
af6f566e 8198 else {
830247a4 8199 RExC_end = RExC_parse + 2;
af6f566e
HS
8200 if (RExC_end > oldregxend)
8201 RExC_end = oldregxend;
8202 }
830247a4 8203 RExC_parse--;
a14b48bc 8204
3dab1dad 8205 ret = regclass(pRExC_state,depth+1);
a14b48bc 8206
830247a4
IZ
8207 RExC_end = oldregxend;
8208 RExC_parse--;
ccb2c380
MP
8209
8210 Set_Node_Offset(ret, parse_start + 2);
8211 Set_Node_Cur_Length(ret);
830247a4 8212 nextchar(pRExC_state);
a14b48bc
LW
8213 *flagp |= HASWIDTH|SIMPLE;
8214 }
8215 break;
fc8cd66c 8216 case 'N':
afefe6bf 8217 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
8218 multicharacter. join_exact() will join them up later on.
8219 Also this makes sure that things like /\N{BLAH}+/ and
8220 \N{BLAH} being multi char Just Happen. dmq*/
8221 ++RExC_parse;
afefe6bf 8222 ret= reg_namedseq(pRExC_state, NULL, flagp);
fc8cd66c 8223 break;
0a4db386 8224 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 8225 parse_named_seq:
81714fb9
YO
8226 {
8227 char ch= RExC_parse[1];
1f1031fe
YO
8228 if (ch != '<' && ch != '\'' && ch != '{') {
8229 RExC_parse++;
8230 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 8231 } else {
1f1031fe
YO
8232 /* this pretty much dupes the code for (?P=...) in reg(), if
8233 you change this make sure you change that */
81714fb9 8234 char* name_start = (RExC_parse += 2);
2eccd3b2 8235 U32 num = 0;
0a4db386
YO
8236 SV *sv_dat = reg_scan_name(pRExC_state,
8237 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 8238 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 8239 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
8240 vFAIL2("Sequence %.3s... not terminated",parse_start);
8241
8242 if (!SIZE_ONLY) {
8243 num = add_data( pRExC_state, 1, "S" );
8244 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 8245 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
8246 }
8247
81714fb9
YO
8248 RExC_sawback = 1;
8249 ret = reganode(pRExC_state,
4444fd9f
KW
8250 ((! FOLD)
8251 ? NREF
cfaf538b 8252 : (AT_LEAST_UNI_SEMANTICS)
4444fd9f
KW
8253 ? NREFFU
8254 : (LOC)
8255 ? NREFFL
8256 : NREFF),
8257 num);
81714fb9 8258 *flagp |= HASWIDTH;
1f1031fe 8259
81714fb9
YO
8260 /* override incorrect value set in reganode MJD */
8261 Set_Node_Offset(ret, parse_start+1);
8262 Set_Node_Cur_Length(ret); /* MJD */
8263 nextchar(pRExC_state);
1f1031fe 8264
81714fb9
YO
8265 }
8266 break;
1f1031fe 8267 }
2bf803e2 8268 case 'g':
a0d0e21e
LW
8269 case '1': case '2': case '3': case '4':
8270 case '5': case '6': case '7': case '8': case '9':
8271 {
c74340f9 8272 I32 num;
2bf803e2
YO
8273 bool isg = *RExC_parse == 'g';
8274 bool isrel = 0;
8275 bool hasbrace = 0;
8276 if (isg) {
c74340f9 8277 RExC_parse++;
2bf803e2
YO
8278 if (*RExC_parse == '{') {
8279 RExC_parse++;
8280 hasbrace = 1;
8281 }
8282 if (*RExC_parse == '-') {
8283 RExC_parse++;
8284 isrel = 1;
8285 }
1f1031fe
YO
8286 if (hasbrace && !isDIGIT(*RExC_parse)) {
8287 if (isrel) RExC_parse--;
8288 RExC_parse -= 2;
8289 goto parse_named_seq;
8290 } }
c74340f9 8291 num = atoi(RExC_parse);
b72d83b2
RGS
8292 if (isg && num == 0)
8293 vFAIL("Reference to invalid group 0");
c74340f9 8294 if (isrel) {
5624f11d 8295 num = RExC_npar - num;
c74340f9
YO
8296 if (num < 1)
8297 vFAIL("Reference to nonexistent or unclosed group");
8298 }
2bf803e2 8299 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
8300 goto defchar;
8301 else {
3dab1dad 8302 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
8303 while (isDIGIT(*RExC_parse))
8304 RExC_parse++;
1f1031fe
YO
8305 if (parse_start == RExC_parse - 1)
8306 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
8307 if (hasbrace) {
8308 if (*RExC_parse != '}')
8309 vFAIL("Unterminated \\g{...} pattern");
8310 RExC_parse++;
8311 }
c74340f9
YO
8312 if (!SIZE_ONLY) {
8313 if (num > (I32)RExC_rx->nparens)
8314 vFAIL("Reference to nonexistent group");
c74340f9 8315 }
830247a4 8316 RExC_sawback = 1;
eb160463 8317 ret = reganode(pRExC_state,
4444fd9f
KW
8318 ((! FOLD)
8319 ? REF
cfaf538b 8320 : (AT_LEAST_UNI_SEMANTICS)
4444fd9f
KW
8321 ? REFFU
8322 : (LOC)
8323 ? REFFL
8324 : REFF),
8325 num);
a0d0e21e 8326 *flagp |= HASWIDTH;
2af232bd 8327
fac92740 8328 /* override incorrect value set in reganode MJD */
2af232bd 8329 Set_Node_Offset(ret, parse_start+1);
fac92740 8330 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
8331 RExC_parse--;
8332 nextchar(pRExC_state);
a0d0e21e
LW
8333 }
8334 }
8335 break;
8336 case '\0':
830247a4 8337 if (RExC_parse >= RExC_end)
b45f050a 8338 FAIL("Trailing \\");
a0d0e21e
LW
8339 /* FALL THROUGH */
8340 default:
a0288114 8341 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 8342 back into the quick-grab loop below */
45948336 8343 parse_start--;
a0d0e21e
LW
8344 goto defchar;
8345 }
8346 break;
4633a7c4
LW
8347
8348 case '#':
bbe252da 8349 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 8350 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
8351 goto tryagain;
8352 }
8353 /* FALL THROUGH */
8354
f9a79580
RGS
8355 default:
8356 outer_default:{
ba210ebe 8357 register STRLEN len;
58ae7d3f 8358 register UV ender;
a0d0e21e 8359 register char *p;
3dab1dad 8360 char *s;
80aecb99 8361 STRLEN foldlen;
89ebb4a3 8362 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
8363
8364 parse_start = RExC_parse - 1;
a0d0e21e 8365
830247a4 8366 RExC_parse++;
a0d0e21e
LW
8367
8368 defchar:
58ae7d3f 8369 ender = 0;
eb160463 8370 ret = reg_node(pRExC_state,
2c2b7f86
KW
8371 (U8) ((! FOLD) ? EXACT
8372 : (LOC)
8373 ? EXACTFL
cfaf538b 8374 : (AT_LEAST_UNI_SEMANTICS)
2c2b7f86
KW
8375 ? EXACTFU
8376 : EXACTF)
8377 );
cd439c50 8378 s = STRING(ret);
830247a4
IZ
8379 for (len = 0, p = RExC_parse - 1;
8380 len < 127 && p < RExC_end;
a0d0e21e
LW
8381 len++)
8382 {
3dab1dad 8383 char * const oldp = p;
5b5a24f7 8384
bbe252da 8385 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 8386 p = regwhite( pRExC_state, p );
f9a79580 8387 switch ((U8)*p) {
ced7f090
KW
8388 case LATIN_SMALL_LETTER_SHARP_S:
8389 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8390 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
56d400ed 8391 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
f9a79580 8392 goto normal_default;
a0d0e21e
LW
8393 case '^':
8394 case '$':
8395 case '.':
8396 case '[':
8397 case '(':
8398 case ')':
8399 case '|':
8400 goto loopdone;
8401 case '\\':
ee9b8eae
YO
8402 /* Literal Escapes Switch
8403
8404 This switch is meant to handle escape sequences that
8405 resolve to a literal character.
8406
8407 Every escape sequence that represents something
8408 else, like an assertion or a char class, is handled
8409 in the switch marked 'Special Escapes' above in this
8410 routine, but also has an entry here as anything that
8411 isn't explicitly mentioned here will be treated as
8412 an unescaped equivalent literal.
8413 */
8414
a0a388a1 8415 switch ((U8)*++p) {
ee9b8eae 8416 /* These are all the special escapes. */
ced7f090
KW
8417 case LATIN_SMALL_LETTER_SHARP_S:
8418 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8419 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
a0a388a1
YO
8420 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8421 goto normal_default;
ee9b8eae
YO
8422 case 'A': /* Start assertion */
8423 case 'b': case 'B': /* Word-boundary assertion*/
8424 case 'C': /* Single char !DANGEROUS! */
8425 case 'd': case 'D': /* digit class */
8426 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 8427 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
8428 case 'k': case 'K': /* named backref, keep marker */
8429 case 'N': /* named char sequence */
38a44b82 8430 case 'p': case 'P': /* Unicode property */
e1d1eefb 8431 case 'R': /* LNBREAK */
ee9b8eae 8432 case 's': case 'S': /* space class */
e1d1eefb 8433 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
8434 case 'w': case 'W': /* word class */
8435 case 'X': /* eXtended Unicode "combining character sequence" */
8436 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
8437 --p;
8438 goto loopdone;
ee9b8eae
YO
8439
8440 /* Anything after here is an escape that resolves to a
8441 literal. (Except digits, which may or may not)
8442 */
a0d0e21e
LW
8443 case 'n':
8444 ender = '\n';
8445 p++;
a687059c 8446 break;
a0d0e21e
LW
8447 case 'r':
8448 ender = '\r';
8449 p++;
a687059c 8450 break;
a0d0e21e
LW
8451 case 't':
8452 ender = '\t';
8453 p++;
a687059c 8454 break;
a0d0e21e
LW
8455 case 'f':
8456 ender = '\f';
8457 p++;
a687059c 8458 break;
a0d0e21e 8459 case 'e':
c7f1f016 8460 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 8461 p++;
a687059c 8462 break;
a0d0e21e 8463 case 'a':
c7f1f016 8464 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 8465 p++;
a687059c 8466 break;
f0a2b745
KW
8467 case 'o':
8468 {
8469 STRLEN brace_len = len;
00c0cb6d 8470 UV result;
454155d9
KW
8471 const char* error_msg;
8472
8473 bool valid = grok_bslash_o(p,
8474 &result,
8475 &brace_len,
8476 &error_msg,
8477 1);
8478 p += brace_len;
8479 if (! valid) {
8480 RExC_parse = p; /* going to die anyway; point
8481 to exact spot of failure */
f0a2b745
KW
8482 vFAIL(error_msg);
8483 }
00c0cb6d
DG
8484 else
8485 {
8486 ender = result;
8487 }
f0a2b745
KW
8488 if (PL_encoding && ender < 0x100) {
8489 goto recode_encoding;
8490 }
8491 if (ender > 0xff) {
62fed28b 8492 REQUIRE_UTF8;
f0a2b745
KW
8493 }
8494 break;
8495 }
a0d0e21e 8496 case 'x':
a0ed51b3 8497 if (*++p == '{') {
1df70142 8498 char* const e = strchr(p, '}');
b81d288d 8499
b45f050a 8500 if (!e) {
830247a4 8501 RExC_parse = p + 1;
b45f050a
JF
8502 vFAIL("Missing right brace on \\x{}");
8503 }
de5f0749 8504 else {
a4c04bdc
NC
8505 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8506 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 8507 STRLEN numlen = e - p - 1;
53305cf1 8508 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028 8509 if (ender > 0xff)
62fed28b 8510 REQUIRE_UTF8;
a0ed51b3
LW
8511 p = e + 1;
8512 }
a0ed51b3
LW
8513 }
8514 else {
a4c04bdc 8515 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 8516 STRLEN numlen = 2;
53305cf1 8517 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
8518 p += numlen;
8519 }
9e08bc66
TS
8520 if (PL_encoding && ender < 0x100)
8521 goto recode_encoding;
a687059c 8522 break;
a0d0e21e
LW
8523 case 'c':
8524 p++;
f9d13529 8525 ender = grok_bslash_c(*p++, SIZE_ONLY);
a687059c 8526 break;
a0d0e21e
LW
8527 case '0': case '1': case '2': case '3':case '4':
8528 case '5': case '6': case '7': case '8':case '9':
8529 if (*p == '0' ||
ca67da41 8530 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
c99e91e9
KW
8531 {
8532 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
1df70142 8533 STRLEN numlen = 3;
53305cf1 8534 ender = grok_oct(p, &numlen, &flags, NULL);
fa1639c5 8535 if (ender > 0xff) {
62fed28b 8536 REQUIRE_UTF8;
609122bd 8537 }
a0d0e21e
LW
8538 p += numlen;
8539 }
8540 else {
8541 --p;
8542 goto loopdone;
a687059c 8543 }
9e08bc66
TS
8544 if (PL_encoding && ender < 0x100)
8545 goto recode_encoding;
8546 break;
8547 recode_encoding:
8548 {
8549 SV* enc = PL_encoding;
8550 ender = reg_recode((const char)(U8)ender, &enc);
668c081a
NC
8551 if (!enc && SIZE_ONLY)
8552 ckWARNreg(p, "Invalid escape in the specified encoding");
62fed28b 8553 REQUIRE_UTF8;
9e08bc66 8554 }
a687059c 8555 break;
a0d0e21e 8556 case '\0':
830247a4 8557 if (p >= RExC_end)
b45f050a 8558 FAIL("Trailing \\");
a687059c 8559 /* FALL THROUGH */
a0d0e21e 8560 default:
668c081a
NC
8561 if (!SIZE_ONLY&& isALPHA(*p))
8562 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 8563 goto normal_default;
a0d0e21e
LW
8564 }
8565 break;
a687059c 8566 default:
a0ed51b3 8567 normal_default:
fd400ab9 8568 if (UTF8_IS_START(*p) && UTF) {
1df70142 8569 STRLEN numlen;
5e12f4fb 8570 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 8571 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
8572 p += numlen;
8573 }
8574 else
8575 ender = *p++;
a0d0e21e 8576 break;
a687059c 8577 }
bcdf7404
YO
8578 if ( RExC_flags & RXf_PMf_EXTENDED)
8579 p = regwhite( pRExC_state, p );
60a8b682
JH
8580 if (UTF && FOLD) {
8581 /* Prime the casefolded buffer. */
ac7e0132 8582 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 8583 }
bcdf7404 8584 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
8585 if (len)
8586 p = oldp;
16ea2a2e 8587 else if (UTF) {
80aecb99 8588 if (FOLD) {
60a8b682 8589 /* Emit all the Unicode characters. */
1df70142 8590 STRLEN numlen;
80aecb99
JH
8591 for (foldbuf = tmpbuf;
8592 foldlen;
8593 foldlen -= numlen) {
8594 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 8595 if (numlen > 0) {
71207a34 8596 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
8597 s += unilen;
8598 len += unilen;
8599 /* In EBCDIC the numlen
8600 * and unilen can differ. */
9dc45d57 8601 foldbuf += numlen;
47654450
JH
8602 if (numlen >= foldlen)
8603 break;
9dc45d57
JH
8604 }
8605 else
8606 break; /* "Can't happen." */
80aecb99
JH
8607 }
8608 }
8609 else {
71207a34 8610 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 8611 if (unilen > 0) {
0ebc6274
JH
8612 s += unilen;
8613 len += unilen;
9dc45d57 8614 }
80aecb99 8615 }
a0ed51b3 8616 }
a0d0e21e
LW
8617 else {
8618 len++;
eb160463 8619 REGC((char)ender, s++);
a0d0e21e
LW
8620 }
8621 break;
a687059c 8622 }
16ea2a2e 8623 if (UTF) {
80aecb99 8624 if (FOLD) {
60a8b682 8625 /* Emit all the Unicode characters. */
1df70142 8626 STRLEN numlen;
80aecb99
JH
8627 for (foldbuf = tmpbuf;
8628 foldlen;
8629 foldlen -= numlen) {
8630 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 8631 if (numlen > 0) {
71207a34 8632 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
8633 len += unilen;
8634 s += unilen;
8635 /* In EBCDIC the numlen
8636 * and unilen can differ. */
9dc45d57 8637 foldbuf += numlen;
47654450
JH
8638 if (numlen >= foldlen)
8639 break;
9dc45d57
JH
8640 }
8641 else
8642 break;
80aecb99
JH
8643 }
8644 }
8645 else {
71207a34 8646 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 8647 if (unilen > 0) {
0ebc6274
JH
8648 s += unilen;
8649 len += unilen;
9dc45d57 8650 }
80aecb99
JH
8651 }
8652 len--;
a0ed51b3
LW
8653 }
8654 else
eb160463 8655 REGC((char)ender, s++);
a0d0e21e
LW
8656 }
8657 loopdone:
830247a4 8658 RExC_parse = p - 1;
fac92740 8659 Set_Node_Cur_Length(ret); /* MJD */
830247a4 8660 nextchar(pRExC_state);
793db0cb
JH
8661 {
8662 /* len is STRLEN which is unsigned, need to copy to signed */
8663 IV iv = len;
8664 if (iv < 0)
8665 vFAIL("Internal disaster");
8666 }
a0d0e21e
LW
8667 if (len > 0)
8668 *flagp |= HASWIDTH;
090f7165 8669 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 8670 *flagp |= SIMPLE;
3dab1dad 8671
cd439c50 8672 if (SIZE_ONLY)
830247a4 8673 RExC_size += STR_SZ(len);
3dab1dad
YO
8674 else {
8675 STR_LEN(ret) = len;
830247a4 8676 RExC_emit += STR_SZ(len);
07be1b83 8677 }
3dab1dad 8678 }
a0d0e21e
LW
8679 break;
8680 }
a687059c 8681
a0d0e21e 8682 return(ret);
980866de
KW
8683
8684/* Jumped to when an unrecognized character set is encountered */
8685bad_charset:
8686 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
8687 return(NULL);
a687059c
LW
8688}
8689
873ef191 8690STATIC char *
bcdf7404 8691S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 8692{
bcdf7404 8693 const char *e = RExC_end;
7918f24d
NC
8694
8695 PERL_ARGS_ASSERT_REGWHITE;
8696
5b5a24f7
CS
8697 while (p < e) {
8698 if (isSPACE(*p))
8699 ++p;
8700 else if (*p == '#') {
bcdf7404 8701 bool ended = 0;
5b5a24f7 8702 do {
bcdf7404
YO
8703 if (*p++ == '\n') {
8704 ended = 1;
8705 break;
8706 }
8707 } while (p < e);
8708 if (!ended)
8709 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
8710 }
8711 else
8712 break;
8713 }
8714 return p;
8715}
8716
b8c5462f
JH
8717/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
8718 Character classes ([:foo:]) can also be negated ([:^foo:]).
8719 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
8720 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 8721 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
8722
8723#define POSIXCC_DONE(c) ((c) == ':')
8724#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
8725#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
8726
b8c5462f 8727STATIC I32
830247a4 8728S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 8729{
97aff369 8730 dVAR;
936ed897 8731 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 8732
7918f24d
NC
8733 PERL_ARGS_ASSERT_REGPPOSIXCC;
8734
830247a4 8735 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 8736 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 8737 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 8738 const char c = UCHARAT(RExC_parse);
097eb12c 8739 char* const s = RExC_parse++;
b81d288d 8740
9a86a77b 8741 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
8742 RExC_parse++;
8743 if (RExC_parse == RExC_end)
620e46c5 8744 /* Grandfather lone [:, [=, [. */
830247a4 8745 RExC_parse = s;
620e46c5 8746 else {
3dab1dad 8747 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
8748 assert(*t == c);
8749
9a86a77b 8750 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 8751 const char *posixcc = s + 1;
830247a4 8752 RExC_parse++; /* skip over the ending ] */
3dab1dad 8753
b8c5462f 8754 if (*s == ':') {
1df70142
AL
8755 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
8756 const I32 skip = t - posixcc;
80916619
NC
8757
8758 /* Initially switch on the length of the name. */
8759 switch (skip) {
8760 case 4:
3dab1dad
YO
8761 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
8762 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 8763 break;
80916619
NC
8764 case 5:
8765 /* Names all of length 5. */
8766 /* alnum alpha ascii blank cntrl digit graph lower
8767 print punct space upper */
8768 /* Offset 4 gives the best switch position. */
8769 switch (posixcc[4]) {
8770 case 'a':
3dab1dad
YO
8771 if (memEQ(posixcc, "alph", 4)) /* alpha */
8772 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
8773 break;
8774 case 'e':
3dab1dad
YO
8775 if (memEQ(posixcc, "spac", 4)) /* space */
8776 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
8777 break;
8778 case 'h':
3dab1dad
YO
8779 if (memEQ(posixcc, "grap", 4)) /* graph */
8780 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
8781 break;
8782 case 'i':
3dab1dad
YO
8783 if (memEQ(posixcc, "asci", 4)) /* ascii */
8784 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
8785 break;
8786 case 'k':
3dab1dad
YO
8787 if (memEQ(posixcc, "blan", 4)) /* blank */
8788 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
8789 break;
8790 case 'l':
3dab1dad
YO
8791 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
8792 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
8793 break;
8794 case 'm':
3dab1dad
YO
8795 if (memEQ(posixcc, "alnu", 4)) /* alnum */
8796 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
8797 break;
8798 case 'r':
3dab1dad
YO
8799 if (memEQ(posixcc, "lowe", 4)) /* lower */
8800 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8801 else if (memEQ(posixcc, "uppe", 4)) /* upper */
8802 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
8803 break;
8804 case 't':
3dab1dad
YO
8805 if (memEQ(posixcc, "digi", 4)) /* digit */
8806 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8807 else if (memEQ(posixcc, "prin", 4)) /* print */
8808 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8809 else if (memEQ(posixcc, "punc", 4)) /* punct */
8810 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 8811 break;
b8c5462f
JH
8812 }
8813 break;
80916619 8814 case 6:
3dab1dad
YO
8815 if (memEQ(posixcc, "xdigit", 6))
8816 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
8817 break;
8818 }
80916619
NC
8819
8820 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
8821 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8822 t - s - 1, s + 1);
80916619
NC
8823 assert (posixcc[skip] == ':');
8824 assert (posixcc[skip+1] == ']');
b45f050a 8825 } else if (!SIZE_ONLY) {
b8c5462f 8826 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 8827
830247a4 8828 /* adjust RExC_parse so the warning shows after
b45f050a 8829 the class closes */
9a86a77b 8830 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 8831 RExC_parse++;
b45f050a
JF
8832 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8833 }
b8c5462f
JH
8834 } else {
8835 /* Maternal grandfather:
8836 * "[:" ending in ":" but not in ":]" */
830247a4 8837 RExC_parse = s;
767d463e 8838 }
620e46c5
JH
8839 }
8840 }
8841
b8c5462f
JH
8842 return namedclass;
8843}
8844
8845STATIC void
830247a4 8846S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 8847{
97aff369 8848 dVAR;
7918f24d
NC
8849
8850 PERL_ARGS_ASSERT_CHECKPOSIXCC;
8851
3dab1dad 8852 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
8853 const char *s = RExC_parse;
8854 const char c = *s++;
b8c5462f 8855
3dab1dad 8856 while (isALNUM(*s))
b8c5462f
JH
8857 s++;
8858 if (*s && c == *s && s[1] == ']') {
668c081a
NC
8859 ckWARN3reg(s+2,
8860 "POSIX syntax [%c %c] belongs inside character classes",
8861 c, c);
b45f050a
JF
8862
8863 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 8864 if (POSIXCC_NOTYET(c)) {
830247a4 8865 /* adjust RExC_parse so the error shows after
b45f050a 8866 the class closes */
9a86a77b 8867 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 8868 NOOP;
b45f050a
JF
8869 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8870 }
b8c5462f
JH
8871 }
8872 }
620e46c5
JH
8873}
8874
003331de
KW
8875/* No locale test, and always Unicode semantics */
8876#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
8877ANYOF_##NAME: \
8878 for (value = 0; value < 256; value++) \
8879 if (TEST) \
56ca34ca 8880 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); \
003331de
KW
8881 yesno = '+'; \
8882 what = WORD; \
8883 break; \
8884case ANYOF_N##NAME: \
8885 for (value = 0; value < 256; value++) \
8886 if (!TEST) \
56ca34ca 8887 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); \
003331de
KW
8888 yesno = '!'; \
8889 what = WORD; \
e1d1eefb 8890 break
89836f1f 8891
a12cf05f
KW
8892/* Like the above, but there are differences if we are in uni-8-bit or not, so
8893 * there are two tests passed in, to use depending on that. There aren't any
8894 * cases where the label is different from the name, so no need for that
8895 * parameter */
f952827c 8896#define _C_C_T_(NAME, TEST_8, TEST_7, WORD) \
003331de
KW
8897ANYOF_##NAME: \
8898 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8899 else if (UNI_SEMANTICS) { \
8900 for (value = 0; value < 256; value++) { \
f952827c 8901 if (TEST_8(value)) stored += \
56ca34ca 8902 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); \
003331de
KW
8903 } \
8904 } \
8905 else { \
8906 for (value = 0; value < 128; value++) { \
f952827c 8907 if (TEST_7(UNI_TO_NATIVE(value))) stored += \
003331de 8908 S_set_regclass_bit(aTHX_ pRExC_state, ret, \
56ca34ca 8909 (U8) UNI_TO_NATIVE(value), &nonbitmap); \
003331de
KW
8910 } \
8911 } \
8912 yesno = '+'; \
8913 what = WORD; \
8914 break; \
8915case ANYOF_N##NAME: \
8916 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8917 else if (UNI_SEMANTICS) { \
8918 for (value = 0; value < 256; value++) { \
f952827c 8919 if (! TEST_8(value)) stored += \
56ca34ca 8920 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); \
003331de
KW
8921 } \
8922 } \
8923 else { \
8924 for (value = 0; value < 128; value++) { \
f952827c 8925 if (! TEST_7(UNI_TO_NATIVE(value))) stored += S_set_regclass_bit( \
56ca34ca 8926 aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \
003331de 8927 } \
cfaf538b
KW
8928 if (ASCII_RESTRICTED) { \
8929 for (value = 128; value < 256; value++) { \
f952827c 8930 stored += S_set_regclass_bit( \
56ca34ca 8931 aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \
cfaf538b
KW
8932 } \
8933 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8; \
8934 } \
8935 else { \
8936 /* For a non-ut8 target string with DEPENDS semantics, all above \
8937 * ASCII Latin1 code points match the complement of any of the \
8938 * classes. But in utf8, they have their Unicode semantics, so \
8939 * can't just set them in the bitmap, or else regexec.c will think \
8940 * they matched when they shouldn't. */ \
8941 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8; \
8942 } \
003331de
KW
8943 } \
8944 yesno = '!'; \
8945 what = WORD; \
a12cf05f
KW
8946 break
8947
da7fcca4
YO
8948/*
8949 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8950 so that it is possible to override the option here without having to
8951 rebuild the entire core. as we are required to do if we change regcomp.h
8952 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8953*/
8954#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8955#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8956#endif
8957
8958#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8959#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8960#else
8961#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8962#endif
8963
2283d326 8964STATIC U8
56ca34ca 8965S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr)
2283d326
KW
8966{
8967
8968 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
8969 * Locale folding is done at run-time, so this function should not be
8970 * called for nodes that are for locales.
8971 *
8972 * This function simply sets the bit corresponding to the fold of the input
8973 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
8974 * 'F' is 'f'.
8975 *
8976 * It also sets any necessary flags, and returns the number of bits that
8977 * actually changed from 0 to 1 */
8978
8979 U8 stored = 0;
2283d326
KW
8980 U8 fold;
8981
cfaf538b 8982 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
2283d326
KW
8983 : PL_fold[value];
8984
8985 /* It assumes the bit for 'value' has already been set */
8986 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
8987 ANYOF_BITMAP_SET(node, fold);
8988 stored++;
8989 }
f56b6394
KW
8990 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
8991 || (! UNI_SEMANTICS
8992 && ! isASCII(value)
8993 && PL_fold_latin1[value] != value))
2283d326
KW
8994 { /* A character that has a fold outside of Latin1 matches outside the
8995 bitmap, but only when the target string is utf8. Similarly when we
8996 don't have unicode semantics for the above ASCII Latin-1 characters,
8997 and they have a fold, they should match if the target is utf8, and
8998 not otherwise */
56ca34ca
KW
8999 if (! *nonbitmap_ptr) {
9000 *nonbitmap_ptr = _new_invlist(2);
9001 }
9002 *nonbitmap_ptr = add_range_to_invlist(*nonbitmap_ptr, value, value);
2283d326
KW
9003 ANYOF_FLAGS(node) |= ANYOF_UTF8;
9004 }
9005
9006 return stored;
9007}
9008
9009
9010PERL_STATIC_INLINE U8
56ca34ca 9011S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr)
2283d326
KW
9012{
9013 /* This inline function sets a bit in the bitmap if not already set, and if
9014 * appropriate, its fold, returning the number of bits that actually
9015 * changed from 0 to 1 */
9016
9017 U8 stored;
9018
9019 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
9020 return 0;
9021 }
9022
9023 ANYOF_BITMAP_SET(node, value);
9024 stored = 1;
9025
9026 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
56ca34ca 9027 stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value, nonbitmap_ptr);
2283d326
KW
9028 }
9029
9030 return stored;
9031}
9032
7f6f358c
YO
9033/*
9034 parse a class specification and produce either an ANYOF node that
89836f1f
YO
9035 matches the pattern or if the pattern matches a single char only and
9036 that char is < 256 and we are case insensitive then we produce an
9037 EXACT node instead.
7f6f358c 9038*/
89836f1f 9039
76e3520e 9040STATIC regnode *
3dab1dad 9041S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 9042{
97aff369 9043 dVAR;
9a86a77b 9044 register UV nextvalue;
3568d838 9045 register IV prevvalue = OOB_UNICODE;
ffc61ed2 9046 register IV range = 0;
e1d1eefb 9047 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 9048 register regnode *ret;
ba210ebe 9049 STRLEN numlen;
ffc61ed2 9050 IV namedclass;
cbbf8932 9051 char *rangebegin = NULL;
936ed897 9052 bool need_class = 0;
c445ea15 9053 SV *listsv = NULL;
ffc61ed2 9054 UV n;
56ca34ca 9055 HV* nonbitmap = NULL;
cbbf8932 9056 AV* unicode_alternate = NULL;
1b2d223b
JH
9057#ifdef EBCDIC
9058 UV literal_endpoint = 0;
9059#endif
ffc130aa 9060 UV stored = 0; /* how many chars stored in the bitmap */
ffc61ed2 9061
3dab1dad 9062 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 9063 case we need to change the emitted regop to an EXACT. */
07be1b83 9064 const char * orig_parse = RExC_parse;
72f13be8 9065 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9066
9067 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
9068#ifndef DEBUGGING
9069 PERL_UNUSED_ARG(depth);
9070#endif
72f13be8 9071
3dab1dad 9072 DEBUG_PARSE("clas");
7f6f358c
YO
9073
9074 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
9075 ret = reganode(pRExC_state, ANYOF, 0);
9076
56ca34ca
KW
9077
9078 if (!SIZE_ONLY) {
ffc61ed2 9079 ANYOF_FLAGS(ret) = 0;
56ca34ca 9080 }
ffc61ed2 9081
9a86a77b 9082 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
9083 RExC_naughty++;
9084 RExC_parse++;
9085 if (!SIZE_ONLY)
9086 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
9087 }
a0d0e21e 9088
73060fc4 9089 if (SIZE_ONLY) {
830247a4 9090 RExC_size += ANYOF_SKIP;
3a15e693
KW
9091#ifdef ANYOF_ADD_LOC_SKIP
9092 if (LOC) {
9093 RExC_size += ANYOF_ADD_LOC_SKIP;
9094 }
9095#endif
73060fc4
JH
9096 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
9097 }
936ed897 9098 else {
830247a4 9099 RExC_emit += ANYOF_SKIP;
3a15e693 9100 if (LOC) {
936ed897 9101 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3a15e693
KW
9102#ifdef ANYOF_ADD_LOC_SKIP
9103 RExC_emit += ANYOF_ADD_LOC_SKIP;
9104#endif
9105 }
ffc61ed2 9106 ANYOF_BITMAP_ZERO(ret);
396482e1 9107 listsv = newSVpvs("# comment\n");
a0d0e21e 9108 }
b8c5462f 9109
9a86a77b
JH
9110 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9111
b938889d 9112 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 9113 checkposixcc(pRExC_state);
b8c5462f 9114
f064b6ad
HS
9115 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
9116 if (UCHARAT(RExC_parse) == ']')
9117 goto charclassloop;
ffc61ed2 9118
fc8cd66c 9119parseit:
9a86a77b 9120 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
9121
9122 charclassloop:
9123
9124 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9125
73b437c8 9126 if (!range)
830247a4 9127 rangebegin = RExC_parse;
ffc61ed2 9128 if (UTF) {
5e12f4fb 9129 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 9130 RExC_end - RExC_parse,
9f7f3913 9131 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
9132 RExC_parse += numlen;
9133 }
9134 else
9135 value = UCHARAT(RExC_parse++);
7f6f358c 9136
9a86a77b
JH
9137 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9138 if (value == '[' && POSIXCC(nextvalue))
830247a4 9139 namedclass = regpposixcc(pRExC_state, value);
620e46c5 9140 else if (value == '\\') {
ffc61ed2 9141 if (UTF) {
5e12f4fb 9142 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 9143 RExC_end - RExC_parse,
9f7f3913 9144 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
9145 RExC_parse += numlen;
9146 }
9147 else
9148 value = UCHARAT(RExC_parse++);
470c3474 9149 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 9150 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
9151 * be a problem later if we want switch on Unicode.
9152 * A similar issue a little bit later when switching on
9153 * namedclass. --jhi */
ffc61ed2 9154 switch ((I32)value) {
b8c5462f
JH
9155 case 'w': namedclass = ANYOF_ALNUM; break;
9156 case 'W': namedclass = ANYOF_NALNUM; break;
9157 case 's': namedclass = ANYOF_SPACE; break;
9158 case 'S': namedclass = ANYOF_NSPACE; break;
9159 case 'd': namedclass = ANYOF_DIGIT; break;
9160 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
9161 case 'v': namedclass = ANYOF_VERTWS; break;
9162 case 'V': namedclass = ANYOF_NVERTWS; break;
9163 case 'h': namedclass = ANYOF_HORIZWS; break;
9164 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
9165 case 'N': /* Handle \N{NAME} in class */
9166 {
9167 /* We only pay attention to the first char of
9168 multichar strings being returned. I kinda wonder
9169 if this makes sense as it does change the behaviour
9170 from earlier versions, OTOH that behaviour was broken
9171 as well. */
9172 UV v; /* value is register so we cant & it /grrr */
afefe6bf 9173 if (reg_namedseq(pRExC_state, &v, NULL)) {
fc8cd66c
YO
9174 goto parseit;
9175 }
9176 value= v;
9177 }
9178 break;
ffc61ed2
JH
9179 case 'p':
9180 case 'P':
3dab1dad
YO
9181 {
9182 char *e;
af6f566e 9183 if (RExC_parse >= RExC_end)
2a4859cd 9184 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 9185 if (*RExC_parse == '{') {
1df70142 9186 const U8 c = (U8)value;
ffc61ed2
JH
9187 e = strchr(RExC_parse++, '}');
9188 if (!e)
0da60cf5 9189 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
9190 while (isSPACE(UCHARAT(RExC_parse)))
9191 RExC_parse++;
9192 if (e == RExC_parse)
0da60cf5 9193 vFAIL2("Empty \\%c{}", c);
ffc61ed2 9194 n = e - RExC_parse;
ab13f0c7
JH
9195 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
9196 n--;
ffc61ed2
JH
9197 }
9198 else {
9199 e = RExC_parse;
9200 n = 1;
9201 }
fb2e24cd
KW
9202 if (SIZE_ONLY) {
9203 if (LOC) {
9204 ckWARN2reg(RExC_parse,
9205 "\\%c uses Unicode rules, not locale rules",
9206 (int) value);
9207 }
9208 }
9209 else {
ab13f0c7
JH
9210 if (UCHARAT(RExC_parse) == '^') {
9211 RExC_parse++;
9212 n--;
9213 value = value == 'p' ? 'P' : 'p'; /* toggle */
9214 while (isSPACE(UCHARAT(RExC_parse))) {
9215 RExC_parse++;
9216 n--;
9217 }
9218 }
2f833f52
KW
9219
9220 /* Add the property name to the list. If /i matching, give
9221 * a different name which consists of the normal name
9222 * sandwiched between two underscores and '_i'. The design
9223 * is discussed in the commit message for this. */
9224 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
9225 (value=='p' ? '+' : '!'),
9226 (FOLD) ? "__" : "",
9227 (int)n,
9228 RExC_parse,
9229 (FOLD) ? "_i" : ""
9230 );
ffc61ed2
JH
9231 }
9232 RExC_parse = e + 1;
08fc12dd
KW
9233
9234 /* The \p could match something in the Latin1 range, hence
9235 * something that isn't utf8 */
3ff7ceb3 9236 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
f81125e2 9237 namedclass = ANYOF_MAX; /* no official name, but it's named */
e40e74fe
KW
9238
9239 /* \p means they want Unicode semantics */
9240 RExC_uni_semantics = 1;
3dab1dad 9241 }
f81125e2 9242 break;
b8c5462f
JH
9243 case 'n': value = '\n'; break;
9244 case 'r': value = '\r'; break;
9245 case 't': value = '\t'; break;
9246 case 'f': value = '\f'; break;
9247 case 'b': value = '\b'; break;
c7f1f016
NIS
9248 case 'e': value = ASCII_TO_NATIVE('\033');break;
9249 case 'a': value = ASCII_TO_NATIVE('\007');break;
f0a2b745
KW
9250 case 'o':
9251 RExC_parse--; /* function expects to be pointed at the 'o' */
454155d9
KW
9252 {
9253 const char* error_msg;
9254 bool valid = grok_bslash_o(RExC_parse,
f0a2b745
KW
9255 &value,
9256 &numlen,
454155d9
KW
9257 &error_msg,
9258 SIZE_ONLY);
9259 RExC_parse += numlen;
9260 if (! valid) {
9261 vFAIL(error_msg);
9262 }
f0a2b745 9263 }
f0a2b745
KW
9264 if (PL_encoding && value < 0x100) {
9265 goto recode_encoding;
9266 }
9267 break;
b8c5462f 9268 case 'x':
ffc61ed2 9269 if (*RExC_parse == '{') {
a4c04bdc
NC
9270 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9271 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 9272 char * const e = strchr(RExC_parse++, '}');
b81d288d 9273 if (!e)
ffc61ed2 9274 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
9275
9276 numlen = e - RExC_parse;
9277 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
9278 RExC_parse = e + 1;
9279 }
9280 else {
a4c04bdc 9281 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
9282 numlen = 2;
9283 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
9284 RExC_parse += numlen;
9285 }
9e08bc66
TS
9286 if (PL_encoding && value < 0x100)
9287 goto recode_encoding;
b8c5462f
JH
9288 break;
9289 case 'c':
f9d13529 9290 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
b8c5462f
JH
9291 break;
9292 case '0': case '1': case '2': case '3': case '4':
c99e91e9 9293 case '5': case '6': case '7':
9e08bc66 9294 {
c99e91e9
KW
9295 /* Take 1-3 octal digits */
9296 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9e08bc66
TS
9297 numlen = 3;
9298 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
9299 RExC_parse += numlen;
9300 if (PL_encoding && value < 0x100)
9301 goto recode_encoding;
9302 break;
9303 }
9304 recode_encoding:
9305 {
9306 SV* enc = PL_encoding;
9307 value = reg_recode((const char)(U8)value, &enc);
668c081a
NC
9308 if (!enc && SIZE_ONLY)
9309 ckWARNreg(RExC_parse,
9310 "Invalid escape in the specified encoding");
9e08bc66
TS
9311 break;
9312 }
1028017a 9313 default:
c99e91e9
KW
9314 /* Allow \_ to not give an error */
9315 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
668c081a
NC
9316 ckWARN2reg(RExC_parse,
9317 "Unrecognized escape \\%c in character class passed through",
9318 (int)value);
c99e91e9 9319 }
1028017a 9320 break;
b8c5462f 9321 }
ffc61ed2 9322 } /* end of \blah */
1b2d223b
JH
9323#ifdef EBCDIC
9324 else
9325 literal_endpoint++;
9326#endif
ffc61ed2
JH
9327
9328 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
9329
2c63ecad
KW
9330 /* What matches in a locale is not known until runtime, so need to
9331 * (one time per class) allocate extra space to pass to regexec.
9332 * The space will contain a bit for each named class that is to be
9333 * matched against. This isn't needed for \p{} and pseudo-classes,
9334 * as they are not affected by locale, and hence are dealt with
9335 * separately */
9336 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
9337 need_class = 1;
9338 if (SIZE_ONLY) {
3a15e693 9339#ifdef ANYOF_CLASS_ADD_SKIP
2c63ecad 9340 RExC_size += ANYOF_CLASS_ADD_SKIP;
3a15e693 9341#endif
2c63ecad
KW
9342 }
9343 else {
3a15e693 9344#ifdef ANYOF_CLASS_ADD_SKIP
2c63ecad 9345 RExC_emit += ANYOF_CLASS_ADD_SKIP;
3a15e693 9346#endif
2c63ecad
KW
9347 ANYOF_CLASS_ZERO(ret);
9348 }
9051cfd9 9349 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
2c63ecad 9350 }
ffc61ed2 9351
d5788240
KW
9352 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
9353 * literal */
ffc61ed2 9354 if (range) {
73b437c8 9355 if (!SIZE_ONLY) {
668c081a
NC
9356 const int w =
9357 RExC_parse >= rangebegin ?
9358 RExC_parse - rangebegin : 0;
9359 ckWARN4reg(RExC_parse,
b45f050a 9360 "False [] range \"%*.*s\"",
097eb12c 9361 w, w, rangebegin);
668c081a 9362
3568d838 9363 if (prevvalue < 256) {
2283d326 9364 stored +=
56ca34ca 9365 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue, &nonbitmap);
2283d326 9366 stored +=
56ca34ca 9367 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-', &nonbitmap);
ffc61ed2
JH
9368 }
9369 else {
ef87b810 9370 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
ffc61ed2 9371 Perl_sv_catpvf(aTHX_ listsv,
a1f3213b 9372 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 9373 }
b8c5462f 9374 }
ffc61ed2
JH
9375
9376 range = 0; /* this was not a true range */
73b437c8 9377 }
ffc61ed2 9378
89836f1f
YO
9379
9380
73b437c8 9381 if (!SIZE_ONLY) {
c49a72a9
NC
9382 const char *what = NULL;
9383 char yesno = 0;
9384
e2962f66
JH
9385 /* Possible truncation here but in some 64-bit environments
9386 * the compiler gets heartburn about switch on 64-bit values.
9387 * A similar issue a little earlier when switching on value.
98f323fa 9388 * --jhi */
e2962f66 9389 switch ((I32)namedclass) {
da7fcca4 9390
f952827c
KW
9391 case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
9392 case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
9393 case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
9394 case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
9395 case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
9396 case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
9397 case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
9398 case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
9399 case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
9400 case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
da7fcca4 9401#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
a12cf05f 9402 /* \s, \w match all unicode if utf8. */
f952827c
KW
9403 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
9404 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
da7fcca4 9405#else
a12cf05f 9406 /* \s, \w match ascii and locale only */
f952827c
KW
9407 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "PerlSpace");
9408 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "PerlWord");
da7fcca4 9409#endif
f952827c 9410 case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
e1d1eefb
YO
9411 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
9412 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
73b437c8
JH
9413 case ANYOF_ASCII:
9414 if (LOC)
936ed897 9415 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 9416 else {
1ba5c669 9417 for (value = 0; value < 128; value++)
2283d326 9418 stored +=
56ca34ca 9419 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap);
73b437c8 9420 }
c49a72a9 9421 yesno = '+';
ce1c68b2
KW
9422 what = NULL; /* Doesn't match outside ascii, so
9423 don't want to add +utf8:: */
73b437c8
JH
9424 break;
9425 case ANYOF_NASCII:
9426 if (LOC)
936ed897 9427 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 9428 else {
1ba5c669 9429 for (value = 128; value < 256; value++)
2283d326 9430 stored +=
56ca34ca 9431 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap);
73b437c8 9432 }
cfaf538b 9433 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
c49a72a9
NC
9434 yesno = '!';
9435 what = "ASCII";
89836f1f 9436 break;
ffc61ed2
JH
9437 case ANYOF_DIGIT:
9438 if (LOC)
9439 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
9440 else {
9441 /* consecutive digits assumed */
9442 for (value = '0'; value <= '9'; value++)
2283d326 9443 stored +=
56ca34ca 9444 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);
ffc61ed2 9445 }
c49a72a9 9446 yesno = '+';
da7fcca4 9447 what = POSIX_CC_UNI_NAME("Digit");
ffc61ed2
JH
9448 break;
9449 case ANYOF_NDIGIT:
9450 if (LOC)
9451 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
9452 else {
9453 /* consecutive digits assumed */
9454 for (value = 0; value < '0'; value++)
2283d326 9455 stored +=
56ca34ca 9456 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);
ffc61ed2 9457 for (value = '9' + 1; value < 256; value++)
2283d326 9458 stored +=
56ca34ca 9459 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);
ffc61ed2 9460 }
c49a72a9 9461 yesno = '!';
da7fcca4 9462 what = POSIX_CC_UNI_NAME("Digit");
cfaf538b
KW
9463 if (ASCII_RESTRICTED ) {
9464 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9465 }
89836f1f 9466 break;
f81125e2
JP
9467 case ANYOF_MAX:
9468 /* this is to handle \p and \P */
9469 break;
73b437c8 9470 default:
b45f050a 9471 vFAIL("Invalid [::] class");
73b437c8 9472 break;
b8c5462f 9473 }
cfaf538b 9474 if (what && ! (ASCII_RESTRICTED)) {
c49a72a9
NC
9475 /* Strings such as "+utf8::isWord\n" */
9476 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
ef87b810
KW
9477 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
9478 }
ce1c68b2 9479
73b437c8 9480 continue;
a0d0e21e 9481 }
ffc61ed2
JH
9482 } /* end of namedclass \blah */
9483
a0d0e21e 9484 if (range) {
eb160463 9485 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
9486 const int w = RExC_parse - rangebegin;
9487 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 9488 range = 0; /* not a valid range */
73b437c8 9489 }
a0d0e21e
LW
9490 }
9491 else {
3568d838 9492 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
9493 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
9494 RExC_parse[1] != ']') {
9495 RExC_parse++;
ffc61ed2
JH
9496
9497 /* a bad range like \w-, [:word:]- ? */
9498 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 9499 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 9500 const int w =
afd78fd5
JH
9501 RExC_parse >= rangebegin ?
9502 RExC_parse - rangebegin : 0;
830247a4 9503 vWARN4(RExC_parse,
b45f050a 9504 "False [] range \"%*.*s\"",
097eb12c 9505 w, w, rangebegin);
afd78fd5 9506 }
73b437c8 9507 if (!SIZE_ONLY)
2283d326 9508 stored +=
56ca34ca 9509 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-', &nonbitmap);
73b437c8 9510 } else
ffc61ed2
JH
9511 range = 1; /* yeah, it's a range! */
9512 continue; /* but do it the next time */
a0d0e21e 9513 }
a687059c 9514 }
ffc61ed2 9515
56ca34ca
KW
9516 if (value > 255) {
9517 RExC_uni_semantics = 1;
9518 }
9519
93733859 9520 /* now is the next time */
ae5c130c 9521 if (!SIZE_ONLY) {
3568d838 9522 if (prevvalue < 256) {
1df70142 9523 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 9524 IV i;
3568d838 9525#ifdef EBCDIC
1b2d223b
JH
9526 /* In EBCDIC [\x89-\x91] should include
9527 * the \x8e but [i-j] should not. */
9528 if (literal_endpoint == 2 &&
9529 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
9530 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 9531 {
3568d838
JH
9532 if (isLOWER(prevvalue)) {
9533 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 9534 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326 9535 stored +=
56ca34ca 9536 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap);
2670d666 9537 }
ffc61ed2 9538 } else {
3568d838 9539 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 9540 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326 9541 stored +=
56ca34ca 9542 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap);
2670d666 9543 }
ffc61ed2 9544 }
8ada0baa 9545 }
ffc61ed2 9546 else
8ada0baa 9547#endif
07be1b83 9548 for (i = prevvalue; i <= ceilvalue; i++) {
56ca34ca 9549 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap);
07be1b83 9550 }
3568d838 9551 }
56ca34ca
KW
9552 if (value > 255) {
9553 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
9554 const UV natvalue = NATIVE_TO_UNI(value);
9555 if (! nonbitmap) {
9556 nonbitmap = _new_invlist(2);
9557 }
9558 nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
9559 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
9560 }
9561#if 0
ef87b810
KW
9562
9563 /* If the code point requires utf8 to represent, and we are not
9564 * folding, it can't match unless the target is in utf8. Only
9565 * a few code points above 255 fold to below it, so XXX an
9566 * optimization would be to know which ones and set the flag
9567 * appropriately. */
9568 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
9569 ? ANYOF_NONBITMAP
9570 : ANYOF_UTF8;
d5788240
KW
9571 if (prevnatvalue < natvalue) { /* '>' case is fatal error above */
9572
9573 /* The \t sets the whole range */
ffc61ed2 9574 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7 9575 prevnatvalue, natvalue);
f56b6394
KW
9576
9577 /* Currently, we don't look at every value in the range.
9578 * Therefore we have to assume the worst case: that if
b57e4118
KW
9579 * folding, it will match more than one character. But in
9580 * lookbehind patterns, can only be single character
9581 * length, so disallow those folds */
9582 if (FOLD && ! RExC_in_lookbehind) {
f56b6394
KW
9583 OP(ret) = ANYOFV;
9584 }
b08decb7
JH
9585 }
9586 else if (prevnatvalue == natvalue) {
9587 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 9588 if (FOLD) {
89ebb4a3 9589 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 9590 STRLEN foldlen;
1df70142 9591 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 9592
e294cc5d
JH
9593#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
9594 if (RExC_precomp[0] == ':' &&
9595 RExC_precomp[1] == '[' &&
9596 (f == 0xDF || f == 0x92)) {
9597 f = NATIVE_TO_UNI(f);
9598 }
9599#endif
c840d2a2
JH
9600 /* If folding and foldable and a single
9601 * character, insert also the folded version
9602 * to the charclass. */
9e55ce06 9603 if (f != value) {
e294cc5d
JH
9604#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
9605 if ((RExC_precomp[0] == ':' &&
9606 RExC_precomp[1] == '[' &&
9607 (f == 0xA2 &&
9608 (value == 0xFB05 || value == 0xFB06))) ?
9609 foldlen == ((STRLEN)UNISKIP(f) - 1) :
9610 foldlen == (STRLEN)UNISKIP(f) )
9611#else
eb160463 9612 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 9613#endif
9e55ce06
JH
9614 Perl_sv_catpvf(aTHX_ listsv,
9615 "%04"UVxf"\n", f);
b57e4118 9616 else if (! RExC_in_lookbehind) {
9e55ce06 9617 /* Any multicharacter foldings
b57e4118 9618 * (disallowed in lookbehind patterns)
9e55ce06
JH
9619 * require the following transform:
9620 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
9621 * where E folds into "pq" and F folds
9622 * into "rst", all other characters
9623 * fold to single characters. We save
9624 * away these multicharacter foldings,
9625 * to be later saved as part of the
9626 * additional "s" data. */
9627 SV *sv;
9628
9629 if (!unicode_alternate)
9630 unicode_alternate = newAV();
740cce10
NC
9631 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
9632 TRUE);
9e55ce06 9633 av_push(unicode_alternate, sv);
f56b6394 9634 OP(ret) = ANYOFV;
9e55ce06
JH
9635 }
9636 }
254ba52a 9637
60a8b682
JH
9638 /* If folding and the value is one of the Greek
9639 * sigmas insert a few more sigmas to make the
9640 * folding rules of the sigmas to work right.
9641 * Note that not all the possible combinations
9642 * are handled here: some of them are handled
9e55ce06
JH
9643 * by the standard folding rules, and some of
9644 * them (literal or EXACTF cases) are handled
9645 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
9646 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
9647 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 9648 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 9649 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 9650 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
9651 }
9652 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
9653 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 9654 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
9655 }
9656 }
ffc61ed2 9657 }
56ca34ca 9658#endif
1b2d223b
JH
9659#ifdef EBCDIC
9660 literal_endpoint = 0;
9661#endif
8ada0baa 9662 }
ffc61ed2
JH
9663
9664 range = 0; /* this range (if it was one) is done now */
a0d0e21e 9665 }
ffc61ed2 9666
ffc61ed2 9667
7f6f358c
YO
9668
9669 if (SIZE_ONLY)
9670 return ret;
9671 /****** !SIZE_ONLY AFTER HERE *********/
9672
56ca34ca
KW
9673 /* Finish up the non-bitmap entries */
9674 if (nonbitmap) {
9675 UV* nonbitmap_array;
9676 UV i;
9677
9678 /* If folding, we add to the list all characters that could fold to or
9679 * from the ones already on the list */
9680 if (FOLD) {
9681 HV* fold_intersection;
9682 UV* fold_list;
9683
9684 /* This is a list of all the characters that participate in folds
9685 * (except marks, etc in multi-char folds */
9686 if (! PL_utf8_foldable) {
9687 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
9688 PL_utf8_foldable = _swash_to_invlist(swash);
9689 }
9690
9691 /* This is a hash that for a particular fold gives all characters
9692 * that are involved in it */
9693 if (! PL_utf8_foldclosures) {
9694
b443038a
KW
9695 /* If we were unable to find any folds, then we likely won't be
9696 * able to find the closures. So just create an empty list.
9697 * Folding will effectively be restricted to the non-Unicode
9698 * rules hard-coded into Perl. (This case happens legitimately
9699 * during compilation of Perl itself before the Unicode tables
9700 * are generated) */
9701 if (invlist_len(PL_utf8_foldable) == 0) {
9702 PL_utf8_foldclosures = _new_invlist(0);
9703 } else {
9704 /* If the folds haven't been read in, call a fold function
9705 * to force that */
9706 if (! PL_utf8_tofold) {
9707 U8 dummy[UTF8_MAXBYTES+1];
9708 STRLEN dummy_len;
9709 to_utf8_fold((U8*) "A", dummy, &dummy_len);
9710 }
9711 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
56ca34ca 9712 }
56ca34ca
KW
9713 }
9714
9715 /* Only the characters in this class that participate in folds need
9716 * be checked. Get the intersection of this class and all the
9717 * possible characters that are foldable. This can quickly narrow
9718 * down a large class */
9719 fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
9720
9721 /* Now look at the foldable characters in this class individually */
9722 fold_list = invlist_array(fold_intersection);
9723 for (i = 0; i < invlist_len(fold_intersection); i++) {
9724 UV j;
9725
9726 /* The next entry is the beginning of the range that is in the
9727 * class */
9728 UV start = fold_list[i++];
9729
9730
9731 /* The next entry is the beginning of the next range, which
9732 * isn't in the class, so the end of the current range is one
9733 * less than that */
9734 UV end = fold_list[i] - 1;
9735
9736 /* Look at every character in the range */
9737 for (j = start; j <= end; j++) {
9738
9739 /* Get its fold */
9740 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
9741 STRLEN foldlen;
9742 const UV f = to_uni_fold(j, foldbuf, &foldlen);
9743
9744 if (foldlen > (STRLEN)UNISKIP(f)) {
9745
9746 /* Any multicharacter foldings (disallowed in
9747 * lookbehind patterns) require the following
9748 * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
9749 * E folds into "pq" and F folds into "rst", all other
9750 * characters fold to single characters. We save away
9751 * these multicharacter foldings, to be later saved as
9752 * part of the additional "s" data. */
9753 if (! RExC_in_lookbehind) {
9754 /* XXX Discard this fold if any are latin1 and LOC */
9755 SV *sv;
9756
9757 if (!unicode_alternate) {
9758 unicode_alternate = newAV();
9759 }
9760 sv = newSVpvn_utf8((char*)foldbuf, foldlen, TRUE);
9761 av_push(unicode_alternate, sv);
9762
9763 /* This node is variable length */
9764 OP(ret) = ANYOFV;
9765 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
9766 }
9767 }
9768 else { /* Single character fold */
9769 SV** listp;
9770
9771 /* Consider "k" =~ /[K]/i. The line above would have
9772 * just folded the 'k' to itself, and that isn't going
9773 * to match 'K'. So we look through the closure of
9774 * everything that folds to 'k'. That will find the
9775 * 'K'. Initialize the list, if necessary */
9776
9777 /* The data structure is a hash with the keys every
9778 * character that is folded to, like 'k', and the
9779 * values each an array of everything that folds to its
9780 * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
9781 if ((listp = hv_fetch(PL_utf8_foldclosures,
9782 (char *) foldbuf, foldlen, FALSE)))
9783 {
9784 AV* list = (AV*) *listp;
9785 IV k;
9786 for (k = 0; k <= av_len(list); k++) {
9787 SV** c_p = av_fetch(list, k, FALSE);
9788 UV c;
9789 if (c_p == NULL) {
9790 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
9791 }
9792 c = SvUV(*c_p);
9793
9794 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
9795 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) c, &nonbitmap);
9796 }
9797 /* It may be that the code point is already
9798 * in this range or already in the bitmap,
9799 * XXX THink about LOC
9800 * in which case we need do nothing */
9801 else if ((c < start || c > end)
9802 && (c > 255
9803 || ! ANYOF_BITMAP_TEST(ret, c)))
9804 {
9805 nonbitmap = add_range_to_invlist(nonbitmap, c, c);
9806 }
9807 }
9808 }
9809 }
9810 }
9811 }
9812 invlist_destroy(fold_intersection);
9813 } /* End of processing all the folds */
9814
9815 /* Here have the full list of items to match that aren't in the
9816 * bitmap. Convert to the structure that the rest of the code is
9817 * expecting. XXX That rest of the code should convert to this
9818 * structure */
9819 nonbitmap_array = invlist_array(nonbitmap);
9820 for (i = 0; i < invlist_len(nonbitmap); i++) {
9821
9822 /* The next entry is the beginning of the range that is in the
9823 * class */
9824 UV start = nonbitmap_array[i++];
9825
9826 /* The next entry is the beginning of the next range, which isn't
9827 * in the class, so the end of the current range is one less than
9828 * that */
9829 UV end = nonbitmap_array[i] - 1;
9830
9831 if (start == end) {
9832 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
9833 }
9834 else {
9835 /* The \t sets the whole range */
9836 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
9837 /* XXX EBCDIC */
9838 start, end);
9839 }
9840 }
9841 invlist_destroy(nonbitmap);
9842 }
9843
f56b6394
KW
9844 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
9845 * set the FOLD flag yet, so this this does optimize those. It doesn't
40c78556
KW
9846 * optimize locale. Doing so perhaps could be done as long as there is
9847 * nothing like \w in it; some thought also would have to be given to the
9848 * interaction with above 0x100 chars */
f56b6394 9849 if (! LOC && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
40c78556
KW
9850 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
9851 ANYOF_BITMAP(ret)[value] ^= 0xFF;
9852 stored = 256 - stored;
9853
d5788240
KW
9854 /* The inversion means that everything above 255 is matched; and at the
9855 * same time we clear the invert flag */
40c78556
KW
9856 ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
9857 }
9858
f56b6394
KW
9859 if (FOLD) {
9860 SV *sv;
9861
9862 /* This is the one character in the bitmap that needs special handling
9863 * under non-locale folding, as it folds to two characters 'ss'. This
9864 * happens if it is set and not inverting, or isn't set and are
b57e4118
KW
9865 * inverting (disallowed in lookbehind patterns because they can't be
9866 * variable length) */
f56b6394 9867 if (! LOC
b57e4118 9868 && ! RExC_in_lookbehind
f56b6394
KW
9869 && (cBOOL(ANYOF_BITMAP_TEST(ret, LATIN_SMALL_LETTER_SHARP_S))
9870 ^ cBOOL(ANYOF_FLAGS(ret) & ANYOF_INVERT)))
9871 {
9872 OP(ret) = ANYOFV; /* Can match more than a single char */
9873
9874 /* Under Unicode semantics), it can do this when the target string
9875 * isn't in utf8 */
9876 if (UNI_SEMANTICS) {
9877 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
9878 }
9879
9880 if (!unicode_alternate) {
9881 unicode_alternate = newAV();
9882 }
9883 sv = newSVpvn_utf8("ss", 2, TRUE);
9884 av_push(unicode_alternate, sv);
9885 }
9886
9887 /* Folding in the bitmap is taken care of above, but not for locale
9888 * (for which we have to wait to see what folding is in effect at
9889 * runtime), and for things not in the bitmap. Set run-time fold flag
9890 * for these */
9891 if ((LOC || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP))) {
9892 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
9893 }
9894 }
9895
2786be71
KW
9896 /* A single character class can be "optimized" into an EXACTish node.
9897 * Note that since we don't currently count how many characters there are
9898 * outside the bitmap, we are XXX missing optimization possibilities for
9899 * them. This optimization can't happen unless this is a truly single
9900 * character class, which means that it can't be an inversion into a
9901 * many-character class, and there must be no possibility of there being
9902 * things outside the bitmap. 'stored' (only) for locales doesn't include
6da63e10
KW
9903 * \w, etc, so have to make a special test that they aren't present
9904 *
9905 * Similarly A 2-character class of the very special form like [bB] can be
9906 * optimized into an EXACTFish node, but only for non-locales, and for
9907 * characters which only have the two folds; so things like 'fF' and 'Ii'
9908 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
9909 * FI'. */
2786be71 9910 if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
6da63e10
KW
9911 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
9912 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
9913 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
9914 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
9915 /* If the latest code point has a fold whose
9916 * bit is set, it must be the only other one */
2dcac756 9917 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
6da63e10 9918 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
2786be71
KW
9919 {
9920 /* Note that the information needed to decide to do this optimization
9921 * is not currently available until the 2nd pass, and that the actually
6da63e10
KW
9922 * used EXACTish node takes less space than the calculated ANYOF node,
9923 * and hence the amount of space calculated in the first pass is larger
2786be71
KW
9924 * than actually used, so this optimization doesn't gain us any space.
9925 * But an EXACT node is faster than an ANYOF node, and can be combined
9926 * with any adjacent EXACT nodes later by the optimizer for further
6da63e10
KW
9927 * gains. The speed of executing an EXACTF is similar to an ANYOF
9928 * node, so the optimization advantage comes from the ability to join
9929 * it to adjacent EXACT nodes */
2786be71 9930
07be1b83 9931 const char * cur_parse= RExC_parse;
6da63e10 9932 U8 op;
07be1b83
YO
9933 RExC_emit = (regnode *)orig_emit;
9934 RExC_parse = (char *)orig_parse;
2786be71 9935
6da63e10
KW
9936 if (stored == 1) {
9937
9938 /* A locale node with one point can be folded; all the other cases
9939 * with folding will have two points, since we calculate them above
9940 */
39065660 9941 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
6da63e10
KW
9942 op = EXACTFL;
9943 }
9944 else {
9945 op = EXACT;
9946 }
9947 } /* else 2 chars in the bit map: the folds of each other */
cfaf538b 9948 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
6da63e10
KW
9949
9950 /* To join adjacent nodes, they must be the exact EXACTish type.
9951 * Try to use the most likely type, by using EXACTFU if the regex
9952 * calls for them, or is required because the character is
9953 * non-ASCII */
9954 op = EXACTFU;
9955 }
9956 else { /* Otherwise, more likely to be EXACTF type */
9957 op = EXACTF;
9958 }
9959
9960 ret = reg_node(pRExC_state, op);
07be1b83 9961 RExC_parse = (char *)cur_parse;
2786be71
KW
9962 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
9963 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
9964 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
9965 STR_LEN(ret)= 2;
9966 RExC_emit += STR_SZ(2);
9967 }
9968 else {
9969 *STRING(ret)= (char)value;
9970 STR_LEN(ret)= 1;
9971 RExC_emit += STR_SZ(1);
9972 }
ef8d46e8 9973 SvREFCNT_dec(listsv);
7f6f358c
YO
9974 return ret;
9975 }
ffc61ed2 9976
7f6f358c 9977 {
097eb12c 9978 AV * const av = newAV();
ffc61ed2 9979 SV *rv;
9e55ce06 9980 /* The 0th element stores the character class description
6a0407ee 9981 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
9982 * to initialize the appropriate swash (which gets stored in
9983 * the 1st element), and also useful for dumping the regnode.
9984 * The 2nd element stores the multicharacter foldings,
6a0407ee 9985 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
9986 av_store(av, 0, listsv);
9987 av_store(av, 1, NULL);
ad64d0ec
NC
9988 av_store(av, 2, MUTABLE_SV(unicode_alternate));
9989 rv = newRV_noinc(MUTABLE_SV(av));
19860706 9990 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 9991 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 9992 ARG_SET(ret, n);
a0ed51b3 9993 }
a0ed51b3
LW
9994 return ret;
9995}
89836f1f
YO
9996#undef _C_C_T_
9997
a0ed51b3 9998
bcdf7404
YO
9999/* reg_skipcomment()
10000
10001 Absorbs an /x style # comments from the input stream.
10002 Returns true if there is more text remaining in the stream.
10003 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
10004 terminates the pattern without including a newline.
10005
10006 Note its the callers responsibility to ensure that we are
10007 actually in /x mode
10008
10009*/
10010
10011STATIC bool
10012S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
10013{
10014 bool ended = 0;
7918f24d
NC
10015
10016 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
10017
bcdf7404
YO
10018 while (RExC_parse < RExC_end)
10019 if (*RExC_parse++ == '\n') {
10020 ended = 1;
10021 break;
10022 }
10023 if (!ended) {
10024 /* we ran off the end of the pattern without ending
10025 the comment, so we have to add an \n when wrapping */
10026 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10027 return 0;
10028 } else
10029 return 1;
10030}
10031
10032/* nextchar()
10033
3b753521 10034 Advances the parse position, and optionally absorbs
bcdf7404
YO
10035 "whitespace" from the inputstream.
10036
10037 Without /x "whitespace" means (?#...) style comments only,
10038 with /x this means (?#...) and # comments and whitespace proper.
10039
10040 Returns the RExC_parse point from BEFORE the scan occurs.
10041
10042 This is the /x friendly way of saying RExC_parse++.
10043*/
10044
76e3520e 10045STATIC char*
830247a4 10046S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 10047{
097eb12c 10048 char* const retval = RExC_parse++;
a0d0e21e 10049
7918f24d
NC
10050 PERL_ARGS_ASSERT_NEXTCHAR;
10051
4633a7c4 10052 for (;;) {
830247a4
IZ
10053 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
10054 RExC_parse[2] == '#') {
e994fd66
AE
10055 while (*RExC_parse != ')') {
10056 if (RExC_parse == RExC_end)
10057 FAIL("Sequence (?#... not terminated");
830247a4 10058 RExC_parse++;
e994fd66 10059 }
830247a4 10060 RExC_parse++;
4633a7c4
LW
10061 continue;
10062 }
bbe252da 10063 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
10064 if (isSPACE(*RExC_parse)) {
10065 RExC_parse++;
748a9306
LW
10066 continue;
10067 }
830247a4 10068 else if (*RExC_parse == '#') {
bcdf7404
YO
10069 if ( reg_skipcomment( pRExC_state ) )
10070 continue;
748a9306 10071 }
748a9306 10072 }
4633a7c4 10073 return retval;
a0d0e21e 10074 }
a687059c
LW
10075}
10076
10077/*
c277df42 10078- reg_node - emit a node
a0d0e21e 10079*/
76e3520e 10080STATIC regnode * /* Location. */
830247a4 10081S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 10082{
97aff369 10083 dVAR;
c277df42 10084 register regnode *ptr;
504618e9 10085 regnode * const ret = RExC_emit;
07be1b83 10086 GET_RE_DEBUG_FLAGS_DECL;
a687059c 10087
7918f24d
NC
10088 PERL_ARGS_ASSERT_REG_NODE;
10089
c277df42 10090 if (SIZE_ONLY) {
830247a4
IZ
10091 SIZE_ALIGN(RExC_size);
10092 RExC_size += 1;
a0d0e21e
LW
10093 return(ret);
10094 }
3b57cd43
YO
10095 if (RExC_emit >= RExC_emit_bound)
10096 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10097
c277df42 10098 NODE_ALIGN_FILL(ret);
a0d0e21e 10099 ptr = ret;
c277df42 10100 FILL_ADVANCE_NODE(ptr, op);
7122b237 10101#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 10102 if (RExC_offsets) { /* MJD */
07be1b83 10103 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 10104 "reg_node", __LINE__,
13d6edb4 10105 PL_reg_name[op],
07be1b83
YO
10106 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
10107 ? "Overwriting end of array!\n" : "OK",
10108 (UV)(RExC_emit - RExC_emit_start),
10109 (UV)(RExC_parse - RExC_start),
10110 (UV)RExC_offsets[0]));
ccb2c380 10111 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 10112 }
7122b237 10113#endif
830247a4 10114 RExC_emit = ptr;
a0d0e21e 10115 return(ret);
a687059c
LW
10116}
10117
10118/*
a0d0e21e
LW
10119- reganode - emit a node with an argument
10120*/
76e3520e 10121STATIC regnode * /* Location. */
830247a4 10122S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 10123{
97aff369 10124 dVAR;
c277df42 10125 register regnode *ptr;
504618e9 10126 regnode * const ret = RExC_emit;
07be1b83 10127 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 10128
7918f24d
NC
10129 PERL_ARGS_ASSERT_REGANODE;
10130
c277df42 10131 if (SIZE_ONLY) {
830247a4
IZ
10132 SIZE_ALIGN(RExC_size);
10133 RExC_size += 2;
6bda09f9
YO
10134 /*
10135 We can't do this:
10136
10137 assert(2==regarglen[op]+1);
10138
10139 Anything larger than this has to allocate the extra amount.
10140 If we changed this to be:
10141
10142 RExC_size += (1 + regarglen[op]);
10143
10144 then it wouldn't matter. Its not clear what side effect
10145 might come from that so its not done so far.
10146 -- dmq
10147 */
a0d0e21e
LW
10148 return(ret);
10149 }
3b57cd43
YO
10150 if (RExC_emit >= RExC_emit_bound)
10151 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10152
c277df42 10153 NODE_ALIGN_FILL(ret);
a0d0e21e 10154 ptr = ret;
c277df42 10155 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 10156#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 10157 if (RExC_offsets) { /* MJD */
07be1b83 10158 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 10159 "reganode",
ccb2c380 10160 __LINE__,
13d6edb4 10161 PL_reg_name[op],
07be1b83 10162 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 10163 "Overwriting end of array!\n" : "OK",
07be1b83
YO
10164 (UV)(RExC_emit - RExC_emit_start),
10165 (UV)(RExC_parse - RExC_start),
10166 (UV)RExC_offsets[0]));
ccb2c380 10167 Set_Cur_Node_Offset;
fac92740 10168 }
7122b237 10169#endif
830247a4 10170 RExC_emit = ptr;
a0d0e21e 10171 return(ret);
fe14fcc3
LW
10172}
10173
10174/*
cd439c50 10175- reguni - emit (if appropriate) a Unicode character
a0ed51b3 10176*/
71207a34
AL
10177STATIC STRLEN
10178S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 10179{
97aff369 10180 dVAR;
7918f24d
NC
10181
10182 PERL_ARGS_ASSERT_REGUNI;
10183
71207a34 10184 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
10185}
10186
10187/*
a0d0e21e
LW
10188- reginsert - insert an operator in front of already-emitted operand
10189*
10190* Means relocating the operand.
10191*/
76e3520e 10192STATIC void
6bda09f9 10193S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 10194{
97aff369 10195 dVAR;
c277df42
IZ
10196 register regnode *src;
10197 register regnode *dst;
10198 register regnode *place;
504618e9 10199 const int offset = regarglen[(U8)op];
6bda09f9 10200 const int size = NODE_STEP_REGNODE + offset;
07be1b83 10201 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10202
10203 PERL_ARGS_ASSERT_REGINSERT;
def51078 10204 PERL_UNUSED_ARG(depth);
22c35a8c 10205/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 10206 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 10207 if (SIZE_ONLY) {
6bda09f9 10208 RExC_size += size;
a0d0e21e
LW
10209 return;
10210 }
a687059c 10211
830247a4 10212 src = RExC_emit;
6bda09f9 10213 RExC_emit += size;
830247a4 10214 dst = RExC_emit;
40d049e4 10215 if (RExC_open_parens) {
6bda09f9 10216 int paren;
3b57cd43 10217 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 10218 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 10219 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 10220 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
10221 RExC_open_parens[paren] += size;
10222 } else {
3b57cd43 10223 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
10224 }
10225 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 10226 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
10227 RExC_close_parens[paren] += size;
10228 } else {
3b57cd43 10229 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
10230 }
10231 }
6bda09f9 10232 }
40d049e4 10233
fac92740 10234 while (src > opnd) {
c277df42 10235 StructCopy(--src, --dst, regnode);
7122b237 10236#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 10237 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 10238 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 10239 "reg_insert",
ccb2c380 10240 __LINE__,
13d6edb4 10241 PL_reg_name[op],
07be1b83
YO
10242 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
10243 ? "Overwriting end of array!\n" : "OK",
10244 (UV)(src - RExC_emit_start),
10245 (UV)(dst - RExC_emit_start),
10246 (UV)RExC_offsets[0]));
ccb2c380
MP
10247 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
10248 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 10249 }
7122b237 10250#endif
fac92740
MJD
10251 }
10252
a0d0e21e
LW
10253
10254 place = opnd; /* Op node, where operand used to be. */
7122b237 10255#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 10256 if (RExC_offsets) { /* MJD */
07be1b83 10257 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 10258 "reginsert",
ccb2c380 10259 __LINE__,
13d6edb4 10260 PL_reg_name[op],
07be1b83 10261 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 10262 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
10263 (UV)(place - RExC_emit_start),
10264 (UV)(RExC_parse - RExC_start),
786e8c11 10265 (UV)RExC_offsets[0]));
ccb2c380 10266 Set_Node_Offset(place, RExC_parse);
45948336 10267 Set_Node_Length(place, 1);
fac92740 10268 }
7122b237 10269#endif
c277df42
IZ
10270 src = NEXTOPER(place);
10271 FILL_ADVANCE_NODE(place, op);
10272 Zero(src, offset, regnode);
a687059c
LW
10273}
10274
10275/*
c277df42 10276- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 10277- SEE ALSO: regtail_study
a0d0e21e 10278*/
097eb12c 10279/* TODO: All three parms should be const */
76e3520e 10280STATIC void
3dab1dad 10281S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 10282{
97aff369 10283 dVAR;
c277df42 10284 register regnode *scan;
72f13be8 10285 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10286
10287 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
10288#ifndef DEBUGGING
10289 PERL_UNUSED_ARG(depth);
10290#endif
a0d0e21e 10291
c277df42 10292 if (SIZE_ONLY)
a0d0e21e
LW
10293 return;
10294
10295 /* Find last node. */
10296 scan = p;
10297 for (;;) {
504618e9 10298 regnode * const temp = regnext(scan);
3dab1dad
YO
10299 DEBUG_PARSE_r({
10300 SV * const mysv=sv_newmortal();
10301 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
10302 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
10303 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
10304 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
10305 (temp == NULL ? "->" : ""),
13d6edb4 10306 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 10307 );
3dab1dad
YO
10308 });
10309 if (temp == NULL)
10310 break;
10311 scan = temp;
10312 }
10313
10314 if (reg_off_by_arg[OP(scan)]) {
10315 ARG_SET(scan, val - scan);
10316 }
10317 else {
10318 NEXT_OFF(scan) = val - scan;
10319 }
10320}
10321
07be1b83 10322#ifdef DEBUGGING
3dab1dad
YO
10323/*
10324- regtail_study - set the next-pointer at the end of a node chain of p to val.
10325- Look for optimizable sequences at the same time.
10326- currently only looks for EXACT chains.
07be1b83 10327
486ec47a 10328This is experimental code. The idea is to use this routine to perform
07be1b83
YO
10329in place optimizations on branches and groups as they are constructed,
10330with the long term intention of removing optimization from study_chunk so
10331that it is purely analytical.
10332
10333Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
10334to control which is which.
10335
3dab1dad
YO
10336*/
10337/* TODO: All four parms should be const */
07be1b83 10338
3dab1dad
YO
10339STATIC U8
10340S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10341{
10342 dVAR;
10343 register regnode *scan;
07be1b83
YO
10344 U8 exact = PSEUDO;
10345#ifdef EXPERIMENTAL_INPLACESCAN
10346 I32 min = 0;
10347#endif
3dab1dad
YO
10348 GET_RE_DEBUG_FLAGS_DECL;
10349
7918f24d
NC
10350 PERL_ARGS_ASSERT_REGTAIL_STUDY;
10351
07be1b83 10352
3dab1dad
YO
10353 if (SIZE_ONLY)
10354 return exact;
10355
10356 /* Find last node. */
10357
10358 scan = p;
10359 for (;;) {
10360 regnode * const temp = regnext(scan);
07be1b83
YO
10361#ifdef EXPERIMENTAL_INPLACESCAN
10362 if (PL_regkind[OP(scan)] == EXACT)
10363 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
10364 return EXACT;
10365#endif
3dab1dad
YO
10366 if ( exact ) {
10367 switch (OP(scan)) {
10368 case EXACT:
10369 case EXACTF:
2c2b7f86 10370 case EXACTFU:
3dab1dad
YO
10371 case EXACTFL:
10372 if( exact == PSEUDO )
10373 exact= OP(scan);
07be1b83
YO
10374 else if ( exact != OP(scan) )
10375 exact= 0;
3dab1dad
YO
10376 case NOTHING:
10377 break;
10378 default:
10379 exact= 0;
10380 }
10381 }
10382 DEBUG_PARSE_r({
10383 SV * const mysv=sv_newmortal();
10384 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
10385 regprop(RExC_rx, mysv, scan);
eaf3ca90 10386 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 10387 SvPV_nolen_const(mysv),
eaf3ca90 10388 REG_NODE_NUM(scan),
13d6edb4 10389 PL_reg_name[exact]);
3dab1dad 10390 });
a0d0e21e
LW
10391 if (temp == NULL)
10392 break;
10393 scan = temp;
10394 }
07be1b83
YO
10395 DEBUG_PARSE_r({
10396 SV * const mysv_val=sv_newmortal();
10397 DEBUG_PARSE_MSG("");
10398 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
10399 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
10400 SvPV_nolen_const(mysv_val),
10401 (IV)REG_NODE_NUM(val),
10402 (IV)(val - scan)
07be1b83
YO
10403 );
10404 });
c277df42
IZ
10405 if (reg_off_by_arg[OP(scan)]) {
10406 ARG_SET(scan, val - scan);
a0ed51b3
LW
10407 }
10408 else {
c277df42
IZ
10409 NEXT_OFF(scan) = val - scan;
10410 }
3dab1dad
YO
10411
10412 return exact;
a687059c 10413}
07be1b83 10414#endif
a687059c
LW
10415
10416/*
fd181c75 10417 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 10418 */
f7819f85 10419#ifdef DEBUGGING
c33269f7 10420static void
7918f24d
NC
10421S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
10422{
f7819f85
A
10423 int bit;
10424 int set=0;
a62b1201 10425 regex_charset cs;
7918f24d 10426
f7819f85
A
10427 for (bit=0; bit<32; bit++) {
10428 if (flags & (1<<bit)) {
a62b1201
KW
10429 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
10430 continue;
10431 }
f7819f85
A
10432 if (!set++ && lead)
10433 PerlIO_printf(Perl_debug_log, "%s",lead);
10434 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
10435 }
10436 }
a62b1201
KW
10437 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
10438 if (!set++ && lead) {
10439 PerlIO_printf(Perl_debug_log, "%s",lead);
10440 }
10441 switch (cs) {
10442 case REGEX_UNICODE_CHARSET:
10443 PerlIO_printf(Perl_debug_log, "UNICODE");
10444 break;
10445 case REGEX_LOCALE_CHARSET:
10446 PerlIO_printf(Perl_debug_log, "LOCALE");
10447 break;
cfaf538b
KW
10448 case REGEX_ASCII_RESTRICTED_CHARSET:
10449 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
10450 break;
a62b1201
KW
10451 default:
10452 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
10453 break;
10454 }
10455 }
f7819f85
A
10456 if (lead) {
10457 if (set)
10458 PerlIO_printf(Perl_debug_log, "\n");
10459 else
10460 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
10461 }
10462}
10463#endif
10464
a687059c 10465void
097eb12c 10466Perl_regdump(pTHX_ const regexp *r)
a687059c 10467{
35ff7856 10468#ifdef DEBUGGING
97aff369 10469 dVAR;
c445ea15 10470 SV * const sv = sv_newmortal();
ab3bbdeb 10471 SV *dsv= sv_newmortal();
f8fc2ecf 10472 RXi_GET_DECL(r,ri);
f7819f85 10473 GET_RE_DEBUG_FLAGS_DECL;
a687059c 10474
7918f24d
NC
10475 PERL_ARGS_ASSERT_REGDUMP;
10476
f8fc2ecf 10477 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
10478
10479 /* Header fields of interest. */
ab3bbdeb
YO
10480 if (r->anchored_substr) {
10481 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
10482 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 10483 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
10484 "anchored %s%s at %"IVdf" ",
10485 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 10486 (IV)r->anchored_offset);
ab3bbdeb
YO
10487 } else if (r->anchored_utf8) {
10488 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
10489 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 10490 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
10491 "anchored utf8 %s%s at %"IVdf" ",
10492 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 10493 (IV)r->anchored_offset);
ab3bbdeb
YO
10494 }
10495 if (r->float_substr) {
10496 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
10497 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 10498 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
10499 "floating %s%s at %"IVdf"..%"UVuf" ",
10500 s, RE_SV_TAIL(r->float_substr),
7b0972df 10501 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
10502 } else if (r->float_utf8) {
10503 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
10504 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 10505 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
10506 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
10507 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 10508 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 10509 }
33b8afdf 10510 if (r->check_substr || r->check_utf8)
b81d288d 10511 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
10512 (const char *)
10513 (r->check_substr == r->float_substr
10514 && r->check_utf8 == r->float_utf8
10515 ? "(checking floating" : "(checking anchored"));
bbe252da 10516 if (r->extflags & RXf_NOSCAN)
c277df42 10517 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 10518 if (r->extflags & RXf_CHECK_ALL)
c277df42 10519 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 10520 if (r->check_substr || r->check_utf8)
c277df42
IZ
10521 PerlIO_printf(Perl_debug_log, ") ");
10522
f8fc2ecf
YO
10523 if (ri->regstclass) {
10524 regprop(r, sv, ri->regstclass);
1de06328 10525 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 10526 }
bbe252da 10527 if (r->extflags & RXf_ANCH) {
774d564b 10528 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 10529 if (r->extflags & RXf_ANCH_BOL)
774d564b 10530 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 10531 if (r->extflags & RXf_ANCH_MBOL)
c277df42 10532 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 10533 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 10534 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 10535 if (r->extflags & RXf_ANCH_GPOS)
774d564b 10536 PerlIO_printf(Perl_debug_log, "(GPOS)");
10537 PerlIO_putc(Perl_debug_log, ' ');
10538 }
bbe252da 10539 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 10540 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 10541 if (r->intflags & PREGf_SKIP)
760ac839 10542 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 10543 if (r->intflags & PREGf_IMPLICIT)
760ac839 10544 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 10545 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 10546 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 10547 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 10548 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 10549 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 10550#else
7918f24d 10551 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 10552 PERL_UNUSED_CONTEXT;
65e66c80 10553 PERL_UNUSED_ARG(r);
17c3b450 10554#endif /* DEBUGGING */
a687059c
LW
10555}
10556
10557/*
a0d0e21e
LW
10558- regprop - printable representation of opcode
10559*/
3339dfd8
YO
10560#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
10561STMT_START { \
10562 if (do_sep) { \
10563 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
10564 if (flags & ANYOF_INVERT) \
10565 /*make sure the invert info is in each */ \
10566 sv_catpvs(sv, "^"); \
10567 do_sep = 0; \
10568 } \
10569} STMT_END
10570
46fc3d4c 10571void
32fc9b6a 10572Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 10573{
35ff7856 10574#ifdef DEBUGGING
97aff369 10575 dVAR;
9b155405 10576 register int k;
f8fc2ecf 10577 RXi_GET_DECL(prog,progi);
1de06328 10578 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 10579
7918f24d 10580 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 10581
76f68e9b 10582 sv_setpvs(sv, "");
8aa23a47 10583
03363afd 10584 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
10585 /* It would be nice to FAIL() here, but this may be called from
10586 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 10587 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 10588 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 10589
3dab1dad 10590 k = PL_regkind[OP(o)];
9b155405 10591
2a782b5b 10592 if (k == EXACT) {
f92a2122 10593 sv_catpvs(sv, " ");
ab3bbdeb
YO
10594 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
10595 * is a crude hack but it may be the best for now since
10596 * we have no flag "this EXACTish node was UTF-8"
10597 * --jhi */
f92a2122
NC
10598 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
10599 PERL_PV_ESCAPE_UNI_DETECT |
c89df6cf 10600 PERL_PV_ESCAPE_NONASCII |
f92a2122
NC
10601 PERL_PV_PRETTY_ELLIPSES |
10602 PERL_PV_PRETTY_LTGT |
10603 PERL_PV_PRETTY_NOCLEAR
10604 );
bb263b4e 10605 } else if (k == TRIE) {
3dab1dad 10606 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 10607 * progi->data isn't available here */
1de06328 10608 const char op = OP(o);
647f639f 10609 const U32 n = ARG(o);
1de06328 10610 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 10611 (reg_ac_data *)progi->data->data[n] :
1de06328 10612 NULL;
3251b653
NC
10613 const reg_trie_data * const trie
10614 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 10615
13d6edb4 10616 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
10617 DEBUG_TRIE_COMPILE_r(
10618 Perl_sv_catpvf(aTHX_ sv,
10619 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
10620 (UV)trie->startstate,
1e2e3d02 10621 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
10622 (UV)trie->wordcount,
10623 (UV)trie->minlen,
10624 (UV)trie->maxlen,
10625 (UV)TRIE_CHARCOUNT(trie),
10626 (UV)trie->uniquecharcount
10627 )
10628 );
10629 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
10630 int i;
10631 int rangestart = -1;
f46cb337 10632 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 10633 sv_catpvs(sv, "[");
1de06328
YO
10634 for (i = 0; i <= 256; i++) {
10635 if (i < 256 && BITMAP_TEST(bitmap,i)) {
10636 if (rangestart == -1)
10637 rangestart = i;
10638 } else if (rangestart != -1) {
10639 if (i <= rangestart + 3)
10640 for (; rangestart < i; rangestart++)
10641 put_byte(sv, rangestart);
10642 else {
10643 put_byte(sv, rangestart);
10644 sv_catpvs(sv, "-");
10645 put_byte(sv, i - 1);
10646 }
10647 rangestart = -1;
10648 }
10649 }
f3a2811a 10650 sv_catpvs(sv, "]");
1de06328
YO
10651 }
10652
a3621e74 10653 } else if (k == CURLY) {
cb434fcc 10654 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
10655 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
10656 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 10657 }
2c2d71f5
JH
10658 else if (k == WHILEM && o->flags) /* Ordinal/of */
10659 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 10660 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 10661 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 10662 if ( RXp_PAREN_NAMES(prog) ) {
9d6ecd7a 10663 if ( k != REF || (OP(o) < NREF)) {
502c6561 10664 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
10665 SV **name= av_fetch(list, ARG(o), 0 );
10666 if (name)
10667 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
10668 }
10669 else {
502c6561 10670 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 10671 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
10672 I32 *nums=(I32*)SvPVX(sv_dat);
10673 SV **name= av_fetch(list, nums[0], 0 );
10674 I32 n;
10675 if (name) {
10676 for ( n=0; n<SvIVX(sv_dat); n++ ) {
10677 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
10678 (n ? "," : ""), (IV)nums[n]);
10679 }
10680 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 10681 }
1f1031fe 10682 }
ee9b8eae 10683 }
1f1031fe 10684 } else if (k == GOSUB)
6bda09f9 10685 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
10686 else if (k == VERB) {
10687 if (!o->flags)
10688 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 10689 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 10690 } else if (k == LOGICAL)
04ebc1ab 10691 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
f9a79580 10692 else if (k == FOLDCHAR)
df44d732 10693 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
653099ff
GS
10694 else if (k == ANYOF) {
10695 int i, rangestart = -1;
2d03de9c 10696 const U8 flags = ANYOF_FLAGS(o);
24d786f4 10697 int do_sep = 0;
0bd48802
AL
10698
10699 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
10700 static const char * const anyofs[] = {
653099ff
GS
10701 "\\w",
10702 "\\W",
10703 "\\s",
10704 "\\S",
10705 "\\d",
10706 "\\D",
10707 "[:alnum:]",
10708 "[:^alnum:]",
10709 "[:alpha:]",
10710 "[:^alpha:]",
10711 "[:ascii:]",
10712 "[:^ascii:]",
24d786f4
YO
10713 "[:cntrl:]",
10714 "[:^cntrl:]",
653099ff
GS
10715 "[:graph:]",
10716 "[:^graph:]",
10717 "[:lower:]",
10718 "[:^lower:]",
10719 "[:print:]",
10720 "[:^print:]",
10721 "[:punct:]",
10722 "[:^punct:]",
10723 "[:upper:]",
aaa51d5e 10724 "[:^upper:]",
653099ff 10725 "[:xdigit:]",
aaa51d5e
JF
10726 "[:^xdigit:]",
10727 "[:space:]",
10728 "[:^space:]",
10729 "[:blank:]",
10730 "[:^blank:]"
653099ff
GS
10731 };
10732
19860706 10733 if (flags & ANYOF_LOCALE)
396482e1 10734 sv_catpvs(sv, "{loc}");
39065660 10735 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
396482e1 10736 sv_catpvs(sv, "{i}");
653099ff 10737 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 10738 if (flags & ANYOF_INVERT)
396482e1 10739 sv_catpvs(sv, "^");
3339dfd8
YO
10740
10741 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
10742 for (i = 0; i <= 256; i++) {
10743 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
10744 if (rangestart == -1)
10745 rangestart = i;
10746 } else if (rangestart != -1) {
10747 if (i <= rangestart + 3)
10748 for (; rangestart < i; rangestart++)
653099ff 10749 put_byte(sv, rangestart);
ffc61ed2
JH
10750 else {
10751 put_byte(sv, rangestart);
396482e1 10752 sv_catpvs(sv, "-");
ffc61ed2 10753 put_byte(sv, i - 1);
653099ff 10754 }
24d786f4 10755 do_sep = 1;
ffc61ed2 10756 rangestart = -1;
653099ff 10757 }
847a199f 10758 }
3339dfd8
YO
10759
10760 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
3a15e693
KW
10761 /* output any special charclass tests (used entirely under use locale) */
10762 if (ANYOF_CLASS_TEST_ANY_SET(o))
bb7a0f54 10763 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 10764 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 10765 sv_catpv(sv, anyofs[i]);
24d786f4
YO
10766 do_sep = 1;
10767 }
10768
3339dfd8
YO
10769 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
10770
11454c59
KW
10771 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
10772 sv_catpvs(sv, "{non-utf8-latin1-all}");
10773 }
10774
3339dfd8 10775 /* output information about the unicode matching */
ef87b810 10776 if (flags & ANYOF_UNICODE_ALL)
396482e1 10777 sv_catpvs(sv, "{unicode_all}");
ef87b810
KW
10778 else if (flags & ANYOF_UTF8)
10779 sv_catpvs(sv, "{unicode}");
f5ecd18d 10780 if (flags & ANYOF_NONBITMAP_NON_UTF8)
ef87b810 10781 sv_catpvs(sv, "{outside bitmap}");
ffc61ed2
JH
10782
10783 {
10784 SV *lv;
32fc9b6a 10785 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 10786
ffc61ed2
JH
10787 if (lv) {
10788 if (sw) {
89ebb4a3 10789 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 10790
ffc61ed2 10791 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 10792 uvchr_to_utf8(s, i);
ffc61ed2 10793
3568d838 10794 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
10795 if (rangestart == -1)
10796 rangestart = i;
10797 } else if (rangestart != -1) {
ffc61ed2
JH
10798 if (i <= rangestart + 3)
10799 for (; rangestart < i; rangestart++) {
2d03de9c
AL
10800 const U8 * const e = uvchr_to_utf8(s,rangestart);
10801 U8 *p;
10802 for(p = s; p < e; p++)
ffc61ed2
JH
10803 put_byte(sv, *p);
10804 }
10805 else {
2d03de9c
AL
10806 const U8 *e = uvchr_to_utf8(s,rangestart);
10807 U8 *p;
10808 for (p = s; p < e; p++)
ffc61ed2 10809 put_byte(sv, *p);
396482e1 10810 sv_catpvs(sv, "-");
2d03de9c
AL
10811 e = uvchr_to_utf8(s, i-1);
10812 for (p = s; p < e; p++)
1df70142 10813 put_byte(sv, *p);
ffc61ed2
JH
10814 }
10815 rangestart = -1;
10816 }
19860706 10817 }
ffc61ed2 10818
396482e1 10819 sv_catpvs(sv, "..."); /* et cetera */
19860706 10820 }
fde631ed 10821
ffc61ed2 10822 {
2e0de35c 10823 char *s = savesvpv(lv);
c445ea15 10824 char * const origs = s;
b81d288d 10825
3dab1dad
YO
10826 while (*s && *s != '\n')
10827 s++;
b81d288d 10828
ffc61ed2 10829 if (*s == '\n') {
2d03de9c 10830 const char * const t = ++s;
ffc61ed2
JH
10831
10832 while (*s) {
10833 if (*s == '\n')
10834 *s = ' ';
10835 s++;
10836 }
10837 if (s[-1] == ' ')
10838 s[-1] = 0;
10839
10840 sv_catpv(sv, t);
fde631ed 10841 }
b81d288d 10842
ffc61ed2 10843 Safefree(origs);
fde631ed
JH
10844 }
10845 }
653099ff 10846 }
ffc61ed2 10847
653099ff
GS
10848 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
10849 }
9b155405 10850 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 10851 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 10852#else
96a5add6 10853 PERL_UNUSED_CONTEXT;
65e66c80
SP
10854 PERL_UNUSED_ARG(sv);
10855 PERL_UNUSED_ARG(o);
f9049ba1 10856 PERL_UNUSED_ARG(prog);
17c3b450 10857#endif /* DEBUGGING */
35ff7856 10858}
a687059c 10859
cad2e5aa 10860SV *
288b8c02 10861Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 10862{ /* Assume that RE_INTUIT is set */
97aff369 10863 dVAR;
288b8c02 10864 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 10865 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10866
10867 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
10868 PERL_UNUSED_CONTEXT;
10869
a3621e74 10870 DEBUG_COMPILE_r(
cfd0369c 10871 {
2d03de9c 10872 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 10873 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
10874
10875 if (!PL_colorset) reginitcolors();
10876 PerlIO_printf(Perl_debug_log,
a0288114 10877 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
10878 PL_colors[4],
10879 prog->check_substr ? "" : "utf8 ",
10880 PL_colors[5],PL_colors[0],
cad2e5aa
JH
10881 s,
10882 PL_colors[1],
10883 (strlen(s) > 60 ? "..." : ""));
10884 } );
10885
33b8afdf 10886 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
10887}
10888
84da74a7 10889/*
f8149455 10890 pregfree()
84da74a7 10891
f8149455
YO
10892 handles refcounting and freeing the perl core regexp structure. When
10893 it is necessary to actually free the structure the first thing it
3b753521 10894 does is call the 'free' method of the regexp_engine associated to
f8149455
YO
10895 the regexp, allowing the handling of the void *pprivate; member
10896 first. (This routine is not overridable by extensions, which is why
10897 the extensions free is called first.)
10898
10899 See regdupe and regdupe_internal if you change anything here.
84da74a7 10900*/
f8149455 10901#ifndef PERL_IN_XSUB_RE
2b69d0c2 10902void
84679df5 10903Perl_pregfree(pTHX_ REGEXP *r)
a687059c 10904{
288b8c02
NC
10905 SvREFCNT_dec(r);
10906}
10907
10908void
10909Perl_pregfree2(pTHX_ REGEXP *rx)
10910{
27da23d5 10911 dVAR;
288b8c02 10912 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 10913 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 10914
7918f24d
NC
10915 PERL_ARGS_ASSERT_PREGFREE2;
10916
28d8d7f4
YO
10917 if (r->mother_re) {
10918 ReREFCNT_dec(r->mother_re);
10919 } else {
288b8c02 10920 CALLREGFREE_PVT(rx); /* free the private data */
ef8d46e8 10921 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
10922 }
10923 if (r->substrs) {
ef8d46e8
VP
10924 SvREFCNT_dec(r->anchored_substr);
10925 SvREFCNT_dec(r->anchored_utf8);
10926 SvREFCNT_dec(r->float_substr);
10927 SvREFCNT_dec(r->float_utf8);
28d8d7f4
YO
10928 Safefree(r->substrs);
10929 }
288b8c02 10930 RX_MATCH_COPY_FREE(rx);
f8c7b90f 10931#ifdef PERL_OLD_COPY_ON_WRITE
ef8d46e8 10932 SvREFCNT_dec(r->saved_copy);
ed252734 10933#endif
f0ab9afb 10934 Safefree(r->offs);
f8149455 10935}
28d8d7f4
YO
10936
10937/* reg_temp_copy()
10938
10939 This is a hacky workaround to the structural issue of match results
10940 being stored in the regexp structure which is in turn stored in
10941 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
10942 could be PL_curpm in multiple contexts, and could require multiple
10943 result sets being associated with the pattern simultaneously, such
10944 as when doing a recursive match with (??{$qr})
10945
10946 The solution is to make a lightweight copy of the regexp structure
10947 when a qr// is returned from the code executed by (??{$qr}) this
486ec47a 10948 lightweight copy doesn't actually own any of its data except for
28d8d7f4
YO
10949 the starp/end and the actual regexp structure itself.
10950
10951*/
10952
10953
84679df5 10954REGEXP *
f0826785 10955Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
7918f24d 10956{
f0826785 10957 struct regexp *ret;
288b8c02 10958 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 10959 register const I32 npar = r->nparens+1;
7918f24d
NC
10960
10961 PERL_ARGS_ASSERT_REG_TEMP_COPY;
10962
f0826785
BM
10963 if (!ret_x)
10964 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
10965 ret = (struct regexp *)SvANY(ret_x);
10966
288b8c02 10967 (void)ReREFCNT_inc(rx);
f7c278bf
NC
10968 /* We can take advantage of the existing "copied buffer" mechanism in SVs
10969 by pointing directly at the buffer, but flagging that the allocated
10970 space in the copy is zero. As we've just done a struct copy, it's now
10971 a case of zero-ing that, rather than copying the current length. */
10972 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 10973 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
b6f60916
NC
10974 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
10975 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
f7c278bf 10976 SvLEN_set(ret_x, 0);
b9ad13ac 10977 SvSTASH_set(ret_x, NULL);
703c388d 10978 SvMAGIC_set(ret_x, NULL);
f0ab9afb
NC
10979 Newx(ret->offs, npar, regexp_paren_pair);
10980 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 10981 if (r->substrs) {
28d8d7f4 10982 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
10983 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10984
10985 SvREFCNT_inc_void(ret->anchored_substr);
10986 SvREFCNT_inc_void(ret->anchored_utf8);
10987 SvREFCNT_inc_void(ret->float_substr);
10988 SvREFCNT_inc_void(ret->float_utf8);
10989
10990 /* check_substr and check_utf8, if non-NULL, point to either their
10991 anchored or float namesakes, and don't hold a second reference. */
486913e4 10992 }
288b8c02 10993 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 10994#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 10995 ret->saved_copy = NULL;
28d8d7f4 10996#endif
288b8c02 10997 ret->mother_re = rx;
28d8d7f4 10998
288b8c02 10999 return ret_x;
28d8d7f4 11000}
f8149455
YO
11001#endif
11002
11003/* regfree_internal()
11004
11005 Free the private data in a regexp. This is overloadable by
11006 extensions. Perl takes care of the regexp structure in pregfree(),
3b753521 11007 this covers the *pprivate pointer which technically perl doesn't
f8149455
YO
11008 know about, however of course we have to handle the
11009 regexp_internal structure when no extension is in use.
11010
11011 Note this is called before freeing anything in the regexp
11012 structure.
11013 */
11014
11015void
288b8c02 11016Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
11017{
11018 dVAR;
288b8c02 11019 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
11020 RXi_GET_DECL(r,ri);
11021 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
11022
11023 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
11024
f8149455
YO
11025 DEBUG_COMPILE_r({
11026 if (!PL_colorset)
11027 reginitcolors();
11028 {
11029 SV *dsv= sv_newmortal();
3c8556c3 11030 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 11031 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
11032 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
11033 PL_colors[4],PL_colors[5],s);
11034 }
11035 });
7122b237
YO
11036#ifdef RE_TRACK_PATTERN_OFFSETS
11037 if (ri->u.offsets)
11038 Safefree(ri->u.offsets); /* 20010421 MJD */
11039#endif
f8fc2ecf
YO
11040 if (ri->data) {
11041 int n = ri->data->count;
f3548bdc
DM
11042 PAD* new_comppad = NULL;
11043 PAD* old_comppad;
4026c95a 11044 PADOFFSET refcnt;
dfad63ad 11045
c277df42 11046 while (--n >= 0) {
261faec3 11047 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 11048 switch (ri->data->what[n]) {
af534a04 11049 case 'a':
c277df42 11050 case 's':
81714fb9 11051 case 'S':
55eed653 11052 case 'u':
ad64d0ec 11053 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 11054 break;
653099ff 11055 case 'f':
f8fc2ecf 11056 Safefree(ri->data->data[n]);
653099ff 11057 break;
dfad63ad 11058 case 'p':
502c6561 11059 new_comppad = MUTABLE_AV(ri->data->data[n]);
dfad63ad 11060 break;
c277df42 11061 case 'o':
dfad63ad 11062 if (new_comppad == NULL)
cea2e8a9 11063 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
11064 PAD_SAVE_LOCAL(old_comppad,
11065 /* Watch out for global destruction's random ordering. */
c445ea15 11066 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 11067 );
b34c0dd4 11068 OP_REFCNT_LOCK;
f8fc2ecf 11069 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
11070 OP_REFCNT_UNLOCK;
11071 if (!refcnt)
f8fc2ecf 11072 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 11073
f3548bdc 11074 PAD_RESTORE_LOCAL(old_comppad);
ad64d0ec 11075 SvREFCNT_dec(MUTABLE_SV(new_comppad));
dfad63ad 11076 new_comppad = NULL;
c277df42
IZ
11077 break;
11078 case 'n':
9e55ce06 11079 break;
07be1b83 11080 case 'T':
be8e71aa
YO
11081 { /* Aho Corasick add-on structure for a trie node.
11082 Used in stclass optimization only */
07be1b83 11083 U32 refcount;
f8fc2ecf 11084 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
11085 OP_REFCNT_LOCK;
11086 refcount = --aho->refcount;
11087 OP_REFCNT_UNLOCK;
11088 if ( !refcount ) {
446bd890
NC
11089 PerlMemShared_free(aho->states);
11090 PerlMemShared_free(aho->fail);
446bd890
NC
11091 /* do this last!!!! */
11092 PerlMemShared_free(ri->data->data[n]);
11093 PerlMemShared_free(ri->regstclass);
07be1b83
YO
11094 }
11095 }
11096 break;
a3621e74 11097 case 't':
07be1b83 11098 {
be8e71aa 11099 /* trie structure. */
07be1b83 11100 U32 refcount;
f8fc2ecf 11101 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
11102 OP_REFCNT_LOCK;
11103 refcount = --trie->refcount;
11104 OP_REFCNT_UNLOCK;
11105 if ( !refcount ) {
446bd890 11106 PerlMemShared_free(trie->charmap);
446bd890
NC
11107 PerlMemShared_free(trie->states);
11108 PerlMemShared_free(trie->trans);
07be1b83 11109 if (trie->bitmap)
446bd890 11110 PerlMemShared_free(trie->bitmap);
786e8c11 11111 if (trie->jump)
446bd890 11112 PerlMemShared_free(trie->jump);
2e64971a 11113 PerlMemShared_free(trie->wordinfo);
446bd890
NC
11114 /* do this last!!!! */
11115 PerlMemShared_free(ri->data->data[n]);
a3621e74 11116 }
07be1b83
YO
11117 }
11118 break;
c277df42 11119 default:
f8fc2ecf 11120 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
11121 }
11122 }
f8fc2ecf
YO
11123 Safefree(ri->data->what);
11124 Safefree(ri->data);
a0d0e21e 11125 }
28d8d7f4 11126
f8fc2ecf 11127 Safefree(ri);
a687059c 11128}
c277df42 11129
a09252eb
NC
11130#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
11131#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
84da74a7
YO
11132#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
11133
11134/*
32cd70f6 11135 re_dup - duplicate a regexp.
84da74a7 11136
8233f606
DM
11137 This routine is expected to clone a given regexp structure. It is only
11138 compiled under USE_ITHREADS.
32cd70f6 11139
f8149455
YO
11140 After all of the core data stored in struct regexp is duplicated
11141 the regexp_engine.dupe method is used to copy any private data
11142 stored in the *pprivate pointer. This allows extensions to handle
11143 any duplication it needs to do.
11144
11145 See pregfree() and regfree_internal() if you change anything here.
84da74a7 11146*/
a3c0e9ca 11147#if defined(USE_ITHREADS)
f8149455 11148#ifndef PERL_IN_XSUB_RE
288b8c02
NC
11149void
11150Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 11151{
84da74a7 11152 dVAR;
a86a1ca7 11153 I32 npar;
288b8c02
NC
11154 const struct regexp *r = (const struct regexp *)SvANY(sstr);
11155 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 11156
7918f24d
NC
11157 PERL_ARGS_ASSERT_RE_DUP_GUTS;
11158
84da74a7 11159 npar = r->nparens+1;
f0ab9afb
NC
11160 Newx(ret->offs, npar, regexp_paren_pair);
11161 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 11162 if(ret->swap) {
28d8d7f4 11163 /* no need to copy these */
f0ab9afb 11164 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 11165 }
84da74a7 11166
6057429f 11167 if (ret->substrs) {
32cd70f6
NC
11168 /* Do it this way to avoid reading from *r after the StructCopy().
11169 That way, if any of the sv_dup_inc()s dislodge *r from the L1
11170 cache, it doesn't matter. */
66b1de87
NC
11171 const bool anchored = r->check_substr
11172 ? r->check_substr == r->anchored_substr
11173 : r->check_utf8 == r->anchored_utf8;
785a26d5 11174 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
11175 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11176
32cd70f6
NC
11177 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
11178 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
11179 ret->float_substr = sv_dup_inc(ret->float_substr, param);
11180 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 11181
32cd70f6
NC
11182 /* check_substr and check_utf8, if non-NULL, point to either their
11183 anchored or float namesakes, and don't hold a second reference. */
11184
11185 if (ret->check_substr) {
11186 if (anchored) {
11187 assert(r->check_utf8 == r->anchored_utf8);
11188 ret->check_substr = ret->anchored_substr;
11189 ret->check_utf8 = ret->anchored_utf8;
11190 } else {
11191 assert(r->check_substr == r->float_substr);
11192 assert(r->check_utf8 == r->float_utf8);
11193 ret->check_substr = ret->float_substr;
11194 ret->check_utf8 = ret->float_utf8;
11195 }
66b1de87
NC
11196 } else if (ret->check_utf8) {
11197 if (anchored) {
11198 ret->check_utf8 = ret->anchored_utf8;
11199 } else {
11200 ret->check_utf8 = ret->float_utf8;
11201 }
32cd70f6 11202 }
6057429f 11203 }
f8149455 11204
5daac39c 11205 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 11206
6057429f 11207 if (ret->pprivate)
288b8c02 11208 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 11209
288b8c02 11210 if (RX_MATCH_COPIED(dstr))
6057429f 11211 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
11212 else
11213 ret->subbeg = NULL;
11214#ifdef PERL_OLD_COPY_ON_WRITE
11215 ret->saved_copy = NULL;
11216#endif
6057429f 11217
c2123ae3
NC
11218 if (ret->mother_re) {
11219 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
11220 /* Our storage points directly to our mother regexp, but that's
11221 1: a buffer in a different thread
11222 2: something we no longer hold a reference on
11223 so we need to copy it locally. */
11224 /* Note we need to sue SvCUR() on our mother_re, because it, in
11225 turn, may well be pointing to its own mother_re. */
11226 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
11227 SvCUR(ret->mother_re)+1));
11228 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
11229 }
11230 ret->mother_re = NULL;
11231 }
6057429f 11232 ret->gofs = 0;
f8149455
YO
11233}
11234#endif /* PERL_IN_XSUB_RE */
11235
11236/*
11237 regdupe_internal()
11238
11239 This is the internal complement to regdupe() which is used to copy
11240 the structure pointed to by the *pprivate pointer in the regexp.
11241 This is the core version of the extension overridable cloning hook.
11242 The regexp structure being duplicated will be copied by perl prior
11243 to this and will be provided as the regexp *r argument, however
11244 with the /old/ structures pprivate pointer value. Thus this routine
11245 may override any copying normally done by perl.
11246
11247 It returns a pointer to the new regexp_internal structure.
11248*/
11249
11250void *
288b8c02 11251Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
11252{
11253 dVAR;
288b8c02 11254 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
11255 regexp_internal *reti;
11256 int len, npar;
11257 RXi_GET_DECL(r,ri);
7918f24d
NC
11258
11259 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455
YO
11260
11261 npar = r->nparens+1;
7122b237 11262 len = ProgLen(ri);
f8149455 11263
45cf4570 11264 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455
YO
11265 Copy(ri->program, reti->program, len+1, regnode);
11266
f8149455 11267
f8fc2ecf 11268 reti->regstclass = NULL;
bcdf7404 11269
f8fc2ecf 11270 if (ri->data) {
84da74a7 11271 struct reg_data *d;
f8fc2ecf 11272 const int count = ri->data->count;
84da74a7
YO
11273 int i;
11274
11275 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
11276 char, struct reg_data);
11277 Newx(d->what, count, U8);
11278
11279 d->count = count;
11280 for (i = 0; i < count; i++) {
f8fc2ecf 11281 d->what[i] = ri->data->what[i];
84da74a7 11282 switch (d->what[i]) {
af534a04 11283 /* legal options are one of: sSfpontTua
84da74a7 11284 see also regcomp.h and pregfree() */
af534a04 11285 case 'a': /* actually an AV, but the dup function is identical. */
84da74a7 11286 case 's':
81714fb9 11287 case 'S':
0536c0a7 11288 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 11289 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 11290 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 11291 break;
84da74a7
YO
11292 case 'f':
11293 /* This is cheating. */
11294 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 11295 StructCopy(ri->data->data[i], d->data[i],
84da74a7 11296 struct regnode_charclass_class);
f8fc2ecf 11297 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
11298 break;
11299 case 'o':
bbe252da
YO
11300 /* Compiled op trees are readonly and in shared memory,
11301 and can thus be shared without duplication. */
84da74a7 11302 OP_REFCNT_LOCK;
f8fc2ecf 11303 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
11304 OP_REFCNT_UNLOCK;
11305 break;
23eab42c
NC
11306 case 'T':
11307 /* Trie stclasses are readonly and can thus be shared
11308 * without duplication. We free the stclass in pregfree
11309 * when the corresponding reg_ac_data struct is freed.
11310 */
11311 reti->regstclass= ri->regstclass;
11312 /* Fall through */
84da74a7 11313 case 't':
84da74a7 11314 OP_REFCNT_LOCK;
0536c0a7 11315 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 11316 OP_REFCNT_UNLOCK;
0536c0a7
NC
11317 /* Fall through */
11318 case 'n':
11319 d->data[i] = ri->data->data[i];
84da74a7 11320 break;
84da74a7 11321 default:
f8fc2ecf 11322 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
11323 }
11324 }
11325
f8fc2ecf 11326 reti->data = d;
84da74a7
YO
11327 }
11328 else
f8fc2ecf 11329 reti->data = NULL;
84da74a7 11330
cde0cee5
YO
11331 reti->name_list_idx = ri->name_list_idx;
11332
7122b237
YO
11333#ifdef RE_TRACK_PATTERN_OFFSETS
11334 if (ri->u.offsets) {
11335 Newx(reti->u.offsets, 2*len+1, U32);
11336 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
11337 }
11338#else
11339 SetProgLen(reti,len);
11340#endif
11341
f8149455 11342 return (void*)reti;
84da74a7 11343}
f8149455
YO
11344
11345#endif /* USE_ITHREADS */
84da74a7 11346
f8149455 11347#ifndef PERL_IN_XSUB_RE
bcdf7404 11348
c277df42
IZ
11349/*
11350 - regnext - dig the "next" pointer out of a node
c277df42
IZ
11351 */
11352regnode *
864dbfa3 11353Perl_regnext(pTHX_ register regnode *p)
c277df42 11354{
97aff369 11355 dVAR;
c277df42
IZ
11356 register I32 offset;
11357
f8fc2ecf 11358 if (!p)
c277df42
IZ
11359 return(NULL);
11360
35db910f
KW
11361 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
11362 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
11363 }
11364
c277df42
IZ
11365 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
11366 if (offset == 0)
11367 return(NULL);
11368
c277df42 11369 return(p+offset);
c277df42 11370}
76234dfb 11371#endif
c277df42 11372
01f988be 11373STATIC void
cea2e8a9 11374S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
11375{
11376 va_list args;
11377 STRLEN l1 = strlen(pat1);
11378 STRLEN l2 = strlen(pat2);
11379 char buf[512];
06bf62c7 11380 SV *msv;
73d840c0 11381 const char *message;
c277df42 11382
7918f24d
NC
11383 PERL_ARGS_ASSERT_RE_CROAK2;
11384
c277df42
IZ
11385 if (l1 > 510)
11386 l1 = 510;
11387 if (l1 + l2 > 510)
11388 l2 = 510 - l1;
11389 Copy(pat1, buf, l1 , char);
11390 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
11391 buf[l1 + l2] = '\n';
11392 buf[l1 + l2 + 1] = '\0';
8736538c
AS
11393#ifdef I_STDARG
11394 /* ANSI variant takes additional second argument */
c277df42 11395 va_start(args, pat2);
8736538c
AS
11396#else
11397 va_start(args);
11398#endif
5a844595 11399 msv = vmess(buf, &args);
c277df42 11400 va_end(args);
cfd0369c 11401 message = SvPV_const(msv,l1);
c277df42
IZ
11402 if (l1 > 512)
11403 l1 = 512;
11404 Copy(message, buf, l1 , char);
197cf9b9 11405 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 11406 Perl_croak(aTHX_ "%s", buf);
c277df42 11407}
a0ed51b3
LW
11408
11409/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
11410
76234dfb 11411#ifndef PERL_IN_XSUB_RE
a0ed51b3 11412void
864dbfa3 11413Perl_save_re_context(pTHX)
b81d288d 11414{
97aff369 11415 dVAR;
1ade1aa1
NC
11416
11417 struct re_save_state *state;
11418
11419 SAVEVPTR(PL_curcop);
11420 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
11421
11422 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
11423 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
c6bf6a65 11424 SSPUSHUV(SAVEt_RE_STATE);
1ade1aa1 11425
46ab3289 11426 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 11427
a0ed51b3 11428 PL_reg_start_tmp = 0;
a0ed51b3 11429 PL_reg_start_tmpl = 0;
c445ea15 11430 PL_reg_oldsaved = NULL;
a5db57d6 11431 PL_reg_oldsavedlen = 0;
a5db57d6 11432 PL_reg_maxiter = 0;
a5db57d6 11433 PL_reg_leftiter = 0;
c445ea15 11434 PL_reg_poscache = NULL;
a5db57d6 11435 PL_reg_poscache_size = 0;
1ade1aa1
NC
11436#ifdef PERL_OLD_COPY_ON_WRITE
11437 PL_nrs = NULL;
11438#endif
ada6e8a9 11439
c445ea15
AL
11440 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
11441 if (PL_curpm) {
11442 const REGEXP * const rx = PM_GETRE(PL_curpm);
11443 if (rx) {
1df70142 11444 U32 i;
07bc277f 11445 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 11446 char digits[TYPE_CHARS(long)];
d9fad198 11447 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
11448 GV *const *const gvp
11449 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
11450
b37c2d43
AL
11451 if (gvp) {
11452 GV * const gv = *gvp;
11453 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
11454 save_scalar(gv);
49f27e4b 11455 }
ada6e8a9
AMS
11456 }
11457 }
11458 }
a0ed51b3 11459}
76234dfb 11460#endif
51371543 11461
51371543 11462static void
acfe0abc 11463clear_re(pTHX_ void *r)
51371543 11464{
97aff369 11465 dVAR;
84679df5 11466 ReREFCNT_dec((REGEXP *)r);
51371543 11467}
ffbc6a93 11468
a28509cc
AL
11469#ifdef DEBUGGING
11470
11471STATIC void
11472S_put_byte(pTHX_ SV *sv, int c)
11473{
7918f24d
NC
11474 PERL_ARGS_ASSERT_PUT_BYTE;
11475
7fddd944
NC
11476 /* Our definition of isPRINT() ignores locales, so only bytes that are
11477 not part of UTF-8 are considered printable. I assume that the same
11478 holds for UTF-EBCDIC.
11479 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
11480 which Wikipedia says:
11481
11482 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
11483 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
11484 identical, to the ASCII delete (DEL) or rubout control character.
11485 ) So the old condition can be simplified to !isPRINT(c) */
9ce2357e
KW
11486 if (!isPRINT(c)) {
11487 if (c < 256) {
11488 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
11489 }
11490 else {
11491 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
11492 }
11493 }
5e7aa789 11494 else {
88c9ea1e 11495 const char string = c;
5e7aa789
NC
11496 if (c == '-' || c == ']' || c == '\\' || c == '^')
11497 sv_catpvs(sv, "\\");
11498 sv_catpvn(sv, &string, 1);
11499 }
a28509cc
AL
11500}
11501
786e8c11 11502
3dab1dad
YO
11503#define CLEAR_OPTSTART \
11504 if (optstart) STMT_START { \
70685ca0 11505 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
11506 optstart=NULL; \
11507 } STMT_END
11508
786e8c11 11509#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 11510
b5a2f8d8
NC
11511STATIC const regnode *
11512S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
11513 const regnode *last, const regnode *plast,
11514 SV* sv, I32 indent, U32 depth)
a28509cc 11515{
97aff369 11516 dVAR;
786e8c11 11517 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 11518 register const regnode *next;
3dab1dad 11519 const regnode *optstart= NULL;
1f1031fe 11520
f8fc2ecf 11521 RXi_GET_DECL(r,ri);
3dab1dad 11522 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
11523
11524 PERL_ARGS_ASSERT_DUMPUNTIL;
11525
786e8c11
YO
11526#ifdef DEBUG_DUMPUNTIL
11527 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
11528 last ? last-start : 0,plast ? plast-start : 0);
11529#endif
11530
11531 if (plast && plast < last)
11532 last= plast;
11533
11534 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 11535 /* While that wasn't END last time... */
a28509cc
AL
11536 NODE_ALIGN(node);
11537 op = OP(node);
de734bd5 11538 if (op == CLOSE || op == WHILEM)
786e8c11 11539 indent--;
b5a2f8d8 11540 next = regnext((regnode *)node);
1f1031fe 11541
a28509cc 11542 /* Where, what. */
8e11feef 11543 if (OP(node) == OPTIMIZED) {
e68ec53f 11544 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 11545 optstart = node;
3dab1dad 11546 else
8e11feef 11547 goto after_print;
3dab1dad
YO
11548 } else
11549 CLEAR_OPTSTART;
1f1031fe 11550
32fc9b6a 11551 regprop(r, sv, node);
a28509cc 11552 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 11553 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
11554
11555 if (OP(node) != OPTIMIZED) {
11556 if (next == NULL) /* Next ptr. */
11557 PerlIO_printf(Perl_debug_log, " (0)");
11558 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
11559 PerlIO_printf(Perl_debug_log, " (FAIL)");
11560 else
11561 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
11562 (void)PerlIO_putc(Perl_debug_log, '\n');
11563 }
11564
a28509cc
AL
11565 after_print:
11566 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
11567 assert(next);
11568 {
11569 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
11570 ? regnext((regnode *)next)
11571 : next);
be8e71aa
YO
11572 if (last && nnode > last)
11573 nnode = last;
786e8c11 11574 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 11575 }
a28509cc
AL
11576 }
11577 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 11578 assert(next);
786e8c11 11579 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
11580 }
11581 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 11582 const regnode *this_trie = node;
1de06328 11583 const char op = OP(node);
647f639f 11584 const U32 n = ARG(node);
1de06328 11585 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 11586 (reg_ac_data *)ri->data->data[n] :
1de06328 11587 NULL;
3251b653
NC
11588 const reg_trie_data * const trie =
11589 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 11590#ifdef DEBUGGING
502c6561 11591 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 11592#endif
786e8c11 11593 const regnode *nextbranch= NULL;
a28509cc 11594 I32 word_idx;
76f68e9b 11595 sv_setpvs(sv, "");
786e8c11 11596 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 11597 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
11598
11599 PerlIO_printf(Perl_debug_log, "%*s%s ",
11600 (int)(2*(indent+3)), "",
11601 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
11602 PL_colors[0], PL_colors[1],
11603 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 11604 PERL_PV_PRETTY_ELLIPSES |
7f69552c 11605 PERL_PV_PRETTY_LTGT
786e8c11
YO
11606 )
11607 : "???"
11608 );
11609 if (trie->jump) {
40d049e4 11610 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
11611 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
11612 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
11613 if (dist) {
11614 if (!nextbranch)
24b23f37 11615 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
11616 DUMPUNTIL(this_trie + dist, nextbranch);
11617 }
786e8c11
YO
11618 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
11619 nextbranch= regnext((regnode *)nextbranch);
11620 } else {
11621 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 11622 }
786e8c11
YO
11623 }
11624 if (last && next > last)
11625 node= last;
11626 else
11627 node= next;
a28509cc 11628 }
786e8c11
YO
11629 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
11630 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
11631 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
11632 }
11633 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 11634 assert(next);
786e8c11 11635 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
11636 }
11637 else if ( op == PLUS || op == STAR) {
786e8c11 11638 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc 11639 }
f56b6394 11640 else if (PL_regkind[(U8)op] == ANYOF) {
a28509cc 11641 /* arglen 1 + class block */
4a3ee7a8 11642 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
a28509cc
AL
11643 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
11644 node = NEXTOPER(node);
11645 }
11646 else if (PL_regkind[(U8)op] == EXACT) {
11647 /* Literal string, where present. */
11648 node += NODE_SZ_STR(node) - 1;
11649 node = NEXTOPER(node);
11650 }
11651 else {
11652 node = NEXTOPER(node);
11653 node += regarglen[(U8)op];
11654 }
11655 if (op == CURLYX || op == OPEN)
786e8c11 11656 indent++;
a28509cc 11657 }
3dab1dad 11658 CLEAR_OPTSTART;
786e8c11 11659#ifdef DEBUG_DUMPUNTIL
70685ca0 11660 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 11661#endif
1de06328 11662 return node;
a28509cc
AL
11663}
11664
11665#endif /* DEBUGGING */
11666
241d1a3b
NC
11667/*
11668 * Local variables:
11669 * c-indentation-style: bsd
11670 * c-basic-offset: 4
11671 * indent-tabs-mode: t
11672 * End:
11673 *
37442d52
RGS
11674 * ex: set ts=8 sts=4 sw=4 noet:
11675 */