This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perlcall: subjunctive, not indicative
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
81714fb9 137 HV *paren_names; /* Paren names */
1f1031fe 138
40d049e4
YO
139 regnode **recurse; /* Recurse regops */
140 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
141#if ADD_TO_REGEXEC
142 char *starttry; /* -Dr: where regtry was called. */
143#define RExC_starttry (pRExC_state->starttry)
144#endif
3dab1dad 145#ifdef DEBUGGING
be8e71aa 146 const char *lastparse;
3dab1dad 147 I32 lastnum;
1f1031fe 148 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
149#define RExC_lastparse (pRExC_state->lastparse)
150#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 151#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 152#endif
830247a4
IZ
153} RExC_state_t;
154
e2509266 155#define RExC_flags (pRExC_state->flags)
830247a4 156#define RExC_precomp (pRExC_state->precomp)
288b8c02 157#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 158#define RExC_rx (pRExC_state->rx)
f8fc2ecf 159#define RExC_rxi (pRExC_state->rxi)
fac92740 160#define RExC_start (pRExC_state->start)
830247a4
IZ
161#define RExC_end (pRExC_state->end)
162#define RExC_parse (pRExC_state->parse)
163#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
164#ifdef RE_TRACK_PATTERN_OFFSETS
165#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
166#endif
830247a4 167#define RExC_emit (pRExC_state->emit)
fac92740 168#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 169#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
170#define RExC_naughty (pRExC_state->naughty)
171#define RExC_sawback (pRExC_state->sawback)
172#define RExC_seen (pRExC_state->seen)
173#define RExC_size (pRExC_state->size)
174#define RExC_npar (pRExC_state->npar)
e2e6a0f1 175#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
176#define RExC_extralen (pRExC_state->extralen)
177#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 179#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 180#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
181#define RExC_open_parens (pRExC_state->open_parens)
182#define RExC_close_parens (pRExC_state->close_parens)
183#define RExC_opend (pRExC_state->opend)
81714fb9 184#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
185#define RExC_recurse (pRExC_state->recurse)
186#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 187
cde0cee5 188
a687059c
LW
189#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
a687059c 192
35c8bce7
LW
193#ifdef SPSTART
194#undef SPSTART /* dratted cpp namespace... */
195#endif
a687059c
LW
196/*
197 * Flags to be passed up and down.
198 */
a687059c 199#define WORST 0 /* Worst case. */
a3b492c3 200#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
201
202/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
d7b56a3c 203 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
fda99bee 204#define SIMPLE 0x02
a3b492c3
YO
205#define SPSTART 0x04 /* Starts with * or +. */
206#define TRYAGAIN 0x08 /* Weeded out a declaration. */
207#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 208
3dab1dad
YO
209#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
210
07be1b83
YO
211/* whether trie related optimizations are enabled */
212#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
213#define TRIE_STUDY_OPT
786e8c11 214#define FULL_TRIE_STUDY
07be1b83
YO
215#define TRIE_STCLASS
216#endif
1de06328
YO
217
218
40d049e4
YO
219
220#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
221#define PBITVAL(paren) (1 << ((paren) & 7))
222#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
223#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
224#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
225
bbd61b5f
KW
226/* If not already in utf8, do a longjmp back to the beginning */
227#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
228#define REQUIRE_UTF8 STMT_START { \
229 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
230 } STMT_END
40d049e4 231
1de06328
YO
232/* About scan_data_t.
233
234 During optimisation we recurse through the regexp program performing
235 various inplace (keyhole style) optimisations. In addition study_chunk
236 and scan_commit populate this data structure with information about
237 what strings MUST appear in the pattern. We look for the longest
3b753521 238 string that must appear at a fixed location, and we look for the
1de06328
YO
239 longest string that may appear at a floating location. So for instance
240 in the pattern:
241
242 /FOO[xX]A.*B[xX]BAR/
243
244 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
245 strings (because they follow a .* construct). study_chunk will identify
246 both FOO and BAR as being the longest fixed and floating strings respectively.
247
248 The strings can be composites, for instance
249
250 /(f)(o)(o)/
251
252 will result in a composite fixed substring 'foo'.
253
254 For each string some basic information is maintained:
255
256 - offset or min_offset
257 This is the position the string must appear at, or not before.
258 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
259 characters must match before the string we are searching for.
260 Likewise when combined with minlenp and the length of the string it
1de06328
YO
261 tells us how many characters must appear after the string we have
262 found.
263
264 - max_offset
265 Only used for floating strings. This is the rightmost point that
3b753521 266 the string can appear at. If set to I32 max it indicates that the
1de06328
YO
267 string can occur infinitely far to the right.
268
269 - minlenp
270 A pointer to the minimum length of the pattern that the string
271 was found inside. This is important as in the case of positive
272 lookahead or positive lookbehind we can have multiple patterns
273 involved. Consider
274
275 /(?=FOO).*F/
276
277 The minimum length of the pattern overall is 3, the minimum length
278 of the lookahead part is 3, but the minimum length of the part that
279 will actually match is 1. So 'FOO's minimum length is 3, but the
280 minimum length for the F is 1. This is important as the minimum length
281 is used to determine offsets in front of and behind the string being
282 looked for. Since strings can be composites this is the length of the
486ec47a 283 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
284 the length is calculated by study_chunk, so that the minimum lengths
285 are not known until the full pattern has been compiled, thus the
286 pointer to the value.
287
288 - lookbehind
289
290 In the case of lookbehind the string being searched for can be
291 offset past the start point of the final matching string.
292 If this value was just blithely removed from the min_offset it would
293 invalidate some of the calculations for how many chars must match
294 before or after (as they are derived from min_offset and minlen and
295 the length of the string being searched for).
296 When the final pattern is compiled and the data is moved from the
297 scan_data_t structure into the regexp structure the information
298 about lookbehind is factored in, with the information that would
299 have been lost precalculated in the end_shift field for the
300 associated string.
301
302 The fields pos_min and pos_delta are used to store the minimum offset
303 and the delta to the maximum offset at the current point in the pattern.
304
305*/
2c2d71f5
JH
306
307typedef struct scan_data_t {
1de06328
YO
308 /*I32 len_min; unused */
309 /*I32 len_delta; unused */
2c2d71f5
JH
310 I32 pos_min;
311 I32 pos_delta;
312 SV *last_found;
1de06328 313 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
314 I32 last_start_min;
315 I32 last_start_max;
1de06328
YO
316 SV **longest; /* Either &l_fixed, or &l_float. */
317 SV *longest_fixed; /* longest fixed string found in pattern */
318 I32 offset_fixed; /* offset where it starts */
486ec47a 319 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
320 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
321 SV *longest_float; /* longest floating string found in pattern */
322 I32 offset_float_min; /* earliest point in string it can appear */
323 I32 offset_float_max; /* latest point in string it can appear */
486ec47a 324 I32 *minlen_float; /* pointer to the minlen relevant to the string */
1de06328 325 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
326 I32 flags;
327 I32 whilem_c;
cb434fcc 328 I32 *last_closep;
653099ff 329 struct regnode_charclass_class *start_class;
2c2d71f5
JH
330} scan_data_t;
331
a687059c 332/*
e50aee73 333 * Forward declarations for pregcomp()'s friends.
a687059c 334 */
a0d0e21e 335
27da23d5 336static const scan_data_t zero_scan_data =
1de06328 337 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
338
339#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
340#define SF_BEFORE_SEOL 0x0001
341#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
342#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
343#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
344
09b7f37c
CB
345#ifdef NO_UNARY_PLUS
346# define SF_FIX_SHIFT_EOL (0+2)
347# define SF_FL_SHIFT_EOL (0+4)
348#else
349# define SF_FIX_SHIFT_EOL (+2)
350# define SF_FL_SHIFT_EOL (+4)
351#endif
c277df42
IZ
352
353#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
354#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
355
356#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
357#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
358#define SF_IS_INF 0x0040
359#define SF_HAS_PAR 0x0080
360#define SF_IN_PAR 0x0100
361#define SF_HAS_EVAL 0x0200
362#define SCF_DO_SUBSTR 0x0400
653099ff
GS
363#define SCF_DO_STCLASS_AND 0x0800
364#define SCF_DO_STCLASS_OR 0x1000
365#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 366#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 367
786e8c11 368#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 369#define SCF_SEEN_ACCEPT 0x8000
07be1b83 370
43fead97 371#define UTF cBOOL(RExC_utf8)
a62b1201
KW
372#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
373#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
cfaf538b
KW
374#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
375#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
376#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
a62b1201 377
43fead97 378#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 379
ffc61ed2 380#define OOB_UNICODE 12345678
93733859 381#define OOB_NAMEDCLASS -1
b8c5462f 382
a0ed51b3
LW
383#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
384#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
385
8615cb43 386
b45f050a
JF
387/* length of regex to show in messages that don't mark a position within */
388#define RegexLengthToShowInErrorMessages 127
389
390/*
391 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
392 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
393 * op/pragma/warn/regcomp.
394 */
7253e4e3
RK
395#define MARKER1 "<-- HERE" /* marker as it appears in the description */
396#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 397
7253e4e3 398#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
399
400/*
401 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
402 * arg. Show regex, up to a maximum length. If it's too long, chop and add
403 * "...".
404 */
58e23c8d 405#define _FAIL(code) STMT_START { \
bfed75c6 406 const char *ellipses = ""; \
ccb2c380
MP
407 IV len = RExC_end - RExC_precomp; \
408 \
409 if (!SIZE_ONLY) \
288b8c02 410 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
411 if (len > RegexLengthToShowInErrorMessages) { \
412 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
413 len = RegexLengthToShowInErrorMessages - 10; \
414 ellipses = "..."; \
415 } \
58e23c8d 416 code; \
ccb2c380 417} STMT_END
8615cb43 418
58e23c8d
YO
419#define FAIL(msg) _FAIL( \
420 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
421 msg, (int)len, RExC_precomp, ellipses))
422
423#define FAIL2(msg,arg) _FAIL( \
424 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
425 arg, (int)len, RExC_precomp, ellipses))
426
b45f050a 427/*
b45f050a
JF
428 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
429 */
ccb2c380 430#define Simple_vFAIL(m) STMT_START { \
a28509cc 431 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
432 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
433 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
434} STMT_END
b45f050a
JF
435
436/*
437 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
438 */
ccb2c380
MP
439#define vFAIL(m) STMT_START { \
440 if (!SIZE_ONLY) \
288b8c02 441 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
442 Simple_vFAIL(m); \
443} STMT_END
b45f050a
JF
444
445/*
446 * Like Simple_vFAIL(), but accepts two arguments.
447 */
ccb2c380 448#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 449 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
450 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
451 (int)offset, RExC_precomp, RExC_precomp + offset); \
452} STMT_END
b45f050a
JF
453
454/*
455 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
456 */
ccb2c380
MP
457#define vFAIL2(m,a1) STMT_START { \
458 if (!SIZE_ONLY) \
288b8c02 459 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
460 Simple_vFAIL2(m, a1); \
461} STMT_END
b45f050a
JF
462
463
464/*
465 * Like Simple_vFAIL(), but accepts three arguments.
466 */
ccb2c380 467#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 468 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
469 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
470 (int)offset, RExC_precomp, RExC_precomp + offset); \
471} STMT_END
b45f050a
JF
472
473/*
474 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
475 */
ccb2c380
MP
476#define vFAIL3(m,a1,a2) STMT_START { \
477 if (!SIZE_ONLY) \
288b8c02 478 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
479 Simple_vFAIL3(m, a1, a2); \
480} STMT_END
b45f050a
JF
481
482/*
483 * Like Simple_vFAIL(), but accepts four arguments.
484 */
ccb2c380 485#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 486 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
487 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
488 (int)offset, RExC_precomp, RExC_precomp + offset); \
489} STMT_END
b45f050a 490
668c081a 491#define ckWARNreg(loc,m) STMT_START { \
a28509cc 492 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
493 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
494 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
495} STMT_END
496
668c081a 497#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 498 const IV offset = loc - RExC_precomp; \
d1d15184 499 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
500 m REPORT_LOCATION, \
501 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
502} STMT_END
503
668c081a 504#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 505 const IV offset = loc - RExC_precomp; \
668c081a 506 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
507 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
508} STMT_END
509
510#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 511 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
512 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
513 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
514} STMT_END
515
668c081a
NC
516#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
517 const IV offset = loc - RExC_precomp; \
518 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
519 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
520} STMT_END
521
ccb2c380 522#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 523 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
524 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
525 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
526} STMT_END
527
668c081a
NC
528#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
529 const IV offset = loc - RExC_precomp; \
530 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
531 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
532} STMT_END
533
ccb2c380 534#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 535 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
536 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
537 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
538} STMT_END
9d1d55b5 539
8615cb43 540
cd439c50 541/* Allow for side effects in s */
ccb2c380
MP
542#define REGC(c,s) STMT_START { \
543 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
544} STMT_END
cd439c50 545
fac92740
MJD
546/* Macros for recording node offsets. 20001227 mjd@plover.com
547 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
548 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
549 * Element 0 holds the number n.
07be1b83 550 * Position is 1 indexed.
fac92740 551 */
7122b237
YO
552#ifndef RE_TRACK_PATTERN_OFFSETS
553#define Set_Node_Offset_To_R(node,byte)
554#define Set_Node_Offset(node,byte)
555#define Set_Cur_Node_Offset
556#define Set_Node_Length_To_R(node,len)
557#define Set_Node_Length(node,len)
558#define Set_Node_Cur_Length(node)
559#define Node_Offset(n)
560#define Node_Length(n)
561#define Set_Node_Offset_Length(node,offset,len)
562#define ProgLen(ri) ri->u.proglen
563#define SetProgLen(ri,x) ri->u.proglen = x
564#else
565#define ProgLen(ri) ri->u.offsets[0]
566#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
567#define Set_Node_Offset_To_R(node,byte) STMT_START { \
568 if (! SIZE_ONLY) { \
569 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 570 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 571 if((node) < 0) { \
551405c4 572 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
573 } else { \
574 RExC_offsets[2*(node)-1] = (byte); \
575 } \
576 } \
577} STMT_END
578
579#define Set_Node_Offset(node,byte) \
580 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
581#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
582
583#define Set_Node_Length_To_R(node,len) STMT_START { \
584 if (! SIZE_ONLY) { \
585 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 586 __LINE__, (int)(node), (int)(len))); \
ccb2c380 587 if((node) < 0) { \
551405c4 588 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
589 } else { \
590 RExC_offsets[2*(node)] = (len); \
591 } \
592 } \
593} STMT_END
594
595#define Set_Node_Length(node,len) \
596 Set_Node_Length_To_R((node)-RExC_emit_start, len)
597#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
598#define Set_Node_Cur_Length(node) \
599 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
600
601/* Get offsets and lengths */
602#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
603#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
604
07be1b83
YO
605#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
606 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
607 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
608} STMT_END
7122b237 609#endif
07be1b83
YO
610
611#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
612#define EXPERIMENTAL_INPLACESCAN
f427392e 613#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 614
304ee84b
YO
615#define DEBUG_STUDYDATA(str,data,depth) \
616DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 617 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
618 "%*s" str "Pos:%"IVdf"/%"IVdf \
619 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
620 (int)(depth)*2, "", \
621 (IV)((data)->pos_min), \
622 (IV)((data)->pos_delta), \
304ee84b 623 (UV)((data)->flags), \
1de06328 624 (IV)((data)->whilem_c), \
304ee84b
YO
625 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
626 is_inf ? "INF " : "" \
1de06328
YO
627 ); \
628 if ((data)->last_found) \
629 PerlIO_printf(Perl_debug_log, \
630 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
631 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
632 SvPVX_const((data)->last_found), \
633 (IV)((data)->last_end), \
634 (IV)((data)->last_start_min), \
635 (IV)((data)->last_start_max), \
636 ((data)->longest && \
637 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
638 SvPVX_const((data)->longest_fixed), \
639 (IV)((data)->offset_fixed), \
640 ((data)->longest && \
641 (data)->longest==&((data)->longest_float)) ? "*" : "", \
642 SvPVX_const((data)->longest_float), \
643 (IV)((data)->offset_float_min), \
644 (IV)((data)->offset_float_max) \
645 ); \
646 PerlIO_printf(Perl_debug_log,"\n"); \
647});
648
acfe0abc 649static void clear_re(pTHX_ void *r);
4327152a 650
653099ff 651/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 652 Update the longest found anchored substring and the longest found
653099ff
GS
653 floating substrings if needed. */
654
4327152a 655STATIC void
304ee84b 656S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 657{
e1ec3a88
AL
658 const STRLEN l = CHR_SVLEN(data->last_found);
659 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 660 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 661
7918f24d
NC
662 PERL_ARGS_ASSERT_SCAN_COMMIT;
663
c277df42 664 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 665 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
666 if (*data->longest == data->longest_fixed) {
667 data->offset_fixed = l ? data->last_start_min : data->pos_min;
668 if (data->flags & SF_BEFORE_EOL)
b81d288d 669 data->flags
c277df42
IZ
670 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
671 else
672 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
673 data->minlen_fixed=minlenp;
674 data->lookbehind_fixed=0;
a0ed51b3 675 }
304ee84b 676 else { /* *data->longest == data->longest_float */
c277df42 677 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
678 data->offset_float_max = (l
679 ? data->last_start_max
c277df42 680 : data->pos_min + data->pos_delta);
304ee84b 681 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 682 data->offset_float_max = I32_MAX;
c277df42 683 if (data->flags & SF_BEFORE_EOL)
b81d288d 684 data->flags
c277df42
IZ
685 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
686 else
687 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
688 data->minlen_float=minlenp;
689 data->lookbehind_float=0;
c277df42
IZ
690 }
691 }
692 SvCUR_set(data->last_found, 0);
0eda9292 693 {
a28509cc 694 SV * const sv = data->last_found;
097eb12c
AL
695 if (SvUTF8(sv) && SvMAGICAL(sv)) {
696 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
697 if (mg)
698 mg->mg_len = 0;
699 }
0eda9292 700 }
c277df42
IZ
701 data->last_end = -1;
702 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 703 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
704}
705
653099ff
GS
706/* Can match anything (initialization) */
707STATIC void
097eb12c 708S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 709{
7918f24d
NC
710 PERL_ARGS_ASSERT_CL_ANYTHING;
711
653099ff 712 ANYOF_CLASS_ZERO(cl);
f8bef550 713 ANYOF_BITMAP_SETALL(cl);
11454c59 714 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
653099ff
GS
715 if (LOC)
716 cl->flags |= ANYOF_LOCALE;
717}
718
719/* Can match anything (initialization) */
720STATIC int
5f66b61c 721S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
722{
723 int value;
724
7918f24d
NC
725 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
726
aaa51d5e 727 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
728 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
729 return 1;
1aa99e6b
IH
730 if (!(cl->flags & ANYOF_UNICODE_ALL))
731 return 0;
10edeb5d 732 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 733 return 0;
653099ff
GS
734 return 1;
735}
736
737/* Can match anything (initialization) */
738STATIC void
097eb12c 739S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 740{
7918f24d
NC
741 PERL_ARGS_ASSERT_CL_INIT;
742
8ecf7187 743 Zero(cl, 1, struct regnode_charclass_class);
653099ff 744 cl->type = ANYOF;
830247a4 745 cl_anything(pRExC_state, cl);
653099ff
GS
746}
747
748STATIC void
097eb12c 749S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 750{
7918f24d
NC
751 PERL_ARGS_ASSERT_CL_INIT_ZERO;
752
8ecf7187 753 Zero(cl, 1, struct regnode_charclass_class);
653099ff 754 cl->type = ANYOF;
830247a4 755 cl_anything(pRExC_state, cl);
653099ff
GS
756 if (LOC)
757 cl->flags |= ANYOF_LOCALE;
758}
759
760/* 'And' a given class with another one. Can create false positives */
761/* We assume that cl is not inverted */
762STATIC void
5f66b61c 763S_cl_and(struct regnode_charclass_class *cl,
a28509cc 764 const struct regnode_charclass_class *and_with)
653099ff 765{
7918f24d 766 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
767
768 assert(and_with->type == ANYOF);
1e6ade67
KW
769
770 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
771 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
653099ff 772 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
773 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
774 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
653099ff
GS
775 int i;
776
777 if (and_with->flags & ANYOF_INVERT)
778 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
779 cl->bitmap[i] &= ~and_with->bitmap[i];
780 else
781 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
782 cl->bitmap[i] &= and_with->bitmap[i];
783 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
784 if (!(and_with->flags & ANYOF_EOS))
785 cl->flags &= ~ANYOF_EOS;
1aa99e6b 786
39065660
KW
787 if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
788 cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
11454c59
KW
789 if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
790 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
8951c461 791
3ff7ceb3 792 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
14ebb1a2 793 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b 794 cl->flags &= ~ANYOF_UNICODE_ALL;
ef87b810
KW
795 cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
796 only the one(s)
797 actually set */
1aa99e6b
IH
798 ARG_SET(cl, ARG(and_with));
799 }
14ebb1a2
JH
800 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
801 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 802 cl->flags &= ~ANYOF_UNICODE_ALL;
3ff7ceb3 803 if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
14ebb1a2 804 !(and_with->flags & ANYOF_INVERT))
3ff7ceb3 805 cl->flags &= ~ANYOF_NONBITMAP;
653099ff
GS
806}
807
808/* 'OR' a given class with another one. Can create false positives */
809/* We assume that cl is not inverted */
810STATIC void
097eb12c 811S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 812{
7918f24d
NC
813 PERL_ARGS_ASSERT_CL_OR;
814
653099ff
GS
815 if (or_with->flags & ANYOF_INVERT) {
816 /* We do not use
817 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
818 * <= (B1 | !B2) | (CL1 | !CL2)
819 * which is wasteful if CL2 is small, but we ignore CL2:
820 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
821 * XXXX Can we handle case-fold? Unclear:
822 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
823 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
824 */
825 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
826 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
827 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
653099ff
GS
828 int i;
829
830 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
831 cl->bitmap[i] |= ~or_with->bitmap[i];
832 } /* XXXX: logic is complicated otherwise */
833 else {
830247a4 834 cl_anything(pRExC_state, cl);
653099ff
GS
835 }
836 } else {
837 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
838 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
839 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
840 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
653099ff
GS
841 int i;
842
843 /* OR char bitmap and class bitmap separately */
844 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
845 cl->bitmap[i] |= or_with->bitmap[i];
1e6ade67 846 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
653099ff
GS
847 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
848 cl->classflags[i] |= or_with->classflags[i];
849 cl->flags |= ANYOF_CLASS;
850 }
851 }
852 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 853 cl_anything(pRExC_state, cl);
653099ff
GS
854 }
855 }
856 if (or_with->flags & ANYOF_EOS)
857 cl->flags |= ANYOF_EOS;
11454c59
KW
858 if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
859 cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
1aa99e6b 860
39065660
KW
861 if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
862 cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
8951c461 863
9826f543
KW
864 /* If both nodes match something outside the bitmap, but what they match
865 * outside is not the same pointer, and hence not easily compared, give up
866 * and allow the start class to match everything outside the bitmap */
3ff7ceb3 867 if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
1aa99e6b
IH
868 ARG(cl) != ARG(or_with)) {
869 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 870 }
9826f543 871
1aa99e6b
IH
872 if (or_with->flags & ANYOF_UNICODE_ALL) {
873 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 874 }
653099ff
GS
875}
876
a3621e74
YO
877#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
878#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
879#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
880#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
881
3dab1dad
YO
882
883#ifdef DEBUGGING
07be1b83 884/*
2b8b4781
NC
885 dump_trie(trie,widecharmap,revcharmap)
886 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
887 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
888
889 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
890 The _interim_ variants are used for debugging the interim
891 tables that are used to generate the final compressed
892 representation which is what dump_trie expects.
893
486ec47a 894 Part of the reason for their existence is to provide a form
3dab1dad 895 of documentation as to how the different representations function.
07be1b83
YO
896
897*/
3dab1dad
YO
898
899/*
3dab1dad
YO
900 Dumps the final compressed table form of the trie to Perl_debug_log.
901 Used for debugging make_trie().
902*/
b9a59e08 903
3dab1dad 904STATIC void
2b8b4781
NC
905S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
906 AV *revcharmap, U32 depth)
3dab1dad
YO
907{
908 U32 state;
ab3bbdeb 909 SV *sv=sv_newmortal();
55eed653 910 int colwidth= widecharmap ? 6 : 4;
2e64971a 911 U16 word;
3dab1dad
YO
912 GET_RE_DEBUG_FLAGS_DECL;
913
7918f24d 914 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 915
3dab1dad
YO
916 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
917 (int)depth * 2 + 2,"",
918 "Match","Base","Ofs" );
919
920 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 921 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 922 if ( tmp ) {
ab3bbdeb
YO
923 PerlIO_printf( Perl_debug_log, "%*s",
924 colwidth,
ddc5bc0f 925 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
926 PL_colors[0], PL_colors[1],
927 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
928 PERL_PV_ESCAPE_FIRSTCHAR
929 )
930 );
3dab1dad
YO
931 }
932 }
933 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
934 (int)depth * 2 + 2,"");
935
936 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 937 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
938 PerlIO_printf( Perl_debug_log, "\n");
939
1e2e3d02 940 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 941 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
942
943 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
944
945 if ( trie->states[ state ].wordnum ) {
946 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
947 } else {
948 PerlIO_printf( Perl_debug_log, "%6s", "" );
949 }
950
951 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
952
953 if ( base ) {
954 U32 ofs = 0;
955
956 while( ( base + ofs < trie->uniquecharcount ) ||
957 ( base + ofs - trie->uniquecharcount < trie->lasttrans
958 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
959 ofs++;
960
961 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
962
963 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
964 if ( ( base + ofs >= trie->uniquecharcount ) &&
965 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
966 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
967 {
ab3bbdeb
YO
968 PerlIO_printf( Perl_debug_log, "%*"UVXf,
969 colwidth,
3dab1dad
YO
970 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
971 } else {
ab3bbdeb 972 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
973 }
974 }
975
976 PerlIO_printf( Perl_debug_log, "]");
977
978 }
979 PerlIO_printf( Perl_debug_log, "\n" );
980 }
2e64971a
DM
981 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
982 for (word=1; word <= trie->wordcount; word++) {
983 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
984 (int)word, (int)(trie->wordinfo[word].prev),
985 (int)(trie->wordinfo[word].len));
986 }
987 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
988}
989/*
3dab1dad
YO
990 Dumps a fully constructed but uncompressed trie in list form.
991 List tries normally only are used for construction when the number of
992 possible chars (trie->uniquecharcount) is very high.
993 Used for debugging make_trie().
994*/
995STATIC void
55eed653 996S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
997 HV *widecharmap, AV *revcharmap, U32 next_alloc,
998 U32 depth)
3dab1dad
YO
999{
1000 U32 state;
ab3bbdeb 1001 SV *sv=sv_newmortal();
55eed653 1002 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1003 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1004
1005 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1006
3dab1dad 1007 /* print out the table precompression. */
ab3bbdeb
YO
1008 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1009 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1010 "------:-----+-----------------\n" );
3dab1dad
YO
1011
1012 for( state=1 ; state < next_alloc ; state ++ ) {
1013 U16 charid;
1014
ab3bbdeb 1015 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1016 (int)depth * 2 + 2,"", (UV)state );
1017 if ( ! trie->states[ state ].wordnum ) {
1018 PerlIO_printf( Perl_debug_log, "%5s| ","");
1019 } else {
1020 PerlIO_printf( Perl_debug_log, "W%4x| ",
1021 trie->states[ state ].wordnum
1022 );
1023 }
1024 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1025 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1026 if ( tmp ) {
1027 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1028 colwidth,
ddc5bc0f 1029 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1030 PL_colors[0], PL_colors[1],
1031 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1032 PERL_PV_ESCAPE_FIRSTCHAR
1033 ) ,
1e2e3d02
YO
1034 TRIE_LIST_ITEM(state,charid).forid,
1035 (UV)TRIE_LIST_ITEM(state,charid).newstate
1036 );
1037 if (!(charid % 10))
664e119d
RGS
1038 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1039 (int)((depth * 2) + 14), "");
1e2e3d02 1040 }
ab3bbdeb
YO
1041 }
1042 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1043 }
1044}
1045
1046/*
3dab1dad
YO
1047 Dumps a fully constructed but uncompressed trie in table form.
1048 This is the normal DFA style state transition table, with a few
1049 twists to facilitate compression later.
1050 Used for debugging make_trie().
1051*/
1052STATIC void
55eed653 1053S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1054 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1055 U32 depth)
3dab1dad
YO
1056{
1057 U32 state;
1058 U16 charid;
ab3bbdeb 1059 SV *sv=sv_newmortal();
55eed653 1060 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1061 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1062
1063 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1064
1065 /*
1066 print out the table precompression so that we can do a visual check
1067 that they are identical.
1068 */
1069
1070 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1071
1072 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1073 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1074 if ( tmp ) {
ab3bbdeb
YO
1075 PerlIO_printf( Perl_debug_log, "%*s",
1076 colwidth,
ddc5bc0f 1077 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1078 PL_colors[0], PL_colors[1],
1079 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1080 PERL_PV_ESCAPE_FIRSTCHAR
1081 )
1082 );
3dab1dad
YO
1083 }
1084 }
1085
1086 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1087
1088 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1089 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1090 }
1091
1092 PerlIO_printf( Perl_debug_log, "\n" );
1093
1094 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1095
1096 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1097 (int)depth * 2 + 2,"",
1098 (UV)TRIE_NODENUM( state ) );
1099
1100 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1101 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1102 if (v)
1103 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1104 else
1105 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1106 }
1107 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1108 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1109 } else {
1110 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1111 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1112 }
1113 }
07be1b83 1114}
3dab1dad
YO
1115
1116#endif
1117
2e64971a 1118
786e8c11
YO
1119/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1120 startbranch: the first branch in the whole branch sequence
1121 first : start branch of sequence of branch-exact nodes.
1122 May be the same as startbranch
1123 last : Thing following the last branch.
1124 May be the same as tail.
1125 tail : item following the branch sequence
1126 count : words in the sequence
1127 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1128 depth : indent depth
3dab1dad 1129
786e8c11 1130Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1131
786e8c11
YO
1132A trie is an N'ary tree where the branches are determined by digital
1133decomposition of the key. IE, at the root node you look up the 1st character and
1134follow that branch repeat until you find the end of the branches. Nodes can be
1135marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1136
786e8c11 1137 /he|she|his|hers/
72f13be8 1138
786e8c11
YO
1139would convert into the following structure. Numbers represent states, letters
1140following numbers represent valid transitions on the letter from that state, if
1141the number is in square brackets it represents an accepting state, otherwise it
1142will be in parenthesis.
07be1b83 1143
786e8c11
YO
1144 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1145 | |
1146 | (2)
1147 | |
1148 (1) +-i->(6)-+-s->[7]
1149 |
1150 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1151
786e8c11
YO
1152 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1153
1154This shows that when matching against the string 'hers' we will begin at state 1
1155read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1156then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1157is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1158single traverse. We store a mapping from accepting to state to which word was
1159matched, and then when we have multiple possibilities we try to complete the
1160rest of the regex in the order in which they occured in the alternation.
1161
1162The only prior NFA like behaviour that would be changed by the TRIE support is
1163the silent ignoring of duplicate alternations which are of the form:
1164
1165 / (DUPE|DUPE) X? (?{ ... }) Y /x
1166
4b714af6 1167Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1168and without the optimisation. With the optimisations dupes will be silently
486ec47a 1169ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1170the following demonstrates:
1171
1172 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1173
1174which prints out 'word' three times, but
1175
1176 'words'=~/(word|word|word)(?{ print $1 })S/
1177
1178which doesnt print it out at all. This is due to other optimisations kicking in.
1179
1180Example of what happens on a structural level:
1181
486ec47a 1182The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1183
1184 1: CURLYM[1] {1,32767}(18)
1185 5: BRANCH(8)
1186 6: EXACT <ac>(16)
1187 8: BRANCH(11)
1188 9: EXACT <ad>(16)
1189 11: BRANCH(14)
1190 12: EXACT <ab>(16)
1191 16: SUCCEED(0)
1192 17: NOTHING(18)
1193 18: END(0)
1194
1195This would be optimizable with startbranch=5, first=5, last=16, tail=16
1196and should turn into:
1197
1198 1: CURLYM[1] {1,32767}(18)
1199 5: TRIE(16)
1200 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1201 <ac>
1202 <ad>
1203 <ab>
1204 16: SUCCEED(0)
1205 17: NOTHING(18)
1206 18: END(0)
1207
1208Cases where tail != last would be like /(?foo|bar)baz/:
1209
1210 1: BRANCH(4)
1211 2: EXACT <foo>(8)
1212 4: BRANCH(7)
1213 5: EXACT <bar>(8)
1214 7: TAIL(8)
1215 8: EXACT <baz>(10)
1216 10: END(0)
1217
1218which would be optimizable with startbranch=1, first=1, last=7, tail=8
1219and would end up looking like:
1220
1221 1: TRIE(8)
1222 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1223 <foo>
1224 <bar>
1225 7: TAIL(8)
1226 8: EXACT <baz>(10)
1227 10: END(0)
1228
1229 d = uvuni_to_utf8_flags(d, uv, 0);
1230
1231is the recommended Unicode-aware way of saying
1232
1233 *(d++) = uv;
1234*/
1235
1e2e3d02 1236#define TRIE_STORE_REVCHAR \
786e8c11 1237 STMT_START { \
73031816
NC
1238 if (UTF) { \
1239 SV *zlopp = newSV(2); \
88c9ea1e
CB
1240 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1241 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1242 SvCUR_set(zlopp, kapow - flrbbbbb); \
1243 SvPOK_on(zlopp); \
1244 SvUTF8_on(zlopp); \
1245 av_push(revcharmap, zlopp); \
1246 } else { \
6bdeddd2 1247 char ooooff = (char)uvc; \
73031816
NC
1248 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1249 } \
1250 } STMT_END
786e8c11
YO
1251
1252#define TRIE_READ_CHAR STMT_START { \
1253 wordlen++; \
1254 if ( UTF ) { \
1255 if ( folder ) { \
1256 if ( foldlen > 0 ) { \
1257 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1258 foldlen -= len; \
1259 scan += len; \
1260 len = 0; \
1261 } else { \
1262 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1263 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1264 foldlen -= UNISKIP( uvc ); \
1265 scan = foldbuf + UNISKIP( uvc ); \
1266 } \
1267 } else { \
1268 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1269 } \
1270 } else { \
1271 uvc = (U32)*uc; \
1272 len = 1; \
1273 } \
1274} STMT_END
1275
1276
1277
1278#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1279 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1280 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1281 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1282 } \
1283 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1284 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1285 TRIE_LIST_CUR( state )++; \
1286} STMT_END
07be1b83 1287
786e8c11
YO
1288#define TRIE_LIST_NEW(state) STMT_START { \
1289 Newxz( trie->states[ state ].trans.list, \
1290 4, reg_trie_trans_le ); \
1291 TRIE_LIST_CUR( state ) = 1; \
1292 TRIE_LIST_LEN( state ) = 4; \
1293} STMT_END
07be1b83 1294
786e8c11
YO
1295#define TRIE_HANDLE_WORD(state) STMT_START { \
1296 U16 dupe= trie->states[ state ].wordnum; \
1297 regnode * const noper_next = regnext( noper ); \
1298 \
786e8c11
YO
1299 DEBUG_r({ \
1300 /* store the word for dumping */ \
1301 SV* tmp; \
1302 if (OP(noper) != NOTHING) \
740cce10 1303 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1304 else \
740cce10 1305 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1306 av_push( trie_words, tmp ); \
786e8c11
YO
1307 }); \
1308 \
1309 curword++; \
2e64971a
DM
1310 trie->wordinfo[curword].prev = 0; \
1311 trie->wordinfo[curword].len = wordlen; \
1312 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1313 \
1314 if ( noper_next < tail ) { \
1315 if (!trie->jump) \
c944940b 1316 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1317 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1318 if (!jumper) \
1319 jumper = noper_next; \
1320 if (!nextbranch) \
1321 nextbranch= regnext(cur); \
1322 } \
1323 \
1324 if ( dupe ) { \
2e64971a
DM
1325 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1326 /* chain, so that when the bits of chain are later */\
1327 /* linked together, the dups appear in the chain */\
1328 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1329 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1330 } else { \
1331 /* we haven't inserted this word yet. */ \
1332 trie->states[ state ].wordnum = curword; \
1333 } \
1334} STMT_END
07be1b83 1335
3dab1dad 1336
786e8c11
YO
1337#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1338 ( ( base + charid >= ucharcount \
1339 && base + charid < ubound \
1340 && state == trie->trans[ base - ucharcount + charid ].check \
1341 && trie->trans[ base - ucharcount + charid ].next ) \
1342 ? trie->trans[ base - ucharcount + charid ].next \
1343 : ( state==1 ? special : 0 ) \
1344 )
3dab1dad 1345
786e8c11
YO
1346#define MADE_TRIE 1
1347#define MADE_JUMP_TRIE 2
1348#define MADE_EXACT_TRIE 4
3dab1dad 1349
a3621e74 1350STATIC I32
786e8c11 1351S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1352{
27da23d5 1353 dVAR;
a3621e74
YO
1354 /* first pass, loop through and scan words */
1355 reg_trie_data *trie;
55eed653 1356 HV *widecharmap = NULL;
2b8b4781 1357 AV *revcharmap = newAV();
a3621e74 1358 regnode *cur;
9f7f3913 1359 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1360 STRLEN len = 0;
1361 UV uvc = 0;
1362 U16 curword = 0;
1363 U32 next_alloc = 0;
786e8c11
YO
1364 regnode *jumper = NULL;
1365 regnode *nextbranch = NULL;
7f69552c 1366 regnode *convert = NULL;
2e64971a 1367 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1368 /* we just use folder as a flag in utf8 */
1e696034 1369 const U8 * folder = NULL;
a3621e74 1370
2b8b4781
NC
1371#ifdef DEBUGGING
1372 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1373 AV *trie_words = NULL;
1374 /* along with revcharmap, this only used during construction but both are
1375 * useful during debugging so we store them in the struct when debugging.
8e11feef 1376 */
2b8b4781
NC
1377#else
1378 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1379 STRLEN trie_charcount=0;
3dab1dad 1380#endif
2b8b4781 1381 SV *re_trie_maxbuff;
a3621e74 1382 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1383
1384 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1385#ifndef DEBUGGING
1386 PERL_UNUSED_ARG(depth);
1387#endif
a3621e74 1388
1e696034
KW
1389 switch (flags) {
1390 case EXACTFU: folder = PL_fold_latin1; break;
1391 case EXACTF: folder = PL_fold; break;
1392 case EXACTFL: folder = PL_fold_locale; break;
1393 }
1394
c944940b 1395 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1396 trie->refcount = 1;
3dab1dad 1397 trie->startstate = 1;
786e8c11 1398 trie->wordcount = word_count;
f8fc2ecf 1399 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1400 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1401 if (!(UTF && folder))
c944940b 1402 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1403 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1404 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1405
a3621e74 1406 DEBUG_r({
2b8b4781 1407 trie_words = newAV();
a3621e74 1408 });
a3621e74 1409
0111c4fd 1410 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1411 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1412 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1413 }
3dab1dad
YO
1414 DEBUG_OPTIMISE_r({
1415 PerlIO_printf( Perl_debug_log,
786e8c11 1416 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1417 (int)depth * 2 + 2, "",
1418 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1419 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1420 (int)depth);
3dab1dad 1421 });
7f69552c
YO
1422
1423 /* Find the node we are going to overwrite */
1424 if ( first == startbranch && OP( last ) != BRANCH ) {
1425 /* whole branch chain */
1426 convert = first;
1427 } else {
1428 /* branch sub-chain */
1429 convert = NEXTOPER( first );
1430 }
1431
a3621e74
YO
1432 /* -- First loop and Setup --
1433
1434 We first traverse the branches and scan each word to determine if it
1435 contains widechars, and how many unique chars there are, this is
1436 important as we have to build a table with at least as many columns as we
1437 have unique chars.
1438
1439 We use an array of integers to represent the character codes 0..255
38a44b82 1440 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1441 native representation of the character value as the key and IV's for the
1442 coded index.
1443
1444 *TODO* If we keep track of how many times each character is used we can
1445 remap the columns so that the table compression later on is more
3b753521 1446 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1447 middle and the least common are on the outside. IMO this would be better
1448 than a most to least common mapping as theres a decent chance the most
1449 common letter will share a node with the least common, meaning the node
486ec47a 1450 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
1451 case is when we have the least common nodes twice.
1452
1453 */
1454
a3621e74 1455 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1456 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1457 const U8 *uc = (U8*)STRING( noper );
a28509cc 1458 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1459 STRLEN foldlen = 0;
1460 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1461 const U8 *scan = (U8*)NULL;
07be1b83 1462 U32 wordlen = 0; /* required init */
02daf0ab
YO
1463 STRLEN chars = 0;
1464 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1465
3dab1dad
YO
1466 if (OP(noper) == NOTHING) {
1467 trie->minlen= 0;
1468 continue;
1469 }
02daf0ab
YO
1470 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1471 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1472 regardless of encoding */
1473
a3621e74 1474 for ( ; uc < e ; uc += len ) {
3dab1dad 1475 TRIE_CHARCOUNT(trie)++;
a3621e74 1476 TRIE_READ_CHAR;
3dab1dad 1477 chars++;
a3621e74
YO
1478 if ( uvc < 256 ) {
1479 if ( !trie->charmap[ uvc ] ) {
1480 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1481 if ( folder )
1482 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1483 TRIE_STORE_REVCHAR;
a3621e74 1484 }
02daf0ab 1485 if ( set_bit ) {
62012aee
KW
1486 /* store the codepoint in the bitmap, and its folded
1487 * equivalent. */
02daf0ab 1488 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1489
1490 /* store the folded codepoint */
1491 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1492
1493 if ( !UTF ) {
1494 /* store first byte of utf8 representation of
acdf4139
KW
1495 variant codepoints */
1496 if (! UNI_IS_INVARIANT(uvc)) {
1497 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
1498 }
1499 }
02daf0ab
YO
1500 set_bit = 0; /* We've done our bit :-) */
1501 }
a3621e74
YO
1502 } else {
1503 SV** svpp;
55eed653
NC
1504 if ( !widecharmap )
1505 widecharmap = newHV();
a3621e74 1506
55eed653 1507 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1508
1509 if ( !svpp )
e4584336 1510 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1511
1512 if ( !SvTRUE( *svpp ) ) {
1513 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1514 TRIE_STORE_REVCHAR;
a3621e74
YO
1515 }
1516 }
1517 }
3dab1dad
YO
1518 if( cur == first ) {
1519 trie->minlen=chars;
1520 trie->maxlen=chars;
1521 } else if (chars < trie->minlen) {
1522 trie->minlen=chars;
1523 } else if (chars > trie->maxlen) {
1524 trie->maxlen=chars;
1525 }
1526
a3621e74
YO
1527 } /* end first pass */
1528 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1529 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1530 (int)depth * 2 + 2,"",
55eed653 1531 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1532 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1533 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1534 );
a3621e74
YO
1535
1536 /*
1537 We now know what we are dealing with in terms of unique chars and
1538 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1539 representation using a flat table will take. If it's over a reasonable
1540 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1541 conservative but potentially much slower representation using an array
1542 of lists.
1543
1544 At the end we convert both representations into the same compressed
1545 form that will be used in regexec.c for matching with. The latter
1546 is a form that cannot be used to construct with but has memory
1547 properties similar to the list form and access properties similar
1548 to the table form making it both suitable for fast searches and
1549 small enough that its feasable to store for the duration of a program.
1550
1551 See the comment in the code where the compressed table is produced
1552 inplace from the flat tabe representation for an explanation of how
1553 the compression works.
1554
1555 */
1556
1557
2e64971a
DM
1558 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1559 prev_states[1] = 0;
1560
3dab1dad 1561 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1562 /*
1563 Second Pass -- Array Of Lists Representation
1564
1565 Each state will be represented by a list of charid:state records
1566 (reg_trie_trans_le) the first such element holds the CUR and LEN
1567 points of the allocated array. (See defines above).
1568
1569 We build the initial structure using the lists, and then convert
1570 it into the compressed table form which allows faster lookups
1571 (but cant be modified once converted).
a3621e74
YO
1572 */
1573
a3621e74
YO
1574 STRLEN transcount = 1;
1575
1e2e3d02
YO
1576 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1577 "%*sCompiling trie using list compiler\n",
1578 (int)depth * 2 + 2, ""));
446bd890 1579
c944940b
JH
1580 trie->states = (reg_trie_state *)
1581 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1582 sizeof(reg_trie_state) );
a3621e74
YO
1583 TRIE_LIST_NEW(1);
1584 next_alloc = 2;
1585
1586 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1587
c445ea15
AL
1588 regnode * const noper = NEXTOPER( cur );
1589 U8 *uc = (U8*)STRING( noper );
1590 const U8 * const e = uc + STR_LEN( noper );
1591 U32 state = 1; /* required init */
1592 U16 charid = 0; /* sanity init */
1593 U8 *scan = (U8*)NULL; /* sanity init */
1594 STRLEN foldlen = 0; /* required init */
07be1b83 1595 U32 wordlen = 0; /* required init */
c445ea15
AL
1596 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1597
3dab1dad 1598 if (OP(noper) != NOTHING) {
786e8c11 1599 for ( ; uc < e ; uc += len ) {
c445ea15 1600
786e8c11 1601 TRIE_READ_CHAR;
c445ea15 1602
786e8c11
YO
1603 if ( uvc < 256 ) {
1604 charid = trie->charmap[ uvc ];
c445ea15 1605 } else {
55eed653 1606 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1607 if ( !svpp ) {
1608 charid = 0;
1609 } else {
1610 charid=(U16)SvIV( *svpp );
1611 }
c445ea15 1612 }
786e8c11
YO
1613 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1614 if ( charid ) {
a3621e74 1615
786e8c11
YO
1616 U16 check;
1617 U32 newstate = 0;
a3621e74 1618
786e8c11
YO
1619 charid--;
1620 if ( !trie->states[ state ].trans.list ) {
1621 TRIE_LIST_NEW( state );
c445ea15 1622 }
786e8c11
YO
1623 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1624 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1625 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1626 break;
1627 }
1628 }
1629 if ( ! newstate ) {
1630 newstate = next_alloc++;
2e64971a 1631 prev_states[newstate] = state;
786e8c11
YO
1632 TRIE_LIST_PUSH( state, charid, newstate );
1633 transcount++;
1634 }
1635 state = newstate;
1636 } else {
1637 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1638 }
a28509cc 1639 }
c445ea15 1640 }
3dab1dad 1641 TRIE_HANDLE_WORD(state);
a3621e74
YO
1642
1643 } /* end second pass */
1644
1e2e3d02
YO
1645 /* next alloc is the NEXT state to be allocated */
1646 trie->statecount = next_alloc;
c944940b
JH
1647 trie->states = (reg_trie_state *)
1648 PerlMemShared_realloc( trie->states,
1649 next_alloc
1650 * sizeof(reg_trie_state) );
a3621e74 1651
3dab1dad 1652 /* and now dump it out before we compress it */
2b8b4781
NC
1653 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1654 revcharmap, next_alloc,
1655 depth+1)
1e2e3d02 1656 );
a3621e74 1657
c944940b
JH
1658 trie->trans = (reg_trie_trans *)
1659 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1660 {
1661 U32 state;
a3621e74
YO
1662 U32 tp = 0;
1663 U32 zp = 0;
1664
1665
1666 for( state=1 ; state < next_alloc ; state ++ ) {
1667 U32 base=0;
1668
1669 /*
1670 DEBUG_TRIE_COMPILE_MORE_r(
1671 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1672 );
1673 */
1674
1675 if (trie->states[state].trans.list) {
1676 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1677 U16 maxid=minid;
a28509cc 1678 U16 idx;
a3621e74
YO
1679
1680 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1681 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1682 if ( forid < minid ) {
1683 minid=forid;
1684 } else if ( forid > maxid ) {
1685 maxid=forid;
1686 }
a3621e74
YO
1687 }
1688 if ( transcount < tp + maxid - minid + 1) {
1689 transcount *= 2;
c944940b
JH
1690 trie->trans = (reg_trie_trans *)
1691 PerlMemShared_realloc( trie->trans,
446bd890
NC
1692 transcount
1693 * sizeof(reg_trie_trans) );
a3621e74
YO
1694 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1695 }
1696 base = trie->uniquecharcount + tp - minid;
1697 if ( maxid == minid ) {
1698 U32 set = 0;
1699 for ( ; zp < tp ; zp++ ) {
1700 if ( ! trie->trans[ zp ].next ) {
1701 base = trie->uniquecharcount + zp - minid;
1702 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1703 trie->trans[ zp ].check = state;
1704 set = 1;
1705 break;
1706 }
1707 }
1708 if ( !set ) {
1709 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1710 trie->trans[ tp ].check = state;
1711 tp++;
1712 zp = tp;
1713 }
1714 } else {
1715 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1716 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1717 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1718 trie->trans[ tid ].check = state;
1719 }
1720 tp += ( maxid - minid + 1 );
1721 }
1722 Safefree(trie->states[ state ].trans.list);
1723 }
1724 /*
1725 DEBUG_TRIE_COMPILE_MORE_r(
1726 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1727 );
1728 */
1729 trie->states[ state ].trans.base=base;
1730 }
cc601c31 1731 trie->lasttrans = tp + 1;
a3621e74
YO
1732 }
1733 } else {
1734 /*
1735 Second Pass -- Flat Table Representation.
1736
1737 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1738 We know that we will need Charcount+1 trans at most to store the data
1739 (one row per char at worst case) So we preallocate both structures
1740 assuming worst case.
1741
1742 We then construct the trie using only the .next slots of the entry
1743 structs.
1744
3b753521 1745 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
1746 make compression both faster and easier by keeping track of how many non
1747 zero fields are in the node.
1748
1749 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1750 transition.
1751
1752 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1753 number representing the first entry of the node, and state as a
1754 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1755 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1756 are 2 entrys per node. eg:
1757
1758 A B A B
1759 1. 2 4 1. 3 7
1760 2. 0 3 3. 0 5
1761 3. 0 0 5. 0 0
1762 4. 0 0 7. 0 0
1763
1764 The table is internally in the right hand, idx form. However as we also
1765 have to deal with the states array which is indexed by nodenum we have to
1766 use TRIE_NODENUM() to convert.
1767
1768 */
1e2e3d02
YO
1769 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1770 "%*sCompiling trie using table compiler\n",
1771 (int)depth * 2 + 2, ""));
3dab1dad 1772
c944940b
JH
1773 trie->trans = (reg_trie_trans *)
1774 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1775 * trie->uniquecharcount + 1,
1776 sizeof(reg_trie_trans) );
1777 trie->states = (reg_trie_state *)
1778 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1779 sizeof(reg_trie_state) );
a3621e74
YO
1780 next_alloc = trie->uniquecharcount + 1;
1781
3dab1dad 1782
a3621e74
YO
1783 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1784
c445ea15 1785 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1786 const U8 *uc = (U8*)STRING( noper );
1787 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1788
1789 U32 state = 1; /* required init */
1790
1791 U16 charid = 0; /* sanity init */
1792 U32 accept_state = 0; /* sanity init */
1793 U8 *scan = (U8*)NULL; /* sanity init */
1794
1795 STRLEN foldlen = 0; /* required init */
07be1b83 1796 U32 wordlen = 0; /* required init */
a3621e74
YO
1797 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1798
3dab1dad 1799 if ( OP(noper) != NOTHING ) {
786e8c11 1800 for ( ; uc < e ; uc += len ) {
a3621e74 1801
786e8c11 1802 TRIE_READ_CHAR;
a3621e74 1803
786e8c11
YO
1804 if ( uvc < 256 ) {
1805 charid = trie->charmap[ uvc ];
1806 } else {
55eed653 1807 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1808 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1809 }
786e8c11
YO
1810 if ( charid ) {
1811 charid--;
1812 if ( !trie->trans[ state + charid ].next ) {
1813 trie->trans[ state + charid ].next = next_alloc;
1814 trie->trans[ state ].check++;
2e64971a
DM
1815 prev_states[TRIE_NODENUM(next_alloc)]
1816 = TRIE_NODENUM(state);
786e8c11
YO
1817 next_alloc += trie->uniquecharcount;
1818 }
1819 state = trie->trans[ state + charid ].next;
1820 } else {
1821 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1822 }
1823 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1824 }
a3621e74 1825 }
3dab1dad
YO
1826 accept_state = TRIE_NODENUM( state );
1827 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1828
1829 } /* end second pass */
1830
3dab1dad 1831 /* and now dump it out before we compress it */
2b8b4781
NC
1832 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1833 revcharmap,
1834 next_alloc, depth+1));
a3621e74 1835
a3621e74
YO
1836 {
1837 /*
1838 * Inplace compress the table.*
1839
1840 For sparse data sets the table constructed by the trie algorithm will
1841 be mostly 0/FAIL transitions or to put it another way mostly empty.
1842 (Note that leaf nodes will not contain any transitions.)
1843
1844 This algorithm compresses the tables by eliminating most such
1845 transitions, at the cost of a modest bit of extra work during lookup:
1846
1847 - Each states[] entry contains a .base field which indicates the
1848 index in the state[] array wheres its transition data is stored.
1849
3b753521 1850 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
1851
1852 - If .base is nonzero then charid is added to it to find an entry in
1853 the trans array.
1854
1855 -If trans[states[state].base+charid].check!=state then the
1856 transition is taken to be a 0/Fail transition. Thus if there are fail
1857 transitions at the front of the node then the .base offset will point
1858 somewhere inside the previous nodes data (or maybe even into a node
1859 even earlier), but the .check field determines if the transition is
1860 valid.
1861
786e8c11 1862 XXX - wrong maybe?
a3621e74 1863 The following process inplace converts the table to the compressed
3b753521 1864 table: We first do not compress the root node 1,and mark all its
a3621e74 1865 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
1866 allows us to do a DFA construction from the compressed table later,
1867 and ensures that any .base pointers we calculate later are greater
1868 than 0.
a3621e74
YO
1869
1870 - We set 'pos' to indicate the first entry of the second node.
1871
1872 - We then iterate over the columns of the node, finding the first and
1873 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1874 and set the .check pointers accordingly, and advance pos
1875 appropriately and repreat for the next node. Note that when we copy
1876 the next pointers we have to convert them from the original
1877 NODEIDX form to NODENUM form as the former is not valid post
1878 compression.
1879
1880 - If a node has no transitions used we mark its base as 0 and do not
1881 advance the pos pointer.
1882
1883 - If a node only has one transition we use a second pointer into the
1884 structure to fill in allocated fail transitions from other states.
1885 This pointer is independent of the main pointer and scans forward
1886 looking for null transitions that are allocated to a state. When it
1887 finds one it writes the single transition into the "hole". If the
786e8c11 1888 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1889
1890 - Once compressed we can Renew/realloc the structures to release the
1891 excess space.
1892
1893 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1894 specifically Fig 3.47 and the associated pseudocode.
1895
1896 demq
1897 */
a3b680e6 1898 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1899 U32 state, charid;
a3621e74 1900 U32 pos = 0, zp=0;
1e2e3d02 1901 trie->statecount = laststate;
a3621e74
YO
1902
1903 for ( state = 1 ; state < laststate ; state++ ) {
1904 U8 flag = 0;
a28509cc
AL
1905 const U32 stateidx = TRIE_NODEIDX( state );
1906 const U32 o_used = trie->trans[ stateidx ].check;
1907 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1908 trie->trans[ stateidx ].check = 0;
1909
1910 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1911 if ( flag || trie->trans[ stateidx + charid ].next ) {
1912 if ( trie->trans[ stateidx + charid ].next ) {
1913 if (o_used == 1) {
1914 for ( ; zp < pos ; zp++ ) {
1915 if ( ! trie->trans[ zp ].next ) {
1916 break;
1917 }
1918 }
1919 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1920 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1921 trie->trans[ zp ].check = state;
1922 if ( ++zp > pos ) pos = zp;
1923 break;
1924 }
1925 used--;
1926 }
1927 if ( !flag ) {
1928 flag = 1;
1929 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1930 }
1931 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1932 trie->trans[ pos ].check = state;
1933 pos++;
1934 }
1935 }
1936 }
cc601c31 1937 trie->lasttrans = pos + 1;
c944940b
JH
1938 trie->states = (reg_trie_state *)
1939 PerlMemShared_realloc( trie->states, laststate
1940 * sizeof(reg_trie_state) );
a3621e74 1941 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1942 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1943 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1944 (int)depth * 2 + 2,"",
1945 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1946 (IV)next_alloc,
1947 (IV)pos,
a3621e74
YO
1948 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1949 );
1950
1951 } /* end table compress */
1952 }
1e2e3d02
YO
1953 DEBUG_TRIE_COMPILE_MORE_r(
1954 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1955 (int)depth * 2 + 2, "",
1956 (UV)trie->statecount,
1957 (UV)trie->lasttrans)
1958 );
cc601c31 1959 /* resize the trans array to remove unused space */
c944940b
JH
1960 trie->trans = (reg_trie_trans *)
1961 PerlMemShared_realloc( trie->trans, trie->lasttrans
1962 * sizeof(reg_trie_trans) );
a3621e74 1963
3b753521 1964 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
1965 U8 nodetype =(U8)(flags & 0xFF);
1966 char *str=NULL;
786e8c11 1967
07be1b83 1968#ifdef DEBUGGING
e62cc96a 1969 regnode *optimize = NULL;
7122b237
YO
1970#ifdef RE_TRACK_PATTERN_OFFSETS
1971
b57a0404
JH
1972 U32 mjd_offset = 0;
1973 U32 mjd_nodelen = 0;
7122b237
YO
1974#endif /* RE_TRACK_PATTERN_OFFSETS */
1975#endif /* DEBUGGING */
a3621e74 1976 /*
3dab1dad
YO
1977 This means we convert either the first branch or the first Exact,
1978 depending on whether the thing following (in 'last') is a branch
1979 or not and whther first is the startbranch (ie is it a sub part of
1980 the alternation or is it the whole thing.)
3b753521 1981 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 1982 the whole branch sequence, including the first.
a3621e74 1983 */
3dab1dad 1984 /* Find the node we are going to overwrite */
7f69552c 1985 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1986 /* branch sub-chain */
3dab1dad 1987 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1988#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1989 DEBUG_r({
1990 mjd_offset= Node_Offset((convert));
1991 mjd_nodelen= Node_Length((convert));
1992 });
7122b237 1993#endif
7f69552c 1994 /* whole branch chain */
7122b237
YO
1995 }
1996#ifdef RE_TRACK_PATTERN_OFFSETS
1997 else {
7f69552c
YO
1998 DEBUG_r({
1999 const regnode *nop = NEXTOPER( convert );
2000 mjd_offset= Node_Offset((nop));
2001 mjd_nodelen= Node_Length((nop));
2002 });
07be1b83
YO
2003 }
2004 DEBUG_OPTIMISE_r(
2005 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2006 (int)depth * 2 + 2, "",
786e8c11 2007 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2008 );
7122b237 2009#endif
3dab1dad
YO
2010 /* But first we check to see if there is a common prefix we can
2011 split out as an EXACT and put in front of the TRIE node. */
2012 trie->startstate= 1;
55eed653 2013 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2014 U32 state;
1e2e3d02 2015 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2016 U32 ofs = 0;
8e11feef
RGS
2017 I32 idx = -1;
2018 U32 count = 0;
2019 const U32 base = trie->states[ state ].trans.base;
a3621e74 2020
3dab1dad 2021 if ( trie->states[state].wordnum )
8e11feef 2022 count = 1;
a3621e74 2023
8e11feef 2024 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2025 if ( ( base + ofs >= trie->uniquecharcount ) &&
2026 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2027 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2028 {
3dab1dad 2029 if ( ++count > 1 ) {
2b8b4781 2030 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2031 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2032 if ( state == 1 ) break;
3dab1dad
YO
2033 if ( count == 2 ) {
2034 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2035 DEBUG_OPTIMISE_r(
8e11feef
RGS
2036 PerlIO_printf(Perl_debug_log,
2037 "%*sNew Start State=%"UVuf" Class: [",
2038 (int)depth * 2 + 2, "",
786e8c11 2039 (UV)state));
be8e71aa 2040 if (idx >= 0) {
2b8b4781 2041 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2042 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2043
3dab1dad 2044 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2045 if ( folder )
2046 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2047 DEBUG_OPTIMISE_r(
f1f66076 2048 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2049 );
8e11feef
RGS
2050 }
2051 }
2052 TRIE_BITMAP_SET(trie,*ch);
2053 if ( folder )
2054 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2055 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2056 }
2057 idx = ofs;
2058 }
3dab1dad
YO
2059 }
2060 if ( count == 1 ) {
2b8b4781 2061 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2062 STRLEN len;
2063 char *ch = SvPV( *tmp, len );
de734bd5
A
2064 DEBUG_OPTIMISE_r({
2065 SV *sv=sv_newmortal();
8e11feef
RGS
2066 PerlIO_printf( Perl_debug_log,
2067 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2068 (int)depth * 2 + 2, "",
de734bd5
A
2069 (UV)state, (UV)idx,
2070 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2071 PL_colors[0], PL_colors[1],
2072 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2073 PERL_PV_ESCAPE_FIRSTCHAR
2074 )
2075 );
2076 });
3dab1dad
YO
2077 if ( state==1 ) {
2078 OP( convert ) = nodetype;
2079 str=STRING(convert);
2080 STR_LEN(convert)=0;
2081 }
c490c714
YO
2082 STR_LEN(convert) += len;
2083 while (len--)
de734bd5 2084 *str++ = *ch++;
8e11feef 2085 } else {
f9049ba1 2086#ifdef DEBUGGING
8e11feef
RGS
2087 if (state>1)
2088 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2089#endif
8e11feef
RGS
2090 break;
2091 }
2092 }
2e64971a 2093 trie->prefixlen = (state-1);
3dab1dad 2094 if (str) {
8e11feef 2095 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2096 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2097 trie->startstate = state;
07be1b83
YO
2098 trie->minlen -= (state - 1);
2099 trie->maxlen -= (state - 1);
33809eae
JH
2100#ifdef DEBUGGING
2101 /* At least the UNICOS C compiler choked on this
2102 * being argument to DEBUG_r(), so let's just have
2103 * it right here. */
2104 if (
2105#ifdef PERL_EXT_RE_BUILD
2106 1
2107#else
2108 DEBUG_r_TEST
2109#endif
2110 ) {
2111 regnode *fix = convert;
2112 U32 word = trie->wordcount;
2113 mjd_nodelen++;
2114 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2115 while( ++fix < n ) {
2116 Set_Node_Offset_Length(fix, 0, 0);
2117 }
2118 while (word--) {
2119 SV ** const tmp = av_fetch( trie_words, word, 0 );
2120 if (tmp) {
2121 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2122 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2123 else
2124 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2125 }
2126 }
2127 }
2128#endif
8e11feef
RGS
2129 if (trie->maxlen) {
2130 convert = n;
2131 } else {
3dab1dad 2132 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2133 DEBUG_r(optimize= n);
3dab1dad
YO
2134 }
2135 }
2136 }
a5ca303d
YO
2137 if (!jumper)
2138 jumper = last;
3dab1dad 2139 if ( trie->maxlen ) {
8e11feef
RGS
2140 NEXT_OFF( convert ) = (U16)(tail - convert);
2141 ARG_SET( convert, data_slot );
786e8c11
YO
2142 /* Store the offset to the first unabsorbed branch in
2143 jump[0], which is otherwise unused by the jump logic.
2144 We use this when dumping a trie and during optimisation. */
2145 if (trie->jump)
7f69552c 2146 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2147
6c48061a
YO
2148 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2149 * and there is a bitmap
2150 * and the first "jump target" node we found leaves enough room
2151 * then convert the TRIE node into a TRIEC node, with the bitmap
2152 * embedded inline in the opcode - this is hypothetically faster.
2153 */
2154 if ( !trie->states[trie->startstate].wordnum
2155 && trie->bitmap
2156 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2157 {
2158 OP( convert ) = TRIEC;
2159 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2160 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2161 trie->bitmap= NULL;
2162 } else
2163 OP( convert ) = TRIE;
a3621e74 2164
3dab1dad
YO
2165 /* store the type in the flags */
2166 convert->flags = nodetype;
a5ca303d
YO
2167 DEBUG_r({
2168 optimize = convert
2169 + NODE_STEP_REGNODE
2170 + regarglen[ OP( convert ) ];
2171 });
2172 /* XXX We really should free up the resource in trie now,
2173 as we won't use them - (which resources?) dmq */
3dab1dad 2174 }
a3621e74 2175 /* needed for dumping*/
e62cc96a 2176 DEBUG_r(if (optimize) {
07be1b83 2177 regnode *opt = convert;
bcdf7404 2178
e62cc96a 2179 while ( ++opt < optimize) {
07be1b83
YO
2180 Set_Node_Offset_Length(opt,0,0);
2181 }
786e8c11
YO
2182 /*
2183 Try to clean up some of the debris left after the
2184 optimisation.
a3621e74 2185 */
786e8c11 2186 while( optimize < jumper ) {
07be1b83 2187 mjd_nodelen += Node_Length((optimize));
a3621e74 2188 OP( optimize ) = OPTIMIZED;
07be1b83 2189 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2190 optimize++;
2191 }
07be1b83 2192 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2193 });
2194 } /* end node insert */
2e64971a
DM
2195
2196 /* Finish populating the prev field of the wordinfo array. Walk back
2197 * from each accept state until we find another accept state, and if
2198 * so, point the first word's .prev field at the second word. If the
2199 * second already has a .prev field set, stop now. This will be the
2200 * case either if we've already processed that word's accept state,
3b753521
FN
2201 * or that state had multiple words, and the overspill words were
2202 * already linked up earlier.
2e64971a
DM
2203 */
2204 {
2205 U16 word;
2206 U32 state;
2207 U16 prev;
2208
2209 for (word=1; word <= trie->wordcount; word++) {
2210 prev = 0;
2211 if (trie->wordinfo[word].prev)
2212 continue;
2213 state = trie->wordinfo[word].accept;
2214 while (state) {
2215 state = prev_states[state];
2216 if (!state)
2217 break;
2218 prev = trie->states[state].wordnum;
2219 if (prev)
2220 break;
2221 }
2222 trie->wordinfo[word].prev = prev;
2223 }
2224 Safefree(prev_states);
2225 }
2226
2227
2228 /* and now dump out the compressed format */
2229 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2230
55eed653 2231 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2232#ifdef DEBUGGING
2233 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2234 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2235#else
2236 SvREFCNT_dec(revcharmap);
07be1b83 2237#endif
786e8c11
YO
2238 return trie->jump
2239 ? MADE_JUMP_TRIE
2240 : trie->startstate>1
2241 ? MADE_EXACT_TRIE
2242 : MADE_TRIE;
2243}
2244
2245STATIC void
2246S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2247{
3b753521 2248/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2249
2250 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2251 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2252 ISBN 0-201-10088-6
2253
2254 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2255 suffix of the current state's 'word' that is also a proper prefix of another word in our
2256 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2257 the DFA not to have to restart after its tried and failed a word at a given point, it
2258 simply continues as though it had been matching the other word in the first place.
2259 Consider
2260 'abcdgu'=~/abcdefg|cdgu/
2261 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2262 fail, which would bring us to the state representing 'd' in the second word where we would
2263 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2264 */
2265 /* add a fail transition */
3251b653
NC
2266 const U32 trie_offset = ARG(source);
2267 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2268 U32 *q;
2269 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2270 const U32 numstates = trie->statecount;
786e8c11
YO
2271 const U32 ubound = trie->lasttrans + ucharcount;
2272 U32 q_read = 0;
2273 U32 q_write = 0;
2274 U32 charid;
2275 U32 base = trie->states[ 1 ].trans.base;
2276 U32 *fail;
2277 reg_ac_data *aho;
2278 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2279 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2280
2281 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2282#ifndef DEBUGGING
2283 PERL_UNUSED_ARG(depth);
2284#endif
2285
2286
2287 ARG_SET( stclass, data_slot );
c944940b 2288 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2289 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2290 aho->trie=trie_offset;
446bd890
NC
2291 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2292 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2293 Newxz( q, numstates, U32);
c944940b 2294 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2295 aho->refcount = 1;
2296 fail = aho->fail;
2297 /* initialize fail[0..1] to be 1 so that we always have
2298 a valid final fail state */
2299 fail[ 0 ] = fail[ 1 ] = 1;
2300
2301 for ( charid = 0; charid < ucharcount ; charid++ ) {
2302 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2303 if ( newstate ) {
2304 q[ q_write ] = newstate;
2305 /* set to point at the root */
2306 fail[ q[ q_write++ ] ]=1;
2307 }
2308 }
2309 while ( q_read < q_write) {
2310 const U32 cur = q[ q_read++ % numstates ];
2311 base = trie->states[ cur ].trans.base;
2312
2313 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2314 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2315 if (ch_state) {
2316 U32 fail_state = cur;
2317 U32 fail_base;
2318 do {
2319 fail_state = fail[ fail_state ];
2320 fail_base = aho->states[ fail_state ].trans.base;
2321 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2322
2323 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2324 fail[ ch_state ] = fail_state;
2325 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2326 {
2327 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2328 }
2329 q[ q_write++ % numstates] = ch_state;
2330 }
2331 }
2332 }
2333 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2334 when we fail in state 1, this allows us to use the
2335 charclass scan to find a valid start char. This is based on the principle
2336 that theres a good chance the string being searched contains lots of stuff
2337 that cant be a start char.
2338 */
2339 fail[ 0 ] = fail[ 1 ] = 0;
2340 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2341 PerlIO_printf(Perl_debug_log,
2342 "%*sStclass Failtable (%"UVuf" states): 0",
2343 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2344 );
786e8c11
YO
2345 for( q_read=1; q_read<numstates; q_read++ ) {
2346 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2347 }
2348 PerlIO_printf(Perl_debug_log, "\n");
2349 });
2350 Safefree(q);
2351 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2352}
2353
786e8c11 2354
a3621e74 2355/*
5d1c421c
JH
2356 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2357 * These need to be revisited when a newer toolchain becomes available.
2358 */
2359#if defined(__sparc64__) && defined(__GNUC__)
2360# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2361# undef SPARC64_GCC_WORKAROUND
2362# define SPARC64_GCC_WORKAROUND 1
2363# endif
2364#endif
2365
07be1b83 2366#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2367 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2368 SV * const mysv=sv_newmortal(); \
2369 regnode *Next = regnext(scan); \
2370 regprop(RExC_rx, mysv, scan); \
7f69552c 2371 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2372 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2373 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2374 }});
07be1b83 2375
1de06328
YO
2376
2377
2378
2379
07be1b83
YO
2380#define JOIN_EXACT(scan,min,flags) \
2381 if (PL_regkind[OP(scan)] == EXACT) \
2382 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2383
be8e71aa 2384STATIC U32
07be1b83
YO
2385S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2386 /* Merge several consecutive EXACTish nodes into one. */
2387 regnode *n = regnext(scan);
2388 U32 stringok = 1;
2389 regnode *next = scan + NODE_SZ_STR(scan);
2390 U32 merged = 0;
2391 U32 stopnow = 0;
2392#ifdef DEBUGGING
2393 regnode *stop = scan;
72f13be8 2394 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2395#else
d47053eb
RGS
2396 PERL_UNUSED_ARG(depth);
2397#endif
7918f24d
NC
2398
2399 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2400#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2401 PERL_UNUSED_ARG(flags);
2402 PERL_UNUSED_ARG(val);
07be1b83 2403#endif
07be1b83
YO
2404 DEBUG_PEEP("join",scan,depth);
2405
2406 /* Skip NOTHING, merge EXACT*. */
2407 while (n &&
2408 ( PL_regkind[OP(n)] == NOTHING ||
2409 (stringok && (OP(n) == OP(scan))))
2410 && NEXT_OFF(n)
2411 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2412
2413 if (OP(n) == TAIL || n > next)
2414 stringok = 0;
2415 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2416 DEBUG_PEEP("skip:",n,depth);
2417 NEXT_OFF(scan) += NEXT_OFF(n);
2418 next = n + NODE_STEP_REGNODE;
2419#ifdef DEBUGGING
2420 if (stringok)
2421 stop = n;
2422#endif
2423 n = regnext(n);
2424 }
2425 else if (stringok) {
786e8c11 2426 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2427 regnode * const nnext = regnext(n);
2428
2429 DEBUG_PEEP("merg",n,depth);
2430
2431 merged++;
2432 if (oldl + STR_LEN(n) > U8_MAX)
2433 break;
2434 NEXT_OFF(scan) += NEXT_OFF(n);
2435 STR_LEN(scan) += STR_LEN(n);
2436 next = n + NODE_SZ_STR(n);
2437 /* Now we can overwrite *n : */
2438 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2439#ifdef DEBUGGING
2440 stop = next - 1;
2441#endif
2442 n = nnext;
2443 if (stopnow) break;
2444 }
2445
d47053eb
RGS
2446#ifdef EXPERIMENTAL_INPLACESCAN
2447 if (flags && !NEXT_OFF(n)) {
2448 DEBUG_PEEP("atch", val, depth);
2449 if (reg_off_by_arg[OP(n)]) {
2450 ARG_SET(n, val - n);
2451 }
2452 else {
2453 NEXT_OFF(n) = val - n;
2454 }
2455 stopnow = 1;
2456 }
07be1b83
YO
2457#endif
2458 }
ced7f090
KW
2459#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2460#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2461#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2462#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2c2b7f86
KW
2463
2464 if (UTF
2465 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
2466 && ( STR_LEN(scan) >= 6 ) )
2467 {
07be1b83
YO
2468 /*
2469 Two problematic code points in Unicode casefolding of EXACT nodes:
2470
2471 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2472 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2473
2474 which casefold to
2475
2476 Unicode UTF-8
2477
2478 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2479 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2480
2481 This means that in case-insensitive matching (or "loose matching",
2482 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2483 length of the above casefolded versions) can match a target string
2484 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2485 This would rather mess up the minimum length computation.
2486
2487 What we'll do is to look for the tail four bytes, and then peek
2488 at the preceding two bytes to see whether we need to decrease
2489 the minimum length by four (six minus two).
2490
2491 Thanks to the design of UTF-8, there cannot be false matches:
2492 A sequence of valid UTF-8 bytes cannot be a subsequence of
2493 another valid sequence of UTF-8 bytes.
2494
2495 */
2496 char * const s0 = STRING(scan), *s, *t;
2497 char * const s1 = s0 + STR_LEN(scan) - 1;
2498 char * const s2 = s1 - 4;
e294cc5d
JH
2499#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2500 const char t0[] = "\xaf\x49\xaf\x42";
2501#else
07be1b83 2502 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2503#endif
07be1b83
YO
2504 const char * const t1 = t0 + 3;
2505
2506 for (s = s0 + 2;
2507 s < s2 && (t = ninstr(s, s1, t0, t1));
2508 s = t + 4) {
e294cc5d
JH
2509#ifdef EBCDIC
2510 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2511 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2512#else
07be1b83
YO
2513 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2514 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2515#endif
07be1b83
YO
2516 *min -= 4;
2517 }
2518 }
2519
2520#ifdef DEBUGGING
2521 /* Allow dumping */
2522 n = scan + NODE_SZ_STR(scan);
2523 while (n <= stop) {
2524 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2525 OP(n) = OPTIMIZED;
2526 NEXT_OFF(n) = 0;
2527 }
2528 n++;
2529 }
2530#endif
2531 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2532 return stopnow;
2533}
2534
486ec47a 2535/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
2536 Finds fixed substrings. */
2537
a0288114 2538/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2539 to the position after last scanned or to NULL. */
2540
40d049e4
YO
2541#define INIT_AND_WITHP \
2542 assert(!and_withp); \
2543 Newx(and_withp,1,struct regnode_charclass_class); \
2544 SAVEFREEPV(and_withp)
07be1b83 2545
b515a41d 2546/* this is a chain of data about sub patterns we are processing that
486ec47a 2547 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
2548 we can simulate recursion without losing state. */
2549struct scan_frame;
2550typedef struct scan_frame {
2551 regnode *last; /* last node to process in this frame */
2552 regnode *next; /* next node to process when last is reached */
2553 struct scan_frame *prev; /*previous frame*/
2554 I32 stop; /* what stopparen do we use */
2555} scan_frame;
2556
304ee84b
YO
2557
2558#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2559
e1d1eefb
YO
2560#define CASE_SYNST_FNC(nAmE) \
2561case nAmE: \
2562 if (flags & SCF_DO_STCLASS_AND) { \
2563 for (value = 0; value < 256; value++) \
2564 if (!is_ ## nAmE ## _cp(value)) \
2565 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2566 } \
2567 else { \
2568 for (value = 0; value < 256; value++) \
2569 if (is_ ## nAmE ## _cp(value)) \
2570 ANYOF_BITMAP_SET(data->start_class, value); \
2571 } \
2572 break; \
2573case N ## nAmE: \
2574 if (flags & SCF_DO_STCLASS_AND) { \
2575 for (value = 0; value < 256; value++) \
2576 if (is_ ## nAmE ## _cp(value)) \
2577 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2578 } \
2579 else { \
2580 for (value = 0; value < 256; value++) \
2581 if (!is_ ## nAmE ## _cp(value)) \
2582 ANYOF_BITMAP_SET(data->start_class, value); \
2583 } \
2584 break
2585
2586
2587
76e3520e 2588STATIC I32
40d049e4 2589S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2590 I32 *minlenp, I32 *deltap,
40d049e4
YO
2591 regnode *last,
2592 scan_data_t *data,
2593 I32 stopparen,
2594 U8* recursed,
2595 struct regnode_charclass_class *and_withp,
2596 U32 flags, U32 depth)
c277df42
IZ
2597 /* scanp: Start here (read-write). */
2598 /* deltap: Write maxlen-minlen here. */
2599 /* last: Stop before this one. */
40d049e4
YO
2600 /* data: string data about the pattern */
2601 /* stopparen: treat close N as END */
2602 /* recursed: which subroutines have we recursed into */
2603 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2604{
97aff369 2605 dVAR;
c277df42
IZ
2606 I32 min = 0, pars = 0, code;
2607 regnode *scan = *scanp, *next;
2608 I32 delta = 0;
2609 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2610 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2611 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2612 scan_data_t data_fake;
a3621e74 2613 SV *re_trie_maxbuff = NULL;
786e8c11 2614 regnode *first_non_open = scan;
e2e6a0f1 2615 I32 stopmin = I32_MAX;
8aa23a47 2616 scan_frame *frame = NULL;
a3621e74 2617 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2618
7918f24d
NC
2619 PERL_ARGS_ASSERT_STUDY_CHUNK;
2620
13a24bad 2621#ifdef DEBUGGING
40d049e4 2622 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2623#endif
40d049e4 2624
786e8c11 2625 if ( depth == 0 ) {
40d049e4 2626 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2627 first_non_open=regnext(first_non_open);
2628 }
2629
b81d288d 2630
8aa23a47
YO
2631 fake_study_recurse:
2632 while ( scan && OP(scan) != END && scan < last ){
2633 /* Peephole optimizer: */
304ee84b 2634 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2635 DEBUG_PEEP("Peep",scan,depth);
2636 JOIN_EXACT(scan,&min,0);
2637
2638 /* Follow the next-chain of the current node and optimize
2639 away all the NOTHINGs from it. */
2640 if (OP(scan) != CURLYX) {
2641 const int max = (reg_off_by_arg[OP(scan)]
2642 ? I32_MAX
2643 /* I32 may be smaller than U16 on CRAYs! */
2644 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2645 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2646 int noff;
2647 regnode *n = scan;
2648
2649 /* Skip NOTHING and LONGJMP. */
2650 while ((n = regnext(n))
2651 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2652 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2653 && off + noff < max)
2654 off += noff;
2655 if (reg_off_by_arg[OP(scan)])
2656 ARG(scan) = off;
2657 else
2658 NEXT_OFF(scan) = off;
2659 }
a3621e74 2660
c277df42 2661
8aa23a47
YO
2662
2663 /* The principal pseudo-switch. Cannot be a switch, since we
2664 look into several different things. */
2665 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2666 || OP(scan) == IFTHEN) {
2667 next = regnext(scan);
2668 code = OP(scan);
2669 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2670
2671 if (OP(next) == code || code == IFTHEN) {
2672 /* NOTE - There is similar code to this block below for handling
2673 TRIE nodes on a re-study. If you change stuff here check there
2674 too. */
2675 I32 max1 = 0, min1 = I32_MAX, num = 0;
2676 struct regnode_charclass_class accum;
2677 regnode * const startbranch=scan;
2678
2679 if (flags & SCF_DO_SUBSTR)
304ee84b 2680 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2681 if (flags & SCF_DO_STCLASS)
2682 cl_init_zero(pRExC_state, &accum);
2683
2684 while (OP(scan) == code) {
2685 I32 deltanext, minnext, f = 0, fake;
2686 struct regnode_charclass_class this_class;
2687
2688 num++;
2689 data_fake.flags = 0;
2690 if (data) {
2691 data_fake.whilem_c = data->whilem_c;
2692 data_fake.last_closep = data->last_closep;
2693 }
2694 else
2695 data_fake.last_closep = &fake;
58e23c8d
YO
2696
2697 data_fake.pos_delta = delta;
8aa23a47
YO
2698 next = regnext(scan);
2699 scan = NEXTOPER(scan);
2700 if (code != BRANCH)
c277df42 2701 scan = NEXTOPER(scan);
8aa23a47
YO
2702 if (flags & SCF_DO_STCLASS) {
2703 cl_init(pRExC_state, &this_class);
2704 data_fake.start_class = &this_class;
2705 f = SCF_DO_STCLASS_AND;
58e23c8d 2706 }
8aa23a47
YO
2707 if (flags & SCF_WHILEM_VISITED_POS)
2708 f |= SCF_WHILEM_VISITED_POS;
2709
2710 /* we suppose the run is continuous, last=next...*/
2711 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2712 next, &data_fake,
2713 stopparen, recursed, NULL, f,depth+1);
2714 if (min1 > minnext)
2715 min1 = minnext;
2716 if (max1 < minnext + deltanext)
2717 max1 = minnext + deltanext;
2718 if (deltanext == I32_MAX)
2719 is_inf = is_inf_internal = 1;
2720 scan = next;
2721 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2722 pars++;
2723 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2724 if ( stopmin > minnext)
2725 stopmin = min + min1;
2726 flags &= ~SCF_DO_SUBSTR;
2727 if (data)
2728 data->flags |= SCF_SEEN_ACCEPT;
2729 }
2730 if (data) {
2731 if (data_fake.flags & SF_HAS_EVAL)
2732 data->flags |= SF_HAS_EVAL;
2733 data->whilem_c = data_fake.whilem_c;
3dab1dad 2734 }
8aa23a47
YO
2735 if (flags & SCF_DO_STCLASS)
2736 cl_or(pRExC_state, &accum, &this_class);
2737 }
2738 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2739 min1 = 0;
2740 if (flags & SCF_DO_SUBSTR) {
2741 data->pos_min += min1;
2742 data->pos_delta += max1 - min1;
2743 if (max1 != min1 || is_inf)
2744 data->longest = &(data->longest_float);
2745 }
2746 min += min1;
2747 delta += max1 - min1;
2748 if (flags & SCF_DO_STCLASS_OR) {
2749 cl_or(pRExC_state, data->start_class, &accum);
2750 if (min1) {
2751 cl_and(data->start_class, and_withp);
2752 flags &= ~SCF_DO_STCLASS;
653099ff 2753 }
8aa23a47
YO
2754 }
2755 else if (flags & SCF_DO_STCLASS_AND) {
2756 if (min1) {
2757 cl_and(data->start_class, &accum);
2758 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2759 }
8aa23a47
YO
2760 else {
2761 /* Switch to OR mode: cache the old value of
2762 * data->start_class */
2763 INIT_AND_WITHP;
2764 StructCopy(data->start_class, and_withp,
2765 struct regnode_charclass_class);
2766 flags &= ~SCF_DO_STCLASS_AND;
2767 StructCopy(&accum, data->start_class,
2768 struct regnode_charclass_class);
2769 flags |= SCF_DO_STCLASS_OR;
2770 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2771 }
8aa23a47 2772 }
a3621e74 2773
8aa23a47
YO
2774 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2775 /* demq.
a3621e74 2776
8aa23a47
YO
2777 Assuming this was/is a branch we are dealing with: 'scan' now
2778 points at the item that follows the branch sequence, whatever
2779 it is. We now start at the beginning of the sequence and look
2780 for subsequences of
a3621e74 2781
8aa23a47
YO
2782 BRANCH->EXACT=>x1
2783 BRANCH->EXACT=>x2
2784 tail
a3621e74 2785
8aa23a47 2786 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2787
486ec47a 2788 If we can find such a subsequence we need to turn the first
8aa23a47
YO
2789 element into a trie and then add the subsequent branch exact
2790 strings to the trie.
a3621e74 2791
8aa23a47 2792 We have two cases
a3621e74 2793
3b753521 2794 1. patterns where the whole set of branches can be converted.
a3621e74 2795
8aa23a47 2796 2. patterns where only a subset can be converted.
a3621e74 2797
8aa23a47
YO
2798 In case 1 we can replace the whole set with a single regop
2799 for the trie. In case 2 we need to keep the start and end
3b753521 2800 branches so
a3621e74 2801
8aa23a47
YO
2802 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2803 becomes BRANCH TRIE; BRANCH X;
786e8c11 2804
8aa23a47
YO
2805 There is an additional case, that being where there is a
2806 common prefix, which gets split out into an EXACT like node
2807 preceding the TRIE node.
a3621e74 2808
8aa23a47
YO
2809 If x(1..n)==tail then we can do a simple trie, if not we make
2810 a "jump" trie, such that when we match the appropriate word
486ec47a 2811 we "jump" to the appropriate tail node. Essentially we turn
8aa23a47 2812 a nested if into a case structure of sorts.
b515a41d 2813
8aa23a47
YO
2814 */
2815
2816 int made=0;
2817 if (!re_trie_maxbuff) {
2818 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2819 if (!SvIOK(re_trie_maxbuff))
2820 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2821 }
2822 if ( SvIV(re_trie_maxbuff)>=0 ) {
2823 regnode *cur;
2824 regnode *first = (regnode *)NULL;
2825 regnode *last = (regnode *)NULL;
2826 regnode *tail = scan;
2827 U8 optype = 0;
2828 U32 count=0;
a3621e74
YO
2829
2830#ifdef DEBUGGING
8aa23a47 2831 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2832#endif
8aa23a47
YO
2833 /* var tail is used because there may be a TAIL
2834 regop in the way. Ie, the exacts will point to the
2835 thing following the TAIL, but the last branch will
2836 point at the TAIL. So we advance tail. If we
2837 have nested (?:) we may have to move through several
2838 tails.
2839 */
2840
2841 while ( OP( tail ) == TAIL ) {
2842 /* this is the TAIL generated by (?:) */
2843 tail = regnext( tail );
2844 }
a3621e74 2845
8aa23a47
YO
2846
2847 DEBUG_OPTIMISE_r({
2848 regprop(RExC_rx, mysv, tail );
2849 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2850 (int)depth * 2 + 2, "",
2851 "Looking for TRIE'able sequences. Tail node is: ",
2852 SvPV_nolen_const( mysv )
2853 );
2854 });
2855
2856 /*
2857
2858 step through the branches, cur represents each
2859 branch, noper is the first thing to be matched
2860 as part of that branch and noper_next is the
2861 regnext() of that node. if noper is an EXACT
2862 and noper_next is the same as scan (our current
2863 position in the regex) then the EXACT branch is
2864 a possible optimization target. Once we have
486ec47a 2865 two or more consecutive such branches we can
8aa23a47
YO
2866 create a trie of the EXACT's contents and stich
2867 it in place. If the sequence represents all of
2868 the branches we eliminate the whole thing and
2869 replace it with a single TRIE. If it is a
2870 subsequence then we need to stitch it in. This
2871 means the first branch has to remain, and needs
2872 to be repointed at the item on the branch chain
2873 following the last branch optimized. This could
2874 be either a BRANCH, in which case the
2875 subsequence is internal, or it could be the
2876 item following the branch sequence in which
2877 case the subsequence is at the end.
2878
2879 */
2880
2881 /* dont use tail as the end marker for this traverse */
2882 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2883 regnode * const noper = NEXTOPER( cur );
b515a41d 2884#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2885 regnode * const noper_next = regnext( noper );
b515a41d
YO
2886#endif
2887
8aa23a47
YO
2888 DEBUG_OPTIMISE_r({
2889 regprop(RExC_rx, mysv, cur);
2890 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2891 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2892
2893 regprop(RExC_rx, mysv, noper);
2894 PerlIO_printf( Perl_debug_log, " -> %s",
2895 SvPV_nolen_const(mysv));
2896
2897 if ( noper_next ) {
2898 regprop(RExC_rx, mysv, noper_next );
2899 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2900 SvPV_nolen_const(mysv));
2901 }
2902 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2903 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2904 });
2905 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2906 : PL_regkind[ OP( noper ) ] == EXACT )
2907 || OP(noper) == NOTHING )
786e8c11 2908#ifdef NOJUMPTRIE
8aa23a47 2909 && noper_next == tail
786e8c11 2910#endif
8aa23a47
YO
2911 && count < U16_MAX)
2912 {
2913 count++;
2914 if ( !first || optype == NOTHING ) {
2915 if (!first) first = cur;
2916 optype = OP( noper );
2917 } else {
2918 last = cur;
2919 }
2920 } else {
a0a388a1 2921/*
0abd0d78
YO
2922 Currently we do not believe that the trie logic can
2923 handle case insensitive matching properly when the
2924 pattern is not unicode (thus forcing unicode semantics).
2925
2926 If/when this is fixed the following define can be swapped
2927 in below to fully enable trie logic.
2928
a0a388a1 2929#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2930
2931*/
2932#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2933
a0a388a1 2934 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2935 make_trie( pRExC_state,
2936 startbranch, first, cur, tail, count,
2937 optype, depth+1 );
2938 }
2939 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2940#ifdef NOJUMPTRIE
8aa23a47 2941 && noper_next == tail
786e8c11 2942#endif
8aa23a47
YO
2943 ){
2944 count = 1;
2945 first = cur;
2946 optype = OP( noper );
2947 } else {
2948 count = 0;
2949 first = NULL;
2950 optype = 0;
2951 }
2952 last = NULL;
2953 }
2954 }
2955 DEBUG_OPTIMISE_r({
2956 regprop(RExC_rx, mysv, cur);
2957 PerlIO_printf( Perl_debug_log,
2958 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2959 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2960
2961 });
a0a388a1
YO
2962
2963 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2964 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2965#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2966 if ( ((made == MADE_EXACT_TRIE &&
2967 startbranch == first)
2968 || ( first_non_open == first )) &&
2969 depth==0 ) {
2970 flags |= SCF_TRIE_RESTUDY;
2971 if ( startbranch == first
2972 && scan == tail )
2973 {
2974 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2975 }
2976 }
3dab1dad 2977#endif
8aa23a47
YO
2978 }
2979 }
2980
2981 } /* do trie */
2982
653099ff 2983 }
8aa23a47
YO
2984 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2985 scan = NEXTOPER(NEXTOPER(scan));
2986 } else /* single branch is optimized. */
2987 scan = NEXTOPER(scan);
2988 continue;
2989 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2990 scan_frame *newframe = NULL;
2991 I32 paren;
2992 regnode *start;
2993 regnode *end;
2994
2995 if (OP(scan) != SUSPEND) {
2996 /* set the pointer */
2997 if (OP(scan) == GOSUB) {
2998 paren = ARG(scan);
2999 RExC_recurse[ARG2L(scan)] = scan;
3000 start = RExC_open_parens[paren-1];
3001 end = RExC_close_parens[paren-1];
3002 } else {
3003 paren = 0;
f8fc2ecf 3004 start = RExC_rxi->program + 1;
8aa23a47
YO
3005 end = RExC_opend;
3006 }
3007 if (!recursed) {
3008 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3009 SAVEFREEPV(recursed);
3010 }
3011 if (!PAREN_TEST(recursed,paren+1)) {
3012 PAREN_SET(recursed,paren+1);
3013 Newx(newframe,1,scan_frame);
3014 } else {
3015 if (flags & SCF_DO_SUBSTR) {
304ee84b 3016 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3017 data->longest = &(data->longest_float);
3018 }
3019 is_inf = is_inf_internal = 1;
3020 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3021 cl_anything(pRExC_state, data->start_class);
3022 flags &= ~SCF_DO_STCLASS;
3023 }
3024 } else {
3025 Newx(newframe,1,scan_frame);
3026 paren = stopparen;
3027 start = scan+2;
3028 end = regnext(scan);
3029 }
3030 if (newframe) {
3031 assert(start);
3032 assert(end);
3033 SAVEFREEPV(newframe);
3034 newframe->next = regnext(scan);
3035 newframe->last = last;
3036 newframe->stop = stopparen;
3037 newframe->prev = frame;
3038
3039 frame = newframe;
3040 scan = start;
3041 stopparen = paren;
3042 last = end;
3043
3044 continue;
3045 }
3046 }
3047 else if (OP(scan) == EXACT) {
3048 I32 l = STR_LEN(scan);
3049 UV uc;
3050 if (UTF) {
3051 const U8 * const s = (U8*)STRING(scan);
3052 l = utf8_length(s, s + l);
3053 uc = utf8_to_uvchr(s, NULL);
3054 } else {
3055 uc = *((U8*)STRING(scan));
3056 }
3057 min += l;
3058 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3059 /* The code below prefers earlier match for fixed
3060 offset, later match for variable offset. */
3061 if (data->last_end == -1) { /* Update the start info. */
3062 data->last_start_min = data->pos_min;
3063 data->last_start_max = is_inf
3064 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3065 }
8aa23a47
YO
3066 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3067 if (UTF)
3068 SvUTF8_on(data->last_found);
3069 {
3070 SV * const sv = data->last_found;
3071 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3072 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3073 if (mg && mg->mg_len >= 0)
3074 mg->mg_len += utf8_length((U8*)STRING(scan),
3075 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3076 }
8aa23a47
YO
3077 data->last_end = data->pos_min + l;
3078 data->pos_min += l; /* As in the first entry. */
3079 data->flags &= ~SF_BEFORE_EOL;
3080 }
3081 if (flags & SCF_DO_STCLASS_AND) {
3082 /* Check whether it is compatible with what we know already! */
3083 int compat = 1;
3084
54251c2e 3085
486ec47a 3086 /* If compatible, we or it in below. It is compatible if is
54251c2e
KW
3087 * in the bitmp and either 1) its bit or its fold is set, or 2)
3088 * it's for a locale. Even if there isn't unicode semantics
3089 * here, at runtime there may be because of matching against a
3090 * utf8 string, so accept a possible false positive for
3091 * latin1-range folds */
8aa23a47
YO
3092 if (uc >= 0x100 ||
3093 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3094 && !ANYOF_BITMAP_TEST(data->start_class, uc)
39065660 3095 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
54251c2e 3096 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
8aa23a47
YO
3097 )
3098 compat = 0;
3099 ANYOF_CLASS_ZERO(data->start_class);
3100 ANYOF_BITMAP_ZERO(data->start_class);
3101 if (compat)
3102 ANYOF_BITMAP_SET(data->start_class, uc);
3103 data->start_class->flags &= ~ANYOF_EOS;
3104 if (uc < 0x100)
3105 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3106 }
3107 else if (flags & SCF_DO_STCLASS_OR) {
3108 /* false positive possible if the class is case-folded */
3109 if (uc < 0x100)
3110 ANYOF_BITMAP_SET(data->start_class, uc);
3111 else
3112 data->start_class->flags |= ANYOF_UNICODE_ALL;
3113 data->start_class->flags &= ~ANYOF_EOS;
3114 cl_and(data->start_class, and_withp);
3115 }
3116 flags &= ~SCF_DO_STCLASS;
3117 }
3118 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3119 I32 l = STR_LEN(scan);
3120 UV uc = *((U8*)STRING(scan));
3121
3122 /* Search for fixed substrings supports EXACT only. */
3123 if (flags & SCF_DO_SUBSTR) {
3124 assert(data);
304ee84b 3125 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3126 }
3127 if (UTF) {
3128 const U8 * const s = (U8 *)STRING(scan);
3129 l = utf8_length(s, s + l);
3130 uc = utf8_to_uvchr(s, NULL);
3131 }
3132 min += l;
3133 if (flags & SCF_DO_SUBSTR)
3134 data->pos_min += l;
3135 if (flags & SCF_DO_STCLASS_AND) {
3136 /* Check whether it is compatible with what we know already! */
3137 int compat = 1;
8aa23a47 3138 if (uc >= 0x100 ||
54251c2e
KW
3139 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3140 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3141 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3142 {
8aa23a47 3143 compat = 0;
54251c2e 3144 }
8aa23a47
YO
3145 ANYOF_CLASS_ZERO(data->start_class);
3146 ANYOF_BITMAP_ZERO(data->start_class);
3147 if (compat) {
3148 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3149 data->start_class->flags &= ~ANYOF_EOS;
39065660 3150 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
970c8436 3151 if (OP(scan) == EXACTFL) {
8aa23a47 3152 data->start_class->flags |= ANYOF_LOCALE;
970c8436
KW
3153 }
3154 else {
3155
54251c2e
KW
3156 /* Also set the other member of the fold pair. In case
3157 * that unicode semantics is called for at runtime, use
3158 * the full latin1 fold. (Can't do this for locale,
3159 * because not known until runtime */
3160 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
970c8436 3161 }
653099ff 3162 }
8aa23a47
YO
3163 }
3164 else if (flags & SCF_DO_STCLASS_OR) {
39065660 3165 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
8aa23a47
YO
3166 /* false positive possible if the class is case-folded.
3167 Assume that the locale settings are the same... */
970c8436 3168 if (uc < 0x100) {
1aa99e6b 3169 ANYOF_BITMAP_SET(data->start_class, uc);
970c8436
KW
3170 if (OP(scan) != EXACTFL) {
3171
3172 /* And set the other member of the fold pair, but
3173 * can't do that in locale because not known until
3174 * run-time */
3175 ANYOF_BITMAP_SET(data->start_class,
54251c2e 3176 PL_fold_latin1[uc]);
970c8436
KW
3177 }
3178 }
653099ff
GS
3179 data->start_class->flags &= ~ANYOF_EOS;
3180 }
8aa23a47 3181 cl_and(data->start_class, and_withp);
653099ff 3182 }
8aa23a47
YO
3183 flags &= ~SCF_DO_STCLASS;
3184 }
e52fc539 3185 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3186 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3187 I32 f = flags, pos_before = 0;
3188 regnode * const oscan = scan;
3189 struct regnode_charclass_class this_class;
3190 struct regnode_charclass_class *oclass = NULL;
3191 I32 next_is_eval = 0;
3192
3193 switch (PL_regkind[OP(scan)]) {
3194 case WHILEM: /* End of (?:...)* . */
3195 scan = NEXTOPER(scan);
3196 goto finish;
3197 case PLUS:
3198 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3199 next = NEXTOPER(scan);
3200 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3201 mincount = 1;
3202 maxcount = REG_INFTY;
3203 next = regnext(scan);
3204 scan = NEXTOPER(scan);
3205 goto do_curly;
3206 }
3207 }
3208 if (flags & SCF_DO_SUBSTR)
3209 data->pos_min++;
3210 min++;
3211 /* Fall through. */
3212 case STAR:
3213 if (flags & SCF_DO_STCLASS) {
3214 mincount = 0;
3215 maxcount = REG_INFTY;
3216 next = regnext(scan);
3217 scan = NEXTOPER(scan);
3218 goto do_curly;
3219 }
3220 is_inf = is_inf_internal = 1;
3221 scan = regnext(scan);
c277df42 3222 if (flags & SCF_DO_SUBSTR) {
304ee84b 3223 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3224 data->longest = &(data->longest_float);
c277df42 3225 }
8aa23a47
YO
3226 goto optimize_curly_tail;
3227 case CURLY:
3228 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3229 && (scan->flags == stopparen))
3230 {
3231 mincount = 1;
3232 maxcount = 1;
3233 } else {
3234 mincount = ARG1(scan);
3235 maxcount = ARG2(scan);
653099ff 3236 }
8aa23a47
YO
3237 next = regnext(scan);
3238 if (OP(scan) == CURLYX) {
3239 I32 lp = (data ? *(data->last_closep) : 0);
3240 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3241 }
8aa23a47
YO
3242 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3243 next_is_eval = (OP(scan) == EVAL);
3244 do_curly:
3245 if (flags & SCF_DO_SUBSTR) {
304ee84b 3246 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3247 pos_before = data->pos_min;
b45f050a 3248 }
8aa23a47
YO
3249 if (data) {
3250 fl = data->flags;
3251 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3252 if (is_inf)
3253 data->flags |= SF_IS_INF;
3254 }
3255 if (flags & SCF_DO_STCLASS) {
3256 cl_init(pRExC_state, &this_class);
3257 oclass = data->start_class;
3258 data->start_class = &this_class;
3259 f |= SCF_DO_STCLASS_AND;
3260 f &= ~SCF_DO_STCLASS_OR;
3261 }
779bcb7d
NC
3262 /* Exclude from super-linear cache processing any {n,m}
3263 regops for which the combination of input pos and regex
3264 pos is not enough information to determine if a match
3265 will be possible.
3266
3267 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3268 regex pos at the \s*, the prospects for a match depend not
3269 only on the input position but also on how many (bar\s*)
3270 repeats into the {4,8} we are. */
3271 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
8aa23a47 3272 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3273
8aa23a47
YO
3274 /* This will finish on WHILEM, setting scan, or on NULL: */
3275 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3276 last, data, stopparen, recursed, NULL,
3277 (mincount == 0
3278 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3279
8aa23a47
YO
3280 if (flags & SCF_DO_STCLASS)
3281 data->start_class = oclass;
3282 if (mincount == 0 || minnext == 0) {
3283 if (flags & SCF_DO_STCLASS_OR) {
3284 cl_or(pRExC_state, data->start_class, &this_class);
3285 }
3286 else if (flags & SCF_DO_STCLASS_AND) {
3287 /* Switch to OR mode: cache the old value of
3288 * data->start_class */
3289 INIT_AND_WITHP;
3290 StructCopy(data->start_class, and_withp,
3291 struct regnode_charclass_class);
3292 flags &= ~SCF_DO_STCLASS_AND;
3293 StructCopy(&this_class, data->start_class,
3294 struct regnode_charclass_class);
3295 flags |= SCF_DO_STCLASS_OR;
3296 data->start_class->flags |= ANYOF_EOS;
3297 }
3298 } else { /* Non-zero len */
3299 if (flags & SCF_DO_STCLASS_OR) {
3300 cl_or(pRExC_state, data->start_class, &this_class);
3301 cl_and(data->start_class, and_withp);
3302 }
3303 else if (flags & SCF_DO_STCLASS_AND)
3304 cl_and(data->start_class, &this_class);
3305 flags &= ~SCF_DO_STCLASS;
3306 }
3307 if (!scan) /* It was not CURLYX, but CURLY. */
3308 scan = next;
3309 if ( /* ? quantifier ok, except for (?{ ... }) */
3310 (next_is_eval || !(mincount == 0 && maxcount == 1))
3311 && (minnext == 0) && (deltanext == 0)
3312 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3313 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3314 {
668c081a
NC
3315 ckWARNreg(RExC_parse,
3316 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3317 }
3318
3319 min += minnext * mincount;
3320 is_inf_internal |= ((maxcount == REG_INFTY
3321 && (minnext + deltanext) > 0)
3322 || deltanext == I32_MAX);
3323 is_inf |= is_inf_internal;
3324 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3325
3326 /* Try powerful optimization CURLYX => CURLYN. */
3327 if ( OP(oscan) == CURLYX && data
3328 && data->flags & SF_IN_PAR
3329 && !(data->flags & SF_HAS_EVAL)
3330 && !deltanext && minnext == 1 ) {
3331 /* Try to optimize to CURLYN. */
3332 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3333 regnode * const nxt1 = nxt;
497b47a8 3334#ifdef DEBUGGING
8aa23a47 3335 regnode *nxt2;
497b47a8 3336#endif
c277df42 3337
8aa23a47
YO
3338 /* Skip open. */
3339 nxt = regnext(nxt);
e52fc539 3340 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3341 && !(PL_regkind[OP(nxt)] == EXACT
3342 && STR_LEN(nxt) == 1))
3343 goto nogo;
497b47a8 3344#ifdef DEBUGGING
8aa23a47 3345 nxt2 = nxt;
497b47a8 3346#endif
8aa23a47
YO
3347 nxt = regnext(nxt);
3348 if (OP(nxt) != CLOSE)
3349 goto nogo;
3350 if (RExC_open_parens) {
3351 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3352 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3353 }
3354 /* Now we know that nxt2 is the only contents: */
3355 oscan->flags = (U8)ARG(nxt);
3356 OP(oscan) = CURLYN;
3357 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3358
c277df42 3359#ifdef DEBUGGING
8aa23a47 3360 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3361 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3362 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3363 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3364 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3365 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3366#endif
8aa23a47
YO
3367 }
3368 nogo:
3369
3370 /* Try optimization CURLYX => CURLYM. */
3371 if ( OP(oscan) == CURLYX && data
3372 && !(data->flags & SF_HAS_PAR)
3373 && !(data->flags & SF_HAS_EVAL)
3374 && !deltanext /* atom is fixed width */
3375 && minnext != 0 /* CURLYM can't handle zero width */
3376 ) {
3377 /* XXXX How to optimize if data == 0? */
3378 /* Optimize to a simpler form. */
3379 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3380 regnode *nxt2;
3381
3382 OP(oscan) = CURLYM;
3383 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3384 && (OP(nxt2) != WHILEM))
3385 nxt = nxt2;
3386 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3387 /* Need to optimize away parenths. */
b3c0965f 3388 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3389 /* Set the parenth number. */
3390 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3391
8aa23a47
YO
3392 oscan->flags = (U8)ARG(nxt);
3393 if (RExC_open_parens) {
3394 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3395 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3396 }
8aa23a47
YO
3397 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3398 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3399
c277df42 3400#ifdef DEBUGGING
8aa23a47
YO
3401 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3402 OP(nxt + 1) = OPTIMIZED; /* was count. */
486ec47a
PA
3403 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3404 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
b81d288d 3405#endif
c277df42 3406#if 0
8aa23a47
YO
3407 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3408 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3409 if (nnxt == nxt) {
3410 if (reg_off_by_arg[OP(nxt1)])
3411 ARG_SET(nxt1, nxt2 - nxt1);
3412 else if (nxt2 - nxt1 < U16_MAX)
3413 NEXT_OFF(nxt1) = nxt2 - nxt1;
3414 else
3415 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3416 }
8aa23a47 3417 nxt1 = nnxt;
c277df42 3418 }
5d1c421c 3419#endif
8aa23a47
YO
3420 /* Optimize again: */
3421 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3422 NULL, stopparen, recursed, NULL, 0,depth+1);
3423 }
3424 else
3425 oscan->flags = 0;
3426 }
3427 else if ((OP(oscan) == CURLYX)
3428 && (flags & SCF_WHILEM_VISITED_POS)
3429 /* See the comment on a similar expression above.
3b753521 3430 However, this time it's not a subexpression
8aa23a47
YO
3431 we care about, but the expression itself. */
3432 && (maxcount == REG_INFTY)
3433 && data && ++data->whilem_c < 16) {
3434 /* This stays as CURLYX, we can put the count/of pair. */
3435 /* Find WHILEM (as in regexec.c) */
3436 regnode *nxt = oscan + NEXT_OFF(oscan);
3437
3438 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3439 nxt += ARG(nxt);
3440 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3441 | (RExC_whilem_seen << 4)); /* On WHILEM */
3442 }
3443 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3444 pars++;
3445 if (flags & SCF_DO_SUBSTR) {
3446 SV *last_str = NULL;
3447 int counted = mincount != 0;
a0ed51b3 3448
8aa23a47
YO
3449 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3450#if defined(SPARC64_GCC_WORKAROUND)
3451 I32 b = 0;
3452 STRLEN l = 0;
3453 const char *s = NULL;
3454 I32 old = 0;
b515a41d 3455
8aa23a47
YO
3456 if (pos_before >= data->last_start_min)
3457 b = pos_before;
3458 else
3459 b = data->last_start_min;
b515a41d 3460
8aa23a47
YO
3461 l = 0;
3462 s = SvPV_const(data->last_found, l);
3463 old = b - data->last_start_min;
3464
3465#else
3466 I32 b = pos_before >= data->last_start_min
3467 ? pos_before : data->last_start_min;
3468 STRLEN l;
3469 const char * const s = SvPV_const(data->last_found, l);
3470 I32 old = b - data->last_start_min;
3471#endif
3472
3473 if (UTF)
3474 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3475 l -= old;
3476 /* Get the added string: */
740cce10 3477 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3478 if (deltanext == 0 && pos_before == b) {
3479 /* What was added is a constant string */
3480 if (mincount > 1) {
3481 SvGROW(last_str, (mincount * l) + 1);
3482 repeatcpy(SvPVX(last_str) + l,
3483 SvPVX_const(last_str), l, mincount - 1);
3484 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3485 /* Add additional parts. */
3486 SvCUR_set(data->last_found,
3487 SvCUR(data->last_found) - l);
3488 sv_catsv(data->last_found, last_str);
3489 {
3490 SV * sv = data->last_found;
3491 MAGIC *mg =
3492 SvUTF8(sv) && SvMAGICAL(sv) ?
3493 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3494 if (mg && mg->mg_len >= 0)
bd94e887 3495 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3496 }
8aa23a47 3497 data->last_end += l * (mincount - 1);
b515a41d 3498 }
8aa23a47
YO
3499 } else {
3500 /* start offset must point into the last copy */
3501 data->last_start_min += minnext * (mincount - 1);
3502 data->last_start_max += is_inf ? I32_MAX
3503 : (maxcount - 1) * (minnext + data->pos_delta);
3504 }
c277df42 3505 }
8aa23a47
YO
3506 /* It is counted once already... */
3507 data->pos_min += minnext * (mincount - counted);
3508 data->pos_delta += - counted * deltanext +
3509 (minnext + deltanext) * maxcount - minnext * mincount;
3510 if (mincount != maxcount) {
3511 /* Cannot extend fixed substrings found inside
3512 the group. */
304ee84b 3513 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3514 if (mincount && last_str) {
3515 SV * const sv = data->last_found;
3516 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3517 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3518
3519 if (mg)
3520 mg->mg_len = -1;
3521 sv_setsv(sv, last_str);
3522 data->last_end = data->pos_min;
3523 data->last_start_min =
3524 data->pos_min - CHR_SVLEN(last_str);
3525 data->last_start_max = is_inf
3526 ? I32_MAX
3527 : data->pos_min + data->pos_delta
3528 - CHR_SVLEN(last_str);
3529 }
3530 data->longest = &(data->longest_float);
3531 }
3532 SvREFCNT_dec(last_str);
c277df42 3533 }
8aa23a47
YO
3534 if (data && (fl & SF_HAS_EVAL))
3535 data->flags |= SF_HAS_EVAL;
3536 optimize_curly_tail:
3537 if (OP(oscan) != CURLYX) {
3538 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3539 && NEXT_OFF(next))
3540 NEXT_OFF(oscan) += NEXT_OFF(next);
3541 }
3542 continue;
f56b6394 3543 default: /* REF, ANYOFV, and CLUMP only? */
8aa23a47 3544 if (flags & SCF_DO_SUBSTR) {
304ee84b 3545 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3546 data->longest = &(data->longest_float);
3547 }
3548 is_inf = is_inf_internal = 1;
3549 if (flags & SCF_DO_STCLASS_OR)
3550 cl_anything(pRExC_state, data->start_class);
3551 flags &= ~SCF_DO_STCLASS;
3552 break;
c277df42 3553 }
8aa23a47 3554 }
e1d1eefb
YO
3555 else if (OP(scan) == LNBREAK) {
3556 if (flags & SCF_DO_STCLASS) {
3557 int value = 0;
3558 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3559 if (flags & SCF_DO_STCLASS_AND) {
3560 for (value = 0; value < 256; value++)
e64b1bd1 3561 if (!is_VERTWS_cp(value))
b9a59e08
KW
3562 ANYOF_BITMAP_CLEAR(data->start_class, value);
3563 }
3564 else {
e1d1eefb 3565 for (value = 0; value < 256; value++)
e64b1bd1 3566 if (is_VERTWS_cp(value))
b9a59e08
KW
3567 ANYOF_BITMAP_SET(data->start_class, value);
3568 }
e1d1eefb
YO
3569 if (flags & SCF_DO_STCLASS_OR)
3570 cl_and(data->start_class, and_withp);
3571 flags &= ~SCF_DO_STCLASS;
3572 }
3573 min += 1;
f9a79580 3574 delta += 1;
e1d1eefb
YO
3575 if (flags & SCF_DO_SUBSTR) {
3576 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3577 data->pos_min += 1;
f9a79580 3578 data->pos_delta += 1;
e1d1eefb
YO
3579 data->longest = &(data->longest_float);
3580 }
e1d1eefb 3581 }
f9a79580 3582 else if (OP(scan) == FOLDCHAR) {
ced7f090 3583 int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
f9a79580
RGS
3584 flags &= ~SCF_DO_STCLASS;
3585 min += 1;
3586 delta += d;
3587 if (flags & SCF_DO_SUBSTR) {
3588 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3589 data->pos_min += 1;
3590 data->pos_delta += d;
3591 data->longest = &(data->longest_float);
3592 }
3593 }
e52fc539 3594 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 3595 int value = 0;
653099ff 3596
8aa23a47 3597 if (flags & SCF_DO_SUBSTR) {
304ee84b 3598 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3599 data->pos_min++;
3600 }
3601 min++;
3602 if (flags & SCF_DO_STCLASS) {
3603 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3604
8aa23a47
YO
3605 /* Some of the logic below assumes that switching
3606 locale on will only add false positives. */
3607 switch (PL_regkind[OP(scan)]) {
3608 case SANY:
3609 default:
3610 do_default:
3611 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3612 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3613 cl_anything(pRExC_state, data->start_class);
3614 break;
3615 case REG_ANY:
3616 if (OP(scan) == SANY)
3617 goto do_default;
3618 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3619 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3a15e693 3620 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
8aa23a47 3621 cl_anything(pRExC_state, data->start_class);
653099ff 3622 }
8aa23a47
YO
3623 if (flags & SCF_DO_STCLASS_AND || !value)
3624 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3625 break;
3626 case ANYOF:
3627 if (flags & SCF_DO_STCLASS_AND)
3628 cl_and(data->start_class,
3629 (struct regnode_charclass_class*)scan);
653099ff 3630 else
8aa23a47
YO
3631 cl_or(pRExC_state, data->start_class,
3632 (struct regnode_charclass_class*)scan);
3633 break;
3634 case ALNUM:
3635 if (flags & SCF_DO_STCLASS_AND) {
3636 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3637 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
980866de 3638 if (OP(scan) == ALNUMU) {
a12cf05f
KW
3639 for (value = 0; value < 256; value++) {
3640 if (!isWORDCHAR_L1(value)) {
3641 ANYOF_BITMAP_CLEAR(data->start_class, value);
3642 }
3643 }
3644 } else {
3645 for (value = 0; value < 256; value++) {
3646 if (!isALNUM(value)) {
3647 ANYOF_BITMAP_CLEAR(data->start_class, value);
3648 }
3649 }
3650 }
8aa23a47 3651 }
653099ff 3652 }
8aa23a47
YO
3653 else {
3654 if (data->start_class->flags & ANYOF_LOCALE)
3655 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
980866de 3656 else if (OP(scan) == ALNUMU) {
a12cf05f
KW
3657 for (value = 0; value < 256; value++) {
3658 if (isWORDCHAR_L1(value)) {
3659 ANYOF_BITMAP_SET(data->start_class, value);
3660 }
3661 }
3662 } else {
3663 for (value = 0; value < 256; value++) {
3664 if (isALNUM(value)) {
3665 ANYOF_BITMAP_SET(data->start_class, value);
3666 }
3667 }
3668 }
8aa23a47
YO
3669 }
3670 break;
8aa23a47
YO
3671 case NALNUM:
3672 if (flags & SCF_DO_STCLASS_AND) {
3673 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3674 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
980866de 3675 if (OP(scan) == NALNUMU) {
a12cf05f
KW
3676 for (value = 0; value < 256; value++) {
3677 if (isWORDCHAR_L1(value)) {
3678 ANYOF_BITMAP_CLEAR(data->start_class, value);
3679 }
3680 }
3681 } else {
3682 for (value = 0; value < 256; value++) {
3683 if (isALNUM(value)) {
3684 ANYOF_BITMAP_CLEAR(data->start_class, value);
3685 }
3686 }
3687 }
653099ff
GS
3688 }
3689 }
8aa23a47
YO
3690 else {
3691 if (data->start_class->flags & ANYOF_LOCALE)
3692 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3693 else {
980866de 3694 if (OP(scan) == NALNUMU) {
e9a9c1bc
KW
3695 for (value = 0; value < 256; value++) {
3696 if (! isWORDCHAR_L1(value)) {
3697 ANYOF_BITMAP_SET(data->start_class, value);
3698 }
3699 }
3700 } else {
3701 for (value = 0; value < 256; value++) {
3702 if (! isALNUM(value)) {
3703 ANYOF_BITMAP_SET(data->start_class, value);
3704 }
3705 }
3706 }
8aa23a47 3707 }
653099ff 3708 }
8aa23a47 3709 break;
8aa23a47
YO
3710 case SPACE:
3711 if (flags & SCF_DO_STCLASS_AND) {
3712 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3713 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
980866de 3714 if (OP(scan) == SPACEU) {
a12cf05f
KW
3715 for (value = 0; value < 256; value++) {
3716 if (!isSPACE_L1(value)) {
3717 ANYOF_BITMAP_CLEAR(data->start_class, value);
3718 }
3719 }
3720 } else {
3721 for (value = 0; value < 256; value++) {
3722 if (!isSPACE(value)) {
3723 ANYOF_BITMAP_CLEAR(data->start_class, value);
3724 }
3725 }
3726 }
653099ff
GS
3727 }
3728 }
8aa23a47 3729 else {
a12cf05f 3730 if (data->start_class->flags & ANYOF_LOCALE) {
8aa23a47 3731 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
a12cf05f 3732 }
980866de 3733 else if (OP(scan) == SPACEU) {
a12cf05f
KW
3734 for (value = 0; value < 256; value++) {
3735 if (isSPACE_L1(value)) {
3736 ANYOF_BITMAP_SET(data->start_class, value);
3737 }
3738 }
3739 } else {
3740 for (value = 0; value < 256; value++) {
3741 if (isSPACE(value)) {
3742 ANYOF_BITMAP_SET(data->start_class, value);
3743 }
3744 }
8aa23a47 3745 }
653099ff 3746 }
8aa23a47 3747 break;
8aa23a47
YO
3748 case NSPACE:
3749 if (flags & SCF_DO_STCLASS_AND) {
3750 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3751 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
980866de 3752 if (OP(scan) == NSPACEU) {
a12cf05f
KW
3753 for (value = 0; value < 256; value++) {
3754 if (isSPACE_L1(value)) {
3755 ANYOF_BITMAP_CLEAR(data->start_class, value);
3756 }
3757 }
3758 } else {
3759 for (value = 0; value < 256; value++) {
3760 if (isSPACE(value)) {
3761 ANYOF_BITMAP_CLEAR(data->start_class, value);
3762 }
3763 }
3764 }
653099ff 3765 }
8aa23a47
YO
3766 }
3767 else {
3768 if (data->start_class->flags & ANYOF_LOCALE)
3769 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
980866de 3770 else if (OP(scan) == NSPACEU) {
a12cf05f
KW
3771 for (value = 0; value < 256; value++) {
3772 if (!isSPACE_L1(value)) {
3773 ANYOF_BITMAP_SET(data->start_class, value);
3774 }
3775 }
3776 }
3777 else {
3778 for (value = 0; value < 256; value++) {
3779 if (!isSPACE(value)) {
3780 ANYOF_BITMAP_SET(data->start_class, value);
3781 }
3782 }
3783 }
653099ff 3784 }
8aa23a47 3785 break;
8aa23a47
YO
3786 case DIGIT:
3787 if (flags & SCF_DO_STCLASS_AND) {
3788 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3789 for (value = 0; value < 256; value++)
3790 if (!isDIGIT(value))
3791 ANYOF_BITMAP_CLEAR(data->start_class, value);
3792 }
3793 else {
3794 if (data->start_class->flags & ANYOF_LOCALE)
3795 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3796 else {
3797 for (value = 0; value < 256; value++)
3798 if (isDIGIT(value))
b9a59e08 3799 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47
YO
3800 }
3801 }
3802 break;
3803 case NDIGIT:
3804 if (flags & SCF_DO_STCLASS_AND) {
3805 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3806 for (value = 0; value < 256; value++)
3807 if (isDIGIT(value))
3808 ANYOF_BITMAP_CLEAR(data->start_class, value);
3809 }
3810 else {
3811 if (data->start_class->flags & ANYOF_LOCALE)
3812 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3813 else {
3814 for (value = 0; value < 256; value++)
3815 if (!isDIGIT(value))
b9a59e08 3816 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3817 }
3818 }
8aa23a47 3819 break;
e1d1eefb
YO
3820 CASE_SYNST_FNC(VERTWS);
3821 CASE_SYNST_FNC(HORIZWS);
3822
8aa23a47
YO
3823 }
3824 if (flags & SCF_DO_STCLASS_OR)
3825 cl_and(data->start_class, and_withp);
3826 flags &= ~SCF_DO_STCLASS;
3827 }
3828 }
3829 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3830 data->flags |= (OP(scan) == MEOL
3831 ? SF_BEFORE_MEOL
3832 : SF_BEFORE_SEOL);
3833 }
3834 else if ( PL_regkind[OP(scan)] == BRANCHJ
3835 /* Lookbehind, or need to calculate parens/evals/stclass: */
3836 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3837 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3838 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3839 || OP(scan) == UNLESSM )
3840 {
3841 /* Negative Lookahead/lookbehind
3842 In this case we can't do fixed string optimisation.
3843 */
1de06328 3844
8aa23a47
YO
3845 I32 deltanext, minnext, fake = 0;
3846 regnode *nscan;
3847 struct regnode_charclass_class intrnl;
3848 int f = 0;
1de06328 3849
8aa23a47
YO
3850 data_fake.flags = 0;
3851 if (data) {
3852 data_fake.whilem_c = data->whilem_c;
3853 data_fake.last_closep = data->last_closep;
c277df42 3854 }
8aa23a47
YO
3855 else
3856 data_fake.last_closep = &fake;
58e23c8d 3857 data_fake.pos_delta = delta;
8aa23a47
YO
3858 if ( flags & SCF_DO_STCLASS && !scan->flags
3859 && OP(scan) == IFMATCH ) { /* Lookahead */
3860 cl_init(pRExC_state, &intrnl);
3861 data_fake.start_class = &intrnl;
3862 f |= SCF_DO_STCLASS_AND;
3863 }
3864 if (flags & SCF_WHILEM_VISITED_POS)
3865 f |= SCF_WHILEM_VISITED_POS;
3866 next = regnext(scan);
3867 nscan = NEXTOPER(NEXTOPER(scan));
3868 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3869 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3870 if (scan->flags) {
3871 if (deltanext) {
58e23c8d 3872 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3873 }
3874 else if (minnext > (I32)U8_MAX) {
58e23c8d 3875 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3876 }
3877 scan->flags = (U8)minnext;
3878 }
3879 if (data) {
3880 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3881 pars++;
3882 if (data_fake.flags & SF_HAS_EVAL)
3883 data->flags |= SF_HAS_EVAL;
3884 data->whilem_c = data_fake.whilem_c;
3885 }
3886 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
3887 if (flags & SCF_DO_STCLASS_OR) {
3888 /* OR before, AND after: ideally we would recurse with
3889 * data_fake to get the AND applied by study of the
3890 * remainder of the pattern, and then derecurse;
3891 * *** HACK *** for now just treat as "no information".
3892 * See [perl #56690].
3893 */
3894 cl_init(pRExC_state, data->start_class);
3895 } else {
3896 /* AND before and after: combine and continue */
3897 const int was = (data->start_class->flags & ANYOF_EOS);
3898
3899 cl_and(data->start_class, &intrnl);
3900 if (was)
3901 data->start_class->flags |= ANYOF_EOS;
3902 }
8aa23a47 3903 }
cb434fcc 3904 }
8aa23a47
YO
3905#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3906 else {
3907 /* Positive Lookahead/lookbehind
3908 In this case we can do fixed string optimisation,
3909 but we must be careful about it. Note in the case of
3910 lookbehind the positions will be offset by the minimum
3911 length of the pattern, something we won't know about
3912 until after the recurse.
3913 */
3914 I32 deltanext, fake = 0;
3915 regnode *nscan;
3916 struct regnode_charclass_class intrnl;
3917 int f = 0;
3918 /* We use SAVEFREEPV so that when the full compile
3919 is finished perl will clean up the allocated
3b753521 3920 minlens when it's all done. This way we don't
8aa23a47
YO
3921 have to worry about freeing them when we know
3922 they wont be used, which would be a pain.
3923 */
3924 I32 *minnextp;
3925 Newx( minnextp, 1, I32 );
3926 SAVEFREEPV(minnextp);
3927
3928 if (data) {
3929 StructCopy(data, &data_fake, scan_data_t);
3930 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3931 f |= SCF_DO_SUBSTR;
3932 if (scan->flags)
304ee84b 3933 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3934 data_fake.last_found=newSVsv(data->last_found);
3935 }
3936 }
3937 else
3938 data_fake.last_closep = &fake;
3939 data_fake.flags = 0;
58e23c8d 3940 data_fake.pos_delta = delta;
8aa23a47
YO
3941 if (is_inf)
3942 data_fake.flags |= SF_IS_INF;
3943 if ( flags & SCF_DO_STCLASS && !scan->flags
3944 && OP(scan) == IFMATCH ) { /* Lookahead */
3945 cl_init(pRExC_state, &intrnl);
3946 data_fake.start_class = &intrnl;
3947 f |= SCF_DO_STCLASS_AND;
3948 }
3949 if (flags & SCF_WHILEM_VISITED_POS)
3950 f |= SCF_WHILEM_VISITED_POS;
3951 next = regnext(scan);
3952 nscan = NEXTOPER(NEXTOPER(scan));
3953
3954 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3955 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3956 if (scan->flags) {
3957 if (deltanext) {
58e23c8d 3958 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3959 }
3960 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3961 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3962 }
3963 scan->flags = (U8)*minnextp;
3964 }
3965
3966 *minnextp += min;
3967
3968 if (f & SCF_DO_STCLASS_AND) {
3969 const int was = (data->start_class->flags & ANYOF_EOS);
3970
3971 cl_and(data->start_class, &intrnl);
3972 if (was)
3973 data->start_class->flags |= ANYOF_EOS;
3974 }
3975 if (data) {
3976 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3977 pars++;
3978 if (data_fake.flags & SF_HAS_EVAL)
3979 data->flags |= SF_HAS_EVAL;
3980 data->whilem_c = data_fake.whilem_c;
3981 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3982 if (RExC_rx->minlen<*minnextp)
3983 RExC_rx->minlen=*minnextp;
304ee84b 3984 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3985 SvREFCNT_dec(data_fake.last_found);
3986
3987 if ( data_fake.minlen_fixed != minlenp )
3988 {
3989 data->offset_fixed= data_fake.offset_fixed;
3990 data->minlen_fixed= data_fake.minlen_fixed;
3991 data->lookbehind_fixed+= scan->flags;
3992 }
3993 if ( data_fake.minlen_float != minlenp )
3994 {
3995 data->minlen_float= data_fake.minlen_float;
3996 data->offset_float_min=data_fake.offset_float_min;
3997 data->offset_float_max=data_fake.offset_float_max;
3998 data->lookbehind_float+= scan->flags;
3999 }
4000 }
4001 }
4002
4003
40d049e4 4004 }
8aa23a47
YO
4005#endif
4006 }
4007 else if (OP(scan) == OPEN) {
4008 if (stopparen != (I32)ARG(scan))
4009 pars++;
4010 }
4011 else if (OP(scan) == CLOSE) {
4012 if (stopparen == (I32)ARG(scan)) {
4013 break;
4014 }
4015 if ((I32)ARG(scan) == is_par) {
4016 next = regnext(scan);
b515a41d 4017
8aa23a47
YO
4018 if ( next && (OP(next) != WHILEM) && next < last)
4019 is_par = 0; /* Disable optimization */
40d049e4 4020 }
8aa23a47
YO
4021 if (data)
4022 *(data->last_closep) = ARG(scan);
4023 }
4024 else if (OP(scan) == EVAL) {
c277df42
IZ
4025 if (data)
4026 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
4027 }
4028 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4029 if (flags & SCF_DO_SUBSTR) {
304ee84b 4030 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 4031 flags &= ~SCF_DO_SUBSTR;
40d049e4 4032 }
8aa23a47
YO
4033 if (data && OP(scan)==ACCEPT) {
4034 data->flags |= SCF_SEEN_ACCEPT;
4035 if (stopmin > min)
4036 stopmin = min;
e2e6a0f1 4037 }
8aa23a47
YO
4038 }
4039 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4040 {
0f5d15d6 4041 if (flags & SCF_DO_SUBSTR) {
304ee84b 4042 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
4043 data->longest = &(data->longest_float);
4044 }
4045 is_inf = is_inf_internal = 1;
653099ff 4046 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 4047 cl_anything(pRExC_state, data->start_class);
96776eda 4048 flags &= ~SCF_DO_STCLASS;
8aa23a47 4049 }
58e23c8d 4050 else if (OP(scan) == GPOS) {
bbe252da 4051 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
4052 !(delta || is_inf || (data && data->pos_delta)))
4053 {
bbe252da
YO
4054 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4055 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
4056 if (RExC_rx->gofs < (U32)min)
4057 RExC_rx->gofs = min;
4058 } else {
bbe252da 4059 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
4060 RExC_rx->gofs = 0;
4061 }
4062 }
786e8c11 4063#ifdef TRIE_STUDY_OPT
40d049e4 4064#ifdef FULL_TRIE_STUDY
8aa23a47
YO
4065 else if (PL_regkind[OP(scan)] == TRIE) {
4066 /* NOTE - There is similar code to this block above for handling
4067 BRANCH nodes on the initial study. If you change stuff here
4068 check there too. */
4069 regnode *trie_node= scan;
4070 regnode *tail= regnext(scan);
f8fc2ecf 4071 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4072 I32 max1 = 0, min1 = I32_MAX;
4073 struct regnode_charclass_class accum;
4074
4075 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 4076 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
4077 if (flags & SCF_DO_STCLASS)
4078 cl_init_zero(pRExC_state, &accum);
4079
4080 if (!trie->jump) {
4081 min1= trie->minlen;
4082 max1= trie->maxlen;
4083 } else {
4084 const regnode *nextbranch= NULL;
4085 U32 word;
4086
4087 for ( word=1 ; word <= trie->wordcount ; word++)
4088 {
4089 I32 deltanext=0, minnext=0, f = 0, fake;
4090 struct regnode_charclass_class this_class;
4091
4092 data_fake.flags = 0;
4093 if (data) {
4094 data_fake.whilem_c = data->whilem_c;
4095 data_fake.last_closep = data->last_closep;
4096 }
4097 else
4098 data_fake.last_closep = &fake;
58e23c8d 4099 data_fake.pos_delta = delta;
8aa23a47
YO
4100 if (flags & SCF_DO_STCLASS) {
4101 cl_init(pRExC_state, &this_class);
4102 data_fake.start_class = &this_class;
4103 f = SCF_DO_STCLASS_AND;
4104 }
4105 if (flags & SCF_WHILEM_VISITED_POS)
4106 f |= SCF_WHILEM_VISITED_POS;
4107
4108 if (trie->jump[word]) {
4109 if (!nextbranch)
4110 nextbranch = trie_node + trie->jump[0];
4111 scan= trie_node + trie->jump[word];
4112 /* We go from the jump point to the branch that follows
4113 it. Note this means we need the vestigal unused branches
4114 even though they arent otherwise used.
4115 */
4116 minnext = study_chunk(pRExC_state, &scan, minlenp,
4117 &deltanext, (regnode *)nextbranch, &data_fake,
4118 stopparen, recursed, NULL, f,depth+1);
4119 }
4120 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4121 nextbranch= regnext((regnode*)nextbranch);
4122
4123 if (min1 > (I32)(minnext + trie->minlen))
4124 min1 = minnext + trie->minlen;
4125 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4126 max1 = minnext + deltanext + trie->maxlen;
4127 if (deltanext == I32_MAX)
4128 is_inf = is_inf_internal = 1;
4129
4130 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4131 pars++;
4132 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4133 if ( stopmin > min + min1)
4134 stopmin = min + min1;
4135 flags &= ~SCF_DO_SUBSTR;
4136 if (data)
4137 data->flags |= SCF_SEEN_ACCEPT;
4138 }
4139 if (data) {
4140 if (data_fake.flags & SF_HAS_EVAL)
4141 data->flags |= SF_HAS_EVAL;
4142 data->whilem_c = data_fake.whilem_c;
4143 }
4144 if (flags & SCF_DO_STCLASS)
4145 cl_or(pRExC_state, &accum, &this_class);
4146 }
4147 }
4148 if (flags & SCF_DO_SUBSTR) {
4149 data->pos_min += min1;
4150 data->pos_delta += max1 - min1;
4151 if (max1 != min1 || is_inf)
4152 data->longest = &(data->longest_float);
4153 }
4154 min += min1;
4155 delta += max1 - min1;
4156 if (flags & SCF_DO_STCLASS_OR) {
4157 cl_or(pRExC_state, data->start_class, &accum);
4158 if (min1) {
4159 cl_and(data->start_class, and_withp);
4160 flags &= ~SCF_DO_STCLASS;
4161 }
4162 }
4163 else if (flags & SCF_DO_STCLASS_AND) {
4164 if (min1) {
4165 cl_and(data->start_class, &accum);
4166 flags &= ~SCF_DO_STCLASS;
4167 }
4168 else {
4169 /* Switch to OR mode: cache the old value of
4170 * data->start_class */
4171 INIT_AND_WITHP;
4172 StructCopy(data->start_class, and_withp,
4173 struct regnode_charclass_class);
4174 flags &= ~SCF_DO_STCLASS_AND;
4175 StructCopy(&accum, data->start_class,
4176 struct regnode_charclass_class);
4177 flags |= SCF_DO_STCLASS_OR;
4178 data->start_class->flags |= ANYOF_EOS;
4179 }
4180 }
4181 scan= tail;
4182 continue;
4183 }
786e8c11 4184#else
8aa23a47 4185 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4186 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4187 U8*bang=NULL;
4188
4189 min += trie->minlen;
4190 delta += (trie->maxlen - trie->minlen);
4191 flags &= ~SCF_DO_STCLASS; /* xxx */
4192 if (flags & SCF_DO_SUBSTR) {
304ee84b 4193 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4194 data->pos_min += trie->minlen;
4195 data->pos_delta += (trie->maxlen - trie->minlen);
4196 if (trie->maxlen != trie->minlen)
4197 data->longest = &(data->longest_float);
4198 }
4199 if (trie->jump) /* no more substrings -- for now /grr*/
4200 flags &= ~SCF_DO_SUBSTR;
b515a41d 4201 }
8aa23a47
YO
4202#endif /* old or new */
4203#endif /* TRIE_STUDY_OPT */
e1d1eefb 4204
8aa23a47
YO
4205 /* Else: zero-length, ignore. */
4206 scan = regnext(scan);
4207 }
4208 if (frame) {
4209 last = frame->last;
4210 scan = frame->next;
4211 stopparen = frame->stop;
4212 frame = frame->prev;
4213 goto fake_study_recurse;
c277df42
IZ
4214 }
4215
4216 finish:
8aa23a47 4217 assert(!frame);
304ee84b 4218 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4219
c277df42 4220 *scanp = scan;
aca2d497 4221 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4222 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4223 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4224 if (is_par > (I32)U8_MAX)
c277df42
IZ
4225 is_par = 0;
4226 if (is_par && pars==1 && data) {
4227 data->flags |= SF_IN_PAR;
4228 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4229 }
4230 else if (pars && data) {
c277df42
IZ
4231 data->flags |= SF_HAS_PAR;
4232 data->flags &= ~SF_IN_PAR;
4233 }
653099ff 4234 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4235 cl_and(data->start_class, and_withp);
786e8c11
YO
4236 if (flags & SCF_TRIE_RESTUDY)
4237 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4238
304ee84b 4239 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4240
e2e6a0f1 4241 return min < stopmin ? min : stopmin;
c277df42
IZ
4242}
4243
2eccd3b2
NC
4244STATIC U32
4245S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4246{
4a4e7719
NC
4247 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4248
7918f24d
NC
4249 PERL_ARGS_ASSERT_ADD_DATA;
4250
4a4e7719
NC
4251 Renewc(RExC_rxi->data,
4252 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4253 char, struct reg_data);
4254 if(count)
f8fc2ecf 4255 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4256 else
f8fc2ecf 4257 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4258 RExC_rxi->data->count = count + n;
4259 Copy(s, RExC_rxi->data->what + count, n, U8);
4260 return count;
c277df42
IZ
4261}
4262
f8149455 4263/*XXX: todo make this not included in a non debugging perl */
76234dfb 4264#ifndef PERL_IN_XSUB_RE
d88dccdf 4265void
864dbfa3 4266Perl_reginitcolors(pTHX)
d88dccdf 4267{
97aff369 4268 dVAR;
1df70142 4269 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4270 if (s) {
1df70142
AL
4271 char *t = savepv(s);
4272 int i = 0;
4273 PL_colors[0] = t;
d88dccdf 4274 while (++i < 6) {
1df70142
AL
4275 t = strchr(t, '\t');
4276 if (t) {
4277 *t = '\0';
4278 PL_colors[i] = ++t;
d88dccdf
IZ
4279 }
4280 else
1df70142 4281 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4282 }
4283 } else {
1df70142 4284 int i = 0;
b81d288d 4285 while (i < 6)
06b5626a 4286 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4287 }
4288 PL_colorset = 1;
4289}
76234dfb 4290#endif
8615cb43 4291
07be1b83 4292
786e8c11
YO
4293#ifdef TRIE_STUDY_OPT
4294#define CHECK_RESTUDY_GOTO \
4295 if ( \
4296 (data.flags & SCF_TRIE_RESTUDY) \
4297 && ! restudied++ \
4298 ) goto reStudy
4299#else
4300#define CHECK_RESTUDY_GOTO
4301#endif
f9f4320a 4302
a687059c 4303/*
e50aee73 4304 - pregcomp - compile a regular expression into internal code
a687059c
LW
4305 *
4306 * We can't allocate space until we know how big the compiled form will be,
4307 * but we can't compile it (and thus know how big it is) until we've got a
4308 * place to put the code. So we cheat: we compile it twice, once with code
4309 * generation turned off and size counting turned on, and once "for real".
4310 * This also means that we don't allocate space until we are sure that the
4311 * thing really will compile successfully, and we never have to move the
4312 * code and thus invalidate pointers into it. (Note that it has to be in
4313 * one piece because free() must be able to free it all.) [NB: not true in perl]
4314 *
4315 * Beware that the optimization-preparation code in here knows about some
4316 * of the structure of the compiled regexp. [I'll say.]
4317 */
b9b4dddf
YO
4318
4319
4320
f9f4320a 4321#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4322#define RE_ENGINE_PTR &PL_core_reg_engine
4323#else
f9f4320a
YO
4324extern const struct regexp_engine my_reg_engine;
4325#define RE_ENGINE_PTR &my_reg_engine
4326#endif
6d5c990f
RGS
4327
4328#ifndef PERL_IN_XSUB_RE
3ab4a224 4329REGEXP *
1593ad57 4330Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4331{
97aff369 4332 dVAR;
6d5c990f 4333 HV * const table = GvHV(PL_hintgv);
7918f24d
NC
4334
4335 PERL_ARGS_ASSERT_PREGCOMP;
4336
f9f4320a
YO
4337 /* Dispatch a request to compile a regexp to correct
4338 regexp engine. */
f9f4320a
YO
4339 if (table) {
4340 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4341 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4342 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4343 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4344 DEBUG_COMPILE_r({
8d8756e7 4345 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4346 SvIV(*ptr));
4347 });
3ab4a224 4348 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4349 }
b9b4dddf 4350 }
3ab4a224 4351 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4352}
6d5c990f 4353#endif
2a5d9b1d 4354
3ab4a224 4355REGEXP *
29b09c41 4356Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
2a5d9b1d
RGS
4357{
4358 dVAR;
288b8c02
NC
4359 REGEXP *rx;
4360 struct regexp *r;
f8fc2ecf 4361 register regexp_internal *ri;
3ab4a224 4362 STRLEN plen;
5d51ce98
KW
4363 char *exp;
4364 char* xend;
c277df42 4365 regnode *scan;
a0d0e21e 4366 I32 flags;
a0d0e21e 4367 I32 minlen = 0;
29b09c41 4368 U32 pm_flags;
e7f38d0f
YO
4369
4370 /* these are all flags - maybe they should be turned
4371 * into a single int with different bit masks */
4372 I32 sawlookahead = 0;
a0d0e21e
LW
4373 I32 sawplus = 0;
4374 I32 sawopen = 0;
29b09c41 4375 bool used_setjump = FALSE;
e7f38d0f 4376
bbd61b5f
KW
4377 U8 jump_ret = 0;
4378 dJMPENV;
2c2d71f5 4379 scan_data_t data;
830247a4 4380 RExC_state_t RExC_state;
be8e71aa 4381 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83 4382#ifdef TRIE_STUDY_OPT
5d51ce98 4383 int restudied;
07be1b83
YO
4384 RExC_state_t copyRExC_state;
4385#endif
2a5d9b1d 4386 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4387
4388 PERL_ARGS_ASSERT_RE_COMPILE;
4389
6d5c990f 4390 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4391
29b09c41 4392 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
7b597bb8 4393
d6bd454d 4394 /****************** LONG JUMP TARGET HERE***********************/
bbd61b5f
KW
4395 /* Longjmp back to here if have to switch in midstream to utf8 */
4396 if (! RExC_orig_utf8) {
4397 JMPENV_PUSH(jump_ret);
29b09c41 4398 used_setjump = TRUE;
bbd61b5f
KW
4399 }
4400
5d51ce98 4401 if (jump_ret == 0) { /* First time through */
29b09c41
KW
4402 exp = SvPV(pattern, plen);
4403 xend = exp + plen;
4404 /* ignore the utf8ness if the pattern is 0 length */
4405 if (plen == 0) {
4406 RExC_utf8 = RExC_orig_utf8 = 0;
4407 }
4408
5d51ce98
KW
4409 DEBUG_COMPILE_r({
4410 SV *dsv= sv_newmortal();
4411 RE_PV_QUOTED_DECL(s, RExC_utf8,
4412 dsv, exp, plen, 60);
4413 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4414 PL_colors[4],PL_colors[5],s);
4415 });
4416 }
4417 else { /* longjumped back */
bbd61b5f
KW
4418 STRLEN len = plen;
4419
5d51ce98
KW
4420 /* If the cause for the longjmp was other than changing to utf8, pop
4421 * our own setjmp, and longjmp to the correct handler */
bbd61b5f
KW
4422 if (jump_ret != UTF8_LONGJMP) {
4423 JMPENV_POP;
4424 JMPENV_JUMP(jump_ret);
4425 }
4426
595598ee
KW
4427 GET_RE_DEBUG_FLAGS;
4428
bbd61b5f
KW
4429 /* It's possible to write a regexp in ascii that represents Unicode
4430 codepoints outside of the byte range, such as via \x{100}. If we
4431 detect such a sequence we have to convert the entire pattern to utf8
4432 and then recompile, as our sizing calculation will have been based
4433 on 1 byte == 1 character, but we will need to use utf8 to encode
4434 at least some part of the pattern, and therefore must convert the whole
4435 thing.
4436 -- dmq */
4437 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4438 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
595598ee 4439 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
bbd61b5f
KW
4440 xend = exp + len;
4441 RExC_orig_utf8 = RExC_utf8 = 1;
4442 SAVEFREEPV(exp);
4443 }
4444
5d51ce98
KW
4445#ifdef TRIE_STUDY_OPT
4446 restudied = 0;
4447#endif
4448
29b09c41 4449 /* Set to use unicode semantics if the pattern is in utf8 and has the
50e91148 4450 * 'depends' charset specified, as it means unicode when utf8 */
29b09c41 4451 pm_flags = orig_pm_flags;
a62b1201
KW
4452
4453 if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
4454 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
29b09c41
KW
4455 }
4456
02daf0ab 4457 RExC_precomp = exp;
c737faaf 4458 RExC_flags = pm_flags;
830247a4 4459 RExC_sawback = 0;
bbce6d69 4460
830247a4
IZ
4461 RExC_seen = 0;
4462 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4463 RExC_seen_evals = 0;
4464 RExC_extralen = 0;
c277df42 4465
bbce6d69 4466 /* First pass: determine size, legality. */
830247a4 4467 RExC_parse = exp;
fac92740 4468 RExC_start = exp;
830247a4
IZ
4469 RExC_end = xend;
4470 RExC_naughty = 0;
4471 RExC_npar = 1;
e2e6a0f1 4472 RExC_nestroot = 0;
830247a4
IZ
4473 RExC_size = 0L;
4474 RExC_emit = &PL_regdummy;
4475 RExC_whilem_seen = 0;
40d049e4
YO
4476 RExC_open_parens = NULL;
4477 RExC_close_parens = NULL;
4478 RExC_opend = NULL;
81714fb9 4479 RExC_paren_names = NULL;
1f1031fe
YO
4480#ifdef DEBUGGING
4481 RExC_paren_name_list = NULL;
4482#endif
40d049e4
YO
4483 RExC_recurse = NULL;
4484 RExC_recurse_count = 0;
81714fb9 4485
85ddcde9
JH
4486#if 0 /* REGC() is (currently) a NOP at the first pass.
4487 * Clever compilers notice this and complain. --jhi */
830247a4 4488 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4489#endif
3dab1dad
YO
4490 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4491 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4492 RExC_precomp = NULL;
a0d0e21e
LW
4493 return(NULL);
4494 }
bbd61b5f 4495
29b09c41
KW
4496 /* Here, finished first pass. Get rid of any added setjmp */
4497 if (used_setjump) {
bbd61b5f 4498 JMPENV_POP;
02daf0ab 4499 }
07be1b83 4500 DEBUG_PARSE_r({
81714fb9
YO
4501 PerlIO_printf(Perl_debug_log,
4502 "Required size %"IVdf" nodes\n"
4503 "Starting second pass (creation)\n",
4504 (IV)RExC_size);
07be1b83
YO
4505 RExC_lastnum=0;
4506 RExC_lastparse=NULL;
4507 });
c277df42
IZ
4508 /* Small enough for pointer-storage convention?
4509 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4510 if (RExC_size >= 0x10000L && RExC_extralen)
4511 RExC_size += RExC_extralen;
c277df42 4512 else
830247a4
IZ
4513 RExC_extralen = 0;
4514 if (RExC_whilem_seen > 15)
4515 RExC_whilem_seen = 15;
a0d0e21e 4516
f9f4320a
YO
4517 /* Allocate space and zero-initialize. Note, the two step process
4518 of zeroing when in debug mode, thus anything assigned has to
4519 happen after that */
d2f13c59 4520 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 4521 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
4522 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4523 char, regexp_internal);
4524 if ( r == NULL || ri == NULL )
b45f050a 4525 FAIL("Regexp out of space");
0f79a09d
GS
4526#ifdef DEBUGGING
4527 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4528 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4529#else
f8fc2ecf
YO
4530 /* bulk initialize base fields with 0. */
4531 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4532#endif
58e23c8d
YO
4533
4534 /* non-zero initialization begins here */
f8fc2ecf 4535 RXi_SET( r, ri );
f9f4320a 4536 r->engine= RE_ENGINE_PTR;
c737faaf 4537 r->extflags = pm_flags;
bcdf7404 4538 {
f7819f85 4539 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
a62b1201 4540 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
c5ea2ffa
KW
4541
4542 /* The caret is output if there are any defaults: if not all the STD
4543 * flags are set, or if no character set specifier is needed */
4544 bool has_default =
4545 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4546 || ! has_charset);
bcdf7404 4547 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
4548 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4549 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
4550 const char *fptr = STD_PAT_MODS; /*"msix"*/
4551 char *p;
fb85c044 4552 /* Allocate for the worst case, which is all the std flags are turned
c5ea2ffa
KW
4553 * on. If more precision is desired, we could do a population count of
4554 * the flags set. This could be done with a small lookup table, or by
4555 * shifting, masking and adding, or even, when available, assembly
4556 * language for a machine-language population count.
4557 * We never output a minus, as all those are defaults, so are
4558 * covered by the caret */
fb85c044 4559 const STRLEN wraplen = plen + has_p + has_runon
c5ea2ffa 4560 + has_default /* If needs a caret */
a62b1201
KW
4561
4562 /* If needs a character set specifier */
4563 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
bcdf7404
YO
4564 + (sizeof(STD_PAT_MODS) - 1)
4565 + (sizeof("(?:)") - 1);
4566
c5ea2ffa 4567 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
f7c278bf 4568 SvPOK_on(rx);
8f6ae13c 4569 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 4570 *p++='('; *p++='?';
9de15fec
KW
4571
4572 /* If a default, cover it using the caret */
c5ea2ffa 4573 if (has_default) {
85508812 4574 *p++= DEFAULT_PAT_MOD;
fb85c044 4575 }
c5ea2ffa 4576 if (has_charset) {
a62b1201
KW
4577 STRLEN len;
4578 const char* const name = get_regex_charset_name(r->extflags, &len);
4579 Copy(name, p, len, char);
4580 p += len;
9de15fec 4581 }
f7819f85
A
4582 if (has_p)
4583 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404 4584 {
bcdf7404 4585 char ch;
bcdf7404
YO
4586 while((ch = *fptr++)) {
4587 if(reganch & 1)
4588 *p++ = ch;
bcdf7404
YO
4589 reganch >>= 1;
4590 }
bcdf7404
YO
4591 }
4592
28d8d7f4 4593 *p++ = ':';
bb661a58 4594 Copy(RExC_precomp, p, plen, char);
efd26800
NC
4595 assert ((RX_WRAPPED(rx) - p) < 16);
4596 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 4597 p += plen;
bcdf7404 4598 if (has_runon)
28d8d7f4
YO
4599 *p++ = '\n';
4600 *p++ = ')';
4601 *p = 0;
fb85c044 4602 SvCUR_set(rx, p - SvPVX_const(rx));
bcdf7404
YO
4603 }
4604
bbe252da 4605 r->intflags = 0;
830247a4 4606 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4607
6bda09f9 4608 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4609 Newxz(RExC_open_parens, RExC_npar,regnode *);
4610 SAVEFREEPV(RExC_open_parens);
4611 Newxz(RExC_close_parens,RExC_npar,regnode *);
4612 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4613 }
4614
4615 /* Useful during FAIL. */
7122b237
YO
4616#ifdef RE_TRACK_PATTERN_OFFSETS
4617 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4618 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4619 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4620 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4621 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4622#endif
4623 SetProgLen(ri,RExC_size);
288b8c02 4624 RExC_rx_sv = rx;
830247a4 4625 RExC_rx = r;
f8fc2ecf 4626 RExC_rxi = ri;
bbce6d69 4627
4628 /* Second pass: emit code. */
c737faaf 4629 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
4630 RExC_parse = exp;
4631 RExC_end = xend;
4632 RExC_naughty = 0;
4633 RExC_npar = 1;
f8fc2ecf
YO
4634 RExC_emit_start = ri->program;
4635 RExC_emit = ri->program;
3b57cd43
YO
4636 RExC_emit_bound = ri->program + RExC_size + 1;
4637
2cd61cdb 4638 /* Store the count of eval-groups for security checks: */
f8149455 4639 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4640 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 4641 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 4642 ReREFCNT_dec(rx);
a0d0e21e 4643 return(NULL);
80757612 4644 }
07be1b83
YO
4645 /* XXXX To minimize changes to RE engine we always allocate
4646 3-units-long substrs field. */
4647 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4648 if (RExC_recurse_count) {
4649 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4650 SAVEFREEPV(RExC_recurse);
4651 }
a0d0e21e 4652
07be1b83 4653reStudy:
e7f38d0f 4654 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
07be1b83 4655 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4656
07be1b83 4657#ifdef TRIE_STUDY_OPT
0934c9d9
SH
4658 if (!restudied) {
4659 StructCopy(&zero_scan_data, &data, scan_data_t);
4660 copyRExC_state = RExC_state;
4661 } else {
5d458dd8 4662 U32 seen=RExC_seen;
07be1b83 4663 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4664
4665 RExC_state = copyRExC_state;
4666 if (seen & REG_TOP_LEVEL_BRANCHES)
4667 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4668 else
4669 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4670 if (data.last_found) {
07be1b83 4671 SvREFCNT_dec(data.longest_fixed);
07be1b83 4672 SvREFCNT_dec(data.longest_float);
07be1b83 4673 SvREFCNT_dec(data.last_found);
1de06328 4674 }
40d049e4 4675 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4676 }
40d049e4
YO
4677#else
4678 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4679#endif
fc8cd66c 4680
a0d0e21e 4681 /* Dig out information for optimizations. */
f7819f85 4682 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
4683 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4684
a0ed51b3 4685 if (UTF)
8f6ae13c 4686 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 4687 ri->regstclass = NULL;
830247a4 4688 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4689 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4690 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4691
1de06328
YO
4692 /* testing for BRANCH here tells us whether there is "must appear"
4693 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4694 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4695 I32 fake;
c5254dd6 4696 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4697 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4698 int stclass_flag;
07be1b83 4699 I32 last_close = 0; /* pointed to by data */
5339e136
YO
4700 regnode *first= scan;
4701 regnode *first_next= regnext(first);
639081d6
YO
4702 /*
4703 * Skip introductions and multiplicators >= 1
4704 * so that we can extract the 'meat' of the pattern that must
4705 * match in the large if() sequence following.
4706 * NOTE that EXACT is NOT covered here, as it is normally
4707 * picked up by the optimiser separately.
4708 *
4709 * This is unfortunate as the optimiser isnt handling lookahead
4710 * properly currently.
4711 *
4712 */
a0d0e21e 4713 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4714 /* An OR of *one* alternative - should not happen now. */
5339e136 4715 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83 4716 /* for now we can't handle lookbehind IFMATCH*/
e7f38d0f 4717 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
a0d0e21e
LW
4718 (OP(first) == PLUS) ||
4719 (OP(first) == MINMOD) ||
653099ff 4720 /* An {n,m} with n>0 */
5339e136
YO
4721 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4722 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 4723 {
639081d6
YO
4724 /*
4725 * the only op that could be a regnode is PLUS, all the rest
4726 * will be regnode_1 or regnode_2.
4727 *
4728 */
a0d0e21e
LW
4729 if (OP(first) == PLUS)
4730 sawplus = 1;
4731 else
3dab1dad 4732 first += regarglen[OP(first)];
639081d6
YO
4733
4734 first = NEXTOPER(first);
5339e136 4735 first_next= regnext(first);
a687059c
LW
4736 }
4737
a0d0e21e
LW
4738 /* Starting-point info. */
4739 again:
786e8c11 4740 DEBUG_PEEP("first:",first,0);
07be1b83 4741 /* Ignore EXACT as we deal with it later. */
3dab1dad 4742 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4743 if (OP(first) == EXACT)
6f207bd3 4744 NOOP; /* Empty, get anchored substr later. */
e5fbd0ff 4745 else
f8fc2ecf 4746 ri->regstclass = first;
b3c9acc1 4747 }
07be1b83 4748#ifdef TRIE_STCLASS
786e8c11 4749 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4750 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4751 {
786e8c11 4752 regnode *trie_op;
07be1b83 4753 /* this can happen only on restudy */
786e8c11 4754 if ( OP(first) == TRIE ) {
c944940b 4755 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4756 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4757 StructCopy(first,trieop,struct regnode_1);
4758 trie_op=(regnode *)trieop;
4759 } else {
c944940b 4760 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4761 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4762 StructCopy(first,trieop,struct regnode_charclass);
4763 trie_op=(regnode *)trieop;
4764 }
1de06328 4765 OP(trie_op)+=2;
786e8c11 4766 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4767 ri->regstclass = trie_op;
07be1b83
YO
4768 }
4769#endif
e52fc539 4770 else if (REGNODE_SIMPLE(OP(first)))
f8fc2ecf 4771 ri->regstclass = first;
3dab1dad
YO
4772 else if (PL_regkind[OP(first)] == BOUND ||
4773 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4774 ri->regstclass = first;
3dab1dad 4775 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4776 r->extflags |= (OP(first) == MBOL
4777 ? RXf_ANCH_MBOL
cad2e5aa 4778 : (OP(first) == SBOL
bbe252da
YO
4779 ? RXf_ANCH_SBOL
4780 : RXf_ANCH_BOL));
a0d0e21e 4781 first = NEXTOPER(first);
774d564b 4782 goto again;
4783 }
4784 else if (OP(first) == GPOS) {
bbe252da 4785 r->extflags |= RXf_ANCH_GPOS;
774d564b 4786 first = NEXTOPER(first);
4787 goto again;
a0d0e21e 4788 }
cf2a2b69
YO
4789 else if ((!sawopen || !RExC_sawback) &&
4790 (OP(first) == STAR &&
3dab1dad 4791 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4792 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4793 {
4794 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4795 const int type =
4796 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4797 ? RXf_ANCH_MBOL
4798 : RXf_ANCH_SBOL;
4799 r->extflags |= type;
4800 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4801 first = NEXTOPER(first);
774d564b 4802 goto again;
a0d0e21e 4803 }
e7f38d0f 4804 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
830247a4 4805 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4806 /* x+ must match at the 1st pos of run of x's */
bbe252da 4807 r->intflags |= PREGf_SKIP;
a0d0e21e 4808
c277df42 4809 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4810#ifdef TRIE_STUDY_OPT
81714fb9 4811 DEBUG_PARSE_r(
be8e71aa
YO
4812 if (!restudied)
4813 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4814 (IV)(first - scan + 1))
4815 );
4816#else
81714fb9 4817 DEBUG_PARSE_r(
be8e71aa
YO
4818 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4819 (IV)(first - scan + 1))
4820 );
4821#endif
4822
4823
a0d0e21e
LW
4824 /*
4825 * If there's something expensive in the r.e., find the
4826 * longest literal string that must appear and make it the
4827 * regmust. Resolve ties in favor of later strings, since
4828 * the regstart check works with the beginning of the r.e.
4829 * and avoiding duplication strengthens checking. Not a
4830 * strong reason, but sufficient in the absence of others.
4831 * [Now we resolve ties in favor of the earlier string if
c277df42 4832 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4833 * earlier string may buy us something the later one won't.]
4834 */
de8c5301 4835
396482e1
GA
4836 data.longest_fixed = newSVpvs("");
4837 data.longest_float = newSVpvs("");
4838 data.last_found = newSVpvs("");
c277df42
IZ
4839 data.longest = &(data.longest_fixed);
4840 first = scan;
f8fc2ecf 4841 if (!ri->regstclass) {
830247a4 4842 cl_init(pRExC_state, &ch_class);
653099ff
GS
4843 data.start_class = &ch_class;
4844 stclass_flag = SCF_DO_STCLASS_AND;
4845 } else /* XXXX Check for BOUND? */
4846 stclass_flag = 0;
cb434fcc 4847 data.last_closep = &last_close;
de8c5301 4848
1de06328 4849 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4850 &data, -1, NULL, NULL,
4851 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4852
07be1b83 4853
786e8c11
YO
4854 CHECK_RESTUDY_GOTO;
4855
4856
830247a4 4857 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4858 && data.last_start_min == 0 && data.last_end > 0
830247a4 4859 && !RExC_seen_zerolen
2bf803e2 4860 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4861 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4862 r->extflags |= RXf_CHECK_ALL;
304ee84b 4863 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4864 SvREFCNT_dec(data.last_found);
4865
1de06328
YO
4866 /* Note that code very similar to this but for anchored string
4867 follows immediately below, changes may need to be made to both.
4868 Be careful.
4869 */
a0ed51b3 4870 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4871 if (longest_float_length
c277df42
IZ
4872 || (data.flags & SF_FL_BEFORE_EOL
4873 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4874 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4875 {
1182767e 4876 I32 t,ml;
cf93c79d 4877
1de06328 4878 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4879 && data.offset_fixed == data.offset_float_min
4880 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4881 goto remove_float; /* As in (a)+. */
4882
1de06328
YO
4883 /* copy the information about the longest float from the reg_scan_data
4884 over to the program. */
33b8afdf
JH
4885 if (SvUTF8(data.longest_float)) {
4886 r->float_utf8 = data.longest_float;
c445ea15 4887 r->float_substr = NULL;
33b8afdf
JH
4888 } else {
4889 r->float_substr = data.longest_float;
c445ea15 4890 r->float_utf8 = NULL;
33b8afdf 4891 }
1de06328
YO
4892 /* float_end_shift is how many chars that must be matched that
4893 follow this item. We calculate it ahead of time as once the
4894 lookbehind offset is added in we lose the ability to correctly
4895 calculate it.*/
4896 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4897 : (I32)longest_float_length;
1de06328
YO
4898 r->float_end_shift = ml - data.offset_float_min
4899 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4900 + data.lookbehind_float;
4901 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4902 r->float_max_offset = data.offset_float_max;
1182767e 4903 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4904 r->float_max_offset -= data.lookbehind_float;
4905
cf93c79d
IZ
4906 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4907 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4908 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4909 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4910 }
4911 else {
aca2d497 4912 remove_float:
c445ea15 4913 r->float_substr = r->float_utf8 = NULL;
c277df42 4914 SvREFCNT_dec(data.longest_float);
c5254dd6 4915 longest_float_length = 0;
a0d0e21e 4916 }
c277df42 4917
1de06328
YO
4918 /* Note that code very similar to this but for floating string
4919 is immediately above, changes may need to be made to both.
4920 Be careful.
4921 */
a0ed51b3 4922 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4923 if (longest_fixed_length
c277df42
IZ
4924 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4925 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4926 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4927 {
1182767e 4928 I32 t,ml;
cf93c79d 4929
1de06328
YO
4930 /* copy the information about the longest fixed
4931 from the reg_scan_data over to the program. */
33b8afdf
JH
4932 if (SvUTF8(data.longest_fixed)) {
4933 r->anchored_utf8 = data.longest_fixed;
c445ea15 4934 r->anchored_substr = NULL;
33b8afdf
JH
4935 } else {
4936 r->anchored_substr = data.longest_fixed;
c445ea15 4937 r->anchored_utf8 = NULL;
33b8afdf 4938 }
1de06328
YO
4939 /* fixed_end_shift is how many chars that must be matched that
4940 follow this item. We calculate it ahead of time as once the
4941 lookbehind offset is added in we lose the ability to correctly
4942 calculate it.*/
4943 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4944 : (I32)longest_fixed_length;
1de06328
YO
4945 r->anchored_end_shift = ml - data.offset_fixed
4946 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4947 + data.lookbehind_fixed;
4948 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4949
cf93c79d
IZ
4950 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4951 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4952 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4953 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4954 }
4955 else {
c445ea15 4956 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4957 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4958 longest_fixed_length = 0;
a0d0e21e 4959 }
f8fc2ecf
YO
4960 if (ri->regstclass
4961 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4962 ri->regstclass = NULL;
f4244008
KW
4963
4964 /* If the synthetic start class were to ever be used when EOS is set,
4965 * that bit would have to be cleared, as it is shared with another */
33b8afdf
JH
4966 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4967 && stclass_flag
653099ff 4968 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4969 && !cl_is_anything(data.start_class))
4970 {
2eccd3b2 4971 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4972
f8fc2ecf 4973 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4974 struct regnode_charclass_class);
4975 StructCopy(data.start_class,
f8fc2ecf 4976 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4977 struct regnode_charclass_class);
f8fc2ecf 4978 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4979 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4980 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4981 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4982 PerlIO_printf(Perl_debug_log,
a0288114 4983 "synthetic stclass \"%s\".\n",
3f7c398e 4984 SvPVX_const(sv));});
653099ff 4985 }
c277df42
IZ
4986
4987 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4988 if (longest_fixed_length > longest_float_length) {
1de06328 4989 r->check_end_shift = r->anchored_end_shift;
c277df42 4990 r->check_substr = r->anchored_substr;
33b8afdf 4991 r->check_utf8 = r->anchored_utf8;
c277df42 4992 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4993 if (r->extflags & RXf_ANCH_SINGLE)
4994 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4995 }
4996 else {
1de06328 4997 r->check_end_shift = r->float_end_shift;
c277df42 4998 r->check_substr = r->float_substr;
33b8afdf 4999 r->check_utf8 = r->float_utf8;
1de06328
YO
5000 r->check_offset_min = r->float_min_offset;
5001 r->check_offset_max = r->float_max_offset;
a0d0e21e 5002 }
30382c73
IZ
5003 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5004 This should be changed ASAP! */
bbe252da
YO
5005 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5006 r->extflags |= RXf_USE_INTUIT;
33b8afdf 5007 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 5008 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 5009 }
1de06328
YO
5010 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5011 if ( (STRLEN)minlen < longest_float_length )
5012 minlen= longest_float_length;
5013 if ( (STRLEN)minlen < longest_fixed_length )
5014 minlen= longest_fixed_length;
5015 */
a0ed51b3
LW
5016 }
5017 else {
c277df42
IZ
5018 /* Several toplevels. Best we can is to set minlen. */
5019 I32 fake;
653099ff 5020 struct regnode_charclass_class ch_class;
cb434fcc 5021 I32 last_close = 0;
c277df42 5022
5d458dd8 5023 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 5024
f8fc2ecf 5025 scan = ri->program + 1;
830247a4 5026 cl_init(pRExC_state, &ch_class);
653099ff 5027 data.start_class = &ch_class;
cb434fcc 5028 data.last_closep = &last_close;
07be1b83 5029
de8c5301 5030
1de06328 5031 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 5032 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 5033
786e8c11 5034 CHECK_RESTUDY_GOTO;
07be1b83 5035
33b8afdf 5036 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 5037 = r->float_substr = r->float_utf8 = NULL;
f4244008
KW
5038
5039 /* If the synthetic start class were to ever be used when EOS is set,
5040 * that bit would have to be cleared, as it is shared with another */
653099ff 5041 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
5042 && !cl_is_anything(data.start_class))
5043 {
2eccd3b2 5044 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 5045
f8fc2ecf 5046 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
5047 struct regnode_charclass_class);
5048 StructCopy(data.start_class,
f8fc2ecf 5049 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 5050 struct regnode_charclass_class);
f8fc2ecf 5051 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5052 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5053 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 5054 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5055 PerlIO_printf(Perl_debug_log,
a0288114 5056 "synthetic stclass \"%s\".\n",
3f7c398e 5057 SvPVX_const(sv));});
653099ff 5058 }
a0d0e21e
LW
5059 }
5060
1de06328
YO
5061 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5062 the "real" pattern. */
cf9788e3
RGS
5063 DEBUG_OPTIMISE_r({
5064 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 5065 (IV)minlen, (IV)r->minlen);
cf9788e3 5066 });
de8c5301 5067 r->minlenret = minlen;
1de06328
YO
5068 if (r->minlen < minlen)
5069 r->minlen = minlen;
5070
b81d288d 5071 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 5072 r->extflags |= RXf_GPOS_SEEN;
830247a4 5073 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 5074 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 5075 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 5076 r->extflags |= RXf_EVAL_SEEN;
f33976b4 5077 if (RExC_seen & REG_SEEN_CANY)
bbe252da 5078 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 5079 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 5080 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 5081 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 5082 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 5083 if (RExC_paren_names)
85fbaab2 5084 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 5085 else
5daac39c 5086 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 5087
7bd1e614 5088#ifdef STUPID_PATTERN_CHECKS
5509d87a 5089 if (RX_PRELEN(rx) == 0)
640f820d 5090 r->extflags |= RXf_NULL;
5509d87a 5091 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
5092 /* XXX: this should happen BEFORE we compile */
5093 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 5094 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 5095 r->extflags |= RXf_WHITE;
5509d87a 5096 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 5097 r->extflags |= RXf_START_ONLY;
f1b875a0 5098#else
5509d87a 5099 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
5100 /* XXX: this should happen BEFORE we compile */
5101 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5102 else {
5103 regnode *first = ri->program + 1;
39aa8307
JH
5104 U8 fop = OP(first);
5105 U8 nop = OP(NEXTOPER(first));
7bd1e614 5106
640f820d
AB
5107 if (PL_regkind[fop] == NOTHING && nop == END)
5108 r->extflags |= RXf_NULL;
5109 else if (PL_regkind[fop] == BOL && nop == END)
7bd1e614
YO
5110 r->extflags |= RXf_START_ONLY;
5111 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5112 r->extflags |= RXf_WHITE;
5113 }
f1b875a0 5114#endif
1f1031fe
YO
5115#ifdef DEBUGGING
5116 if (RExC_paren_names) {
af534a04 5117 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
1f1031fe
YO
5118 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5119 } else
1f1031fe 5120#endif
cde0cee5 5121 ri->name_list_idx = 0;
1f1031fe 5122
40d049e4
YO
5123 if (RExC_recurse_count) {
5124 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5125 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5126 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5127 }
5128 }
f0ab9afb 5129 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
5130 /* assume we don't need to swap parens around before we match */
5131
be8e71aa
YO
5132 DEBUG_DUMP_r({
5133 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
5134 regdump(r);
5135 });
7122b237
YO
5136#ifdef RE_TRACK_PATTERN_OFFSETS
5137 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5138 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
5139 U32 i;
5140 GET_RE_DEBUG_FLAGS_DECL;
7122b237 5141 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 5142 for (i = 1; i <= len; i++) {
7122b237 5143 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 5144 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 5145 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
5146 }
5147 PerlIO_printf(Perl_debug_log, "\n");
5148 });
7122b237 5149#endif
288b8c02 5150 return rx;
a687059c
LW
5151}
5152
f9f4320a 5153#undef RE_ENGINE_PTR
3dab1dad 5154
93b32b6d 5155
81714fb9 5156SV*
192b9cd1
AB
5157Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5158 const U32 flags)
5159{
7918f24d
NC
5160 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5161
192b9cd1
AB
5162 PERL_UNUSED_ARG(value);
5163
f1b875a0 5164 if (flags & RXapif_FETCH) {
192b9cd1 5165 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 5166 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6ad8f254 5167 Perl_croak_no_modify(aTHX);
192b9cd1 5168 return NULL;
f1b875a0 5169 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
5170 return reg_named_buff_exists(rx, key, flags)
5171 ? &PL_sv_yes
5172 : &PL_sv_no;
f1b875a0 5173 } else if (flags & RXapif_REGNAMES) {
192b9cd1 5174 return reg_named_buff_all(rx, flags);
f1b875a0 5175 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
5176 return reg_named_buff_scalar(rx, flags);
5177 } else {
5178 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5179 return NULL;
5180 }
5181}
5182
5183SV*
5184Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5185 const U32 flags)
5186{
7918f24d 5187 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
5188 PERL_UNUSED_ARG(lastkey);
5189
f1b875a0 5190 if (flags & RXapif_FIRSTKEY)
192b9cd1 5191 return reg_named_buff_firstkey(rx, flags);
f1b875a0 5192 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
5193 return reg_named_buff_nextkey(rx, flags);
5194 else {
5195 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5196 return NULL;
5197 }
5198}
5199
5200SV*
288b8c02
NC
5201Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5202 const U32 flags)
81714fb9 5203{
44a2ac75
YO
5204 AV *retarray = NULL;
5205 SV *ret;
288b8c02 5206 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5207
5208 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5209
f1b875a0 5210 if (flags & RXapif_ALL)
44a2ac75 5211 retarray=newAV();
93b32b6d 5212
5daac39c
NC
5213 if (rx && RXp_PAREN_NAMES(rx)) {
5214 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
5215 if (he_str) {
5216 IV i;
5217 SV* sv_dat=HeVAL(he_str);
5218 I32 *nums=(I32*)SvPVX(sv_dat);
5219 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
5220 if ((I32)(rx->nparens) >= nums[i]
5221 && rx->offs[nums[i]].start != -1
5222 && rx->offs[nums[i]].end != -1)
93b32b6d 5223 {
49d7dfbc 5224 ret = newSVpvs("");
288b8c02 5225 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
5226 if (!retarray)
5227 return ret;
5228 } else {
5229 ret = newSVsv(&PL_sv_undef);
5230 }
ec83ea38 5231 if (retarray)
93b32b6d 5232 av_push(retarray, ret);
81714fb9 5233 }
93b32b6d 5234 if (retarray)
ad64d0ec 5235 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
5236 }
5237 }
5238 return NULL;
5239}
5240
5241bool
288b8c02 5242Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
5243 const U32 flags)
5244{
288b8c02 5245 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5246
5247 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5248
5daac39c 5249 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5250 if (flags & RXapif_ALL) {
5daac39c 5251 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 5252 } else {
288b8c02 5253 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
5254 if (sv) {
5255 SvREFCNT_dec(sv);
192b9cd1
AB
5256 return TRUE;
5257 } else {
5258 return FALSE;
5259 }
5260 }
5261 } else {
5262 return FALSE;
5263 }
5264}
5265
5266SV*
288b8c02 5267Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5268{
288b8c02 5269 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5270
5271 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5272
5daac39c
NC
5273 if ( rx && RXp_PAREN_NAMES(rx) ) {
5274 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 5275
288b8c02 5276 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
5277 } else {
5278 return FALSE;
5279 }
192b9cd1
AB
5280}
5281
5282SV*
288b8c02 5283Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5284{
288b8c02 5285 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 5286 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5287
5288 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5289
5daac39c
NC
5290 if (rx && RXp_PAREN_NAMES(rx)) {
5291 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
5292 HE *temphe;
5293 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5294 IV i;
5295 IV parno = 0;
5296 SV* sv_dat = HeVAL(temphe);
5297 I32 *nums = (I32*)SvPVX(sv_dat);
5298 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5299 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5300 rx->offs[nums[i]].start != -1 &&
5301 rx->offs[nums[i]].end != -1)
5302 {
5303 parno = nums[i];
5304 break;
5305 }
5306 }
f1b875a0 5307 if (parno || flags & RXapif_ALL) {
a663657d 5308 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5309 }
81714fb9
YO
5310 }
5311 }
44a2ac75
YO
5312 return NULL;
5313}
5314
192b9cd1 5315SV*
288b8c02 5316Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5317{
5318 SV *ret;
5319 AV *av;
5320 I32 length;
288b8c02 5321 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5322
7918f24d
NC
5323 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5324
5daac39c 5325 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5326 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5327 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5328 } else if (flags & RXapif_ONE) {
288b8c02 5329 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 5330 av = MUTABLE_AV(SvRV(ret));
192b9cd1 5331 length = av_len(av);
ec83ea38 5332 SvREFCNT_dec(ret);
192b9cd1
AB
5333 return newSViv(length + 1);
5334 } else {
5335 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5336 return NULL;
5337 }
5338 }
5339 return &PL_sv_undef;
5340}
5341
5342SV*
288b8c02 5343Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5344{
288b8c02 5345 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5346 AV *av = newAV();
5347
7918f24d
NC
5348 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5349
5daac39c
NC
5350 if (rx && RXp_PAREN_NAMES(rx)) {
5351 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5352 HE *temphe;
5353 (void)hv_iterinit(hv);
5354 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5355 IV i;
5356 IV parno = 0;
5357 SV* sv_dat = HeVAL(temphe);
5358 I32 *nums = (I32*)SvPVX(sv_dat);
5359 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5360 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5361 rx->offs[nums[i]].start != -1 &&
5362 rx->offs[nums[i]].end != -1)
5363 {
5364 parno = nums[i];
5365 break;
5366 }
5367 }
f1b875a0 5368 if (parno || flags & RXapif_ALL) {
a663657d 5369 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
5370 }
5371 }
5372 }
5373
ad64d0ec 5374 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
5375}
5376
49d7dfbc 5377void
288b8c02
NC
5378Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5379 SV * const sv)
44a2ac75 5380{
288b8c02 5381 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 5382 char *s = NULL;
a9d504c3 5383 I32 i = 0;
44a2ac75 5384 I32 s1, t1;
7918f24d
NC
5385
5386 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 5387
cde0cee5
YO
5388 if (!rx->subbeg) {
5389 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5390 return;
cde0cee5
YO
5391 }
5392 else
f1b875a0 5393 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 5394 /* $` */
f0ab9afb 5395 i = rx->offs[0].start;
cde0cee5 5396 s = rx->subbeg;
44a2ac75
YO
5397 }
5398 else
f1b875a0 5399 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 5400 /* $' */
f0ab9afb
NC
5401 s = rx->subbeg + rx->offs[0].end;
5402 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
5403 }
5404 else
5405 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
5406 (s1 = rx->offs[paren].start) != -1 &&
5407 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
5408 {
5409 /* $& $1 ... */
5410 i = t1 - s1;
5411 s = rx->subbeg + s1;
cde0cee5
YO
5412 } else {
5413 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5414 return;
cde0cee5
YO
5415 }
5416 assert(rx->sublen >= (s - rx->subbeg) + i );
5417 if (i >= 0) {
5418 const int oldtainted = PL_tainted;
5419 TAINT_NOT;
5420 sv_setpvn(sv, s, i);
5421 PL_tainted = oldtainted;
5422 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 5423 ? (RXp_MATCH_UTF8(rx)
cde0cee5 5424 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 5425 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
5426 {
5427 SvUTF8_on(sv);
5428 }
5429 else
5430 SvUTF8_off(sv);
5431 if (PL_tainting) {
07bc277f 5432 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
5433 if (SvTYPE(sv) >= SVt_PVMG) {
5434 MAGIC* const mg = SvMAGIC(sv);
5435 MAGIC* mgt;
5436 PL_tainted = 1;
5437 SvMAGIC_set(sv, mg->mg_moremagic);
5438 SvTAINT(sv);
5439 if ((mgt = SvMAGIC(sv))) {
5440 mg->mg_moremagic = mgt;
5441 SvMAGIC_set(sv, mg);
44a2ac75 5442 }
cde0cee5
YO
5443 } else {
5444 PL_tainted = 1;
5445 SvTAINT(sv);
5446 }
5447 } else
5448 SvTAINTED_off(sv);
44a2ac75 5449 }
81714fb9 5450 } else {
44a2ac75 5451 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5452 return;
81714fb9
YO
5453 }
5454}
93b32b6d 5455
2fdbfb4d
AB
5456void
5457Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5458 SV const * const value)
5459{
7918f24d
NC
5460 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5461
2fdbfb4d
AB
5462 PERL_UNUSED_ARG(rx);
5463 PERL_UNUSED_ARG(paren);
5464 PERL_UNUSED_ARG(value);
5465
5466 if (!PL_localizing)
6ad8f254 5467 Perl_croak_no_modify(aTHX);
2fdbfb4d
AB
5468}
5469
5470I32
288b8c02 5471Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
5472 const I32 paren)
5473{
288b8c02 5474 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
5475 I32 i;
5476 I32 s1, t1;
5477
7918f24d
NC
5478 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5479
2fdbfb4d
AB
5480 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5481 switch (paren) {
192b9cd1 5482 /* $` / ${^PREMATCH} */
f1b875a0 5483 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
5484 if (rx->offs[0].start != -1) {
5485 i = rx->offs[0].start;
5486 if (i > 0) {
5487 s1 = 0;
5488 t1 = i;
5489 goto getlen;
5490 }
5491 }
5492 return 0;
192b9cd1 5493 /* $' / ${^POSTMATCH} */
f1b875a0 5494 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
5495 if (rx->offs[0].end != -1) {
5496 i = rx->sublen - rx->offs[0].end;
5497 if (i > 0) {
5498 s1 = rx->offs[0].end;
5499 t1 = rx->sublen;
5500 goto getlen;
5501 }
5502 }
5503 return 0;
192b9cd1
AB
5504 /* $& / ${^MATCH}, $1, $2, ... */
5505 default:
2fdbfb4d
AB
5506 if (paren <= (I32)rx->nparens &&
5507 (s1 = rx->offs[paren].start) != -1 &&
5508 (t1 = rx->offs[paren].end) != -1)
5509 {
5510 i = t1 - s1;
5511 goto getlen;
5512 } else {
5513 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 5514 report_uninit((const SV *)sv);
2fdbfb4d
AB
5515 return 0;
5516 }
5517 }
5518 getlen:
07bc277f 5519 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
5520 const char * const s = rx->subbeg + s1;
5521 const U8 *ep;
5522 STRLEN el;
5523
5524 i = t1 - s1;
5525 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5526 i = el;
5527 }
5528 return i;
5529}
5530
fe578d7f 5531SV*
49d7dfbc 5532Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 5533{
7918f24d 5534 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 5535 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
5536 if (0)
5537 return NULL;
5538 else
5539 return newSVpvs("Regexp");
fe578d7f 5540}
0a4db386 5541
894be9b7 5542/* Scans the name of a named buffer from the pattern.
0a4db386
YO
5543 * If flags is REG_RSN_RETURN_NULL returns null.
5544 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5545 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5546 * to the parsed name as looked up in the RExC_paren_names hash.
5547 * If there is an error throws a vFAIL().. type exception.
894be9b7 5548 */
0a4db386
YO
5549
5550#define REG_RSN_RETURN_NULL 0
5551#define REG_RSN_RETURN_NAME 1
5552#define REG_RSN_RETURN_DATA 2
5553
894be9b7 5554STATIC SV*
7918f24d
NC
5555S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5556{
894be9b7 5557 char *name_start = RExC_parse;
1f1031fe 5558
7918f24d
NC
5559 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5560
1f1031fe
YO
5561 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5562 /* skip IDFIRST by using do...while */
5563 if (UTF)
5564 do {
5565 RExC_parse += UTF8SKIP(RExC_parse);
5566 } while (isALNUM_utf8((U8*)RExC_parse));
5567 else
5568 do {
5569 RExC_parse++;
5570 } while (isALNUM(*RExC_parse));
894be9b7 5571 }
1f1031fe 5572
0a4db386 5573 if ( flags ) {
59cd0e26
NC
5574 SV* sv_name
5575 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5576 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
5577 if ( flags == REG_RSN_RETURN_NAME)
5578 return sv_name;
5579 else if (flags==REG_RSN_RETURN_DATA) {
5580 HE *he_str = NULL;
5581 SV *sv_dat = NULL;
5582 if ( ! sv_name ) /* should not happen*/
5583 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5584 if (RExC_paren_names)
5585 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5586 if ( he_str )
5587 sv_dat = HeVAL(he_str);
5588 if ( ! sv_dat )
5589 vFAIL("Reference to nonexistent named group");
5590 return sv_dat;
5591 }
5592 else {
5593 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5594 }
5595 /* NOT REACHED */
894be9b7 5596 }
0a4db386 5597 return NULL;
894be9b7
YO
5598}
5599
3dab1dad
YO
5600#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5601 int rem=(int)(RExC_end - RExC_parse); \
5602 int cut; \
5603 int num; \
5604 int iscut=0; \
5605 if (rem>10) { \
5606 rem=10; \
5607 iscut=1; \
5608 } \
5609 cut=10-rem; \
5610 if (RExC_lastparse!=RExC_parse) \
5611 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5612 rem, RExC_parse, \
5613 cut + 4, \
5614 iscut ? "..." : "<" \
5615 ); \
5616 else \
5617 PerlIO_printf(Perl_debug_log,"%16s",""); \
5618 \
5619 if (SIZE_ONLY) \
3b57cd43 5620 num = RExC_size + 1; \
3dab1dad
YO
5621 else \
5622 num=REG_NODE_NUM(RExC_emit); \
5623 if (RExC_lastnum!=num) \
0a4db386 5624 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 5625 else \
0a4db386 5626 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
5627 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5628 (int)((depth*2)), "", \
3dab1dad
YO
5629 (funcname) \
5630 ); \
5631 RExC_lastnum=num; \
5632 RExC_lastparse=RExC_parse; \
5633})
5634
07be1b83
YO
5635
5636
3dab1dad
YO
5637#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5638 DEBUG_PARSE_MSG((funcname)); \
5639 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5640})
6bda09f9
YO
5641#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5642 DEBUG_PARSE_MSG((funcname)); \
5643 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5644})
a687059c
LW
5645/*
5646 - reg - regular expression, i.e. main body or parenthesized thing
5647 *
5648 * Caller must absorb opening parenthesis.
5649 *
5650 * Combining parenthesis handling with the base level of regular expression
5651 * is a trifle forced, but the need to tie the tails of the branches to what
5652 * follows makes it hard to avoid.
5653 */
07be1b83
YO
5654#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5655#ifdef DEBUGGING
5656#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5657#else
5658#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5659#endif
3dab1dad 5660
76e3520e 5661STATIC regnode *
3dab1dad 5662S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 5663 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 5664{
27da23d5 5665 dVAR;
c277df42
IZ
5666 register regnode *ret; /* Will be the head of the group. */
5667 register regnode *br;
5668 register regnode *lastbr;
cbbf8932 5669 register regnode *ender = NULL;
a0d0e21e 5670 register I32 parno = 0;
cbbf8932 5671 I32 flags;
f7819f85 5672 U32 oregflags = RExC_flags;
6136c704
AL
5673 bool have_branch = 0;
5674 bool is_open = 0;
594d7033
YO
5675 I32 freeze_paren = 0;
5676 I32 after_freeze = 0;
9d1d55b5
JP
5677
5678 /* for (?g), (?gc), and (?o) warnings; warning
5679 about (?c) will warn about (?g) -- japhy */
5680
6136c704
AL
5681#define WASTED_O 0x01
5682#define WASTED_G 0x02
5683#define WASTED_C 0x04
5684#define WASTED_GC (0x02|0x04)
cbbf8932 5685 I32 wastedflags = 0x00;
9d1d55b5 5686
fac92740 5687 char * parse_start = RExC_parse; /* MJD */
a28509cc 5688 char * const oregcomp_parse = RExC_parse;
a0d0e21e 5689
3dab1dad 5690 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5691
5692 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
5693 DEBUG_PARSE("reg ");
5694
821b33a5 5695 *flagp = 0; /* Tentatively. */
a0d0e21e 5696
9d1d55b5 5697
a0d0e21e
LW
5698 /* Make an OPEN node, if parenthesized. */
5699 if (paren) {
e2e6a0f1
YO
5700 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5701 char *start_verb = RExC_parse;
5702 STRLEN verb_len = 0;
5703 char *start_arg = NULL;
5704 unsigned char op = 0;
5705 int argok = 1;
5706 int internal_argval = 0; /* internal_argval is only useful if !argok */
5707 while ( *RExC_parse && *RExC_parse != ')' ) {
5708 if ( *RExC_parse == ':' ) {
5709 start_arg = RExC_parse + 1;
5710 break;
5711 }
5712 RExC_parse++;
5713 }
5714 ++start_verb;
5715 verb_len = RExC_parse - start_verb;
5716 if ( start_arg ) {
5717 RExC_parse++;
5718 while ( *RExC_parse && *RExC_parse != ')' )
5719 RExC_parse++;
5720 if ( *RExC_parse != ')' )
5721 vFAIL("Unterminated verb pattern argument");
5722 if ( RExC_parse == start_arg )
5723 start_arg = NULL;
5724 } else {
5725 if ( *RExC_parse != ')' )
5726 vFAIL("Unterminated verb pattern");
5727 }
5d458dd8 5728
e2e6a0f1
YO
5729 switch ( *start_verb ) {
5730 case 'A': /* (*ACCEPT) */
568a785a 5731 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
5732 op = ACCEPT;
5733 internal_argval = RExC_nestroot;
5734 }
5735 break;
5736 case 'C': /* (*COMMIT) */
568a785a 5737 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 5738 op = COMMIT;
e2e6a0f1
YO
5739 break;
5740 case 'F': /* (*FAIL) */
568a785a 5741 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
5742 op = OPFAIL;
5743 argok = 0;
5744 }
5745 break;
5d458dd8
YO
5746 case ':': /* (*:NAME) */
5747 case 'M': /* (*MARK:NAME) */
568a785a 5748 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 5749 op = MARKPOINT;
5d458dd8
YO
5750 argok = -1;
5751 }
5752 break;
5753 case 'P': /* (*PRUNE) */
568a785a 5754 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 5755 op = PRUNE;
e2e6a0f1 5756 break;
5d458dd8 5757 case 'S': /* (*SKIP) */
568a785a 5758 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
5759 op = SKIP;
5760 break;
5761 case 'T': /* (*THEN) */
5762 /* [19:06] <TimToady> :: is then */
568a785a 5763 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
5764 op = CUTGROUP;
5765 RExC_seen |= REG_SEEN_CUTGROUP;
5766 }
e2e6a0f1
YO
5767 break;
5768 }
5769 if ( ! op ) {
5770 RExC_parse++;
5771 vFAIL3("Unknown verb pattern '%.*s'",
5772 verb_len, start_verb);
5773 }
5774 if ( argok ) {
5775 if ( start_arg && internal_argval ) {
5776 vFAIL3("Verb pattern '%.*s' may not have an argument",
5777 verb_len, start_verb);
5778 } else if ( argok < 0 && !start_arg ) {
5779 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5780 verb_len, start_verb);
5781 } else {
5782 ret = reganode(pRExC_state, op, internal_argval);
5783 if ( ! internal_argval && ! SIZE_ONLY ) {
5784 if (start_arg) {
5785 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5786 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5787 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
5788 ret->flags = 0;
5789 } else {
5790 ret->flags = 1;
5791 }
5792 }
5793 }
5794 if (!internal_argval)
5795 RExC_seen |= REG_SEEN_VERBARG;
5796 } else if ( start_arg ) {
5797 vFAIL3("Verb pattern '%.*s' may not have an argument",
5798 verb_len, start_verb);
5799 } else {
5800 ret = reg_node(pRExC_state, op);
5801 }
5802 nextchar(pRExC_state);
5803 return ret;
5804 } else
fac92740 5805 if (*RExC_parse == '?') { /* (?...) */
6136c704 5806 bool is_logical = 0;
a28509cc 5807 const char * const seqstart = RExC_parse;
fb85c044 5808 bool has_use_defaults = FALSE;
ca9dfc88 5809
830247a4
IZ
5810 RExC_parse++;
5811 paren = *RExC_parse++;
c277df42 5812 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5813 switch (paren) {
894be9b7 5814
1f1031fe
YO
5815 case 'P': /* (?P...) variants for those used to PCRE/Python */
5816 paren = *RExC_parse++;
5817 if ( paren == '<') /* (?P<...>) named capture */
5818 goto named_capture;
5819 else if (paren == '>') { /* (?P>name) named recursion */
5820 goto named_recursion;
5821 }
5822 else if (paren == '=') { /* (?P=...) named backref */
5823 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5824 you change this make sure you change that */
5825 char* name_start = RExC_parse;
5826 U32 num = 0;
5827 SV *sv_dat = reg_scan_name(pRExC_state,
5828 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5829 if (RExC_parse == name_start || *RExC_parse != ')')
5830 vFAIL2("Sequence %.3s... not terminated",parse_start);
5831
5832 if (!SIZE_ONLY) {
5833 num = add_data( pRExC_state, 1, "S" );
5834 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5835 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
5836 }
5837 RExC_sawback = 1;
4444fd9f
KW
5838 ret = reganode(pRExC_state,
5839 ((! FOLD)
5840 ? NREF
5841 : (UNI_SEMANTICS)
5842 ? NREFFU
5843 : (LOC)
5844 ? NREFFL
5845 : NREFF),
5846 num);
1f1031fe
YO
5847 *flagp |= HASWIDTH;
5848
5849 Set_Node_Offset(ret, parse_start+1);
5850 Set_Node_Cur_Length(ret); /* MJD */
5851
5852 nextchar(pRExC_state);
5853 return ret;
5854 }
57b84237
YO
5855 RExC_parse++;
5856 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5857 /*NOTREACHED*/
5858 case '<': /* (?<...) */
b81d288d 5859 if (*RExC_parse == '!')
c277df42 5860 paren = ',';
0a4db386 5861 else if (*RExC_parse != '=')
1f1031fe 5862 named_capture:
0a4db386 5863 { /* (?<...>) */
81714fb9 5864 char *name_start;
894be9b7 5865 SV *svname;
81714fb9
YO
5866 paren= '>';
5867 case '\'': /* (?'...') */
5868 name_start= RExC_parse;
0a4db386
YO
5869 svname = reg_scan_name(pRExC_state,
5870 SIZE_ONLY ? /* reverse test from the others */
5871 REG_RSN_RETURN_NAME :
5872 REG_RSN_RETURN_NULL);
57b84237
YO
5873 if (RExC_parse == name_start) {
5874 RExC_parse++;
5875 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5876 /*NOTREACHED*/
5877 }
81714fb9
YO
5878 if (*RExC_parse != paren)
5879 vFAIL2("Sequence (?%c... not terminated",
5880 paren=='>' ? '<' : paren);
5881 if (SIZE_ONLY) {
e62cc96a
YO
5882 HE *he_str;
5883 SV *sv_dat = NULL;
486ec47a 5884 if (!svname) /* shouldn't happen */
894be9b7
YO
5885 Perl_croak(aTHX_
5886 "panic: reg_scan_name returned NULL");
81714fb9
YO
5887 if (!RExC_paren_names) {
5888 RExC_paren_names= newHV();
ad64d0ec 5889 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
5890#ifdef DEBUGGING
5891 RExC_paren_name_list= newAV();
ad64d0ec 5892 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 5893#endif
81714fb9
YO
5894 }
5895 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5896 if ( he_str )
81714fb9 5897 sv_dat = HeVAL(he_str);
e62cc96a 5898 if ( ! sv_dat ) {
81714fb9 5899 /* croak baby croak */
e62cc96a
YO
5900 Perl_croak(aTHX_
5901 "panic: paren_name hash element allocation failed");
5902 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
5903 /* (?|...) can mean we have dupes so scan to check
5904 its already been stored. Maybe a flag indicating
5905 we are inside such a construct would be useful,
5906 but the arrays are likely to be quite small, so
5907 for now we punt -- dmq */
5908 IV count = SvIV(sv_dat);
5909 I32 *pv = (I32*)SvPVX(sv_dat);
5910 IV i;
5911 for ( i = 0 ; i < count ; i++ ) {
5912 if ( pv[i] == RExC_npar ) {
5913 count = 0;
5914 break;
5915 }
5916 }
5917 if ( count ) {
5918 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5919 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5920 pv[count] = RExC_npar;
3a92e6ae 5921 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 5922 }
81714fb9
YO
5923 } else {
5924 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5925 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5926 SvIOK_on(sv_dat);
3ec35e0f 5927 SvIV_set(sv_dat, 1);
e62cc96a 5928 }
1f1031fe
YO
5929#ifdef DEBUGGING
5930 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5931 SvREFCNT_dec(svname);
5932#endif
e62cc96a 5933
81714fb9
YO
5934 /*sv_dump(sv_dat);*/
5935 }
5936 nextchar(pRExC_state);
5937 paren = 1;
5938 goto capturing_parens;
5939 }
5940 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5941 RExC_parse++;
fac92740 5942 case '=': /* (?=...) */
89c6a13e 5943 RExC_seen_zerolen++;
5c3fa2e7 5944 break;
fac92740 5945 case '!': /* (?!...) */
830247a4 5946 RExC_seen_zerolen++;
e2e6a0f1
YO
5947 if (*RExC_parse == ')') {
5948 ret=reg_node(pRExC_state, OPFAIL);
5949 nextchar(pRExC_state);
5950 return ret;
5951 }
594d7033
YO
5952 break;
5953 case '|': /* (?|...) */
5954 /* branch reset, behave like a (?:...) except that
5955 buffers in alternations share the same numbers */
5956 paren = ':';
5957 after_freeze = freeze_paren = RExC_npar;
5958 break;
fac92740
MJD
5959 case ':': /* (?:...) */
5960 case '>': /* (?>...) */
a0d0e21e 5961 break;
fac92740
MJD
5962 case '$': /* (?$...) */
5963 case '@': /* (?@...) */
8615cb43 5964 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5965 break;
fac92740 5966 case '#': /* (?#...) */
830247a4
IZ
5967 while (*RExC_parse && *RExC_parse != ')')
5968 RExC_parse++;
5969 if (*RExC_parse != ')')
c277df42 5970 FAIL("Sequence (?#... not terminated");
830247a4 5971 nextchar(pRExC_state);
a0d0e21e
LW
5972 *flagp = TRYAGAIN;
5973 return NULL;
894be9b7
YO
5974 case '0' : /* (?0) */
5975 case 'R' : /* (?R) */
5976 if (*RExC_parse != ')')
6bda09f9 5977 FAIL("Sequence (?R) not terminated");
1a147d38 5978 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 5979 *flagp |= POSTPONED;
7f69552c
YO
5980 nextchar(pRExC_state);
5981 return ret;
5982 /*notreached*/
894be9b7
YO
5983 { /* named and numeric backreferences */
5984 I32 num;
894be9b7
YO
5985 case '&': /* (?&NAME) */
5986 parse_start = RExC_parse - 1;
1f1031fe 5987 named_recursion:
894be9b7 5988 {
0a4db386
YO
5989 SV *sv_dat = reg_scan_name(pRExC_state,
5990 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5991 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5992 }
5993 goto gen_recurse_regop;
5994 /* NOT REACHED */
542fa716
YO
5995 case '+':
5996 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5997 RExC_parse++;
5998 vFAIL("Illegal pattern");
5999 }
6000 goto parse_recursion;
6001 /* NOT REACHED*/
6002 case '-': /* (?-1) */
6003 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6004 RExC_parse--; /* rewind to let it be handled later */
6005 goto parse_flags;
6006 }
6007 /*FALLTHROUGH */
6bda09f9
YO
6008 case '1': case '2': case '3': case '4': /* (?1) */
6009 case '5': case '6': case '7': case '8': case '9':
6010 RExC_parse--;
542fa716 6011 parse_recursion:
894be9b7
YO
6012 num = atoi(RExC_parse);
6013 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
6014 if (*RExC_parse == '-')
6015 RExC_parse++;
6bda09f9
YO
6016 while (isDIGIT(*RExC_parse))
6017 RExC_parse++;
6018 if (*RExC_parse!=')')
6019 vFAIL("Expecting close bracket");
894be9b7
YO
6020
6021 gen_recurse_regop:
542fa716
YO
6022 if ( paren == '-' ) {
6023 /*
6024 Diagram of capture buffer numbering.
6025 Top line is the normal capture buffer numbers
3b753521 6026 Bottom line is the negative indexing as from
542fa716
YO
6027 the X (the (?-2))
6028
6029 + 1 2 3 4 5 X 6 7
6030 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6031 - 5 4 3 2 1 X x x
6032
6033 */
6034 num = RExC_npar + num;
6035 if (num < 1) {
6036 RExC_parse++;
6037 vFAIL("Reference to nonexistent group");
6038 }
6039 } else if ( paren == '+' ) {
6040 num = RExC_npar + num - 1;
6041 }
6042
1a147d38 6043 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
6044 if (!SIZE_ONLY) {
6045 if (num > (I32)RExC_rx->nparens) {
6046 RExC_parse++;
6047 vFAIL("Reference to nonexistent group");
6048 }
40d049e4 6049 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 6050 RExC_emit++;
226de585 6051 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 6052 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 6053 } else {
6bda09f9 6054 RExC_size++;
6bda09f9 6055 }
0a4db386 6056 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 6057 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
6058 Set_Node_Offset(ret, parse_start); /* MJD */
6059
a3b492c3 6060 *flagp |= POSTPONED;
6bda09f9
YO
6061 nextchar(pRExC_state);
6062 return ret;
894be9b7
YO
6063 } /* named and numeric backreferences */
6064 /* NOT REACHED */
6065
fac92740 6066 case '?': /* (??...) */
6136c704 6067 is_logical = 1;
57b84237
YO
6068 if (*RExC_parse != '{') {
6069 RExC_parse++;
6070 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6071 /*NOTREACHED*/
6072 }
a3b492c3 6073 *flagp |= POSTPONED;
830247a4 6074 paren = *RExC_parse++;
0f5d15d6 6075 /* FALL THROUGH */
fac92740 6076 case '{': /* (?{...}) */
c277df42 6077 {
2eccd3b2
NC
6078 I32 count = 1;
6079 U32 n = 0;
c277df42 6080 char c;
830247a4 6081 char *s = RExC_parse;
c277df42 6082
830247a4
IZ
6083 RExC_seen_zerolen++;
6084 RExC_seen |= REG_SEEN_EVAL;
6085 while (count && (c = *RExC_parse)) {
6136c704
AL
6086 if (c == '\\') {
6087 if (RExC_parse[1])
6088 RExC_parse++;
6089 }
b81d288d 6090 else if (c == '{')
c277df42 6091 count++;
b81d288d 6092 else if (c == '}')
c277df42 6093 count--;
830247a4 6094 RExC_parse++;
c277df42 6095 }
6136c704 6096 if (*RExC_parse != ')') {
b81d288d 6097 RExC_parse = s;
b45f050a
JF
6098 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6099 }
c277df42 6100 if (!SIZE_ONLY) {
f3548bdc 6101 PAD *pad;
6136c704
AL
6102 OP_4tree *sop, *rop;
6103 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 6104
569233ed
SB
6105 ENTER;
6106 Perl_save_re_context(aTHX);
d59a8b3e 6107 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
9b978d73
DM
6108 sop->op_private |= OPpREFCOUNTED;
6109 /* re_dup will OpREFCNT_inc */
6110 OpREFCNT_set(sop, 1);
569233ed 6111 LEAVE;
c277df42 6112
830247a4 6113 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
6114 RExC_rxi->data->data[n] = (void*)rop;
6115 RExC_rxi->data->data[n+1] = (void*)sop;
6116 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 6117 SvREFCNT_dec(sv);
a0ed51b3 6118 }
e24b16f9 6119 else { /* First pass */
830247a4 6120 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 6121 && IN_PERL_RUNTIME)
2cd61cdb
IZ
6122 /* No compiled RE interpolated, has runtime
6123 components ===> unsafe. */
6124 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 6125 if (PL_tainting && PL_tainted)
cc6b7395 6126 FAIL("Eval-group in insecure regular expression");
54df2634 6127#if PERL_VERSION > 8
923e4eb5 6128 if (IN_PERL_COMPILETIME)
b5c19bd7 6129 PL_cv_has_eval = 1;
54df2634 6130#endif
c277df42 6131 }
b5c19bd7 6132
830247a4 6133 nextchar(pRExC_state);
6136c704 6134 if (is_logical) {
830247a4 6135 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6136 if (!SIZE_ONLY)
6137 ret->flags = 2;
3dab1dad 6138 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 6139 /* deal with the length of this later - MJD */
0f5d15d6
IZ
6140 return ret;
6141 }
ccb2c380
MP
6142 ret = reganode(pRExC_state, EVAL, n);
6143 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6144 Set_Node_Offset(ret, parse_start);
6145 return ret;
c277df42 6146 }
fac92740 6147 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 6148 {
0a4db386 6149 int is_define= 0;
fac92740 6150 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
6151 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6152 || RExC_parse[1] == '<'
830247a4 6153 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
6154 I32 flag;
6155
830247a4 6156 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6157 if (!SIZE_ONLY)
6158 ret->flags = 1;
3dab1dad 6159 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 6160 goto insert_if;
b81d288d 6161 }
a0ed51b3 6162 }
0a4db386
YO
6163 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6164 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6165 {
6166 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6167 char *name_start= RExC_parse++;
2eccd3b2 6168 U32 num = 0;
0a4db386
YO
6169 SV *sv_dat=reg_scan_name(pRExC_state,
6170 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6171 if (RExC_parse == name_start || *RExC_parse != ch)
6172 vFAIL2("Sequence (?(%c... not terminated",
6173 (ch == '>' ? '<' : ch));
6174 RExC_parse++;
6175 if (!SIZE_ONLY) {
6176 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 6177 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 6178 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
6179 }
6180 ret = reganode(pRExC_state,NGROUPP,num);
6181 goto insert_if_check_paren;
6182 }
6183 else if (RExC_parse[0] == 'D' &&
6184 RExC_parse[1] == 'E' &&
6185 RExC_parse[2] == 'F' &&
6186 RExC_parse[3] == 'I' &&
6187 RExC_parse[4] == 'N' &&
6188 RExC_parse[5] == 'E')
6189 {
6190 ret = reganode(pRExC_state,DEFINEP,0);
6191 RExC_parse +=6 ;
6192 is_define = 1;
6193 goto insert_if_check_paren;
6194 }
6195 else if (RExC_parse[0] == 'R') {
6196 RExC_parse++;
6197 parno = 0;
6198 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6199 parno = atoi(RExC_parse++);
6200 while (isDIGIT(*RExC_parse))
6201 RExC_parse++;
6202 } else if (RExC_parse[0] == '&') {
6203 SV *sv_dat;
6204 RExC_parse++;
6205 sv_dat = reg_scan_name(pRExC_state,
6206 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6207 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6208 }
1a147d38 6209 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
6210 goto insert_if_check_paren;
6211 }
830247a4 6212 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 6213 /* (?(1)...) */
6136c704 6214 char c;
830247a4 6215 parno = atoi(RExC_parse++);
c277df42 6216
830247a4
IZ
6217 while (isDIGIT(*RExC_parse))
6218 RExC_parse++;
fac92740 6219 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 6220
0a4db386 6221 insert_if_check_paren:
830247a4 6222 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 6223 vFAIL("Switch condition not recognized");
c277df42 6224 insert_if:
3dab1dad
YO
6225 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6226 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 6227 if (br == NULL)
830247a4 6228 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 6229 else
3dab1dad 6230 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 6231 c = *nextchar(pRExC_state);
d1b80229
IZ
6232 if (flags&HASWIDTH)
6233 *flagp |= HASWIDTH;
c277df42 6234 if (c == '|') {
0a4db386
YO
6235 if (is_define)
6236 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 6237 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
6238 regbranch(pRExC_state, &flags, 1,depth+1);
6239 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
6240 if (flags&HASWIDTH)
6241 *flagp |= HASWIDTH;
830247a4 6242 c = *nextchar(pRExC_state);
a0ed51b3
LW
6243 }
6244 else
c277df42
IZ
6245 lastbr = NULL;
6246 if (c != ')')
8615cb43 6247 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 6248 ender = reg_node(pRExC_state, TAIL);
3dab1dad 6249 REGTAIL(pRExC_state, br, ender);
c277df42 6250 if (lastbr) {
3dab1dad
YO
6251 REGTAIL(pRExC_state, lastbr, ender);
6252 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
6253 }
6254 else
3dab1dad 6255 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
6256 RExC_size++; /* XXX WHY do we need this?!!
6257 For large programs it seems to be required
6258 but I can't figure out why. -- dmq*/
c277df42 6259 return ret;
a0ed51b3
LW
6260 }
6261 else {
830247a4 6262 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
6263 }
6264 }
1b1626e4 6265 case 0:
830247a4 6266 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 6267 vFAIL("Sequence (? incomplete");
1b1626e4 6268 break;
85508812
KW
6269 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6270 that follow */
fb85c044
KW
6271 has_use_defaults = TRUE;
6272 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
7b98bc43
KW
6273 if (RExC_utf8) { /* But the default for a utf8 pattern is
6274 unicode semantics */
a62b1201 6275 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
7b98bc43 6276 }
fb85c044 6277 goto parse_flags;
a0d0e21e 6278 default:
cde0cee5
YO
6279 --RExC_parse;
6280 parse_flags: /* (?i) */
6281 {
6282 U32 posflags = 0, negflags = 0;
6283 U32 *flagsp = &posflags;
9de15fec 6284 bool has_charset_modifier = 0;
a62b1201 6285 regex_charset cs = REGEX_DEPENDS_CHARSET;
cde0cee5
YO
6286
6287 while (*RExC_parse) {
6288 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
6289 /* (?g), (?gc) and (?o) are useless here
6290 and must be globally applied -- japhy */
cde0cee5
YO
6291 switch (*RExC_parse) {
6292 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9de15fec
KW
6293 case LOCALE_PAT_MOD:
6294 if (has_charset_modifier || flagsp == &negflags) {
6295 goto fail_modifiers;
6296 }
a62b1201 6297 cs = REGEX_LOCALE_CHARSET;
9de15fec
KW
6298 has_charset_modifier = 1;
6299 break;
6300 case UNICODE_PAT_MOD:
6301 if (has_charset_modifier || flagsp == &negflags) {
6302 goto fail_modifiers;
6303 }
a62b1201 6304 cs = REGEX_UNICODE_CHARSET;
9de15fec
KW
6305 has_charset_modifier = 1;
6306 break;
cfaf538b
KW
6307 case ASCII_RESTRICT_PAT_MOD:
6308 if (has_charset_modifier || flagsp == &negflags) {
6309 goto fail_modifiers;
6310 }
6311 cs = REGEX_ASCII_RESTRICTED_CHARSET;
6312 has_charset_modifier = 1;
6313 break;
50e91148 6314 case DEPENDS_PAT_MOD:
9de15fec
KW
6315 if (has_use_defaults
6316 || has_charset_modifier
6317 || flagsp == &negflags)
6318 {
6319 goto fail_modifiers;
6320 }
7b98bc43
KW
6321
6322 /* The dual charset means unicode semantics if the
6323 * pattern (or target, not known until runtime) are
6324 * utf8 */
a62b1201
KW
6325 cs = (RExC_utf8)
6326 ? REGEX_UNICODE_CHARSET
6327 : REGEX_DEPENDS_CHARSET;
9de15fec
KW
6328 has_charset_modifier = 1;
6329 break;
f7819f85
A
6330 case ONCE_PAT_MOD: /* 'o' */
6331 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 6332 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 6333 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
6334 if (! (wastedflags & wflagbit) ) {
6335 wastedflags |= wflagbit;
6336 vWARN5(
6337 RExC_parse + 1,
6338 "Useless (%s%c) - %suse /%c modifier",
6339 flagsp == &negflags ? "?-" : "?",
6340 *RExC_parse,
6341 flagsp == &negflags ? "don't " : "",
6342 *RExC_parse
6343 );
6344 }
6345 }
cde0cee5
YO
6346 break;
6347
f7819f85 6348 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 6349 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
6350 if (! (wastedflags & WASTED_C) ) {
6351 wastedflags |= WASTED_GC;
9d1d55b5
JP
6352 vWARN3(
6353 RExC_parse + 1,
6354 "Useless (%sc) - %suse /gc modifier",
6355 flagsp == &negflags ? "?-" : "?",
6356 flagsp == &negflags ? "don't " : ""
6357 );
6358 }
6359 }
cde0cee5 6360 break;
f7819f85 6361 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5 6362 if (flagsp == &negflags) {
668c081a
NC
6363 if (SIZE_ONLY)
6364 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
6365 } else {
6366 *flagsp |= RXf_PMf_KEEPCOPY;
6367 }
6368 break;
6369 case '-':
3b753521 6370 /* A flag is a default iff it is following a minus, so
fb85c044
KW
6371 * if there is a minus, it means will be trying to
6372 * re-specify a default which is an error */
6373 if (has_use_defaults || flagsp == &negflags) {
9de15fec 6374 fail_modifiers:
57b84237
YO
6375 RExC_parse++;
6376 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6377 /*NOTREACHED*/
6378 }
cde0cee5
YO
6379 flagsp = &negflags;
6380 wastedflags = 0; /* reset so (?g-c) warns twice */
6381 break;
6382 case ':':
6383 paren = ':';
6384 /*FALLTHROUGH*/
6385 case ')':
6386 RExC_flags |= posflags;
6387 RExC_flags &= ~negflags;
a62b1201 6388 set_regex_charset(&RExC_flags, cs);
f7819f85
A
6389 if (paren != ':') {
6390 oregflags |= posflags;
6391 oregflags &= ~negflags;
a62b1201 6392 set_regex_charset(&oregflags, cs);
f7819f85 6393 }
cde0cee5
YO
6394 nextchar(pRExC_state);
6395 if (paren != ':') {
6396 *flagp = TRYAGAIN;
6397 return NULL;
6398 } else {
6399 ret = NULL;
6400 goto parse_rest;
6401 }
6402 /*NOTREACHED*/
6403 default:
cde0cee5
YO
6404 RExC_parse++;
6405 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6406 /*NOTREACHED*/
6407 }
830247a4 6408 ++RExC_parse;
48c036b1 6409 }
cde0cee5 6410 }} /* one for the default block, one for the switch */
a0d0e21e 6411 }
fac92740 6412 else { /* (...) */
81714fb9 6413 capturing_parens:
830247a4
IZ
6414 parno = RExC_npar;
6415 RExC_npar++;
e2e6a0f1 6416
830247a4 6417 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
6418 if (!SIZE_ONLY ){
6419 if (!RExC_nestroot)
6420 RExC_nestroot = parno;
c009da3d
YO
6421 if (RExC_seen & REG_SEEN_RECURSE
6422 && !RExC_open_parens[parno-1])
6423 {
e2e6a0f1 6424 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
6425 "Setting open paren #%"IVdf" to %d\n",
6426 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
6427 RExC_open_parens[parno-1]= ret;
6428 }
6bda09f9 6429 }
fac92740
MJD
6430 Set_Node_Length(ret, 1); /* MJD */
6431 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 6432 is_open = 1;
a0d0e21e 6433 }
a0ed51b3 6434 }
fac92740 6435 else /* ! paren */
a0d0e21e 6436 ret = NULL;
cde0cee5
YO
6437
6438 parse_rest:
a0d0e21e 6439 /* Pick up the branches, linking them together. */
fac92740 6440 parse_start = RExC_parse; /* MJD */
3dab1dad 6441 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e
VP
6442
6443 if (freeze_paren) {
6444 if (RExC_npar > after_freeze)
6445 after_freeze = RExC_npar;
6446 RExC_npar = freeze_paren;
6447 }
6448
fac92740 6449 /* branch_len = (paren != 0); */
2af232bd 6450
a0d0e21e
LW
6451 if (br == NULL)
6452 return(NULL);
830247a4
IZ
6453 if (*RExC_parse == '|') {
6454 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 6455 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 6456 }
fac92740 6457 else { /* MJD */
6bda09f9 6458 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
6459 Set_Node_Length(br, paren != 0);
6460 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6461 }
c277df42
IZ
6462 have_branch = 1;
6463 if (SIZE_ONLY)
830247a4 6464 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
6465 }
6466 else if (paren == ':') {
c277df42
IZ
6467 *flagp |= flags&SIMPLE;
6468 }
6136c704 6469 if (is_open) { /* Starts with OPEN. */
3dab1dad 6470 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
6471 }
6472 else if (paren != '?') /* Not Conditional */
a0d0e21e 6473 ret = br;
8ae10a67 6474 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 6475 lastbr = br;
830247a4
IZ
6476 while (*RExC_parse == '|') {
6477 if (!SIZE_ONLY && RExC_extralen) {
6478 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 6479 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
6480 }
6481 if (SIZE_ONLY)
830247a4
IZ
6482 RExC_extralen += 2; /* Account for LONGJMP. */
6483 nextchar(pRExC_state);
594d7033
YO
6484 if (freeze_paren) {
6485 if (RExC_npar > after_freeze)
6486 after_freeze = RExC_npar;
6487 RExC_npar = freeze_paren;
6488 }
3dab1dad 6489 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 6490
a687059c 6491 if (br == NULL)
a0d0e21e 6492 return(NULL);
3dab1dad 6493 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 6494 lastbr = br;
8ae10a67 6495 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
6496 }
6497
c277df42
IZ
6498 if (have_branch || paren != ':') {
6499 /* Make a closing node, and hook it on the end. */
6500 switch (paren) {
6501 case ':':
830247a4 6502 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
6503 break;
6504 case 1:
830247a4 6505 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
6506 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6507 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6508 "Setting close paren #%"IVdf" to %d\n",
6509 (IV)parno, REG_NODE_NUM(ender)));
6510 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
6511 if (RExC_nestroot == parno)
6512 RExC_nestroot = 0;
40d049e4 6513 }
fac92740
MJD
6514 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6515 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
6516 break;
6517 case '<':
c277df42
IZ
6518 case ',':
6519 case '=':
6520 case '!':
c277df42 6521 *flagp &= ~HASWIDTH;
821b33a5
IZ
6522 /* FALL THROUGH */
6523 case '>':
830247a4 6524 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
6525 break;
6526 case 0:
830247a4 6527 ender = reg_node(pRExC_state, END);
40d049e4
YO
6528 if (!SIZE_ONLY) {
6529 assert(!RExC_opend); /* there can only be one! */
6530 RExC_opend = ender;
6531 }
c277df42
IZ
6532 break;
6533 }
eaf3ca90 6534 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 6535
9674d46a 6536 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
6537 if (depth==1)
6538 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6539
c277df42 6540 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
6541 for (br = ret; br; br = regnext(br)) {
6542 const U8 op = PL_regkind[OP(br)];
6543 if (op == BRANCH) {
07be1b83 6544 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
6545 }
6546 else if (op == BRANCHJ) {
07be1b83 6547 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 6548 }
c277df42
IZ
6549 }
6550 }
a0d0e21e 6551 }
c277df42
IZ
6552
6553 {
e1ec3a88
AL
6554 const char *p;
6555 static const char parens[] = "=!<,>";
c277df42
IZ
6556
6557 if (paren && (p = strchr(parens, paren))) {
eb160463 6558 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
6559 int flag = (p - parens) > 1;
6560
6561 if (paren == '>')
6562 node = SUSPEND, flag = 0;
6bda09f9 6563 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
6564 Set_Node_Cur_Length(ret);
6565 Set_Node_Offset(ret, parse_start + 1);
c277df42 6566 ret->flags = flag;
07be1b83 6567 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 6568 }
a0d0e21e
LW
6569 }
6570
6571 /* Check for proper termination. */
ce3e6498 6572 if (paren) {
e2509266 6573 RExC_flags = oregflags;
830247a4
IZ
6574 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6575 RExC_parse = oregcomp_parse;
380a0633 6576 vFAIL("Unmatched (");
ce3e6498 6577 }
a0ed51b3 6578 }
830247a4
IZ
6579 else if (!paren && RExC_parse < RExC_end) {
6580 if (*RExC_parse == ')') {
6581 RExC_parse++;
380a0633 6582 vFAIL("Unmatched )");
a0ed51b3
LW
6583 }
6584 else
b45f050a 6585 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
6586 /* NOTREACHED */
6587 }
594d7033
YO
6588 if (after_freeze)
6589 RExC_npar = after_freeze;
a0d0e21e 6590 return(ret);
a687059c
LW
6591}
6592
6593/*
6594 - regbranch - one alternative of an | operator
6595 *
6596 * Implements the concatenation operator.
6597 */
76e3520e 6598STATIC regnode *
3dab1dad 6599S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 6600{
97aff369 6601 dVAR;
c277df42
IZ
6602 register regnode *ret;
6603 register regnode *chain = NULL;
6604 register regnode *latest;
6605 I32 flags = 0, c = 0;
3dab1dad 6606 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6607
6608 PERL_ARGS_ASSERT_REGBRANCH;
6609
3dab1dad 6610 DEBUG_PARSE("brnc");
02daf0ab 6611
b81d288d 6612 if (first)
c277df42
IZ
6613 ret = NULL;
6614 else {
b81d288d 6615 if (!SIZE_ONLY && RExC_extralen)
830247a4 6616 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 6617 else {
830247a4 6618 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
6619 Set_Node_Length(ret, 1);
6620 }
c277df42
IZ
6621 }
6622
b81d288d 6623 if (!first && SIZE_ONLY)
830247a4 6624 RExC_extralen += 1; /* BRANCHJ */
b81d288d 6625
c277df42 6626 *flagp = WORST; /* Tentatively. */
a0d0e21e 6627
830247a4
IZ
6628 RExC_parse--;
6629 nextchar(pRExC_state);
6630 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 6631 flags &= ~TRYAGAIN;
3dab1dad 6632 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6633 if (latest == NULL) {
6634 if (flags & TRYAGAIN)
6635 continue;
6636 return(NULL);
a0ed51b3
LW
6637 }
6638 else if (ret == NULL)
c277df42 6639 ret = latest;
8ae10a67 6640 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 6641 if (chain == NULL) /* First piece. */
a0d0e21e
LW
6642 *flagp |= flags&SPSTART;
6643 else {
830247a4 6644 RExC_naughty++;
3dab1dad 6645 REGTAIL(pRExC_state, chain, latest);
a687059c 6646 }
a0d0e21e 6647 chain = latest;
c277df42
IZ
6648 c++;
6649 }
6650 if (chain == NULL) { /* Loop ran zero times. */
830247a4 6651 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
6652 if (ret == NULL)
6653 ret = chain;
6654 }
6655 if (c == 1) {
6656 *flagp |= flags&SIMPLE;
a0d0e21e 6657 }
a687059c 6658
d4c19fe8 6659 return ret;
a687059c
LW
6660}
6661
6662/*
6663 - regpiece - something followed by possible [*+?]
6664 *
6665 * Note that the branching code sequences used for ? and the general cases
6666 * of * and + are somewhat optimized: they use the same NOTHING node as
6667 * both the endmarker for their branch list and the body of the last branch.
6668 * It might seem that this node could be dispensed with entirely, but the
6669 * endmarker role is not redundant.
6670 */
76e3520e 6671STATIC regnode *
3dab1dad 6672S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6673{
97aff369 6674 dVAR;
c277df42 6675 register regnode *ret;
a0d0e21e
LW
6676 register char op;
6677 register char *next;
6678 I32 flags;
1df70142 6679 const char * const origparse = RExC_parse;
a0d0e21e 6680 I32 min;
c277df42 6681 I32 max = REG_INFTY;
fac92740 6682 char *parse_start;
10edeb5d 6683 const char *maxpos = NULL;
3dab1dad 6684 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6685
6686 PERL_ARGS_ASSERT_REGPIECE;
6687
3dab1dad 6688 DEBUG_PARSE("piec");
a0d0e21e 6689
3dab1dad 6690 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6691 if (ret == NULL) {
6692 if (flags & TRYAGAIN)
6693 *flagp |= TRYAGAIN;
6694 return(NULL);
6695 }
6696
830247a4 6697 op = *RExC_parse;
a0d0e21e 6698
830247a4 6699 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 6700 maxpos = NULL;
fac92740 6701 parse_start = RExC_parse; /* MJD */
830247a4 6702 next = RExC_parse + 1;
a0d0e21e
LW
6703 while (isDIGIT(*next) || *next == ',') {
6704 if (*next == ',') {
6705 if (maxpos)
6706 break;
6707 else
6708 maxpos = next;
a687059c 6709 }
a0d0e21e
LW
6710 next++;
6711 }
6712 if (*next == '}') { /* got one */
6713 if (!maxpos)
6714 maxpos = next;
830247a4
IZ
6715 RExC_parse++;
6716 min = atoi(RExC_parse);
a0d0e21e
LW
6717 if (*maxpos == ',')
6718 maxpos++;
6719 else
830247a4 6720 maxpos = RExC_parse;
a0d0e21e
LW
6721 max = atoi(maxpos);
6722 if (!max && *maxpos != '0')
c277df42
IZ
6723 max = REG_INFTY; /* meaning "infinity" */
6724 else if (max >= REG_INFTY)
8615cb43 6725 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
6726 RExC_parse = next;
6727 nextchar(pRExC_state);
a0d0e21e
LW
6728
6729 do_curly:
6730 if ((flags&SIMPLE)) {
830247a4 6731 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 6732 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
6733 Set_Node_Offset(ret, parse_start+1); /* MJD */
6734 Set_Node_Cur_Length(ret);
a0d0e21e
LW
6735 }
6736 else {
3dab1dad 6737 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
6738
6739 w->flags = 0;
3dab1dad 6740 REGTAIL(pRExC_state, ret, w);
830247a4 6741 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
6742 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6743 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
6744 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6745 }
6bda09f9 6746 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
6747 /* MJD hk */
6748 Set_Node_Offset(ret, parse_start+1);
2af232bd 6749 Set_Node_Length(ret,
fac92740 6750 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 6751
830247a4 6752 if (!SIZE_ONLY && RExC_extralen)
c277df42 6753 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 6754 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 6755 if (SIZE_ONLY)
830247a4
IZ
6756 RExC_whilem_seen++, RExC_extralen += 3;
6757 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 6758 }
c277df42 6759 ret->flags = 0;
a0d0e21e
LW
6760
6761 if (min > 0)
821b33a5
IZ
6762 *flagp = WORST;
6763 if (max > 0)
6764 *flagp |= HASWIDTH;
8fa23287 6765 if (max < min)
8615cb43 6766 vFAIL("Can't do {n,m} with n > m");
c277df42 6767 if (!SIZE_ONLY) {
eb160463
GS
6768 ARG1_SET(ret, (U16)min);
6769 ARG2_SET(ret, (U16)max);
a687059c 6770 }
a687059c 6771
a0d0e21e 6772 goto nest_check;
a687059c 6773 }
a0d0e21e 6774 }
a687059c 6775
a0d0e21e
LW
6776 if (!ISMULT1(op)) {
6777 *flagp = flags;
a687059c 6778 return(ret);
a0d0e21e 6779 }
bb20fd44 6780
c277df42 6781#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
6782
6783 /* if this is reinstated, don't forget to put this back into perldiag:
6784
6785 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6786
6787 (F) The part of the regexp subject to either the * or + quantifier
6788 could match an empty string. The {#} shows in the regular
6789 expression about where the problem was discovered.
6790
6791 */
6792
bb20fd44 6793 if (!(flags&HASWIDTH) && op != '?')
b45f050a 6794 vFAIL("Regexp *+ operand could be empty");
b81d288d 6795#endif
bb20fd44 6796
fac92740 6797 parse_start = RExC_parse;
830247a4 6798 nextchar(pRExC_state);
a0d0e21e 6799
821b33a5 6800 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
6801
6802 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 6803 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 6804 ret->flags = 0;
830247a4 6805 RExC_naughty += 4;
a0d0e21e
LW
6806 }
6807 else if (op == '*') {
6808 min = 0;
6809 goto do_curly;
a0ed51b3
LW
6810 }
6811 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 6812 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 6813 ret->flags = 0;
830247a4 6814 RExC_naughty += 3;
a0d0e21e
LW
6815 }
6816 else if (op == '+') {
6817 min = 1;
6818 goto do_curly;
a0ed51b3
LW
6819 }
6820 else if (op == '?') {
a0d0e21e
LW
6821 min = 0; max = 1;
6822 goto do_curly;
6823 }
6824 nest_check:
668c081a
NC
6825 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6826 ckWARN3reg(RExC_parse,
6827 "%.*s matches null string many times",
6828 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6829 origparse);
a0d0e21e
LW
6830 }
6831
b9b4dddf 6832 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 6833 nextchar(pRExC_state);
6bda09f9 6834 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 6835 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 6836 }
b9b4dddf
YO
6837#ifndef REG_ALLOW_MINMOD_SUSPEND
6838 else
6839#endif
6840 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6841 regnode *ender;
6842 nextchar(pRExC_state);
6843 ender = reg_node(pRExC_state, SUCCEED);
6844 REGTAIL(pRExC_state, ret, ender);
6845 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6846 ret->flags = 0;
6847 ender = reg_node(pRExC_state, TAIL);
6848 REGTAIL(pRExC_state, ret, ender);
6849 /*ret= ender;*/
6850 }
6851
6852 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 6853 RExC_parse++;
b45f050a
JF
6854 vFAIL("Nested quantifiers");
6855 }
a0d0e21e
LW
6856
6857 return(ret);
a687059c
LW
6858}
6859
fc8cd66c
YO
6860
6861/* reg_namedseq(pRExC_state,UVp)
6862
6863 This is expected to be called by a parser routine that has
afefe6bf 6864 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
6865 expected to point at the first char following the N at the time
6866 of the call.
ff3f963a
KW
6867
6868 The \N may be inside (indicated by valuep not being NULL) or outside a
6869 character class.
6870
6871 \N may begin either a named sequence, or if outside a character class, mean
6872 to match a non-newline. For non single-quoted regexes, the tokenizer has
6873 attempted to decide which, and in the case of a named sequence converted it
6874 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6875 where c1... are the characters in the sequence. For single-quoted regexes,
6876 the tokenizer passes the \N sequence through unchanged; this code will not
6877 attempt to determine this nor expand those. The net effect is that if the
6878 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6879 signals that this \N occurrence means to match a non-newline.
6880
6881 Only the \N{U+...} form should occur in a character class, for the same
6882 reason that '.' inside a character class means to just match a period: it
6883 just doesn't make sense.
fc8cd66c
YO
6884
6885 If valuep is non-null then it is assumed that we are parsing inside
6886 of a charclass definition and the first codepoint in the resolved
6887 string is returned via *valuep and the routine will return NULL.
6888 In this mode if a multichar string is returned from the charnames
ff3f963a 6889 handler, a warning will be issued, and only the first char in the
fc8cd66c
YO
6890 sequence will be examined. If the string returned is zero length
6891 then the value of *valuep is undefined and NON-NULL will
6892 be returned to indicate failure. (This will NOT be a valid pointer
6893 to a regnode.)
6894
ff3f963a
KW
6895 If valuep is null then it is assumed that we are parsing normal text and a
6896 new EXACT node is inserted into the program containing the resolved string,
6897 and a pointer to the new node is returned. But if the string is zero length
6898 a NOTHING node is emitted instead.
afefe6bf 6899
fc8cd66c 6900 On success RExC_parse is set to the char following the endbrace.
ff3f963a 6901 Parsing failures will generate a fatal error via vFAIL(...)
fc8cd66c
YO
6902 */
6903STATIC regnode *
afefe6bf 6904S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
fc8cd66c 6905{
c3c41406 6906 char * endbrace; /* '}' following the name */
fc8cd66c 6907 regnode *ret = NULL;
ff3f963a
KW
6908#ifdef DEBUGGING
6909 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6910#endif
c3c41406 6911 char* p;
ff3f963a
KW
6912
6913 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6914
6915 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
ff3f963a
KW
6916
6917 GET_RE_DEBUG_FLAGS;
c3c41406
KW
6918
6919 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6920 * modifier. The other meaning does not */
6921 p = (RExC_flags & RXf_PMf_EXTENDED)
6922 ? regwhite( pRExC_state, RExC_parse )
6923 : RExC_parse;
7918f24d 6924
ff3f963a 6925 /* Disambiguate between \N meaning a named character versus \N meaning
c3c41406
KW
6926 * [^\n]. The former is assumed when it can't be the latter. */
6927 if (*p != '{' || regcurly(p)) {
6928 RExC_parse = p;
ff3f963a 6929 if (valuep) {
afefe6bf 6930 /* no bare \N in a charclass */
ff3f963a
KW
6931 vFAIL("\\N in a character class must be a named character: \\N{...}");
6932 }
afefe6bf
RGS
6933 nextchar(pRExC_state);
6934 ret = reg_node(pRExC_state, REG_ANY);
6935 *flagp |= HASWIDTH|SIMPLE;
6936 RExC_naughty++;
6937 RExC_parse--;
6938 Set_Node_Length(ret, 1); /* MJD */
6939 return ret;
fc8cd66c 6940 }
a4893424 6941
c3c41406
KW
6942 /* Here, we have decided it should be a named sequence */
6943
6944 /* The test above made sure that the next real character is a '{', but
6945 * under the /x modifier, it could be separated by space (or a comment and
6946 * \n) and this is not allowed (for consistency with \x{...} and the
6947 * tokenizer handling of \N{NAME}). */
6948 if (*RExC_parse != '{') {
6949 vFAIL("Missing braces on \\N{}");
6950 }
6951
ff3f963a 6952 RExC_parse++; /* Skip past the '{' */
c3c41406
KW
6953
6954 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6955 || ! (endbrace == RExC_parse /* nothing between the {} */
6956 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6957 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6958 {
6959 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6960 vFAIL("\\N{NAME} must be resolved by the lexer");
6961 }
6962
ff3f963a
KW
6963 if (endbrace == RExC_parse) { /* empty: \N{} */
6964 if (! valuep) {
6965 RExC_parse = endbrace + 1;
6966 return reg_node(pRExC_state,NOTHING);
a4893424 6967 }
fc8cd66c 6968
ff3f963a
KW
6969 if (SIZE_ONLY) {
6970 ckWARNreg(RExC_parse,
6971 "Ignoring zero length \\N{} in character class"
6972 );
6973 RExC_parse = endbrace + 1;
6974 }
6975 *valuep = 0;
6976 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 6977 }
ff3f963a 6978
62fed28b 6979 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
ff3f963a
KW
6980 RExC_parse += 2; /* Skip past the 'U+' */
6981
6982 if (valuep) { /* In a bracketed char class */
6983 /* We only pay attention to the first char of
6984 multichar strings being returned. I kinda wonder
6985 if this makes sense as it does change the behaviour
6986 from earlier versions, OTOH that behaviour was broken
6987 as well. XXX Solution is to recharacterize as
6988 [rest-of-class]|multi1|multi2... */
6989
6990 STRLEN length_of_hex;
6991 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6992 | PERL_SCAN_DISALLOW_PREFIX
6993 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6994
37820adc
KW
6995 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6996 if (endchar < endbrace) {
ff3f963a
KW
6997 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6998 }
ff3f963a
KW
6999
7000 length_of_hex = (STRLEN)(endchar - RExC_parse);
7001 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7002
7003 /* The tokenizer should have guaranteed validity, but it's possible to
7004 * bypass it by using single quoting, so check */
c3c41406
KW
7005 if (length_of_hex == 0
7006 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7007 {
7008 RExC_parse += length_of_hex; /* Includes all the valid */
7009 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7010 ? UTF8SKIP(RExC_parse)
7011 : 1;
7012 /* Guard against malformed utf8 */
7013 if (RExC_parse >= endchar) RExC_parse = endchar;
7014 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
7015 }
7016
7017 RExC_parse = endbrace + 1;
7018 if (endchar == endbrace) return NULL;
7019
7020 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 7021 }
ff3f963a
KW
7022 else { /* Not a char class */
7023 char *s; /* String to put in generated EXACT node */
fda99bee 7024 STRLEN len = 0; /* Its current byte length */
ff3f963a
KW
7025 char *endchar; /* Points to '.' or '}' ending cur char in the input
7026 stream */
7027
2c2b7f86
KW
7028 ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
7029 : (LOC)
7030 ? EXACTFL
7031 : UNI_SEMANTICS
7032 ? EXACTFU
7033 : EXACTF));
ff3f963a
KW
7034 s= STRING(ret);
7035
7036 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
7037 * the input which is of the form now 'c1.c2.c3...}' until find the
fda99bee 7038 * ending brace or exceed length 255. The characters that exceed this
ff3f963a
KW
7039 * limit are dropped. The limit could be relaxed should it become
7040 * desirable by reparsing this as (?:\N{NAME}), so could generate
7041 * multiple EXACT nodes, as is done for just regular input. But this
7042 * is primarily a named character, and not intended to be a huge long
7043 * string, so 255 bytes should be good enough */
7044 while (1) {
c3c41406 7045 STRLEN length_of_hex;
ff3f963a
KW
7046 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7047 | PERL_SCAN_DISALLOW_PREFIX
7048 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7049 UV cp; /* Ord of current character */
7050
7051 /* Code points are separated by dots. If none, there is only one
7052 * code point, and is terminated by the brace */
37820adc 7053 endchar = RExC_parse + strcspn(RExC_parse, ".}");
ff3f963a
KW
7054
7055 /* The values are Unicode even on EBCDIC machines */
c3c41406
KW
7056 length_of_hex = (STRLEN)(endchar - RExC_parse);
7057 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7058 if ( length_of_hex == 0
7059 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
ff3f963a 7060 {
c3c41406
KW
7061 RExC_parse += length_of_hex; /* Includes all the valid */
7062 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7063 ? UTF8SKIP(RExC_parse)
7064 : 1;
7065 /* Guard against malformed utf8 */
7066 if (RExC_parse >= endchar) RExC_parse = endchar;
7067 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
7068 }
7069
7070 if (! FOLD) { /* Not folding, just append to the string */
7071 STRLEN unilen;
7072
7073 /* Quit before adding this character if would exceed limit */
7074 if (len + UNISKIP(cp) > U8_MAX) break;
fc8cd66c 7075
ff3f963a
KW
7076 unilen = reguni(pRExC_state, cp, s);
7077 if (unilen > 0) {
7078 s += unilen;
7079 len += unilen;
7080 }
7081 } else { /* Folding, output the folded equivalent */
7082 STRLEN foldlen,numlen;
7083 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7084 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7085
7086 /* Quit before exceeding size limit */
7087 if (len + foldlen > U8_MAX) break;
7088
7089 for (foldbuf = tmpbuf;
7090 foldlen;
7091 foldlen -= numlen)
7092 {
7093 cp = utf8_to_uvchr(foldbuf, &numlen);
7094 if (numlen > 0) {
7095 const STRLEN unilen = reguni(pRExC_state, cp, s);
7096 s += unilen;
7097 len += unilen;
7098 /* In EBCDIC the numlen and unilen can differ. */
7099 foldbuf += numlen;
7100 if (numlen >= foldlen)
7101 break;
7102 }
7103 else
7104 break; /* "Can't happen." */
7105 }
7106 }
7107
7108 /* Point to the beginning of the next character in the sequence. */
7109 RExC_parse = endchar + 1;
7110
7111 /* Quit if no more characters */
7112 if (RExC_parse >= endbrace) break;
7113 }
7114
7115
7116 if (SIZE_ONLY) {
7117 if (RExC_parse < endbrace) {
7118 ckWARNreg(RExC_parse - 1,
7119 "Using just the first characters returned by \\N{}");
7120 }
7121
7122 RExC_size += STR_SZ(len);
7123 } else {
7124 STR_LEN(ret) = len;
7125 RExC_emit += STR_SZ(len);
7126 }
7127
7128 RExC_parse = endbrace + 1;
7129
7130 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7131 with malformed in t/re/pat_advanced.t */
7132 RExC_parse --;
7133 Set_Node_Cur_Length(ret); /* MJD */
7134 nextchar(pRExC_state);
7135 }
7136
7137 return ret;
fc8cd66c
YO
7138}
7139
7140
9e08bc66
TS
7141/*
7142 * reg_recode
7143 *
7144 * It returns the code point in utf8 for the value in *encp.
7145 * value: a code value in the source encoding
7146 * encp: a pointer to an Encode object
7147 *
7148 * If the result from Encode is not a single character,
7149 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7150 */
7151STATIC UV
7152S_reg_recode(pTHX_ const char value, SV **encp)
7153{
7154 STRLEN numlen = 1;
59cd0e26 7155 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 7156 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
7157 const STRLEN newlen = SvCUR(sv);
7158 UV uv = UNICODE_REPLACEMENT;
7159
7918f24d
NC
7160 PERL_ARGS_ASSERT_REG_RECODE;
7161
9e08bc66
TS
7162 if (newlen)
7163 uv = SvUTF8(sv)
7164 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7165 : *(U8*)s;
7166
7167 if (!newlen || numlen != newlen) {
7168 uv = UNICODE_REPLACEMENT;
c86f7df5 7169 *encp = NULL;
9e08bc66
TS
7170 }
7171 return uv;
7172}
7173
fc8cd66c 7174
a687059c
LW
7175/*
7176 - regatom - the lowest level
ee9b8eae
YO
7177
7178 Try to identify anything special at the start of the pattern. If there
7179 is, then handle it as required. This may involve generating a single regop,
7180 such as for an assertion; or it may involve recursing, such as to
7181 handle a () structure.
7182
7183 If the string doesn't start with something special then we gobble up
7184 as much literal text as we can.
7185
7186 Once we have been able to handle whatever type of thing started the
7187 sequence, we return.
7188
7189 Note: we have to be careful with escapes, as they can be both literal
7190 and special, and in the case of \10 and friends can either, depending
486ec47a 7191 on context. Specifically there are two separate switches for handling
ee9b8eae
YO
7192 escape sequences, with the one for handling literal escapes requiring
7193 a dummy entry for all of the special escapes that are actually handled
7194 by the other.
7195*/
7196
76e3520e 7197STATIC regnode *
3dab1dad 7198S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 7199{
97aff369 7200 dVAR;
cbbf8932 7201 register regnode *ret = NULL;
a0d0e21e 7202 I32 flags;
45948336 7203 char *parse_start = RExC_parse;
980866de 7204 U8 op;
3dab1dad
YO
7205 GET_RE_DEBUG_FLAGS_DECL;
7206 DEBUG_PARSE("atom");
a0d0e21e
LW
7207 *flagp = WORST; /* Tentatively. */
7208
7918f24d 7209 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 7210
a0d0e21e 7211tryagain:
f9a79580 7212 switch ((U8)*RExC_parse) {
a0d0e21e 7213 case '^':
830247a4
IZ
7214 RExC_seen_zerolen++;
7215 nextchar(pRExC_state);
bbe252da 7216 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7217 ret = reg_node(pRExC_state, MBOL);
bbe252da 7218 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7219 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7220 else
830247a4 7221 ret = reg_node(pRExC_state, BOL);
fac92740 7222 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7223 break;
7224 case '$':
830247a4 7225 nextchar(pRExC_state);
b81d288d 7226 if (*RExC_parse)
830247a4 7227 RExC_seen_zerolen++;
bbe252da 7228 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7229 ret = reg_node(pRExC_state, MEOL);
bbe252da 7230 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7231 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7232 else
830247a4 7233 ret = reg_node(pRExC_state, EOL);
fac92740 7234 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7235 break;
7236 case '.':
830247a4 7237 nextchar(pRExC_state);
bbe252da 7238 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
7239 ret = reg_node(pRExC_state, SANY);
7240 else
7241 ret = reg_node(pRExC_state, REG_ANY);
7242 *flagp |= HASWIDTH|SIMPLE;
830247a4 7243 RExC_naughty++;
fac92740 7244 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7245 break;
7246 case '[':
b45f050a 7247 {
3dab1dad
YO
7248 char * const oregcomp_parse = ++RExC_parse;
7249 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
7250 if (*RExC_parse != ']') {
7251 RExC_parse = oregcomp_parse;
b45f050a
JF
7252 vFAIL("Unmatched [");
7253 }
830247a4 7254 nextchar(pRExC_state);
a0d0e21e 7255 *flagp |= HASWIDTH|SIMPLE;
fac92740 7256 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 7257 break;
b45f050a 7258 }
a0d0e21e 7259 case '(':
830247a4 7260 nextchar(pRExC_state);
3dab1dad 7261 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 7262 if (ret == NULL) {
bf93d4cc 7263 if (flags & TRYAGAIN) {
830247a4 7264 if (RExC_parse == RExC_end) {
bf93d4cc
GS
7265 /* Make parent create an empty node if needed. */
7266 *flagp |= TRYAGAIN;
7267 return(NULL);
7268 }
a0d0e21e 7269 goto tryagain;
bf93d4cc 7270 }
a0d0e21e
LW
7271 return(NULL);
7272 }
a3b492c3 7273 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
7274 break;
7275 case '|':
7276 case ')':
7277 if (flags & TRYAGAIN) {
7278 *flagp |= TRYAGAIN;
7279 return NULL;
7280 }
b45f050a 7281 vFAIL("Internal urp");
a0d0e21e
LW
7282 /* Supposed to be caught earlier. */
7283 break;
85afd4ae 7284 case '{':
830247a4
IZ
7285 if (!regcurly(RExC_parse)) {
7286 RExC_parse++;
85afd4ae
CS
7287 goto defchar;
7288 }
7289 /* FALL THROUGH */
a0d0e21e
LW
7290 case '?':
7291 case '+':
7292 case '*':
830247a4 7293 RExC_parse++;
b45f050a 7294 vFAIL("Quantifier follows nothing");
a0d0e21e 7295 break;
ced7f090
KW
7296 case LATIN_SMALL_LETTER_SHARP_S:
7297 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7298 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7299#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
7300#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below.
7301 case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
7302#endif
a0a388a1 7303 do_foldchar:
56d400ed 7304 if (!LOC && FOLD) {
e64b1bd1 7305 U32 len,cp;
7cf3a6a3 7306 len=0; /* silence a spurious compiler warning */
56d400ed 7307 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
e64b1bd1
YO
7308 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7309 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7310 ret = reganode(pRExC_state, FOLDCHAR, cp);
7311 Set_Node_Length(ret, 1); /* MJD */
7312 nextchar(pRExC_state); /* kill whitespace under /x */
7313 return ret;
7314 }
7315 }
7316 goto outer_default;
a0d0e21e 7317 case '\\':
ee9b8eae
YO
7318 /* Special Escapes
7319
7320 This switch handles escape sequences that resolve to some kind
7321 of special regop and not to literal text. Escape sequnces that
7322 resolve to literal text are handled below in the switch marked
7323 "Literal Escapes".
7324
7325 Every entry in this switch *must* have a corresponding entry
7326 in the literal escape switch. However, the opposite is not
7327 required, as the default for this switch is to jump to the
7328 literal text handling code.
7329 */
a0a388a1 7330 switch ((U8)*++RExC_parse) {
ced7f090
KW
7331 case LATIN_SMALL_LETTER_SHARP_S:
7332 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7333 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
a0a388a1 7334 goto do_foldchar;
ee9b8eae 7335 /* Special Escapes */
a0d0e21e 7336 case 'A':
830247a4
IZ
7337 RExC_seen_zerolen++;
7338 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7339 *flagp |= SIMPLE;
ee9b8eae 7340 goto finish_meta_pat;
a0d0e21e 7341 case 'G':
830247a4
IZ
7342 ret = reg_node(pRExC_state, GPOS);
7343 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 7344 *flagp |= SIMPLE;
ee9b8eae
YO
7345 goto finish_meta_pat;
7346 case 'K':
7347 RExC_seen_zerolen++;
7348 ret = reg_node(pRExC_state, KEEPS);
7349 *flagp |= SIMPLE;
37923168
RGS
7350 /* XXX:dmq : disabling in-place substitution seems to
7351 * be necessary here to avoid cases of memory corruption, as
7352 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7353 */
7354 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 7355 goto finish_meta_pat;
a0d0e21e 7356 case 'Z':
830247a4 7357 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7358 *flagp |= SIMPLE;
a1917ab9 7359 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7360 goto finish_meta_pat;
b85d18e9 7361 case 'z':
830247a4 7362 ret = reg_node(pRExC_state, EOS);
b85d18e9 7363 *flagp |= SIMPLE;
830247a4 7364 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7365 goto finish_meta_pat;
4a2d328f 7366 case 'C':
f33976b4
DB
7367 ret = reg_node(pRExC_state, CANY);
7368 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 7369 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7370 goto finish_meta_pat;
a0ed51b3 7371 case 'X':
830247a4 7372 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 7373 *flagp |= HASWIDTH;
ee9b8eae 7374 goto finish_meta_pat;
a0d0e21e 7375 case 'w':
980866de
KW
7376 switch (get_regex_charset(RExC_flags)) {
7377 case REGEX_LOCALE_CHARSET:
7378 op = ALNUML;
7379 break;
7380 case REGEX_UNICODE_CHARSET:
7381 op = ALNUMU;
7382 break;
cfaf538b
KW
7383 case REGEX_ASCII_RESTRICTED_CHARSET:
7384 op = ALNUMA;
7385 break;
980866de
KW
7386 case REGEX_DEPENDS_CHARSET:
7387 op = ALNUM;
7388 break;
7389 default:
7390 goto bad_charset;
a12cf05f 7391 }
980866de 7392 ret = reg_node(pRExC_state, op);
a0d0e21e 7393 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7394 goto finish_meta_pat;
a0d0e21e 7395 case 'W':
980866de
KW
7396 switch (get_regex_charset(RExC_flags)) {
7397 case REGEX_LOCALE_CHARSET:
7398 op = NALNUML;
7399 break;
7400 case REGEX_UNICODE_CHARSET:
7401 op = NALNUMU;
7402 break;
cfaf538b
KW
7403 case REGEX_ASCII_RESTRICTED_CHARSET:
7404 op = NALNUMA;
7405 break;
980866de
KW
7406 case REGEX_DEPENDS_CHARSET:
7407 op = NALNUM;
7408 break;
7409 default:
7410 goto bad_charset;
a12cf05f 7411 }
980866de 7412 ret = reg_node(pRExC_state, op);
a0d0e21e 7413 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7414 goto finish_meta_pat;
a0d0e21e 7415 case 'b':
830247a4
IZ
7416 RExC_seen_zerolen++;
7417 RExC_seen |= REG_SEEN_LOOKBEHIND;
63ac0dad
KW
7418 switch (get_regex_charset(RExC_flags)) {
7419 case REGEX_LOCALE_CHARSET:
7420 op = BOUNDL;
7421 break;
7422 case REGEX_UNICODE_CHARSET:
7423 op = BOUNDU;
7424 break;
cfaf538b
KW
7425 case REGEX_ASCII_RESTRICTED_CHARSET:
7426 op = BOUNDA;
7427 break;
63ac0dad
KW
7428 case REGEX_DEPENDS_CHARSET:
7429 op = BOUND;
7430 break;
7431 default:
7432 goto bad_charset;
a12cf05f 7433 }
63ac0dad 7434 ret = reg_node(pRExC_state, op);
b988e673 7435 FLAGS(ret) = get_regex_charset(RExC_flags);
a0d0e21e 7436 *flagp |= SIMPLE;
ee9b8eae 7437 goto finish_meta_pat;
a0d0e21e 7438 case 'B':
830247a4
IZ
7439 RExC_seen_zerolen++;
7440 RExC_seen |= REG_SEEN_LOOKBEHIND;
63ac0dad
KW
7441 switch (get_regex_charset(RExC_flags)) {
7442 case REGEX_LOCALE_CHARSET:
7443 op = NBOUNDL;
7444 break;
7445 case REGEX_UNICODE_CHARSET:
7446 op = NBOUNDU;
7447 break;
cfaf538b
KW
7448 case REGEX_ASCII_RESTRICTED_CHARSET:
7449 op = NBOUNDA;
7450 break;
63ac0dad
KW
7451 case REGEX_DEPENDS_CHARSET:
7452 op = NBOUND;
7453 break;
7454 default:
7455 goto bad_charset;
a12cf05f 7456 }
63ac0dad 7457 ret = reg_node(pRExC_state, op);
b988e673 7458 FLAGS(ret) = get_regex_charset(RExC_flags);
a0d0e21e 7459 *flagp |= SIMPLE;
ee9b8eae 7460 goto finish_meta_pat;
a0d0e21e 7461 case 's':
980866de
KW
7462 switch (get_regex_charset(RExC_flags)) {
7463 case REGEX_LOCALE_CHARSET:
7464 op = SPACEL;
7465 break;
7466 case REGEX_UNICODE_CHARSET:
7467 op = SPACEU;
7468 break;
cfaf538b
KW
7469 case REGEX_ASCII_RESTRICTED_CHARSET:
7470 op = SPACEA;
7471 break;
980866de
KW
7472 case REGEX_DEPENDS_CHARSET:
7473 op = SPACE;
7474 break;
7475 default:
7476 goto bad_charset;
a12cf05f 7477 }
980866de 7478 ret = reg_node(pRExC_state, op);
a0d0e21e 7479 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7480 goto finish_meta_pat;
a0d0e21e 7481 case 'S':
980866de
KW
7482 switch (get_regex_charset(RExC_flags)) {
7483 case REGEX_LOCALE_CHARSET:
7484 op = NSPACEL;
7485 break;
7486 case REGEX_UNICODE_CHARSET:
7487 op = NSPACEU;
7488 break;
cfaf538b
KW
7489 case REGEX_ASCII_RESTRICTED_CHARSET:
7490 op = NSPACEA;
7491 break;
980866de
KW
7492 case REGEX_DEPENDS_CHARSET:
7493 op = NSPACE;
7494 break;
7495 default:
7496 goto bad_charset;
a12cf05f 7497 }
980866de 7498 ret = reg_node(pRExC_state, op);
a0d0e21e 7499 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7500 goto finish_meta_pat;
a0d0e21e 7501 case 'd':
56ae17b4
KW
7502 switch (get_regex_charset(RExC_flags)) {
7503 case REGEX_LOCALE_CHARSET:
7504 op = DIGITL;
7505 break;
cfaf538b
KW
7506 case REGEX_ASCII_RESTRICTED_CHARSET:
7507 op = DIGITA;
7508 break;
56ae17b4
KW
7509 case REGEX_DEPENDS_CHARSET: /* No difference between these */
7510 case REGEX_UNICODE_CHARSET:
7511 op = DIGIT;
7512 break;
7513 default:
7514 goto bad_charset;
6ab9ea91 7515 }
56ae17b4 7516 ret = reg_node(pRExC_state, op);
a0d0e21e 7517 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7518 goto finish_meta_pat;
a0d0e21e 7519 case 'D':
56ae17b4
KW
7520 switch (get_regex_charset(RExC_flags)) {
7521 case REGEX_LOCALE_CHARSET:
7522 op = NDIGITL;
7523 break;
cfaf538b
KW
7524 case REGEX_ASCII_RESTRICTED_CHARSET:
7525 op = NDIGITA;
7526 break;
56ae17b4
KW
7527 case REGEX_DEPENDS_CHARSET: /* No difference between these */
7528 case REGEX_UNICODE_CHARSET:
7529 op = NDIGIT;
7530 break;
7531 default:
7532 goto bad_charset;
6ab9ea91 7533 }
56ae17b4 7534 ret = reg_node(pRExC_state, op);
a0d0e21e 7535 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7536 goto finish_meta_pat;
e1d1eefb
YO
7537 case 'R':
7538 ret = reg_node(pRExC_state, LNBREAK);
7539 *flagp |= HASWIDTH|SIMPLE;
7540 goto finish_meta_pat;
7541 case 'h':
7542 ret = reg_node(pRExC_state, HORIZWS);
7543 *flagp |= HASWIDTH|SIMPLE;
7544 goto finish_meta_pat;
7545 case 'H':
7546 ret = reg_node(pRExC_state, NHORIZWS);
7547 *flagp |= HASWIDTH|SIMPLE;
7548 goto finish_meta_pat;
ee9b8eae 7549 case 'v':
e1d1eefb
YO
7550 ret = reg_node(pRExC_state, VERTWS);
7551 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
7552 goto finish_meta_pat;
7553 case 'V':
e1d1eefb
YO
7554 ret = reg_node(pRExC_state, NVERTWS);
7555 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7556 finish_meta_pat:
830247a4 7557 nextchar(pRExC_state);
fac92740 7558 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 7559 break;
a14b48bc
LW
7560 case 'p':
7561 case 'P':
3568d838 7562 {
3dab1dad 7563 char* const oldregxend = RExC_end;
d008bc60 7564#ifdef DEBUGGING
ccb2c380 7565 char* parse_start = RExC_parse - 2;
d008bc60 7566#endif
a14b48bc 7567
830247a4 7568 if (RExC_parse[1] == '{') {
3568d838 7569 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
7570 RExC_end = strchr(RExC_parse, '}');
7571 if (!RExC_end) {
3dab1dad 7572 const U8 c = (U8)*RExC_parse;
830247a4
IZ
7573 RExC_parse += 2;
7574 RExC_end = oldregxend;
0da60cf5 7575 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 7576 }
830247a4 7577 RExC_end++;
a14b48bc 7578 }
af6f566e 7579 else {
830247a4 7580 RExC_end = RExC_parse + 2;
af6f566e
HS
7581 if (RExC_end > oldregxend)
7582 RExC_end = oldregxend;
7583 }
830247a4 7584 RExC_parse--;
a14b48bc 7585
3dab1dad 7586 ret = regclass(pRExC_state,depth+1);
a14b48bc 7587
830247a4
IZ
7588 RExC_end = oldregxend;
7589 RExC_parse--;
ccb2c380
MP
7590
7591 Set_Node_Offset(ret, parse_start + 2);
7592 Set_Node_Cur_Length(ret);
830247a4 7593 nextchar(pRExC_state);
a14b48bc
LW
7594 *flagp |= HASWIDTH|SIMPLE;
7595 }
7596 break;
fc8cd66c 7597 case 'N':
afefe6bf 7598 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
7599 multicharacter. join_exact() will join them up later on.
7600 Also this makes sure that things like /\N{BLAH}+/ and
7601 \N{BLAH} being multi char Just Happen. dmq*/
7602 ++RExC_parse;
afefe6bf 7603 ret= reg_namedseq(pRExC_state, NULL, flagp);
fc8cd66c 7604 break;
0a4db386 7605 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 7606 parse_named_seq:
81714fb9
YO
7607 {
7608 char ch= RExC_parse[1];
1f1031fe
YO
7609 if (ch != '<' && ch != '\'' && ch != '{') {
7610 RExC_parse++;
7611 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 7612 } else {
1f1031fe
YO
7613 /* this pretty much dupes the code for (?P=...) in reg(), if
7614 you change this make sure you change that */
81714fb9 7615 char* name_start = (RExC_parse += 2);
2eccd3b2 7616 U32 num = 0;
0a4db386
YO
7617 SV *sv_dat = reg_scan_name(pRExC_state,
7618 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 7619 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 7620 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
7621 vFAIL2("Sequence %.3s... not terminated",parse_start);
7622
7623 if (!SIZE_ONLY) {
7624 num = add_data( pRExC_state, 1, "S" );
7625 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 7626 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
7627 }
7628
81714fb9
YO
7629 RExC_sawback = 1;
7630 ret = reganode(pRExC_state,
4444fd9f
KW
7631 ((! FOLD)
7632 ? NREF
cfaf538b 7633 : (AT_LEAST_UNI_SEMANTICS)
4444fd9f
KW
7634 ? NREFFU
7635 : (LOC)
7636 ? NREFFL
7637 : NREFF),
7638 num);
81714fb9 7639 *flagp |= HASWIDTH;
1f1031fe 7640
81714fb9
YO
7641 /* override incorrect value set in reganode MJD */
7642 Set_Node_Offset(ret, parse_start+1);
7643 Set_Node_Cur_Length(ret); /* MJD */
7644 nextchar(pRExC_state);
1f1031fe 7645
81714fb9
YO
7646 }
7647 break;
1f1031fe 7648 }
2bf803e2 7649 case 'g':
a0d0e21e
LW
7650 case '1': case '2': case '3': case '4':
7651 case '5': case '6': case '7': case '8': case '9':
7652 {
c74340f9 7653 I32 num;
2bf803e2
YO
7654 bool isg = *RExC_parse == 'g';
7655 bool isrel = 0;
7656 bool hasbrace = 0;
7657 if (isg) {
c74340f9 7658 RExC_parse++;
2bf803e2
YO
7659 if (*RExC_parse == '{') {
7660 RExC_parse++;
7661 hasbrace = 1;
7662 }
7663 if (*RExC_parse == '-') {
7664 RExC_parse++;
7665 isrel = 1;
7666 }
1f1031fe
YO
7667 if (hasbrace && !isDIGIT(*RExC_parse)) {
7668 if (isrel) RExC_parse--;
7669 RExC_parse -= 2;
7670 goto parse_named_seq;
7671 } }
c74340f9 7672 num = atoi(RExC_parse);
b72d83b2
RGS
7673 if (isg && num == 0)
7674 vFAIL("Reference to invalid group 0");
c74340f9 7675 if (isrel) {
5624f11d 7676 num = RExC_npar - num;
c74340f9
YO
7677 if (num < 1)
7678 vFAIL("Reference to nonexistent or unclosed group");
7679 }
2bf803e2 7680 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
7681 goto defchar;
7682 else {
3dab1dad 7683 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
7684 while (isDIGIT(*RExC_parse))
7685 RExC_parse++;
1f1031fe
YO
7686 if (parse_start == RExC_parse - 1)
7687 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
7688 if (hasbrace) {
7689 if (*RExC_parse != '}')
7690 vFAIL("Unterminated \\g{...} pattern");
7691 RExC_parse++;
7692 }
c74340f9
YO
7693 if (!SIZE_ONLY) {
7694 if (num > (I32)RExC_rx->nparens)
7695 vFAIL("Reference to nonexistent group");
c74340f9 7696 }
830247a4 7697 RExC_sawback = 1;
eb160463 7698 ret = reganode(pRExC_state,
4444fd9f
KW
7699 ((! FOLD)
7700 ? REF
cfaf538b 7701 : (AT_LEAST_UNI_SEMANTICS)
4444fd9f
KW
7702 ? REFFU
7703 : (LOC)
7704 ? REFFL
7705 : REFF),
7706 num);
a0d0e21e 7707 *flagp |= HASWIDTH;
2af232bd 7708
fac92740 7709 /* override incorrect value set in reganode MJD */
2af232bd 7710 Set_Node_Offset(ret, parse_start+1);
fac92740 7711 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
7712 RExC_parse--;
7713 nextchar(pRExC_state);
a0d0e21e
LW
7714 }
7715 }
7716 break;
7717 case '\0':
830247a4 7718 if (RExC_parse >= RExC_end)
b45f050a 7719 FAIL("Trailing \\");
a0d0e21e
LW
7720 /* FALL THROUGH */
7721 default:
a0288114 7722 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 7723 back into the quick-grab loop below */
45948336 7724 parse_start--;
a0d0e21e
LW
7725 goto defchar;
7726 }
7727 break;
4633a7c4
LW
7728
7729 case '#':
bbe252da 7730 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 7731 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
7732 goto tryagain;
7733 }
7734 /* FALL THROUGH */
7735
f9a79580
RGS
7736 default:
7737 outer_default:{
ba210ebe 7738 register STRLEN len;
58ae7d3f 7739 register UV ender;
a0d0e21e 7740 register char *p;
3dab1dad 7741 char *s;
80aecb99 7742 STRLEN foldlen;
89ebb4a3 7743 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
7744
7745 parse_start = RExC_parse - 1;
a0d0e21e 7746
830247a4 7747 RExC_parse++;
a0d0e21e
LW
7748
7749 defchar:
58ae7d3f 7750 ender = 0;
eb160463 7751 ret = reg_node(pRExC_state,
2c2b7f86
KW
7752 (U8) ((! FOLD) ? EXACT
7753 : (LOC)
7754 ? EXACTFL
cfaf538b 7755 : (AT_LEAST_UNI_SEMANTICS)
2c2b7f86
KW
7756 ? EXACTFU
7757 : EXACTF)
7758 );
cd439c50 7759 s = STRING(ret);
830247a4
IZ
7760 for (len = 0, p = RExC_parse - 1;
7761 len < 127 && p < RExC_end;
a0d0e21e
LW
7762 len++)
7763 {
3dab1dad 7764 char * const oldp = p;
5b5a24f7 7765
bbe252da 7766 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 7767 p = regwhite( pRExC_state, p );
f9a79580 7768 switch ((U8)*p) {
ced7f090
KW
7769 case LATIN_SMALL_LETTER_SHARP_S:
7770 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7771 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
56d400ed 7772 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
f9a79580 7773 goto normal_default;
a0d0e21e
LW
7774 case '^':
7775 case '$':
7776 case '.':
7777 case '[':
7778 case '(':
7779 case ')':
7780 case '|':
7781 goto loopdone;
7782 case '\\':
ee9b8eae
YO
7783 /* Literal Escapes Switch
7784
7785 This switch is meant to handle escape sequences that
7786 resolve to a literal character.
7787
7788 Every escape sequence that represents something
7789 else, like an assertion or a char class, is handled
7790 in the switch marked 'Special Escapes' above in this
7791 routine, but also has an entry here as anything that
7792 isn't explicitly mentioned here will be treated as
7793 an unescaped equivalent literal.
7794 */
7795
a0a388a1 7796 switch ((U8)*++p) {
ee9b8eae 7797 /* These are all the special escapes. */
ced7f090
KW
7798 case LATIN_SMALL_LETTER_SHARP_S:
7799 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7800 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
a0a388a1
YO
7801 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7802 goto normal_default;
ee9b8eae
YO
7803 case 'A': /* Start assertion */
7804 case 'b': case 'B': /* Word-boundary assertion*/
7805 case 'C': /* Single char !DANGEROUS! */
7806 case 'd': case 'D': /* digit class */
7807 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 7808 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
7809 case 'k': case 'K': /* named backref, keep marker */
7810 case 'N': /* named char sequence */
38a44b82 7811 case 'p': case 'P': /* Unicode property */
e1d1eefb 7812 case 'R': /* LNBREAK */
ee9b8eae 7813 case 's': case 'S': /* space class */
e1d1eefb 7814 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
7815 case 'w': case 'W': /* word class */
7816 case 'X': /* eXtended Unicode "combining character sequence" */
7817 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
7818 --p;
7819 goto loopdone;
ee9b8eae
YO
7820
7821 /* Anything after here is an escape that resolves to a
7822 literal. (Except digits, which may or may not)
7823 */
a0d0e21e
LW
7824 case 'n':
7825 ender = '\n';
7826 p++;
a687059c 7827 break;
a0d0e21e
LW
7828 case 'r':
7829 ender = '\r';
7830 p++;
a687059c 7831 break;
a0d0e21e
LW
7832 case 't':
7833 ender = '\t';
7834 p++;
a687059c 7835 break;
a0d0e21e
LW
7836 case 'f':
7837 ender = '\f';
7838 p++;
a687059c 7839 break;
a0d0e21e 7840 case 'e':
c7f1f016 7841 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 7842 p++;
a687059c 7843 break;
a0d0e21e 7844 case 'a':
c7f1f016 7845 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 7846 p++;
a687059c 7847 break;
f0a2b745
KW
7848 case 'o':
7849 {
7850 STRLEN brace_len = len;
00c0cb6d 7851 UV result;
454155d9
KW
7852 const char* error_msg;
7853
7854 bool valid = grok_bslash_o(p,
7855 &result,
7856 &brace_len,
7857 &error_msg,
7858 1);
7859 p += brace_len;
7860 if (! valid) {
7861 RExC_parse = p; /* going to die anyway; point
7862 to exact spot of failure */
f0a2b745
KW
7863 vFAIL(error_msg);
7864 }
00c0cb6d
DG
7865 else
7866 {
7867 ender = result;
7868 }
f0a2b745
KW
7869 if (PL_encoding && ender < 0x100) {
7870 goto recode_encoding;
7871 }
7872 if (ender > 0xff) {
62fed28b 7873 REQUIRE_UTF8;
f0a2b745
KW
7874 }
7875 break;
7876 }
a0d0e21e 7877 case 'x':
a0ed51b3 7878 if (*++p == '{') {
1df70142 7879 char* const e = strchr(p, '}');
b81d288d 7880
b45f050a 7881 if (!e) {
830247a4 7882 RExC_parse = p + 1;
b45f050a
JF
7883 vFAIL("Missing right brace on \\x{}");
7884 }
de5f0749 7885 else {
a4c04bdc
NC
7886 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7887 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 7888 STRLEN numlen = e - p - 1;
53305cf1 7889 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028 7890 if (ender > 0xff)
62fed28b 7891 REQUIRE_UTF8;
a0ed51b3
LW
7892 p = e + 1;
7893 }
a0ed51b3
LW
7894 }
7895 else {
a4c04bdc 7896 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 7897 STRLEN numlen = 2;
53305cf1 7898 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
7899 p += numlen;
7900 }
9e08bc66
TS
7901 if (PL_encoding && ender < 0x100)
7902 goto recode_encoding;
a687059c 7903 break;
a0d0e21e
LW
7904 case 'c':
7905 p++;
f9d13529 7906 ender = grok_bslash_c(*p++, SIZE_ONLY);
a687059c 7907 break;
a0d0e21e
LW
7908 case '0': case '1': case '2': case '3':case '4':
7909 case '5': case '6': case '7': case '8':case '9':
7910 if (*p == '0' ||
ca67da41 7911 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
c99e91e9
KW
7912 {
7913 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
1df70142 7914 STRLEN numlen = 3;
53305cf1 7915 ender = grok_oct(p, &numlen, &flags, NULL);
fa1639c5 7916 if (ender > 0xff) {
62fed28b 7917 REQUIRE_UTF8;
609122bd 7918 }
a0d0e21e
LW
7919 p += numlen;
7920 }
7921 else {
7922 --p;
7923 goto loopdone;
a687059c 7924 }
9e08bc66
TS
7925 if (PL_encoding && ender < 0x100)
7926 goto recode_encoding;
7927 break;
7928 recode_encoding:
7929 {
7930 SV* enc = PL_encoding;
7931 ender = reg_recode((const char)(U8)ender, &enc);
668c081a
NC
7932 if (!enc && SIZE_ONLY)
7933 ckWARNreg(p, "Invalid escape in the specified encoding");
62fed28b 7934 REQUIRE_UTF8;
9e08bc66 7935 }
a687059c 7936 break;
a0d0e21e 7937 case '\0':
830247a4 7938 if (p >= RExC_end)
b45f050a 7939 FAIL("Trailing \\");
a687059c 7940 /* FALL THROUGH */
a0d0e21e 7941 default:
668c081a
NC
7942 if (!SIZE_ONLY&& isALPHA(*p))
7943 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 7944 goto normal_default;
a0d0e21e
LW
7945 }
7946 break;
a687059c 7947 default:
a0ed51b3 7948 normal_default:
fd400ab9 7949 if (UTF8_IS_START(*p) && UTF) {
1df70142 7950 STRLEN numlen;
5e12f4fb 7951 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 7952 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
7953 p += numlen;
7954 }
7955 else
7956 ender = *p++;
a0d0e21e 7957 break;
a687059c 7958 }
bcdf7404
YO
7959 if ( RExC_flags & RXf_PMf_EXTENDED)
7960 p = regwhite( pRExC_state, p );
60a8b682
JH
7961 if (UTF && FOLD) {
7962 /* Prime the casefolded buffer. */
ac7e0132 7963 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 7964 }
bcdf7404 7965 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
7966 if (len)
7967 p = oldp;
16ea2a2e 7968 else if (UTF) {
80aecb99 7969 if (FOLD) {
60a8b682 7970 /* Emit all the Unicode characters. */
1df70142 7971 STRLEN numlen;
80aecb99
JH
7972 for (foldbuf = tmpbuf;
7973 foldlen;
7974 foldlen -= numlen) {
7975 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7976 if (numlen > 0) {
71207a34 7977 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7978 s += unilen;
7979 len += unilen;
7980 /* In EBCDIC the numlen
7981 * and unilen can differ. */
9dc45d57 7982 foldbuf += numlen;
47654450
JH
7983 if (numlen >= foldlen)
7984 break;
9dc45d57
JH
7985 }
7986 else
7987 break; /* "Can't happen." */
80aecb99
JH
7988 }
7989 }
7990 else {
71207a34 7991 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7992 if (unilen > 0) {
0ebc6274
JH
7993 s += unilen;
7994 len += unilen;
9dc45d57 7995 }
80aecb99 7996 }
a0ed51b3 7997 }
a0d0e21e
LW
7998 else {
7999 len++;
eb160463 8000 REGC((char)ender, s++);
a0d0e21e
LW
8001 }
8002 break;
a687059c 8003 }
16ea2a2e 8004 if (UTF) {
80aecb99 8005 if (FOLD) {
60a8b682 8006 /* Emit all the Unicode characters. */
1df70142 8007 STRLEN numlen;
80aecb99
JH
8008 for (foldbuf = tmpbuf;
8009 foldlen;
8010 foldlen -= numlen) {
8011 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 8012 if (numlen > 0) {
71207a34 8013 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
8014 len += unilen;
8015 s += unilen;
8016 /* In EBCDIC the numlen
8017 * and unilen can differ. */
9dc45d57 8018 foldbuf += numlen;
47654450
JH
8019 if (numlen >= foldlen)
8020 break;
9dc45d57
JH
8021 }
8022 else
8023 break;
80aecb99
JH
8024 }
8025 }
8026 else {
71207a34 8027 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 8028 if (unilen > 0) {
0ebc6274
JH
8029 s += unilen;
8030 len += unilen;
9dc45d57 8031 }
80aecb99
JH
8032 }
8033 len--;
a0ed51b3
LW
8034 }
8035 else
eb160463 8036 REGC((char)ender, s++);
a0d0e21e
LW
8037 }
8038 loopdone:
830247a4 8039 RExC_parse = p - 1;
fac92740 8040 Set_Node_Cur_Length(ret); /* MJD */
830247a4 8041 nextchar(pRExC_state);
793db0cb
JH
8042 {
8043 /* len is STRLEN which is unsigned, need to copy to signed */
8044 IV iv = len;
8045 if (iv < 0)
8046 vFAIL("Internal disaster");
8047 }
a0d0e21e
LW
8048 if (len > 0)
8049 *flagp |= HASWIDTH;
090f7165 8050 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 8051 *flagp |= SIMPLE;
3dab1dad 8052
cd439c50 8053 if (SIZE_ONLY)
830247a4 8054 RExC_size += STR_SZ(len);
3dab1dad
YO
8055 else {
8056 STR_LEN(ret) = len;
830247a4 8057 RExC_emit += STR_SZ(len);
07be1b83 8058 }
3dab1dad 8059 }
a0d0e21e
LW
8060 break;
8061 }
a687059c 8062
a0d0e21e 8063 return(ret);
980866de
KW
8064
8065/* Jumped to when an unrecognized character set is encountered */
8066bad_charset:
8067 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
8068 return(NULL);
a687059c
LW
8069}
8070
873ef191 8071STATIC char *
bcdf7404 8072S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 8073{
bcdf7404 8074 const char *e = RExC_end;
7918f24d
NC
8075
8076 PERL_ARGS_ASSERT_REGWHITE;
8077
5b5a24f7
CS
8078 while (p < e) {
8079 if (isSPACE(*p))
8080 ++p;
8081 else if (*p == '#') {
bcdf7404 8082 bool ended = 0;
5b5a24f7 8083 do {
bcdf7404
YO
8084 if (*p++ == '\n') {
8085 ended = 1;
8086 break;
8087 }
8088 } while (p < e);
8089 if (!ended)
8090 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
8091 }
8092 else
8093 break;
8094 }
8095 return p;
8096}
8097
b8c5462f
JH
8098/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
8099 Character classes ([:foo:]) can also be negated ([:^foo:]).
8100 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
8101 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 8102 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
8103
8104#define POSIXCC_DONE(c) ((c) == ':')
8105#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
8106#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
8107
b8c5462f 8108STATIC I32
830247a4 8109S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 8110{
97aff369 8111 dVAR;
936ed897 8112 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 8113
7918f24d
NC
8114 PERL_ARGS_ASSERT_REGPPOSIXCC;
8115
830247a4 8116 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 8117 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 8118 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 8119 const char c = UCHARAT(RExC_parse);
097eb12c 8120 char* const s = RExC_parse++;
b81d288d 8121
9a86a77b 8122 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
8123 RExC_parse++;
8124 if (RExC_parse == RExC_end)
620e46c5 8125 /* Grandfather lone [:, [=, [. */
830247a4 8126 RExC_parse = s;
620e46c5 8127 else {
3dab1dad 8128 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
8129 assert(*t == c);
8130
9a86a77b 8131 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 8132 const char *posixcc = s + 1;
830247a4 8133 RExC_parse++; /* skip over the ending ] */
3dab1dad 8134
b8c5462f 8135 if (*s == ':') {
1df70142
AL
8136 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
8137 const I32 skip = t - posixcc;
80916619
NC
8138
8139 /* Initially switch on the length of the name. */
8140 switch (skip) {
8141 case 4:
3dab1dad
YO
8142 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
8143 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 8144 break;
80916619
NC
8145 case 5:
8146 /* Names all of length 5. */
8147 /* alnum alpha ascii blank cntrl digit graph lower
8148 print punct space upper */
8149 /* Offset 4 gives the best switch position. */
8150 switch (posixcc[4]) {
8151 case 'a':
3dab1dad
YO
8152 if (memEQ(posixcc, "alph", 4)) /* alpha */
8153 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
8154 break;
8155 case 'e':
3dab1dad
YO
8156 if (memEQ(posixcc, "spac", 4)) /* space */
8157 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
8158 break;
8159 case 'h':
3dab1dad
YO
8160 if (memEQ(posixcc, "grap", 4)) /* graph */
8161 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
8162 break;
8163 case 'i':
3dab1dad
YO
8164 if (memEQ(posixcc, "asci", 4)) /* ascii */
8165 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
8166 break;
8167 case 'k':
3dab1dad
YO
8168 if (memEQ(posixcc, "blan", 4)) /* blank */
8169 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
8170 break;
8171 case 'l':
3dab1dad
YO
8172 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
8173 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
8174 break;
8175 case 'm':
3dab1dad
YO
8176 if (memEQ(posixcc, "alnu", 4)) /* alnum */
8177 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
8178 break;
8179 case 'r':
3dab1dad
YO
8180 if (memEQ(posixcc, "lowe", 4)) /* lower */
8181 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8182 else if (memEQ(posixcc, "uppe", 4)) /* upper */
8183 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
8184 break;
8185 case 't':
3dab1dad
YO
8186 if (memEQ(posixcc, "digi", 4)) /* digit */
8187 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8188 else if (memEQ(posixcc, "prin", 4)) /* print */
8189 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8190 else if (memEQ(posixcc, "punc", 4)) /* punct */
8191 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 8192 break;
b8c5462f
JH
8193 }
8194 break;
80916619 8195 case 6:
3dab1dad
YO
8196 if (memEQ(posixcc, "xdigit", 6))
8197 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
8198 break;
8199 }
80916619
NC
8200
8201 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
8202 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8203 t - s - 1, s + 1);
80916619
NC
8204 assert (posixcc[skip] == ':');
8205 assert (posixcc[skip+1] == ']');
b45f050a 8206 } else if (!SIZE_ONLY) {
b8c5462f 8207 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 8208
830247a4 8209 /* adjust RExC_parse so the warning shows after
b45f050a 8210 the class closes */
9a86a77b 8211 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 8212 RExC_parse++;
b45f050a
JF
8213 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8214 }
b8c5462f
JH
8215 } else {
8216 /* Maternal grandfather:
8217 * "[:" ending in ":" but not in ":]" */
830247a4 8218 RExC_parse = s;
767d463e 8219 }
620e46c5
JH
8220 }
8221 }
8222
b8c5462f
JH
8223 return namedclass;
8224}
8225
8226STATIC void
830247a4 8227S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 8228{
97aff369 8229 dVAR;
7918f24d
NC
8230
8231 PERL_ARGS_ASSERT_CHECKPOSIXCC;
8232
3dab1dad 8233 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
8234 const char *s = RExC_parse;
8235 const char c = *s++;
b8c5462f 8236
3dab1dad 8237 while (isALNUM(*s))
b8c5462f
JH
8238 s++;
8239 if (*s && c == *s && s[1] == ']') {
668c081a
NC
8240 ckWARN3reg(s+2,
8241 "POSIX syntax [%c %c] belongs inside character classes",
8242 c, c);
b45f050a
JF
8243
8244 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 8245 if (POSIXCC_NOTYET(c)) {
830247a4 8246 /* adjust RExC_parse so the error shows after
b45f050a 8247 the class closes */
9a86a77b 8248 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 8249 NOOP;
b45f050a
JF
8250 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8251 }
b8c5462f
JH
8252 }
8253 }
620e46c5
JH
8254}
8255
003331de
KW
8256/* No locale test, and always Unicode semantics */
8257#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
8258ANYOF_##NAME: \
8259 for (value = 0; value < 256; value++) \
8260 if (TEST) \
8261 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8262 yesno = '+'; \
8263 what = WORD; \
8264 break; \
8265case ANYOF_N##NAME: \
8266 for (value = 0; value < 256; value++) \
8267 if (!TEST) \
8268 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8269 yesno = '!'; \
8270 what = WORD; \
e1d1eefb 8271 break
89836f1f 8272
a12cf05f
KW
8273/* Like the above, but there are differences if we are in uni-8-bit or not, so
8274 * there are two tests passed in, to use depending on that. There aren't any
8275 * cases where the label is different from the name, so no need for that
8276 * parameter */
003331de
KW
8277#define _C_C_T_(NAME,TEST_8,TEST_7,WORD) \
8278ANYOF_##NAME: \
8279 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8280 else if (UNI_SEMANTICS) { \
8281 for (value = 0; value < 256; value++) { \
8282 if (TEST_8) stored += \
8283 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8284 } \
8285 } \
8286 else { \
8287 for (value = 0; value < 128; value++) { \
8288 if (TEST_7) stored += \
8289 S_set_regclass_bit(aTHX_ pRExC_state, ret, \
8290 (U8) UNI_TO_NATIVE(value)); \
8291 } \
8292 } \
8293 yesno = '+'; \
8294 what = WORD; \
8295 break; \
8296case ANYOF_N##NAME: \
8297 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8298 else if (UNI_SEMANTICS) { \
8299 for (value = 0; value < 256; value++) { \
8300 if (! TEST_8) stored += \
8301 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8302 } \
8303 } \
8304 else { \
8305 for (value = 0; value < 128; value++) { \
8306 if (! TEST_7) stored += \
8307 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8308 } \
cfaf538b
KW
8309 if (ASCII_RESTRICTED) { \
8310 for (value = 128; value < 256; value++) { \
8311 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8312 } \
8313 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8; \
8314 } \
8315 else { \
8316 /* For a non-ut8 target string with DEPENDS semantics, all above \
8317 * ASCII Latin1 code points match the complement of any of the \
8318 * classes. But in utf8, they have their Unicode semantics, so \
8319 * can't just set them in the bitmap, or else regexec.c will think \
8320 * they matched when they shouldn't. */ \
8321 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8; \
8322 } \
003331de
KW
8323 } \
8324 yesno = '!'; \
8325 what = WORD; \
a12cf05f
KW
8326 break
8327
da7fcca4
YO
8328/*
8329 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8330 so that it is possible to override the option here without having to
8331 rebuild the entire core. as we are required to do if we change regcomp.h
8332 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8333*/
8334#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8335#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8336#endif
8337
8338#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8339#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8340#else
8341#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8342#endif
8343
2283d326
KW
8344STATIC U8
8345S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8346{
8347
8348 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
8349 * Locale folding is done at run-time, so this function should not be
8350 * called for nodes that are for locales.
8351 *
8352 * This function simply sets the bit corresponding to the fold of the input
8353 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
8354 * 'F' is 'f'.
8355 *
8356 * It also sets any necessary flags, and returns the number of bits that
8357 * actually changed from 0 to 1 */
8358
8359 U8 stored = 0;
2283d326
KW
8360 U8 fold;
8361
cfaf538b 8362 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
2283d326
KW
8363 : PL_fold[value];
8364
8365 /* It assumes the bit for 'value' has already been set */
8366 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
8367 ANYOF_BITMAP_SET(node, fold);
8368 stored++;
8369 }
f56b6394
KW
8370 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
8371 || (! UNI_SEMANTICS
8372 && ! isASCII(value)
8373 && PL_fold_latin1[value] != value))
2283d326
KW
8374 { /* A character that has a fold outside of Latin1 matches outside the
8375 bitmap, but only when the target string is utf8. Similarly when we
8376 don't have unicode semantics for the above ASCII Latin-1 characters,
8377 and they have a fold, they should match if the target is utf8, and
8378 not otherwise */
8379 ANYOF_FLAGS(node) |= ANYOF_UTF8;
8380 }
8381
8382 return stored;
8383}
8384
8385
8386PERL_STATIC_INLINE U8
41b0f1c1 8387S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
2283d326
KW
8388{
8389 /* This inline function sets a bit in the bitmap if not already set, and if
8390 * appropriate, its fold, returning the number of bits that actually
8391 * changed from 0 to 1 */
8392
8393 U8 stored;
8394
8395 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
8396 return 0;
8397 }
8398
8399 ANYOF_BITMAP_SET(node, value);
8400 stored = 1;
8401
8402 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
8403 stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
8404 }
8405
8406 return stored;
8407}
8408
7f6f358c
YO
8409/*
8410 parse a class specification and produce either an ANYOF node that
89836f1f
YO
8411 matches the pattern or if the pattern matches a single char only and
8412 that char is < 256 and we are case insensitive then we produce an
8413 EXACT node instead.
7f6f358c 8414*/
89836f1f 8415
76e3520e 8416STATIC regnode *
3dab1dad 8417S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 8418{
97aff369 8419 dVAR;
9a86a77b 8420 register UV nextvalue;
3568d838 8421 register IV prevvalue = OOB_UNICODE;
ffc61ed2 8422 register IV range = 0;
e1d1eefb 8423 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 8424 register regnode *ret;
ba210ebe 8425 STRLEN numlen;
ffc61ed2 8426 IV namedclass;
cbbf8932 8427 char *rangebegin = NULL;
936ed897 8428 bool need_class = 0;
c445ea15 8429 SV *listsv = NULL;
ffc61ed2 8430 UV n;
cbbf8932 8431 AV* unicode_alternate = NULL;
1b2d223b
JH
8432#ifdef EBCDIC
8433 UV literal_endpoint = 0;
8434#endif
ffc130aa 8435 UV stored = 0; /* how many chars stored in the bitmap */
ffc61ed2 8436
3dab1dad 8437 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 8438 case we need to change the emitted regop to an EXACT. */
07be1b83 8439 const char * orig_parse = RExC_parse;
72f13be8 8440 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8441
8442 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
8443#ifndef DEBUGGING
8444 PERL_UNUSED_ARG(depth);
8445#endif
72f13be8 8446
3dab1dad 8447 DEBUG_PARSE("clas");
7f6f358c
YO
8448
8449 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
8450 ret = reganode(pRExC_state, ANYOF, 0);
8451
8452 if (!SIZE_ONLY)
8453 ANYOF_FLAGS(ret) = 0;
8454
9a86a77b 8455 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
8456 RExC_naughty++;
8457 RExC_parse++;
8458 if (!SIZE_ONLY)
8459 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8460 }
a0d0e21e 8461
73060fc4 8462 if (SIZE_ONLY) {
830247a4 8463 RExC_size += ANYOF_SKIP;
3a15e693
KW
8464#ifdef ANYOF_ADD_LOC_SKIP
8465 if (LOC) {
8466 RExC_size += ANYOF_ADD_LOC_SKIP;
8467 }
8468#endif
73060fc4
JH
8469 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8470 }
936ed897 8471 else {
830247a4 8472 RExC_emit += ANYOF_SKIP;
3a15e693 8473 if (LOC) {
936ed897 8474 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3a15e693
KW
8475#ifdef ANYOF_ADD_LOC_SKIP
8476 RExC_emit += ANYOF_ADD_LOC_SKIP;
8477#endif
8478 }
ffc61ed2 8479 ANYOF_BITMAP_ZERO(ret);
396482e1 8480 listsv = newSVpvs("# comment\n");
a0d0e21e 8481 }
b8c5462f 8482
9a86a77b
JH
8483 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8484
b938889d 8485 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 8486 checkposixcc(pRExC_state);
b8c5462f 8487
f064b6ad
HS
8488 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8489 if (UCHARAT(RExC_parse) == ']')
8490 goto charclassloop;
ffc61ed2 8491
fc8cd66c 8492parseit:
9a86a77b 8493 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
8494
8495 charclassloop:
8496
8497 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8498
73b437c8 8499 if (!range)
830247a4 8500 rangebegin = RExC_parse;
ffc61ed2 8501 if (UTF) {
5e12f4fb 8502 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 8503 RExC_end - RExC_parse,
9f7f3913 8504 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
8505 RExC_parse += numlen;
8506 }
8507 else
8508 value = UCHARAT(RExC_parse++);
7f6f358c 8509
9a86a77b
JH
8510 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8511 if (value == '[' && POSIXCC(nextvalue))
830247a4 8512 namedclass = regpposixcc(pRExC_state, value);
620e46c5 8513 else if (value == '\\') {
ffc61ed2 8514 if (UTF) {
5e12f4fb 8515 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 8516 RExC_end - RExC_parse,
9f7f3913 8517 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
8518 RExC_parse += numlen;
8519 }
8520 else
8521 value = UCHARAT(RExC_parse++);
470c3474 8522 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 8523 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
8524 * be a problem later if we want switch on Unicode.
8525 * A similar issue a little bit later when switching on
8526 * namedclass. --jhi */
ffc61ed2 8527 switch ((I32)value) {
b8c5462f
JH
8528 case 'w': namedclass = ANYOF_ALNUM; break;
8529 case 'W': namedclass = ANYOF_NALNUM; break;
8530 case 's': namedclass = ANYOF_SPACE; break;
8531 case 'S': namedclass = ANYOF_NSPACE; break;
8532 case 'd': namedclass = ANYOF_DIGIT; break;
8533 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
8534 case 'v': namedclass = ANYOF_VERTWS; break;
8535 case 'V': namedclass = ANYOF_NVERTWS; break;
8536 case 'h': namedclass = ANYOF_HORIZWS; break;
8537 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
8538 case 'N': /* Handle \N{NAME} in class */
8539 {
8540 /* We only pay attention to the first char of
8541 multichar strings being returned. I kinda wonder
8542 if this makes sense as it does change the behaviour
8543 from earlier versions, OTOH that behaviour was broken
8544 as well. */
8545 UV v; /* value is register so we cant & it /grrr */
afefe6bf 8546 if (reg_namedseq(pRExC_state, &v, NULL)) {
fc8cd66c
YO
8547 goto parseit;
8548 }
8549 value= v;
8550 }
8551 break;
ffc61ed2
JH
8552 case 'p':
8553 case 'P':
3dab1dad
YO
8554 {
8555 char *e;
af6f566e 8556 if (RExC_parse >= RExC_end)
2a4859cd 8557 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 8558 if (*RExC_parse == '{') {
1df70142 8559 const U8 c = (U8)value;
ffc61ed2
JH
8560 e = strchr(RExC_parse++, '}');
8561 if (!e)
0da60cf5 8562 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
8563 while (isSPACE(UCHARAT(RExC_parse)))
8564 RExC_parse++;
8565 if (e == RExC_parse)
0da60cf5 8566 vFAIL2("Empty \\%c{}", c);
ffc61ed2 8567 n = e - RExC_parse;
ab13f0c7
JH
8568 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8569 n--;
ffc61ed2
JH
8570 }
8571 else {
8572 e = RExC_parse;
8573 n = 1;
8574 }
8575 if (!SIZE_ONLY) {
ab13f0c7
JH
8576 if (UCHARAT(RExC_parse) == '^') {
8577 RExC_parse++;
8578 n--;
8579 value = value == 'p' ? 'P' : 'p'; /* toggle */
8580 while (isSPACE(UCHARAT(RExC_parse))) {
8581 RExC_parse++;
8582 n--;
8583 }
8584 }
097eb12c
AL
8585 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8586 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
8587 }
8588 RExC_parse = e + 1;
08fc12dd
KW
8589
8590 /* The \p could match something in the Latin1 range, hence
8591 * something that isn't utf8 */
3ff7ceb3 8592 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
f56b6394
KW
8593 if (FOLD) { /* And one of these could have a multi-char fold */
8594 OP(ret) = ANYOFV;
8595 }
f81125e2 8596 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 8597 }
f81125e2 8598 break;
b8c5462f
JH
8599 case 'n': value = '\n'; break;
8600 case 'r': value = '\r'; break;
8601 case 't': value = '\t'; break;
8602 case 'f': value = '\f'; break;
8603 case 'b': value = '\b'; break;
c7f1f016
NIS
8604 case 'e': value = ASCII_TO_NATIVE('\033');break;
8605 case 'a': value = ASCII_TO_NATIVE('\007');break;
f0a2b745
KW
8606 case 'o':
8607 RExC_parse--; /* function expects to be pointed at the 'o' */
454155d9
KW
8608 {
8609 const char* error_msg;
8610 bool valid = grok_bslash_o(RExC_parse,
f0a2b745
KW
8611 &value,
8612 &numlen,
454155d9
KW
8613 &error_msg,
8614 SIZE_ONLY);
8615 RExC_parse += numlen;
8616 if (! valid) {
8617 vFAIL(error_msg);
8618 }
f0a2b745 8619 }
f0a2b745
KW
8620 if (PL_encoding && value < 0x100) {
8621 goto recode_encoding;
8622 }
8623 break;
b8c5462f 8624 case 'x':
ffc61ed2 8625 if (*RExC_parse == '{') {
a4c04bdc
NC
8626 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8627 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 8628 char * const e = strchr(RExC_parse++, '}');
b81d288d 8629 if (!e)
ffc61ed2 8630 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
8631
8632 numlen = e - RExC_parse;
8633 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8634 RExC_parse = e + 1;
8635 }
8636 else {
a4c04bdc 8637 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
8638 numlen = 2;
8639 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8640 RExC_parse += numlen;
8641 }
9e08bc66
TS
8642 if (PL_encoding && value < 0x100)
8643 goto recode_encoding;
b8c5462f
JH
8644 break;
8645 case 'c':
f9d13529 8646 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
b8c5462f
JH
8647 break;
8648 case '0': case '1': case '2': case '3': case '4':
c99e91e9 8649 case '5': case '6': case '7':
9e08bc66 8650 {
c99e91e9
KW
8651 /* Take 1-3 octal digits */
8652 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9e08bc66
TS
8653 numlen = 3;
8654 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8655 RExC_parse += numlen;
8656 if (PL_encoding && value < 0x100)
8657 goto recode_encoding;
8658 break;
8659 }
8660 recode_encoding:
8661 {
8662 SV* enc = PL_encoding;
8663 value = reg_recode((const char)(U8)value, &enc);
668c081a
NC
8664 if (!enc && SIZE_ONLY)
8665 ckWARNreg(RExC_parse,
8666 "Invalid escape in the specified encoding");
9e08bc66
TS
8667 break;
8668 }
1028017a 8669 default:
c99e91e9
KW
8670 /* Allow \_ to not give an error */
8671 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
668c081a
NC
8672 ckWARN2reg(RExC_parse,
8673 "Unrecognized escape \\%c in character class passed through",
8674 (int)value);
c99e91e9 8675 }
1028017a 8676 break;
b8c5462f 8677 }
ffc61ed2 8678 } /* end of \blah */
1b2d223b
JH
8679#ifdef EBCDIC
8680 else
8681 literal_endpoint++;
8682#endif
ffc61ed2
JH
8683
8684 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8685
2c63ecad
KW
8686 /* What matches in a locale is not known until runtime, so need to
8687 * (one time per class) allocate extra space to pass to regexec.
8688 * The space will contain a bit for each named class that is to be
8689 * matched against. This isn't needed for \p{} and pseudo-classes,
8690 * as they are not affected by locale, and hence are dealt with
8691 * separately */
8692 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8693 need_class = 1;
8694 if (SIZE_ONLY) {
3a15e693 8695#ifdef ANYOF_CLASS_ADD_SKIP
2c63ecad 8696 RExC_size += ANYOF_CLASS_ADD_SKIP;
3a15e693 8697#endif
2c63ecad
KW
8698 }
8699 else {
3a15e693 8700#ifdef ANYOF_CLASS_ADD_SKIP
2c63ecad 8701 RExC_emit += ANYOF_CLASS_ADD_SKIP;
3a15e693 8702#endif
2c63ecad
KW
8703 ANYOF_CLASS_ZERO(ret);
8704 }
9051cfd9 8705 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
2c63ecad 8706 }
ffc61ed2 8707
d5788240
KW
8708 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
8709 * literal */
ffc61ed2 8710 if (range) {
73b437c8 8711 if (!SIZE_ONLY) {
668c081a
NC
8712 const int w =
8713 RExC_parse >= rangebegin ?
8714 RExC_parse - rangebegin : 0;
8715 ckWARN4reg(RExC_parse,
b45f050a 8716 "False [] range \"%*.*s\"",
097eb12c 8717 w, w, rangebegin);
668c081a 8718
3568d838 8719 if (prevvalue < 256) {
2283d326 8720 stored +=
62e21d5f 8721 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue);
2283d326
KW
8722 stored +=
8723 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
ffc61ed2
JH
8724 }
8725 else {
ef87b810 8726 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
ffc61ed2 8727 Perl_sv_catpvf(aTHX_ listsv,
a1f3213b 8728 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 8729 }
b8c5462f 8730 }
ffc61ed2
JH
8731
8732 range = 0; /* this was not a true range */
73b437c8 8733 }
ffc61ed2 8734
89836f1f
YO
8735
8736
73b437c8 8737 if (!SIZE_ONLY) {
c49a72a9
NC
8738 const char *what = NULL;
8739 char yesno = 0;
8740
e2962f66
JH
8741 /* Possible truncation here but in some 64-bit environments
8742 * the compiler gets heartburn about switch on 64-bit values.
8743 * A similar issue a little earlier when switching on value.
98f323fa 8744 * --jhi */
e2962f66 8745 switch ((I32)namedclass) {
da7fcca4 8746
0399b215
KW
8747 case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
8748 case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
8749 case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
8750 case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
8751 case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
8752 case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
8753 case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
8754 case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
8755 case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
8756 case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
da7fcca4 8757#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
a12cf05f 8758 /* \s, \w match all unicode if utf8. */
0399b215
KW
8759 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8760 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
da7fcca4 8761#else
a12cf05f 8762 /* \s, \w match ascii and locale only */
0399b215
KW
8763 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8764 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
da7fcca4 8765#endif
0399b215 8766 case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
e1d1eefb
YO
8767 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8768 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
73b437c8
JH
8769 case ANYOF_ASCII:
8770 if (LOC)
936ed897 8771 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 8772 else {
1ba5c669 8773 for (value = 0; value < 128; value++)
2283d326 8774 stored +=
62e21d5f 8775 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
73b437c8 8776 }
c49a72a9 8777 yesno = '+';
ce1c68b2
KW
8778 what = NULL; /* Doesn't match outside ascii, so
8779 don't want to add +utf8:: */
73b437c8
JH
8780 break;
8781 case ANYOF_NASCII:
8782 if (LOC)
936ed897 8783 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 8784 else {
1ba5c669 8785 for (value = 128; value < 256; value++)
2283d326 8786 stored +=
62e21d5f 8787 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
73b437c8 8788 }
cfaf538b 8789 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
c49a72a9
NC
8790 yesno = '!';
8791 what = "ASCII";
89836f1f 8792 break;
ffc61ed2
JH
8793 case ANYOF_DIGIT:
8794 if (LOC)
8795 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8796 else {
8797 /* consecutive digits assumed */
8798 for (value = '0'; value <= '9'; value++)
2283d326 8799 stored +=
62e21d5f 8800 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
ffc61ed2 8801 }
c49a72a9 8802 yesno = '+';
da7fcca4 8803 what = POSIX_CC_UNI_NAME("Digit");
ffc61ed2
JH
8804 break;
8805 case ANYOF_NDIGIT:
8806 if (LOC)
8807 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8808 else {
8809 /* consecutive digits assumed */
8810 for (value = 0; value < '0'; value++)
2283d326 8811 stored +=
62e21d5f 8812 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
ffc61ed2 8813 for (value = '9' + 1; value < 256; value++)
2283d326 8814 stored +=
62e21d5f 8815 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
ffc61ed2 8816 }
c49a72a9 8817 yesno = '!';
da7fcca4 8818 what = POSIX_CC_UNI_NAME("Digit");
cfaf538b
KW
8819 if (ASCII_RESTRICTED ) {
8820 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
8821 }
89836f1f 8822 break;
f81125e2
JP
8823 case ANYOF_MAX:
8824 /* this is to handle \p and \P */
8825 break;
73b437c8 8826 default:
b45f050a 8827 vFAIL("Invalid [::] class");
73b437c8 8828 break;
b8c5462f 8829 }
cfaf538b 8830 if (what && ! (ASCII_RESTRICTED)) {
c49a72a9
NC
8831 /* Strings such as "+utf8::isWord\n" */
8832 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
ef87b810
KW
8833 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8834 }
ce1c68b2 8835
73b437c8 8836 continue;
a0d0e21e 8837 }
ffc61ed2
JH
8838 } /* end of namedclass \blah */
8839
a0d0e21e 8840 if (range) {
eb160463 8841 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
8842 const int w = RExC_parse - rangebegin;
8843 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 8844 range = 0; /* not a valid range */
73b437c8 8845 }
a0d0e21e
LW
8846 }
8847 else {
3568d838 8848 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
8849 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8850 RExC_parse[1] != ']') {
8851 RExC_parse++;
ffc61ed2
JH
8852
8853 /* a bad range like \w-, [:word:]- ? */
8854 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 8855 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 8856 const int w =
afd78fd5
JH
8857 RExC_parse >= rangebegin ?
8858 RExC_parse - rangebegin : 0;
830247a4 8859 vWARN4(RExC_parse,
b45f050a 8860 "False [] range \"%*.*s\"",
097eb12c 8861 w, w, rangebegin);
afd78fd5 8862 }
73b437c8 8863 if (!SIZE_ONLY)
2283d326
KW
8864 stored +=
8865 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
73b437c8 8866 } else
ffc61ed2
JH
8867 range = 1; /* yeah, it's a range! */
8868 continue; /* but do it the next time */
a0d0e21e 8869 }
a687059c 8870 }
ffc61ed2 8871
93733859 8872 /* now is the next time */
ae5c130c 8873 if (!SIZE_ONLY) {
3568d838 8874 if (prevvalue < 256) {
1df70142 8875 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 8876 IV i;
3568d838 8877#ifdef EBCDIC
1b2d223b
JH
8878 /* In EBCDIC [\x89-\x91] should include
8879 * the \x8e but [i-j] should not. */
8880 if (literal_endpoint == 2 &&
8881 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8882 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 8883 {
3568d838
JH
8884 if (isLOWER(prevvalue)) {
8885 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 8886 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326 8887 stored +=
62e21d5f 8888 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
2670d666 8889 }
ffc61ed2 8890 } else {
3568d838 8891 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 8892 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326 8893 stored +=
62e21d5f 8894 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
2670d666 8895 }
ffc61ed2 8896 }
8ada0baa 8897 }
ffc61ed2 8898 else
8ada0baa 8899#endif
07be1b83 8900 for (i = prevvalue; i <= ceilvalue; i++) {
62e21d5f 8901 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
07be1b83 8902 }
3568d838 8903 }
a5961de5 8904 if (value > 255 || UTF) {
1df70142
AL
8905 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8906 const UV natvalue = NATIVE_TO_UNI(value);
ef87b810
KW
8907
8908 /* If the code point requires utf8 to represent, and we are not
8909 * folding, it can't match unless the target is in utf8. Only
8910 * a few code points above 255 fold to below it, so XXX an
8911 * optimization would be to know which ones and set the flag
8912 * appropriately. */
8913 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
8914 ? ANYOF_NONBITMAP
8915 : ANYOF_UTF8;
d5788240
KW
8916 if (prevnatvalue < natvalue) { /* '>' case is fatal error above */
8917
8918 /* The \t sets the whole range */
ffc61ed2 8919 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7 8920 prevnatvalue, natvalue);
f56b6394
KW
8921
8922 /* Currently, we don't look at every value in the range.
8923 * Therefore we have to assume the worst case: that if
8924 * folding, it will match more than one character */
8925 if (FOLD) {
8926 OP(ret) = ANYOFV;
8927 }
b08decb7
JH
8928 }
8929 else if (prevnatvalue == natvalue) {
8930 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 8931 if (FOLD) {
89ebb4a3 8932 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 8933 STRLEN foldlen;
1df70142 8934 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 8935
e294cc5d
JH
8936#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8937 if (RExC_precomp[0] == ':' &&
8938 RExC_precomp[1] == '[' &&
8939 (f == 0xDF || f == 0x92)) {
8940 f = NATIVE_TO_UNI(f);
8941 }
8942#endif
c840d2a2
JH
8943 /* If folding and foldable and a single
8944 * character, insert also the folded version
8945 * to the charclass. */
9e55ce06 8946 if (f != value) {
e294cc5d
JH
8947#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8948 if ((RExC_precomp[0] == ':' &&
8949 RExC_precomp[1] == '[' &&
8950 (f == 0xA2 &&
8951 (value == 0xFB05 || value == 0xFB06))) ?
8952 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8953 foldlen == (STRLEN)UNISKIP(f) )
8954#else
eb160463 8955 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 8956#endif
9e55ce06
JH
8957 Perl_sv_catpvf(aTHX_ listsv,
8958 "%04"UVxf"\n", f);
8959 else {
8960 /* Any multicharacter foldings
8961 * require the following transform:
8962 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8963 * where E folds into "pq" and F folds
8964 * into "rst", all other characters
8965 * fold to single characters. We save
8966 * away these multicharacter foldings,
8967 * to be later saved as part of the
8968 * additional "s" data. */
8969 SV *sv;
8970
8971 if (!unicode_alternate)
8972 unicode_alternate = newAV();
740cce10
NC
8973 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8974 TRUE);
9e55ce06 8975 av_push(unicode_alternate, sv);
f56b6394 8976 OP(ret) = ANYOFV;
9e55ce06
JH
8977 }
8978 }
254ba52a 8979
60a8b682
JH
8980 /* If folding and the value is one of the Greek
8981 * sigmas insert a few more sigmas to make the
8982 * folding rules of the sigmas to work right.
8983 * Note that not all the possible combinations
8984 * are handled here: some of them are handled
9e55ce06
JH
8985 * by the standard folding rules, and some of
8986 * them (literal or EXACTF cases) are handled
8987 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
8988 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8989 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8990 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 8991 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8992 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8993 }
8994 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8995 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8996 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8997 }
8998 }
ffc61ed2 8999 }
1b2d223b
JH
9000#ifdef EBCDIC
9001 literal_endpoint = 0;
9002#endif
8ada0baa 9003 }
ffc61ed2
JH
9004
9005 range = 0; /* this range (if it was one) is done now */
a0d0e21e 9006 }
ffc61ed2 9007
ffc61ed2 9008
7f6f358c
YO
9009
9010 if (SIZE_ONLY)
9011 return ret;
9012 /****** !SIZE_ONLY AFTER HERE *********/
9013
f56b6394
KW
9014 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
9015 * set the FOLD flag yet, so this this does optimize those. It doesn't
40c78556
KW
9016 * optimize locale. Doing so perhaps could be done as long as there is
9017 * nothing like \w in it; some thought also would have to be given to the
9018 * interaction with above 0x100 chars */
f56b6394 9019 if (! LOC && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
40c78556
KW
9020 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
9021 ANYOF_BITMAP(ret)[value] ^= 0xFF;
9022 stored = 256 - stored;
9023
d5788240
KW
9024 /* The inversion means that everything above 255 is matched; and at the
9025 * same time we clear the invert flag */
40c78556
KW
9026 ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
9027 }
9028
f56b6394
KW
9029 if (FOLD) {
9030 SV *sv;
9031
9032 /* This is the one character in the bitmap that needs special handling
9033 * under non-locale folding, as it folds to two characters 'ss'. This
9034 * happens if it is set and not inverting, or isn't set and are
9035 * inverting */
9036 if (! LOC
9037 && (cBOOL(ANYOF_BITMAP_TEST(ret, LATIN_SMALL_LETTER_SHARP_S))
9038 ^ cBOOL(ANYOF_FLAGS(ret) & ANYOF_INVERT)))
9039 {
9040 OP(ret) = ANYOFV; /* Can match more than a single char */
9041
9042 /* Under Unicode semantics), it can do this when the target string
9043 * isn't in utf8 */
9044 if (UNI_SEMANTICS) {
9045 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
9046 }
9047
9048 if (!unicode_alternate) {
9049 unicode_alternate = newAV();
9050 }
9051 sv = newSVpvn_utf8("ss", 2, TRUE);
9052 av_push(unicode_alternate, sv);
9053 }
9054
9055 /* Folding in the bitmap is taken care of above, but not for locale
9056 * (for which we have to wait to see what folding is in effect at
9057 * runtime), and for things not in the bitmap. Set run-time fold flag
9058 * for these */
9059 if ((LOC || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP))) {
9060 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
9061 }
9062 }
9063
2786be71
KW
9064 /* A single character class can be "optimized" into an EXACTish node.
9065 * Note that since we don't currently count how many characters there are
9066 * outside the bitmap, we are XXX missing optimization possibilities for
9067 * them. This optimization can't happen unless this is a truly single
9068 * character class, which means that it can't be an inversion into a
9069 * many-character class, and there must be no possibility of there being
9070 * things outside the bitmap. 'stored' (only) for locales doesn't include
6da63e10
KW
9071 * \w, etc, so have to make a special test that they aren't present
9072 *
9073 * Similarly A 2-character class of the very special form like [bB] can be
9074 * optimized into an EXACTFish node, but only for non-locales, and for
9075 * characters which only have the two folds; so things like 'fF' and 'Ii'
9076 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
9077 * FI'. */
2786be71 9078 if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
6da63e10
KW
9079 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
9080 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
9081 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
9082 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
9083 /* If the latest code point has a fold whose
9084 * bit is set, it must be the only other one */
2dcac756 9085 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
6da63e10 9086 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
2786be71
KW
9087 {
9088 /* Note that the information needed to decide to do this optimization
9089 * is not currently available until the 2nd pass, and that the actually
6da63e10
KW
9090 * used EXACTish node takes less space than the calculated ANYOF node,
9091 * and hence the amount of space calculated in the first pass is larger
2786be71
KW
9092 * than actually used, so this optimization doesn't gain us any space.
9093 * But an EXACT node is faster than an ANYOF node, and can be combined
9094 * with any adjacent EXACT nodes later by the optimizer for further
6da63e10
KW
9095 * gains. The speed of executing an EXACTF is similar to an ANYOF
9096 * node, so the optimization advantage comes from the ability to join
9097 * it to adjacent EXACT nodes */
2786be71 9098
07be1b83 9099 const char * cur_parse= RExC_parse;
6da63e10 9100 U8 op;
07be1b83
YO
9101 RExC_emit = (regnode *)orig_emit;
9102 RExC_parse = (char *)orig_parse;
2786be71 9103
6da63e10
KW
9104 if (stored == 1) {
9105
9106 /* A locale node with one point can be folded; all the other cases
9107 * with folding will have two points, since we calculate them above
9108 */
39065660 9109 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
6da63e10
KW
9110 op = EXACTFL;
9111 }
9112 else {
9113 op = EXACT;
9114 }
9115 } /* else 2 chars in the bit map: the folds of each other */
cfaf538b 9116 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
6da63e10
KW
9117
9118 /* To join adjacent nodes, they must be the exact EXACTish type.
9119 * Try to use the most likely type, by using EXACTFU if the regex
9120 * calls for them, or is required because the character is
9121 * non-ASCII */
9122 op = EXACTFU;
9123 }
9124 else { /* Otherwise, more likely to be EXACTF type */
9125 op = EXACTF;
9126 }
9127
9128 ret = reg_node(pRExC_state, op);
07be1b83 9129 RExC_parse = (char *)cur_parse;
2786be71
KW
9130 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
9131 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
9132 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
9133 STR_LEN(ret)= 2;
9134 RExC_emit += STR_SZ(2);
9135 }
9136 else {
9137 *STRING(ret)= (char)value;
9138 STR_LEN(ret)= 1;
9139 RExC_emit += STR_SZ(1);
9140 }
ef8d46e8 9141 SvREFCNT_dec(listsv);
7f6f358c
YO
9142 return ret;
9143 }
ffc61ed2 9144
7f6f358c 9145 {
097eb12c 9146 AV * const av = newAV();
ffc61ed2 9147 SV *rv;
9e55ce06 9148 /* The 0th element stores the character class description
6a0407ee 9149 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
9150 * to initialize the appropriate swash (which gets stored in
9151 * the 1st element), and also useful for dumping the regnode.
9152 * The 2nd element stores the multicharacter foldings,
6a0407ee 9153 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
9154 av_store(av, 0, listsv);
9155 av_store(av, 1, NULL);
ad64d0ec
NC
9156 av_store(av, 2, MUTABLE_SV(unicode_alternate));
9157 rv = newRV_noinc(MUTABLE_SV(av));
19860706 9158 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 9159 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 9160 ARG_SET(ret, n);
a0ed51b3 9161 }
a0ed51b3
LW
9162 return ret;
9163}
89836f1f
YO
9164#undef _C_C_T_
9165
a0ed51b3 9166
bcdf7404
YO
9167/* reg_skipcomment()
9168
9169 Absorbs an /x style # comments from the input stream.
9170 Returns true if there is more text remaining in the stream.
9171 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
9172 terminates the pattern without including a newline.
9173
9174 Note its the callers responsibility to ensure that we are
9175 actually in /x mode
9176
9177*/
9178
9179STATIC bool
9180S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
9181{
9182 bool ended = 0;
7918f24d
NC
9183
9184 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
9185
bcdf7404
YO
9186 while (RExC_parse < RExC_end)
9187 if (*RExC_parse++ == '\n') {
9188 ended = 1;
9189 break;
9190 }
9191 if (!ended) {
9192 /* we ran off the end of the pattern without ending
9193 the comment, so we have to add an \n when wrapping */
9194 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9195 return 0;
9196 } else
9197 return 1;
9198}
9199
9200/* nextchar()
9201
3b753521 9202 Advances the parse position, and optionally absorbs
bcdf7404
YO
9203 "whitespace" from the inputstream.
9204
9205 Without /x "whitespace" means (?#...) style comments only,
9206 with /x this means (?#...) and # comments and whitespace proper.
9207
9208 Returns the RExC_parse point from BEFORE the scan occurs.
9209
9210 This is the /x friendly way of saying RExC_parse++.
9211*/
9212
76e3520e 9213STATIC char*
830247a4 9214S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 9215{
097eb12c 9216 char* const retval = RExC_parse++;
a0d0e21e 9217
7918f24d
NC
9218 PERL_ARGS_ASSERT_NEXTCHAR;
9219
4633a7c4 9220 for (;;) {
830247a4
IZ
9221 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
9222 RExC_parse[2] == '#') {
e994fd66
AE
9223 while (*RExC_parse != ')') {
9224 if (RExC_parse == RExC_end)
9225 FAIL("Sequence (?#... not terminated");
830247a4 9226 RExC_parse++;
e994fd66 9227 }
830247a4 9228 RExC_parse++;
4633a7c4
LW
9229 continue;
9230 }
bbe252da 9231 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
9232 if (isSPACE(*RExC_parse)) {
9233 RExC_parse++;
748a9306
LW
9234 continue;
9235 }
830247a4 9236 else if (*RExC_parse == '#') {
bcdf7404
YO
9237 if ( reg_skipcomment( pRExC_state ) )
9238 continue;
748a9306 9239 }
748a9306 9240 }
4633a7c4 9241 return retval;
a0d0e21e 9242 }
a687059c
LW
9243}
9244
9245/*
c277df42 9246- reg_node - emit a node
a0d0e21e 9247*/
76e3520e 9248STATIC regnode * /* Location. */
830247a4 9249S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 9250{
97aff369 9251 dVAR;
c277df42 9252 register regnode *ptr;
504618e9 9253 regnode * const ret = RExC_emit;
07be1b83 9254 GET_RE_DEBUG_FLAGS_DECL;
a687059c 9255
7918f24d
NC
9256 PERL_ARGS_ASSERT_REG_NODE;
9257
c277df42 9258 if (SIZE_ONLY) {
830247a4
IZ
9259 SIZE_ALIGN(RExC_size);
9260 RExC_size += 1;
a0d0e21e
LW
9261 return(ret);
9262 }
3b57cd43
YO
9263 if (RExC_emit >= RExC_emit_bound)
9264 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9265
c277df42 9266 NODE_ALIGN_FILL(ret);
a0d0e21e 9267 ptr = ret;
c277df42 9268 FILL_ADVANCE_NODE(ptr, op);
7122b237 9269#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9270 if (RExC_offsets) { /* MJD */
07be1b83 9271 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 9272 "reg_node", __LINE__,
13d6edb4 9273 PL_reg_name[op],
07be1b83
YO
9274 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
9275 ? "Overwriting end of array!\n" : "OK",
9276 (UV)(RExC_emit - RExC_emit_start),
9277 (UV)(RExC_parse - RExC_start),
9278 (UV)RExC_offsets[0]));
ccb2c380 9279 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 9280 }
7122b237 9281#endif
830247a4 9282 RExC_emit = ptr;
a0d0e21e 9283 return(ret);
a687059c
LW
9284}
9285
9286/*
a0d0e21e
LW
9287- reganode - emit a node with an argument
9288*/
76e3520e 9289STATIC regnode * /* Location. */
830247a4 9290S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 9291{
97aff369 9292 dVAR;
c277df42 9293 register regnode *ptr;
504618e9 9294 regnode * const ret = RExC_emit;
07be1b83 9295 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 9296
7918f24d
NC
9297 PERL_ARGS_ASSERT_REGANODE;
9298
c277df42 9299 if (SIZE_ONLY) {
830247a4
IZ
9300 SIZE_ALIGN(RExC_size);
9301 RExC_size += 2;
6bda09f9
YO
9302 /*
9303 We can't do this:
9304
9305 assert(2==regarglen[op]+1);
9306
9307 Anything larger than this has to allocate the extra amount.
9308 If we changed this to be:
9309
9310 RExC_size += (1 + regarglen[op]);
9311
9312 then it wouldn't matter. Its not clear what side effect
9313 might come from that so its not done so far.
9314 -- dmq
9315 */
a0d0e21e
LW
9316 return(ret);
9317 }
3b57cd43
YO
9318 if (RExC_emit >= RExC_emit_bound)
9319 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9320
c277df42 9321 NODE_ALIGN_FILL(ret);
a0d0e21e 9322 ptr = ret;
c277df42 9323 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 9324#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9325 if (RExC_offsets) { /* MJD */
07be1b83 9326 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 9327 "reganode",
ccb2c380 9328 __LINE__,
13d6edb4 9329 PL_reg_name[op],
07be1b83 9330 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 9331 "Overwriting end of array!\n" : "OK",
07be1b83
YO
9332 (UV)(RExC_emit - RExC_emit_start),
9333 (UV)(RExC_parse - RExC_start),
9334 (UV)RExC_offsets[0]));
ccb2c380 9335 Set_Cur_Node_Offset;
fac92740 9336 }
7122b237 9337#endif
830247a4 9338 RExC_emit = ptr;
a0d0e21e 9339 return(ret);
fe14fcc3
LW
9340}
9341
9342/*
cd439c50 9343- reguni - emit (if appropriate) a Unicode character
a0ed51b3 9344*/
71207a34
AL
9345STATIC STRLEN
9346S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 9347{
97aff369 9348 dVAR;
7918f24d
NC
9349
9350 PERL_ARGS_ASSERT_REGUNI;
9351
71207a34 9352 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
9353}
9354
9355/*
a0d0e21e
LW
9356- reginsert - insert an operator in front of already-emitted operand
9357*
9358* Means relocating the operand.
9359*/
76e3520e 9360STATIC void
6bda09f9 9361S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 9362{
97aff369 9363 dVAR;
c277df42
IZ
9364 register regnode *src;
9365 register regnode *dst;
9366 register regnode *place;
504618e9 9367 const int offset = regarglen[(U8)op];
6bda09f9 9368 const int size = NODE_STEP_REGNODE + offset;
07be1b83 9369 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9370
9371 PERL_ARGS_ASSERT_REGINSERT;
def51078 9372 PERL_UNUSED_ARG(depth);
22c35a8c 9373/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 9374 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 9375 if (SIZE_ONLY) {
6bda09f9 9376 RExC_size += size;
a0d0e21e
LW
9377 return;
9378 }
a687059c 9379
830247a4 9380 src = RExC_emit;
6bda09f9 9381 RExC_emit += size;
830247a4 9382 dst = RExC_emit;
40d049e4 9383 if (RExC_open_parens) {
6bda09f9 9384 int paren;
3b57cd43 9385 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 9386 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 9387 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 9388 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
9389 RExC_open_parens[paren] += size;
9390 } else {
3b57cd43 9391 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
9392 }
9393 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 9394 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
9395 RExC_close_parens[paren] += size;
9396 } else {
3b57cd43 9397 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
9398 }
9399 }
6bda09f9 9400 }
40d049e4 9401
fac92740 9402 while (src > opnd) {
c277df42 9403 StructCopy(--src, --dst, regnode);
7122b237 9404#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9405 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 9406 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 9407 "reg_insert",
ccb2c380 9408 __LINE__,
13d6edb4 9409 PL_reg_name[op],
07be1b83
YO
9410 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9411 ? "Overwriting end of array!\n" : "OK",
9412 (UV)(src - RExC_emit_start),
9413 (UV)(dst - RExC_emit_start),
9414 (UV)RExC_offsets[0]));
ccb2c380
MP
9415 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9416 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 9417 }
7122b237 9418#endif
fac92740
MJD
9419 }
9420
a0d0e21e
LW
9421
9422 place = opnd; /* Op node, where operand used to be. */
7122b237 9423#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9424 if (RExC_offsets) { /* MJD */
07be1b83 9425 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 9426 "reginsert",
ccb2c380 9427 __LINE__,
13d6edb4 9428 PL_reg_name[op],
07be1b83 9429 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 9430 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
9431 (UV)(place - RExC_emit_start),
9432 (UV)(RExC_parse - RExC_start),
786e8c11 9433 (UV)RExC_offsets[0]));
ccb2c380 9434 Set_Node_Offset(place, RExC_parse);
45948336 9435 Set_Node_Length(place, 1);
fac92740 9436 }
7122b237 9437#endif
c277df42
IZ
9438 src = NEXTOPER(place);
9439 FILL_ADVANCE_NODE(place, op);
9440 Zero(src, offset, regnode);
a687059c
LW
9441}
9442
9443/*
c277df42 9444- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 9445- SEE ALSO: regtail_study
a0d0e21e 9446*/
097eb12c 9447/* TODO: All three parms should be const */
76e3520e 9448STATIC void
3dab1dad 9449S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 9450{
97aff369 9451 dVAR;
c277df42 9452 register regnode *scan;
72f13be8 9453 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9454
9455 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
9456#ifndef DEBUGGING
9457 PERL_UNUSED_ARG(depth);
9458#endif
a0d0e21e 9459
c277df42 9460 if (SIZE_ONLY)
a0d0e21e
LW
9461 return;
9462
9463 /* Find last node. */
9464 scan = p;
9465 for (;;) {
504618e9 9466 regnode * const temp = regnext(scan);
3dab1dad
YO
9467 DEBUG_PARSE_r({
9468 SV * const mysv=sv_newmortal();
9469 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9470 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
9471 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9472 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9473 (temp == NULL ? "->" : ""),
13d6edb4 9474 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 9475 );
3dab1dad
YO
9476 });
9477 if (temp == NULL)
9478 break;
9479 scan = temp;
9480 }
9481
9482 if (reg_off_by_arg[OP(scan)]) {
9483 ARG_SET(scan, val - scan);
9484 }
9485 else {
9486 NEXT_OFF(scan) = val - scan;
9487 }
9488}
9489
07be1b83 9490#ifdef DEBUGGING
3dab1dad
YO
9491/*
9492- regtail_study - set the next-pointer at the end of a node chain of p to val.
9493- Look for optimizable sequences at the same time.
9494- currently only looks for EXACT chains.
07be1b83 9495
486ec47a 9496This is experimental code. The idea is to use this routine to perform
07be1b83
YO
9497in place optimizations on branches and groups as they are constructed,
9498with the long term intention of removing optimization from study_chunk so
9499that it is purely analytical.
9500
9501Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9502to control which is which.
9503
3dab1dad
YO
9504*/
9505/* TODO: All four parms should be const */
07be1b83 9506
3dab1dad
YO
9507STATIC U8
9508S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9509{
9510 dVAR;
9511 register regnode *scan;
07be1b83
YO
9512 U8 exact = PSEUDO;
9513#ifdef EXPERIMENTAL_INPLACESCAN
9514 I32 min = 0;
9515#endif
3dab1dad
YO
9516 GET_RE_DEBUG_FLAGS_DECL;
9517
7918f24d
NC
9518 PERL_ARGS_ASSERT_REGTAIL_STUDY;
9519
07be1b83 9520
3dab1dad
YO
9521 if (SIZE_ONLY)
9522 return exact;
9523
9524 /* Find last node. */
9525
9526 scan = p;
9527 for (;;) {
9528 regnode * const temp = regnext(scan);
07be1b83
YO
9529#ifdef EXPERIMENTAL_INPLACESCAN
9530 if (PL_regkind[OP(scan)] == EXACT)
9531 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9532 return EXACT;
9533#endif
3dab1dad
YO
9534 if ( exact ) {
9535 switch (OP(scan)) {
9536 case EXACT:
9537 case EXACTF:
2c2b7f86 9538 case EXACTFU:
3dab1dad
YO
9539 case EXACTFL:
9540 if( exact == PSEUDO )
9541 exact= OP(scan);
07be1b83
YO
9542 else if ( exact != OP(scan) )
9543 exact= 0;
3dab1dad
YO
9544 case NOTHING:
9545 break;
9546 default:
9547 exact= 0;
9548 }
9549 }
9550 DEBUG_PARSE_r({
9551 SV * const mysv=sv_newmortal();
9552 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9553 regprop(RExC_rx, mysv, scan);
eaf3ca90 9554 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 9555 SvPV_nolen_const(mysv),
eaf3ca90 9556 REG_NODE_NUM(scan),
13d6edb4 9557 PL_reg_name[exact]);
3dab1dad 9558 });
a0d0e21e
LW
9559 if (temp == NULL)
9560 break;
9561 scan = temp;
9562 }
07be1b83
YO
9563 DEBUG_PARSE_r({
9564 SV * const mysv_val=sv_newmortal();
9565 DEBUG_PARSE_MSG("");
9566 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
9567 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9568 SvPV_nolen_const(mysv_val),
9569 (IV)REG_NODE_NUM(val),
9570 (IV)(val - scan)
07be1b83
YO
9571 );
9572 });
c277df42
IZ
9573 if (reg_off_by_arg[OP(scan)]) {
9574 ARG_SET(scan, val - scan);
a0ed51b3
LW
9575 }
9576 else {
c277df42
IZ
9577 NEXT_OFF(scan) = val - scan;
9578 }
3dab1dad
YO
9579
9580 return exact;
a687059c 9581}
07be1b83 9582#endif
a687059c
LW
9583
9584/*
fd181c75 9585 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 9586 */
f7819f85 9587#ifdef DEBUGGING
c33269f7 9588static void
7918f24d
NC
9589S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9590{
f7819f85
A
9591 int bit;
9592 int set=0;
a62b1201 9593 regex_charset cs;
7918f24d 9594
f7819f85
A
9595 for (bit=0; bit<32; bit++) {
9596 if (flags & (1<<bit)) {
a62b1201
KW
9597 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
9598 continue;
9599 }
f7819f85
A
9600 if (!set++ && lead)
9601 PerlIO_printf(Perl_debug_log, "%s",lead);
9602 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9603 }
9604 }
a62b1201
KW
9605 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
9606 if (!set++ && lead) {
9607 PerlIO_printf(Perl_debug_log, "%s",lead);
9608 }
9609 switch (cs) {
9610 case REGEX_UNICODE_CHARSET:
9611 PerlIO_printf(Perl_debug_log, "UNICODE");
9612 break;
9613 case REGEX_LOCALE_CHARSET:
9614 PerlIO_printf(Perl_debug_log, "LOCALE");
9615 break;
cfaf538b
KW
9616 case REGEX_ASCII_RESTRICTED_CHARSET:
9617 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
9618 break;
a62b1201
KW
9619 default:
9620 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
9621 break;
9622 }
9623 }
f7819f85
A
9624 if (lead) {
9625 if (set)
9626 PerlIO_printf(Perl_debug_log, "\n");
9627 else
9628 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9629 }
9630}
9631#endif
9632
a687059c 9633void
097eb12c 9634Perl_regdump(pTHX_ const regexp *r)
a687059c 9635{
35ff7856 9636#ifdef DEBUGGING
97aff369 9637 dVAR;
c445ea15 9638 SV * const sv = sv_newmortal();
ab3bbdeb 9639 SV *dsv= sv_newmortal();
f8fc2ecf 9640 RXi_GET_DECL(r,ri);
f7819f85 9641 GET_RE_DEBUG_FLAGS_DECL;
a687059c 9642
7918f24d
NC
9643 PERL_ARGS_ASSERT_REGDUMP;
9644
f8fc2ecf 9645 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
9646
9647 /* Header fields of interest. */
ab3bbdeb
YO
9648 if (r->anchored_substr) {
9649 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9650 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 9651 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9652 "anchored %s%s at %"IVdf" ",
9653 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 9654 (IV)r->anchored_offset);
ab3bbdeb
YO
9655 } else if (r->anchored_utf8) {
9656 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9657 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 9658 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9659 "anchored utf8 %s%s at %"IVdf" ",
9660 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 9661 (IV)r->anchored_offset);
ab3bbdeb
YO
9662 }
9663 if (r->float_substr) {
9664 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9665 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 9666 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9667 "floating %s%s at %"IVdf"..%"UVuf" ",
9668 s, RE_SV_TAIL(r->float_substr),
7b0972df 9669 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
9670 } else if (r->float_utf8) {
9671 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9672 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 9673 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9674 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9675 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 9676 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 9677 }
33b8afdf 9678 if (r->check_substr || r->check_utf8)
b81d288d 9679 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
9680 (const char *)
9681 (r->check_substr == r->float_substr
9682 && r->check_utf8 == r->float_utf8
9683 ? "(checking floating" : "(checking anchored"));
bbe252da 9684 if (r->extflags & RXf_NOSCAN)
c277df42 9685 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 9686 if (r->extflags & RXf_CHECK_ALL)
c277df42 9687 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 9688 if (r->check_substr || r->check_utf8)
c277df42
IZ
9689 PerlIO_printf(Perl_debug_log, ") ");
9690
f8fc2ecf
YO
9691 if (ri->regstclass) {
9692 regprop(r, sv, ri->regstclass);
1de06328 9693 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 9694 }
bbe252da 9695 if (r->extflags & RXf_ANCH) {
774d564b 9696 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 9697 if (r->extflags & RXf_ANCH_BOL)
774d564b 9698 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 9699 if (r->extflags & RXf_ANCH_MBOL)
c277df42 9700 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 9701 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 9702 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 9703 if (r->extflags & RXf_ANCH_GPOS)
774d564b 9704 PerlIO_printf(Perl_debug_log, "(GPOS)");
9705 PerlIO_putc(Perl_debug_log, ' ');
9706 }
bbe252da 9707 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 9708 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 9709 if (r->intflags & PREGf_SKIP)
760ac839 9710 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 9711 if (r->intflags & PREGf_IMPLICIT)
760ac839 9712 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 9713 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 9714 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 9715 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 9716 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 9717 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 9718#else
7918f24d 9719 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 9720 PERL_UNUSED_CONTEXT;
65e66c80 9721 PERL_UNUSED_ARG(r);
17c3b450 9722#endif /* DEBUGGING */
a687059c
LW
9723}
9724
9725/*
a0d0e21e
LW
9726- regprop - printable representation of opcode
9727*/
3339dfd8
YO
9728#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9729STMT_START { \
9730 if (do_sep) { \
9731 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9732 if (flags & ANYOF_INVERT) \
9733 /*make sure the invert info is in each */ \
9734 sv_catpvs(sv, "^"); \
9735 do_sep = 0; \
9736 } \
9737} STMT_END
9738
46fc3d4c 9739void
32fc9b6a 9740Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 9741{
35ff7856 9742#ifdef DEBUGGING
97aff369 9743 dVAR;
9b155405 9744 register int k;
f8fc2ecf 9745 RXi_GET_DECL(prog,progi);
1de06328 9746 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 9747
7918f24d 9748 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 9749
76f68e9b 9750 sv_setpvs(sv, "");
8aa23a47 9751
03363afd 9752 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
9753 /* It would be nice to FAIL() here, but this may be called from
9754 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 9755 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 9756 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 9757
3dab1dad 9758 k = PL_regkind[OP(o)];
9b155405 9759
2a782b5b 9760 if (k == EXACT) {
f92a2122 9761 sv_catpvs(sv, " ");
ab3bbdeb
YO
9762 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9763 * is a crude hack but it may be the best for now since
9764 * we have no flag "this EXACTish node was UTF-8"
9765 * --jhi */
f92a2122
NC
9766 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9767 PERL_PV_ESCAPE_UNI_DETECT |
c89df6cf 9768 PERL_PV_ESCAPE_NONASCII |
f92a2122
NC
9769 PERL_PV_PRETTY_ELLIPSES |
9770 PERL_PV_PRETTY_LTGT |
9771 PERL_PV_PRETTY_NOCLEAR
9772 );
bb263b4e 9773 } else if (k == TRIE) {
3dab1dad 9774 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 9775 * progi->data isn't available here */
1de06328 9776 const char op = OP(o);
647f639f 9777 const U32 n = ARG(o);
1de06328 9778 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 9779 (reg_ac_data *)progi->data->data[n] :
1de06328 9780 NULL;
3251b653
NC
9781 const reg_trie_data * const trie
9782 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 9783
13d6edb4 9784 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
9785 DEBUG_TRIE_COMPILE_r(
9786 Perl_sv_catpvf(aTHX_ sv,
9787 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9788 (UV)trie->startstate,
1e2e3d02 9789 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
9790 (UV)trie->wordcount,
9791 (UV)trie->minlen,
9792 (UV)trie->maxlen,
9793 (UV)TRIE_CHARCOUNT(trie),
9794 (UV)trie->uniquecharcount
9795 )
9796 );
9797 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9798 int i;
9799 int rangestart = -1;
f46cb337 9800 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 9801 sv_catpvs(sv, "[");
1de06328
YO
9802 for (i = 0; i <= 256; i++) {
9803 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9804 if (rangestart == -1)
9805 rangestart = i;
9806 } else if (rangestart != -1) {
9807 if (i <= rangestart + 3)
9808 for (; rangestart < i; rangestart++)
9809 put_byte(sv, rangestart);
9810 else {
9811 put_byte(sv, rangestart);
9812 sv_catpvs(sv, "-");
9813 put_byte(sv, i - 1);
9814 }
9815 rangestart = -1;
9816 }
9817 }
f3a2811a 9818 sv_catpvs(sv, "]");
1de06328
YO
9819 }
9820
a3621e74 9821 } else if (k == CURLY) {
cb434fcc 9822 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
9823 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9824 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 9825 }
2c2d71f5
JH
9826 else if (k == WHILEM && o->flags) /* Ordinal/of */
9827 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 9828 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 9829 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 9830 if ( RXp_PAREN_NAMES(prog) ) {
9d6ecd7a 9831 if ( k != REF || (OP(o) < NREF)) {
502c6561 9832 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
9833 SV **name= av_fetch(list, ARG(o), 0 );
9834 if (name)
9835 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9836 }
9837 else {
502c6561 9838 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 9839 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
9840 I32 *nums=(I32*)SvPVX(sv_dat);
9841 SV **name= av_fetch(list, nums[0], 0 );
9842 I32 n;
9843 if (name) {
9844 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9845 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9846 (n ? "," : ""), (IV)nums[n]);
9847 }
9848 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 9849 }
1f1031fe 9850 }
ee9b8eae 9851 }
1f1031fe 9852 } else if (k == GOSUB)
6bda09f9 9853 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
9854 else if (k == VERB) {
9855 if (!o->flags)
9856 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 9857 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 9858 } else if (k == LOGICAL)
04ebc1ab 9859 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
f9a79580 9860 else if (k == FOLDCHAR)
df44d732 9861 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
653099ff
GS
9862 else if (k == ANYOF) {
9863 int i, rangestart = -1;
2d03de9c 9864 const U8 flags = ANYOF_FLAGS(o);
24d786f4 9865 int do_sep = 0;
0bd48802
AL
9866
9867 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9868 static const char * const anyofs[] = {
653099ff
GS
9869 "\\w",
9870 "\\W",
9871 "\\s",
9872 "\\S",
9873 "\\d",
9874 "\\D",
9875 "[:alnum:]",
9876 "[:^alnum:]",
9877 "[:alpha:]",
9878 "[:^alpha:]",
9879 "[:ascii:]",
9880 "[:^ascii:]",
24d786f4
YO
9881 "[:cntrl:]",
9882 "[:^cntrl:]",
653099ff
GS
9883 "[:graph:]",
9884 "[:^graph:]",
9885 "[:lower:]",
9886 "[:^lower:]",
9887 "[:print:]",
9888 "[:^print:]",
9889 "[:punct:]",
9890 "[:^punct:]",
9891 "[:upper:]",
aaa51d5e 9892 "[:^upper:]",
653099ff 9893 "[:xdigit:]",
aaa51d5e
JF
9894 "[:^xdigit:]",
9895 "[:space:]",
9896 "[:^space:]",
9897 "[:blank:]",
9898 "[:^blank:]"
653099ff
GS
9899 };
9900
19860706 9901 if (flags & ANYOF_LOCALE)
396482e1 9902 sv_catpvs(sv, "{loc}");
39065660 9903 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
396482e1 9904 sv_catpvs(sv, "{i}");
653099ff 9905 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 9906 if (flags & ANYOF_INVERT)
396482e1 9907 sv_catpvs(sv, "^");
3339dfd8
YO
9908
9909 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
9910 for (i = 0; i <= 256; i++) {
9911 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9912 if (rangestart == -1)
9913 rangestart = i;
9914 } else if (rangestart != -1) {
9915 if (i <= rangestart + 3)
9916 for (; rangestart < i; rangestart++)
653099ff 9917 put_byte(sv, rangestart);
ffc61ed2
JH
9918 else {
9919 put_byte(sv, rangestart);
396482e1 9920 sv_catpvs(sv, "-");
ffc61ed2 9921 put_byte(sv, i - 1);
653099ff 9922 }
24d786f4 9923 do_sep = 1;
ffc61ed2 9924 rangestart = -1;
653099ff 9925 }
847a199f 9926 }
3339dfd8
YO
9927
9928 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
3a15e693
KW
9929 /* output any special charclass tests (used entirely under use locale) */
9930 if (ANYOF_CLASS_TEST_ANY_SET(o))
bb7a0f54 9931 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 9932 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 9933 sv_catpv(sv, anyofs[i]);
24d786f4
YO
9934 do_sep = 1;
9935 }
9936
3339dfd8
YO
9937 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9938
11454c59
KW
9939 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
9940 sv_catpvs(sv, "{non-utf8-latin1-all}");
9941 }
9942
3339dfd8 9943 /* output information about the unicode matching */
ef87b810 9944 if (flags & ANYOF_UNICODE_ALL)
396482e1 9945 sv_catpvs(sv, "{unicode_all}");
ef87b810
KW
9946 else if (flags & ANYOF_UTF8)
9947 sv_catpvs(sv, "{unicode}");
f5ecd18d 9948 if (flags & ANYOF_NONBITMAP_NON_UTF8)
ef87b810 9949 sv_catpvs(sv, "{outside bitmap}");
ffc61ed2
JH
9950
9951 {
9952 SV *lv;
32fc9b6a 9953 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 9954
ffc61ed2
JH
9955 if (lv) {
9956 if (sw) {
89ebb4a3 9957 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 9958
ffc61ed2 9959 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 9960 uvchr_to_utf8(s, i);
ffc61ed2 9961
3568d838 9962 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
9963 if (rangestart == -1)
9964 rangestart = i;
9965 } else if (rangestart != -1) {
ffc61ed2
JH
9966 if (i <= rangestart + 3)
9967 for (; rangestart < i; rangestart++) {
2d03de9c
AL
9968 const U8 * const e = uvchr_to_utf8(s,rangestart);
9969 U8 *p;
9970 for(p = s; p < e; p++)
ffc61ed2
JH
9971 put_byte(sv, *p);
9972 }
9973 else {
2d03de9c
AL
9974 const U8 *e = uvchr_to_utf8(s,rangestart);
9975 U8 *p;
9976 for (p = s; p < e; p++)
ffc61ed2 9977 put_byte(sv, *p);
396482e1 9978 sv_catpvs(sv, "-");
2d03de9c
AL
9979 e = uvchr_to_utf8(s, i-1);
9980 for (p = s; p < e; p++)
1df70142 9981 put_byte(sv, *p);
ffc61ed2
JH
9982 }
9983 rangestart = -1;
9984 }
19860706 9985 }
ffc61ed2 9986
396482e1 9987 sv_catpvs(sv, "..."); /* et cetera */
19860706 9988 }
fde631ed 9989
ffc61ed2 9990 {
2e0de35c 9991 char *s = savesvpv(lv);
c445ea15 9992 char * const origs = s;
b81d288d 9993
3dab1dad
YO
9994 while (*s && *s != '\n')
9995 s++;
b81d288d 9996
ffc61ed2 9997 if (*s == '\n') {
2d03de9c 9998 const char * const t = ++s;
ffc61ed2
JH
9999
10000 while (*s) {
10001 if (*s == '\n')
10002 *s = ' ';
10003 s++;
10004 }
10005 if (s[-1] == ' ')
10006 s[-1] = 0;
10007
10008 sv_catpv(sv, t);
fde631ed 10009 }
b81d288d 10010
ffc61ed2 10011 Safefree(origs);
fde631ed
JH
10012 }
10013 }
653099ff 10014 }
ffc61ed2 10015
653099ff
GS
10016 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
10017 }
9b155405 10018 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 10019 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 10020#else
96a5add6 10021 PERL_UNUSED_CONTEXT;
65e66c80
SP
10022 PERL_UNUSED_ARG(sv);
10023 PERL_UNUSED_ARG(o);
f9049ba1 10024 PERL_UNUSED_ARG(prog);
17c3b450 10025#endif /* DEBUGGING */
35ff7856 10026}
a687059c 10027
cad2e5aa 10028SV *
288b8c02 10029Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 10030{ /* Assume that RE_INTUIT is set */
97aff369 10031 dVAR;
288b8c02 10032 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 10033 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10034
10035 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
10036 PERL_UNUSED_CONTEXT;
10037
a3621e74 10038 DEBUG_COMPILE_r(
cfd0369c 10039 {
2d03de9c 10040 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 10041 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
10042
10043 if (!PL_colorset) reginitcolors();
10044 PerlIO_printf(Perl_debug_log,
a0288114 10045 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
10046 PL_colors[4],
10047 prog->check_substr ? "" : "utf8 ",
10048 PL_colors[5],PL_colors[0],
cad2e5aa
JH
10049 s,
10050 PL_colors[1],
10051 (strlen(s) > 60 ? "..." : ""));
10052 } );
10053
33b8afdf 10054 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
10055}
10056
84da74a7 10057/*
f8149455 10058 pregfree()
84da74a7 10059
f8149455
YO
10060 handles refcounting and freeing the perl core regexp structure. When
10061 it is necessary to actually free the structure the first thing it
3b753521 10062 does is call the 'free' method of the regexp_engine associated to
f8149455
YO
10063 the regexp, allowing the handling of the void *pprivate; member
10064 first. (This routine is not overridable by extensions, which is why
10065 the extensions free is called first.)
10066
10067 See regdupe and regdupe_internal if you change anything here.
84da74a7 10068*/
f8149455 10069#ifndef PERL_IN_XSUB_RE
2b69d0c2 10070void
84679df5 10071Perl_pregfree(pTHX_ REGEXP *r)
a687059c 10072{
288b8c02
NC
10073 SvREFCNT_dec(r);
10074}
10075
10076void
10077Perl_pregfree2(pTHX_ REGEXP *rx)
10078{
27da23d5 10079 dVAR;
288b8c02 10080 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 10081 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 10082
7918f24d
NC
10083 PERL_ARGS_ASSERT_PREGFREE2;
10084
28d8d7f4
YO
10085 if (r->mother_re) {
10086 ReREFCNT_dec(r->mother_re);
10087 } else {
288b8c02 10088 CALLREGFREE_PVT(rx); /* free the private data */
ef8d46e8 10089 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
10090 }
10091 if (r->substrs) {
ef8d46e8
VP
10092 SvREFCNT_dec(r->anchored_substr);
10093 SvREFCNT_dec(r->anchored_utf8);
10094 SvREFCNT_dec(r->float_substr);
10095 SvREFCNT_dec(r->float_utf8);
28d8d7f4
YO
10096 Safefree(r->substrs);
10097 }
288b8c02 10098 RX_MATCH_COPY_FREE(rx);
f8c7b90f 10099#ifdef PERL_OLD_COPY_ON_WRITE
ef8d46e8 10100 SvREFCNT_dec(r->saved_copy);
ed252734 10101#endif
f0ab9afb 10102 Safefree(r->offs);
f8149455 10103}
28d8d7f4
YO
10104
10105/* reg_temp_copy()
10106
10107 This is a hacky workaround to the structural issue of match results
10108 being stored in the regexp structure which is in turn stored in
10109 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
10110 could be PL_curpm in multiple contexts, and could require multiple
10111 result sets being associated with the pattern simultaneously, such
10112 as when doing a recursive match with (??{$qr})
10113
10114 The solution is to make a lightweight copy of the regexp structure
10115 when a qr// is returned from the code executed by (??{$qr}) this
486ec47a 10116 lightweight copy doesn't actually own any of its data except for
28d8d7f4
YO
10117 the starp/end and the actual regexp structure itself.
10118
10119*/
10120
10121
84679df5 10122REGEXP *
f0826785 10123Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
7918f24d 10124{
f0826785 10125 struct regexp *ret;
288b8c02 10126 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 10127 register const I32 npar = r->nparens+1;
7918f24d
NC
10128
10129 PERL_ARGS_ASSERT_REG_TEMP_COPY;
10130
f0826785
BM
10131 if (!ret_x)
10132 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
10133 ret = (struct regexp *)SvANY(ret_x);
10134
288b8c02 10135 (void)ReREFCNT_inc(rx);
f7c278bf
NC
10136 /* We can take advantage of the existing "copied buffer" mechanism in SVs
10137 by pointing directly at the buffer, but flagging that the allocated
10138 space in the copy is zero. As we've just done a struct copy, it's now
10139 a case of zero-ing that, rather than copying the current length. */
10140 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 10141 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
b6f60916
NC
10142 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
10143 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
f7c278bf 10144 SvLEN_set(ret_x, 0);
b9ad13ac 10145 SvSTASH_set(ret_x, NULL);
703c388d 10146 SvMAGIC_set(ret_x, NULL);
f0ab9afb
NC
10147 Newx(ret->offs, npar, regexp_paren_pair);
10148 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 10149 if (r->substrs) {
28d8d7f4 10150 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
10151 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10152
10153 SvREFCNT_inc_void(ret->anchored_substr);
10154 SvREFCNT_inc_void(ret->anchored_utf8);
10155 SvREFCNT_inc_void(ret->float_substr);
10156 SvREFCNT_inc_void(ret->float_utf8);
10157
10158 /* check_substr and check_utf8, if non-NULL, point to either their
10159 anchored or float namesakes, and don't hold a second reference. */
486913e4 10160 }
288b8c02 10161 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 10162#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 10163 ret->saved_copy = NULL;
28d8d7f4 10164#endif
288b8c02 10165 ret->mother_re = rx;
28d8d7f4 10166
288b8c02 10167 return ret_x;
28d8d7f4 10168}
f8149455
YO
10169#endif
10170
10171/* regfree_internal()
10172
10173 Free the private data in a regexp. This is overloadable by
10174 extensions. Perl takes care of the regexp structure in pregfree(),
3b753521 10175 this covers the *pprivate pointer which technically perl doesn't
f8149455
YO
10176 know about, however of course we have to handle the
10177 regexp_internal structure when no extension is in use.
10178
10179 Note this is called before freeing anything in the regexp
10180 structure.
10181 */
10182
10183void
288b8c02 10184Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
10185{
10186 dVAR;
288b8c02 10187 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
10188 RXi_GET_DECL(r,ri);
10189 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10190
10191 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
10192
f8149455
YO
10193 DEBUG_COMPILE_r({
10194 if (!PL_colorset)
10195 reginitcolors();
10196 {
10197 SV *dsv= sv_newmortal();
3c8556c3 10198 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 10199 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
10200 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
10201 PL_colors[4],PL_colors[5],s);
10202 }
10203 });
7122b237
YO
10204#ifdef RE_TRACK_PATTERN_OFFSETS
10205 if (ri->u.offsets)
10206 Safefree(ri->u.offsets); /* 20010421 MJD */
10207#endif
f8fc2ecf
YO
10208 if (ri->data) {
10209 int n = ri->data->count;
f3548bdc
DM
10210 PAD* new_comppad = NULL;
10211 PAD* old_comppad;
4026c95a 10212 PADOFFSET refcnt;
dfad63ad 10213
c277df42 10214 while (--n >= 0) {
261faec3 10215 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 10216 switch (ri->data->what[n]) {
af534a04 10217 case 'a':
c277df42 10218 case 's':
81714fb9 10219 case 'S':
55eed653 10220 case 'u':
ad64d0ec 10221 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 10222 break;
653099ff 10223 case 'f':
f8fc2ecf 10224 Safefree(ri->data->data[n]);
653099ff 10225 break;
dfad63ad 10226 case 'p':
502c6561 10227 new_comppad = MUTABLE_AV(ri->data->data[n]);
dfad63ad 10228 break;
c277df42 10229 case 'o':
dfad63ad 10230 if (new_comppad == NULL)
cea2e8a9 10231 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
10232 PAD_SAVE_LOCAL(old_comppad,
10233 /* Watch out for global destruction's random ordering. */
c445ea15 10234 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 10235 );
b34c0dd4 10236 OP_REFCNT_LOCK;
f8fc2ecf 10237 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
10238 OP_REFCNT_UNLOCK;
10239 if (!refcnt)
f8fc2ecf 10240 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 10241
f3548bdc 10242 PAD_RESTORE_LOCAL(old_comppad);
ad64d0ec 10243 SvREFCNT_dec(MUTABLE_SV(new_comppad));
dfad63ad 10244 new_comppad = NULL;
c277df42
IZ
10245 break;
10246 case 'n':
9e55ce06 10247 break;
07be1b83 10248 case 'T':
be8e71aa
YO
10249 { /* Aho Corasick add-on structure for a trie node.
10250 Used in stclass optimization only */
07be1b83 10251 U32 refcount;
f8fc2ecf 10252 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
10253 OP_REFCNT_LOCK;
10254 refcount = --aho->refcount;
10255 OP_REFCNT_UNLOCK;
10256 if ( !refcount ) {
446bd890
NC
10257 PerlMemShared_free(aho->states);
10258 PerlMemShared_free(aho->fail);
446bd890
NC
10259 /* do this last!!!! */
10260 PerlMemShared_free(ri->data->data[n]);
10261 PerlMemShared_free(ri->regstclass);
07be1b83
YO
10262 }
10263 }
10264 break;
a3621e74 10265 case 't':
07be1b83 10266 {
be8e71aa 10267 /* trie structure. */
07be1b83 10268 U32 refcount;
f8fc2ecf 10269 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
10270 OP_REFCNT_LOCK;
10271 refcount = --trie->refcount;
10272 OP_REFCNT_UNLOCK;
10273 if ( !refcount ) {
446bd890 10274 PerlMemShared_free(trie->charmap);
446bd890
NC
10275 PerlMemShared_free(trie->states);
10276 PerlMemShared_free(trie->trans);
07be1b83 10277 if (trie->bitmap)
446bd890 10278 PerlMemShared_free(trie->bitmap);
786e8c11 10279 if (trie->jump)
446bd890 10280 PerlMemShared_free(trie->jump);
2e64971a 10281 PerlMemShared_free(trie->wordinfo);
446bd890
NC
10282 /* do this last!!!! */
10283 PerlMemShared_free(ri->data->data[n]);
a3621e74 10284 }
07be1b83
YO
10285 }
10286 break;
c277df42 10287 default:
f8fc2ecf 10288 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
10289 }
10290 }
f8fc2ecf
YO
10291 Safefree(ri->data->what);
10292 Safefree(ri->data);
a0d0e21e 10293 }
28d8d7f4 10294
f8fc2ecf 10295 Safefree(ri);
a687059c 10296}
c277df42 10297
a09252eb
NC
10298#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
10299#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
84da74a7
YO
10300#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
10301
10302/*
32cd70f6 10303 re_dup - duplicate a regexp.
84da74a7 10304
8233f606
DM
10305 This routine is expected to clone a given regexp structure. It is only
10306 compiled under USE_ITHREADS.
32cd70f6 10307
f8149455
YO
10308 After all of the core data stored in struct regexp is duplicated
10309 the regexp_engine.dupe method is used to copy any private data
10310 stored in the *pprivate pointer. This allows extensions to handle
10311 any duplication it needs to do.
10312
10313 See pregfree() and regfree_internal() if you change anything here.
84da74a7 10314*/
a3c0e9ca 10315#if defined(USE_ITHREADS)
f8149455 10316#ifndef PERL_IN_XSUB_RE
288b8c02
NC
10317void
10318Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 10319{
84da74a7 10320 dVAR;
a86a1ca7 10321 I32 npar;
288b8c02
NC
10322 const struct regexp *r = (const struct regexp *)SvANY(sstr);
10323 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 10324
7918f24d
NC
10325 PERL_ARGS_ASSERT_RE_DUP_GUTS;
10326
84da74a7 10327 npar = r->nparens+1;
f0ab9afb
NC
10328 Newx(ret->offs, npar, regexp_paren_pair);
10329 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 10330 if(ret->swap) {
28d8d7f4 10331 /* no need to copy these */
f0ab9afb 10332 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 10333 }
84da74a7 10334
6057429f 10335 if (ret->substrs) {
32cd70f6
NC
10336 /* Do it this way to avoid reading from *r after the StructCopy().
10337 That way, if any of the sv_dup_inc()s dislodge *r from the L1
10338 cache, it doesn't matter. */
66b1de87
NC
10339 const bool anchored = r->check_substr
10340 ? r->check_substr == r->anchored_substr
10341 : r->check_utf8 == r->anchored_utf8;
785a26d5 10342 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
10343 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10344
32cd70f6
NC
10345 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
10346 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
10347 ret->float_substr = sv_dup_inc(ret->float_substr, param);
10348 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 10349
32cd70f6
NC
10350 /* check_substr and check_utf8, if non-NULL, point to either their
10351 anchored or float namesakes, and don't hold a second reference. */
10352
10353 if (ret->check_substr) {
10354 if (anchored) {
10355 assert(r->check_utf8 == r->anchored_utf8);
10356 ret->check_substr = ret->anchored_substr;
10357 ret->check_utf8 = ret->anchored_utf8;
10358 } else {
10359 assert(r->check_substr == r->float_substr);
10360 assert(r->check_utf8 == r->float_utf8);
10361 ret->check_substr = ret->float_substr;
10362 ret->check_utf8 = ret->float_utf8;
10363 }
66b1de87
NC
10364 } else if (ret->check_utf8) {
10365 if (anchored) {
10366 ret->check_utf8 = ret->anchored_utf8;
10367 } else {
10368 ret->check_utf8 = ret->float_utf8;
10369 }
32cd70f6 10370 }
6057429f 10371 }
f8149455 10372
5daac39c 10373 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 10374
6057429f 10375 if (ret->pprivate)
288b8c02 10376 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 10377
288b8c02 10378 if (RX_MATCH_COPIED(dstr))
6057429f 10379 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
10380 else
10381 ret->subbeg = NULL;
10382#ifdef PERL_OLD_COPY_ON_WRITE
10383 ret->saved_copy = NULL;
10384#endif
6057429f 10385
c2123ae3
NC
10386 if (ret->mother_re) {
10387 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
10388 /* Our storage points directly to our mother regexp, but that's
10389 1: a buffer in a different thread
10390 2: something we no longer hold a reference on
10391 so we need to copy it locally. */
10392 /* Note we need to sue SvCUR() on our mother_re, because it, in
10393 turn, may well be pointing to its own mother_re. */
10394 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
10395 SvCUR(ret->mother_re)+1));
10396 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
10397 }
10398 ret->mother_re = NULL;
10399 }
6057429f 10400 ret->gofs = 0;
f8149455
YO
10401}
10402#endif /* PERL_IN_XSUB_RE */
10403
10404/*
10405 regdupe_internal()
10406
10407 This is the internal complement to regdupe() which is used to copy
10408 the structure pointed to by the *pprivate pointer in the regexp.
10409 This is the core version of the extension overridable cloning hook.
10410 The regexp structure being duplicated will be copied by perl prior
10411 to this and will be provided as the regexp *r argument, however
10412 with the /old/ structures pprivate pointer value. Thus this routine
10413 may override any copying normally done by perl.
10414
10415 It returns a pointer to the new regexp_internal structure.
10416*/
10417
10418void *
288b8c02 10419Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
10420{
10421 dVAR;
288b8c02 10422 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
10423 regexp_internal *reti;
10424 int len, npar;
10425 RXi_GET_DECL(r,ri);
7918f24d
NC
10426
10427 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455
YO
10428
10429 npar = r->nparens+1;
7122b237 10430 len = ProgLen(ri);
f8149455 10431
45cf4570 10432 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455
YO
10433 Copy(ri->program, reti->program, len+1, regnode);
10434
f8149455 10435
f8fc2ecf 10436 reti->regstclass = NULL;
bcdf7404 10437
f8fc2ecf 10438 if (ri->data) {
84da74a7 10439 struct reg_data *d;
f8fc2ecf 10440 const int count = ri->data->count;
84da74a7
YO
10441 int i;
10442
10443 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10444 char, struct reg_data);
10445 Newx(d->what, count, U8);
10446
10447 d->count = count;
10448 for (i = 0; i < count; i++) {
f8fc2ecf 10449 d->what[i] = ri->data->what[i];
84da74a7 10450 switch (d->what[i]) {
af534a04 10451 /* legal options are one of: sSfpontTua
84da74a7 10452 see also regcomp.h and pregfree() */
af534a04 10453 case 'a': /* actually an AV, but the dup function is identical. */
84da74a7 10454 case 's':
81714fb9 10455 case 'S':
0536c0a7 10456 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 10457 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 10458 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 10459 break;
84da74a7
YO
10460 case 'f':
10461 /* This is cheating. */
10462 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 10463 StructCopy(ri->data->data[i], d->data[i],
84da74a7 10464 struct regnode_charclass_class);
f8fc2ecf 10465 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
10466 break;
10467 case 'o':
bbe252da
YO
10468 /* Compiled op trees are readonly and in shared memory,
10469 and can thus be shared without duplication. */
84da74a7 10470 OP_REFCNT_LOCK;
f8fc2ecf 10471 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
10472 OP_REFCNT_UNLOCK;
10473 break;
23eab42c
NC
10474 case 'T':
10475 /* Trie stclasses are readonly and can thus be shared
10476 * without duplication. We free the stclass in pregfree
10477 * when the corresponding reg_ac_data struct is freed.
10478 */
10479 reti->regstclass= ri->regstclass;
10480 /* Fall through */
84da74a7 10481 case 't':
84da74a7 10482 OP_REFCNT_LOCK;
0536c0a7 10483 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 10484 OP_REFCNT_UNLOCK;
0536c0a7
NC
10485 /* Fall through */
10486 case 'n':
10487 d->data[i] = ri->data->data[i];
84da74a7 10488 break;
84da74a7 10489 default:
f8fc2ecf 10490 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
10491 }
10492 }
10493
f8fc2ecf 10494 reti->data = d;
84da74a7
YO
10495 }
10496 else
f8fc2ecf 10497 reti->data = NULL;
84da74a7 10498
cde0cee5
YO
10499 reti->name_list_idx = ri->name_list_idx;
10500
7122b237
YO
10501#ifdef RE_TRACK_PATTERN_OFFSETS
10502 if (ri->u.offsets) {
10503 Newx(reti->u.offsets, 2*len+1, U32);
10504 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10505 }
10506#else
10507 SetProgLen(reti,len);
10508#endif
10509
f8149455 10510 return (void*)reti;
84da74a7 10511}
f8149455
YO
10512
10513#endif /* USE_ITHREADS */
84da74a7 10514
f8149455 10515#ifndef PERL_IN_XSUB_RE
bcdf7404 10516
c277df42
IZ
10517/*
10518 - regnext - dig the "next" pointer out of a node
c277df42
IZ
10519 */
10520regnode *
864dbfa3 10521Perl_regnext(pTHX_ register regnode *p)
c277df42 10522{
97aff369 10523 dVAR;
c277df42
IZ
10524 register I32 offset;
10525
f8fc2ecf 10526 if (!p)
c277df42
IZ
10527 return(NULL);
10528
35db910f
KW
10529 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
10530 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10531 }
10532
c277df42
IZ
10533 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10534 if (offset == 0)
10535 return(NULL);
10536
c277df42 10537 return(p+offset);
c277df42 10538}
76234dfb 10539#endif
c277df42 10540
01f988be 10541STATIC void
cea2e8a9 10542S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
10543{
10544 va_list args;
10545 STRLEN l1 = strlen(pat1);
10546 STRLEN l2 = strlen(pat2);
10547 char buf[512];
06bf62c7 10548 SV *msv;
73d840c0 10549 const char *message;
c277df42 10550
7918f24d
NC
10551 PERL_ARGS_ASSERT_RE_CROAK2;
10552
c277df42
IZ
10553 if (l1 > 510)
10554 l1 = 510;
10555 if (l1 + l2 > 510)
10556 l2 = 510 - l1;
10557 Copy(pat1, buf, l1 , char);
10558 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
10559 buf[l1 + l2] = '\n';
10560 buf[l1 + l2 + 1] = '\0';
8736538c
AS
10561#ifdef I_STDARG
10562 /* ANSI variant takes additional second argument */
c277df42 10563 va_start(args, pat2);
8736538c
AS
10564#else
10565 va_start(args);
10566#endif
5a844595 10567 msv = vmess(buf, &args);
c277df42 10568 va_end(args);
cfd0369c 10569 message = SvPV_const(msv,l1);
c277df42
IZ
10570 if (l1 > 512)
10571 l1 = 512;
10572 Copy(message, buf, l1 , char);
197cf9b9 10573 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 10574 Perl_croak(aTHX_ "%s", buf);
c277df42 10575}
a0ed51b3
LW
10576
10577/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
10578
76234dfb 10579#ifndef PERL_IN_XSUB_RE
a0ed51b3 10580void
864dbfa3 10581Perl_save_re_context(pTHX)
b81d288d 10582{
97aff369 10583 dVAR;
1ade1aa1
NC
10584
10585 struct re_save_state *state;
10586
10587 SAVEVPTR(PL_curcop);
10588 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10589
10590 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10591 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
c6bf6a65 10592 SSPUSHUV(SAVEt_RE_STATE);
1ade1aa1 10593
46ab3289 10594 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 10595
a0ed51b3 10596 PL_reg_start_tmp = 0;
a0ed51b3 10597 PL_reg_start_tmpl = 0;
c445ea15 10598 PL_reg_oldsaved = NULL;
a5db57d6 10599 PL_reg_oldsavedlen = 0;
a5db57d6 10600 PL_reg_maxiter = 0;
a5db57d6 10601 PL_reg_leftiter = 0;
c445ea15 10602 PL_reg_poscache = NULL;
a5db57d6 10603 PL_reg_poscache_size = 0;
1ade1aa1
NC
10604#ifdef PERL_OLD_COPY_ON_WRITE
10605 PL_nrs = NULL;
10606#endif
ada6e8a9 10607
c445ea15
AL
10608 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10609 if (PL_curpm) {
10610 const REGEXP * const rx = PM_GETRE(PL_curpm);
10611 if (rx) {
1df70142 10612 U32 i;
07bc277f 10613 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 10614 char digits[TYPE_CHARS(long)];
d9fad198 10615 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
10616 GV *const *const gvp
10617 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10618
b37c2d43
AL
10619 if (gvp) {
10620 GV * const gv = *gvp;
10621 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10622 save_scalar(gv);
49f27e4b 10623 }
ada6e8a9
AMS
10624 }
10625 }
10626 }
a0ed51b3 10627}
76234dfb 10628#endif
51371543 10629
51371543 10630static void
acfe0abc 10631clear_re(pTHX_ void *r)
51371543 10632{
97aff369 10633 dVAR;
84679df5 10634 ReREFCNT_dec((REGEXP *)r);
51371543 10635}
ffbc6a93 10636
a28509cc
AL
10637#ifdef DEBUGGING
10638
10639STATIC void
10640S_put_byte(pTHX_ SV *sv, int c)
10641{
7918f24d
NC
10642 PERL_ARGS_ASSERT_PUT_BYTE;
10643
7fddd944
NC
10644 /* Our definition of isPRINT() ignores locales, so only bytes that are
10645 not part of UTF-8 are considered printable. I assume that the same
10646 holds for UTF-EBCDIC.
10647 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10648 which Wikipedia says:
10649
10650 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10651 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10652 identical, to the ASCII delete (DEL) or rubout control character.
10653 ) So the old condition can be simplified to !isPRINT(c) */
9ce2357e
KW
10654 if (!isPRINT(c)) {
10655 if (c < 256) {
10656 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
10657 }
10658 else {
10659 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
10660 }
10661 }
5e7aa789 10662 else {
88c9ea1e 10663 const char string = c;
5e7aa789
NC
10664 if (c == '-' || c == ']' || c == '\\' || c == '^')
10665 sv_catpvs(sv, "\\");
10666 sv_catpvn(sv, &string, 1);
10667 }
a28509cc
AL
10668}
10669
786e8c11 10670
3dab1dad
YO
10671#define CLEAR_OPTSTART \
10672 if (optstart) STMT_START { \
70685ca0 10673 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
10674 optstart=NULL; \
10675 } STMT_END
10676
786e8c11 10677#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 10678
b5a2f8d8
NC
10679STATIC const regnode *
10680S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
10681 const regnode *last, const regnode *plast,
10682 SV* sv, I32 indent, U32 depth)
a28509cc 10683{
97aff369 10684 dVAR;
786e8c11 10685 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 10686 register const regnode *next;
3dab1dad 10687 const regnode *optstart= NULL;
1f1031fe 10688
f8fc2ecf 10689 RXi_GET_DECL(r,ri);
3dab1dad 10690 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10691
10692 PERL_ARGS_ASSERT_DUMPUNTIL;
10693
786e8c11
YO
10694#ifdef DEBUG_DUMPUNTIL
10695 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10696 last ? last-start : 0,plast ? plast-start : 0);
10697#endif
10698
10699 if (plast && plast < last)
10700 last= plast;
10701
10702 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 10703 /* While that wasn't END last time... */
a28509cc
AL
10704 NODE_ALIGN(node);
10705 op = OP(node);
de734bd5 10706 if (op == CLOSE || op == WHILEM)
786e8c11 10707 indent--;
b5a2f8d8 10708 next = regnext((regnode *)node);
1f1031fe 10709
a28509cc 10710 /* Where, what. */
8e11feef 10711 if (OP(node) == OPTIMIZED) {
e68ec53f 10712 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 10713 optstart = node;
3dab1dad 10714 else
8e11feef 10715 goto after_print;
3dab1dad
YO
10716 } else
10717 CLEAR_OPTSTART;
1f1031fe 10718
32fc9b6a 10719 regprop(r, sv, node);
a28509cc 10720 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 10721 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
10722
10723 if (OP(node) != OPTIMIZED) {
10724 if (next == NULL) /* Next ptr. */
10725 PerlIO_printf(Perl_debug_log, " (0)");
10726 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10727 PerlIO_printf(Perl_debug_log, " (FAIL)");
10728 else
10729 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10730 (void)PerlIO_putc(Perl_debug_log, '\n');
10731 }
10732
a28509cc
AL
10733 after_print:
10734 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
10735 assert(next);
10736 {
10737 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
10738 ? regnext((regnode *)next)
10739 : next);
be8e71aa
YO
10740 if (last && nnode > last)
10741 nnode = last;
786e8c11 10742 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 10743 }
a28509cc
AL
10744 }
10745 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 10746 assert(next);
786e8c11 10747 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
10748 }
10749 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 10750 const regnode *this_trie = node;
1de06328 10751 const char op = OP(node);
647f639f 10752 const U32 n = ARG(node);
1de06328 10753 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 10754 (reg_ac_data *)ri->data->data[n] :
1de06328 10755 NULL;
3251b653
NC
10756 const reg_trie_data * const trie =
10757 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 10758#ifdef DEBUGGING
502c6561 10759 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 10760#endif
786e8c11 10761 const regnode *nextbranch= NULL;
a28509cc 10762 I32 word_idx;
76f68e9b 10763 sv_setpvs(sv, "");
786e8c11 10764 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 10765 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
10766
10767 PerlIO_printf(Perl_debug_log, "%*s%s ",
10768 (int)(2*(indent+3)), "",
10769 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
10770 PL_colors[0], PL_colors[1],
10771 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 10772 PERL_PV_PRETTY_ELLIPSES |
7f69552c 10773 PERL_PV_PRETTY_LTGT
786e8c11
YO
10774 )
10775 : "???"
10776 );
10777 if (trie->jump) {
40d049e4 10778 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
10779 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10780 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
10781 if (dist) {
10782 if (!nextbranch)
24b23f37 10783 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
10784 DUMPUNTIL(this_trie + dist, nextbranch);
10785 }
786e8c11
YO
10786 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10787 nextbranch= regnext((regnode *)nextbranch);
10788 } else {
10789 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 10790 }
786e8c11
YO
10791 }
10792 if (last && next > last)
10793 node= last;
10794 else
10795 node= next;
a28509cc 10796 }
786e8c11
YO
10797 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10798 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10799 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
10800 }
10801 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 10802 assert(next);
786e8c11 10803 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
10804 }
10805 else if ( op == PLUS || op == STAR) {
786e8c11 10806 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc 10807 }
f56b6394 10808 else if (PL_regkind[(U8)op] == ANYOF) {
a28509cc 10809 /* arglen 1 + class block */
4a3ee7a8 10810 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
a28509cc
AL
10811 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10812 node = NEXTOPER(node);
10813 }
10814 else if (PL_regkind[(U8)op] == EXACT) {
10815 /* Literal string, where present. */
10816 node += NODE_SZ_STR(node) - 1;
10817 node = NEXTOPER(node);
10818 }
10819 else {
10820 node = NEXTOPER(node);
10821 node += regarglen[(U8)op];
10822 }
10823 if (op == CURLYX || op == OPEN)
786e8c11 10824 indent++;
a28509cc 10825 }
3dab1dad 10826 CLEAR_OPTSTART;
786e8c11 10827#ifdef DEBUG_DUMPUNTIL
70685ca0 10828 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 10829#endif
1de06328 10830 return node;
a28509cc
AL
10831}
10832
10833#endif /* DEBUGGING */
10834
241d1a3b
NC
10835/*
10836 * Local variables:
10837 * c-indentation-style: bsd
10838 * c-basic-offset: 4
10839 * indent-tabs-mode: t
10840 * End:
10841 *
37442d52
RGS
10842 * ex: set ts=8 sts=4 sw=4 noet:
10843 */