This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perlrequick tweaks
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
e40e74fe
KW
137 I32 uni_semantics; /* If a d charset modifier should use unicode
138 rules, even if the pattern is not in
139 utf8 */
81714fb9 140 HV *paren_names; /* Paren names */
1f1031fe 141
40d049e4
YO
142 regnode **recurse; /* Recurse regops */
143 I32 recurse_count; /* Number of recurse regops */
b57e4118 144 I32 in_lookbehind;
830247a4
IZ
145#if ADD_TO_REGEXEC
146 char *starttry; /* -Dr: where regtry was called. */
147#define RExC_starttry (pRExC_state->starttry)
148#endif
3dab1dad 149#ifdef DEBUGGING
be8e71aa 150 const char *lastparse;
3dab1dad 151 I32 lastnum;
1f1031fe 152 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
153#define RExC_lastparse (pRExC_state->lastparse)
154#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 155#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 156#endif
830247a4
IZ
157} RExC_state_t;
158
e2509266 159#define RExC_flags (pRExC_state->flags)
830247a4 160#define RExC_precomp (pRExC_state->precomp)
288b8c02 161#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 162#define RExC_rx (pRExC_state->rx)
f8fc2ecf 163#define RExC_rxi (pRExC_state->rxi)
fac92740 164#define RExC_start (pRExC_state->start)
830247a4
IZ
165#define RExC_end (pRExC_state->end)
166#define RExC_parse (pRExC_state->parse)
167#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
168#ifdef RE_TRACK_PATTERN_OFFSETS
169#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
170#endif
830247a4 171#define RExC_emit (pRExC_state->emit)
fac92740 172#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 173#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
174#define RExC_naughty (pRExC_state->naughty)
175#define RExC_sawback (pRExC_state->sawback)
176#define RExC_seen (pRExC_state->seen)
177#define RExC_size (pRExC_state->size)
178#define RExC_npar (pRExC_state->npar)
e2e6a0f1 179#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
180#define RExC_extralen (pRExC_state->extralen)
181#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
182#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 183#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 184#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 185#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
186#define RExC_open_parens (pRExC_state->open_parens)
187#define RExC_close_parens (pRExC_state->close_parens)
188#define RExC_opend (pRExC_state->opend)
81714fb9 189#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
190#define RExC_recurse (pRExC_state->recurse)
191#define RExC_recurse_count (pRExC_state->recurse_count)
b57e4118 192#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
830247a4 193
cde0cee5 194
a687059c
LW
195#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
196#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
197 ((*s) == '{' && regcurly(s)))
a687059c 198
35c8bce7
LW
199#ifdef SPSTART
200#undef SPSTART /* dratted cpp namespace... */
201#endif
a687059c
LW
202/*
203 * Flags to be passed up and down.
204 */
a687059c 205#define WORST 0 /* Worst case. */
a3b492c3 206#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
207
208/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
d7b56a3c 209 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
fda99bee 210#define SIMPLE 0x02
a3b492c3
YO
211#define SPSTART 0x04 /* Starts with * or +. */
212#define TRYAGAIN 0x08 /* Weeded out a declaration. */
213#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 214
3dab1dad
YO
215#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
216
07be1b83
YO
217/* whether trie related optimizations are enabled */
218#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
219#define TRIE_STUDY_OPT
786e8c11 220#define FULL_TRIE_STUDY
07be1b83
YO
221#define TRIE_STCLASS
222#endif
1de06328
YO
223
224
40d049e4
YO
225
226#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
227#define PBITVAL(paren) (1 << ((paren) & 7))
228#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
229#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
230#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
231
bbd61b5f
KW
232/* If not already in utf8, do a longjmp back to the beginning */
233#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
234#define REQUIRE_UTF8 STMT_START { \
235 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
236 } STMT_END
40d049e4 237
1de06328
YO
238/* About scan_data_t.
239
240 During optimisation we recurse through the regexp program performing
241 various inplace (keyhole style) optimisations. In addition study_chunk
242 and scan_commit populate this data structure with information about
243 what strings MUST appear in the pattern. We look for the longest
3b753521 244 string that must appear at a fixed location, and we look for the
1de06328
YO
245 longest string that may appear at a floating location. So for instance
246 in the pattern:
247
248 /FOO[xX]A.*B[xX]BAR/
249
250 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
251 strings (because they follow a .* construct). study_chunk will identify
252 both FOO and BAR as being the longest fixed and floating strings respectively.
253
254 The strings can be composites, for instance
255
256 /(f)(o)(o)/
257
258 will result in a composite fixed substring 'foo'.
259
260 For each string some basic information is maintained:
261
262 - offset or min_offset
263 This is the position the string must appear at, or not before.
264 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
265 characters must match before the string we are searching for.
266 Likewise when combined with minlenp and the length of the string it
1de06328
YO
267 tells us how many characters must appear after the string we have
268 found.
269
270 - max_offset
271 Only used for floating strings. This is the rightmost point that
3b753521 272 the string can appear at. If set to I32 max it indicates that the
1de06328
YO
273 string can occur infinitely far to the right.
274
275 - minlenp
276 A pointer to the minimum length of the pattern that the string
277 was found inside. This is important as in the case of positive
278 lookahead or positive lookbehind we can have multiple patterns
279 involved. Consider
280
281 /(?=FOO).*F/
282
283 The minimum length of the pattern overall is 3, the minimum length
284 of the lookahead part is 3, but the minimum length of the part that
285 will actually match is 1. So 'FOO's minimum length is 3, but the
286 minimum length for the F is 1. This is important as the minimum length
287 is used to determine offsets in front of and behind the string being
288 looked for. Since strings can be composites this is the length of the
486ec47a 289 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
290 the length is calculated by study_chunk, so that the minimum lengths
291 are not known until the full pattern has been compiled, thus the
292 pointer to the value.
293
294 - lookbehind
295
296 In the case of lookbehind the string being searched for can be
297 offset past the start point of the final matching string.
298 If this value was just blithely removed from the min_offset it would
299 invalidate some of the calculations for how many chars must match
300 before or after (as they are derived from min_offset and minlen and
301 the length of the string being searched for).
302 When the final pattern is compiled and the data is moved from the
303 scan_data_t structure into the regexp structure the information
304 about lookbehind is factored in, with the information that would
305 have been lost precalculated in the end_shift field for the
306 associated string.
307
308 The fields pos_min and pos_delta are used to store the minimum offset
309 and the delta to the maximum offset at the current point in the pattern.
310
311*/
2c2d71f5
JH
312
313typedef struct scan_data_t {
1de06328
YO
314 /*I32 len_min; unused */
315 /*I32 len_delta; unused */
2c2d71f5
JH
316 I32 pos_min;
317 I32 pos_delta;
318 SV *last_found;
1de06328 319 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
320 I32 last_start_min;
321 I32 last_start_max;
1de06328
YO
322 SV **longest; /* Either &l_fixed, or &l_float. */
323 SV *longest_fixed; /* longest fixed string found in pattern */
324 I32 offset_fixed; /* offset where it starts */
486ec47a 325 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
326 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
327 SV *longest_float; /* longest floating string found in pattern */
328 I32 offset_float_min; /* earliest point in string it can appear */
329 I32 offset_float_max; /* latest point in string it can appear */
486ec47a 330 I32 *minlen_float; /* pointer to the minlen relevant to the string */
1de06328 331 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
332 I32 flags;
333 I32 whilem_c;
cb434fcc 334 I32 *last_closep;
653099ff 335 struct regnode_charclass_class *start_class;
2c2d71f5
JH
336} scan_data_t;
337
a687059c 338/*
e50aee73 339 * Forward declarations for pregcomp()'s friends.
a687059c 340 */
a0d0e21e 341
27da23d5 342static const scan_data_t zero_scan_data =
1de06328 343 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
344
345#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
346#define SF_BEFORE_SEOL 0x0001
347#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
348#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
349#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
350
09b7f37c
CB
351#ifdef NO_UNARY_PLUS
352# define SF_FIX_SHIFT_EOL (0+2)
353# define SF_FL_SHIFT_EOL (0+4)
354#else
355# define SF_FIX_SHIFT_EOL (+2)
356# define SF_FL_SHIFT_EOL (+4)
357#endif
c277df42
IZ
358
359#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
360#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
361
362#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
363#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
364#define SF_IS_INF 0x0040
365#define SF_HAS_PAR 0x0080
366#define SF_IN_PAR 0x0100
367#define SF_HAS_EVAL 0x0200
368#define SCF_DO_SUBSTR 0x0400
653099ff
GS
369#define SCF_DO_STCLASS_AND 0x0800
370#define SCF_DO_STCLASS_OR 0x1000
371#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 372#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 373
786e8c11 374#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 375#define SCF_SEEN_ACCEPT 0x8000
07be1b83 376
43fead97 377#define UTF cBOOL(RExC_utf8)
a62b1201
KW
378#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
379#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
cfaf538b
KW
380#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
381#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
382#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
2f7f8cb1
KW
383#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
384#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
a62b1201 385
43fead97 386#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 387
ffc61ed2 388#define OOB_UNICODE 12345678
93733859 389#define OOB_NAMEDCLASS -1
b8c5462f 390
a0ed51b3
LW
391#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
392#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
393
8615cb43 394
b45f050a
JF
395/* length of regex to show in messages that don't mark a position within */
396#define RegexLengthToShowInErrorMessages 127
397
398/*
399 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
400 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
401 * op/pragma/warn/regcomp.
402 */
7253e4e3
RK
403#define MARKER1 "<-- HERE" /* marker as it appears in the description */
404#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 405
7253e4e3 406#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
407
408/*
409 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
410 * arg. Show regex, up to a maximum length. If it's too long, chop and add
411 * "...".
412 */
58e23c8d 413#define _FAIL(code) STMT_START { \
bfed75c6 414 const char *ellipses = ""; \
ccb2c380
MP
415 IV len = RExC_end - RExC_precomp; \
416 \
417 if (!SIZE_ONLY) \
288b8c02 418 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
419 if (len > RegexLengthToShowInErrorMessages) { \
420 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
421 len = RegexLengthToShowInErrorMessages - 10; \
422 ellipses = "..."; \
423 } \
58e23c8d 424 code; \
ccb2c380 425} STMT_END
8615cb43 426
58e23c8d
YO
427#define FAIL(msg) _FAIL( \
428 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
429 msg, (int)len, RExC_precomp, ellipses))
430
431#define FAIL2(msg,arg) _FAIL( \
432 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
433 arg, (int)len, RExC_precomp, ellipses))
434
b45f050a 435/*
b45f050a
JF
436 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
437 */
ccb2c380 438#define Simple_vFAIL(m) STMT_START { \
a28509cc 439 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
440 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
441 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
442} STMT_END
b45f050a
JF
443
444/*
445 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
446 */
ccb2c380
MP
447#define vFAIL(m) STMT_START { \
448 if (!SIZE_ONLY) \
288b8c02 449 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
450 Simple_vFAIL(m); \
451} STMT_END
b45f050a
JF
452
453/*
454 * Like Simple_vFAIL(), but accepts two arguments.
455 */
ccb2c380 456#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 457 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
458 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
459 (int)offset, RExC_precomp, RExC_precomp + offset); \
460} STMT_END
b45f050a
JF
461
462/*
463 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
464 */
ccb2c380
MP
465#define vFAIL2(m,a1) STMT_START { \
466 if (!SIZE_ONLY) \
288b8c02 467 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
468 Simple_vFAIL2(m, a1); \
469} STMT_END
b45f050a
JF
470
471
472/*
473 * Like Simple_vFAIL(), but accepts three arguments.
474 */
ccb2c380 475#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 476 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
477 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
478 (int)offset, RExC_precomp, RExC_precomp + offset); \
479} STMT_END
b45f050a
JF
480
481/*
482 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
483 */
ccb2c380
MP
484#define vFAIL3(m,a1,a2) STMT_START { \
485 if (!SIZE_ONLY) \
288b8c02 486 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
487 Simple_vFAIL3(m, a1, a2); \
488} STMT_END
b45f050a
JF
489
490/*
491 * Like Simple_vFAIL(), but accepts four arguments.
492 */
ccb2c380 493#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 494 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
495 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
496 (int)offset, RExC_precomp, RExC_precomp + offset); \
497} STMT_END
b45f050a 498
668c081a 499#define ckWARNreg(loc,m) STMT_START { \
a28509cc 500 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
501 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
502 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
503} STMT_END
504
668c081a 505#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 506 const IV offset = loc - RExC_precomp; \
d1d15184 507 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
508 m REPORT_LOCATION, \
509 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
510} STMT_END
511
2335b3d3
KW
512#define ckWARN2regdep(loc,m, a1) STMT_START { \
513 const IV offset = loc - RExC_precomp; \
514 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
515 m REPORT_LOCATION, \
516 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
517} STMT_END
518
668c081a 519#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 520 const IV offset = loc - RExC_precomp; \
668c081a 521 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
522 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
523} STMT_END
524
525#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 526 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
527 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
528 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
529} STMT_END
530
668c081a
NC
531#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
532 const IV offset = loc - RExC_precomp; \
533 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
534 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
535} STMT_END
536
ccb2c380 537#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 538 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
539 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
540 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
541} STMT_END
542
668c081a
NC
543#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
544 const IV offset = loc - RExC_precomp; \
545 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
546 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
547} STMT_END
548
ccb2c380 549#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 550 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
551 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
552 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
553} STMT_END
9d1d55b5 554
8615cb43 555
cd439c50 556/* Allow for side effects in s */
ccb2c380
MP
557#define REGC(c,s) STMT_START { \
558 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
559} STMT_END
cd439c50 560
fac92740
MJD
561/* Macros for recording node offsets. 20001227 mjd@plover.com
562 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
563 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
564 * Element 0 holds the number n.
07be1b83 565 * Position is 1 indexed.
fac92740 566 */
7122b237
YO
567#ifndef RE_TRACK_PATTERN_OFFSETS
568#define Set_Node_Offset_To_R(node,byte)
569#define Set_Node_Offset(node,byte)
570#define Set_Cur_Node_Offset
571#define Set_Node_Length_To_R(node,len)
572#define Set_Node_Length(node,len)
573#define Set_Node_Cur_Length(node)
574#define Node_Offset(n)
575#define Node_Length(n)
576#define Set_Node_Offset_Length(node,offset,len)
577#define ProgLen(ri) ri->u.proglen
578#define SetProgLen(ri,x) ri->u.proglen = x
579#else
580#define ProgLen(ri) ri->u.offsets[0]
581#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
582#define Set_Node_Offset_To_R(node,byte) STMT_START { \
583 if (! SIZE_ONLY) { \
584 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 585 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 586 if((node) < 0) { \
551405c4 587 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
588 } else { \
589 RExC_offsets[2*(node)-1] = (byte); \
590 } \
591 } \
592} STMT_END
593
594#define Set_Node_Offset(node,byte) \
595 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
596#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
597
598#define Set_Node_Length_To_R(node,len) STMT_START { \
599 if (! SIZE_ONLY) { \
600 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 601 __LINE__, (int)(node), (int)(len))); \
ccb2c380 602 if((node) < 0) { \
551405c4 603 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
604 } else { \
605 RExC_offsets[2*(node)] = (len); \
606 } \
607 } \
608} STMT_END
609
610#define Set_Node_Length(node,len) \
611 Set_Node_Length_To_R((node)-RExC_emit_start, len)
612#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
613#define Set_Node_Cur_Length(node) \
614 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
615
616/* Get offsets and lengths */
617#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
618#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
619
07be1b83
YO
620#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
621 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
622 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
623} STMT_END
7122b237 624#endif
07be1b83
YO
625
626#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
627#define EXPERIMENTAL_INPLACESCAN
f427392e 628#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 629
304ee84b
YO
630#define DEBUG_STUDYDATA(str,data,depth) \
631DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 632 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
633 "%*s" str "Pos:%"IVdf"/%"IVdf \
634 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
635 (int)(depth)*2, "", \
636 (IV)((data)->pos_min), \
637 (IV)((data)->pos_delta), \
304ee84b 638 (UV)((data)->flags), \
1de06328 639 (IV)((data)->whilem_c), \
304ee84b
YO
640 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
641 is_inf ? "INF " : "" \
1de06328
YO
642 ); \
643 if ((data)->last_found) \
644 PerlIO_printf(Perl_debug_log, \
645 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
646 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
647 SvPVX_const((data)->last_found), \
648 (IV)((data)->last_end), \
649 (IV)((data)->last_start_min), \
650 (IV)((data)->last_start_max), \
651 ((data)->longest && \
652 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
653 SvPVX_const((data)->longest_fixed), \
654 (IV)((data)->offset_fixed), \
655 ((data)->longest && \
656 (data)->longest==&((data)->longest_float)) ? "*" : "", \
657 SvPVX_const((data)->longest_float), \
658 (IV)((data)->offset_float_min), \
659 (IV)((data)->offset_float_max) \
660 ); \
661 PerlIO_printf(Perl_debug_log,"\n"); \
662});
663
acfe0abc 664static void clear_re(pTHX_ void *r);
4327152a 665
653099ff 666/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 667 Update the longest found anchored substring and the longest found
653099ff
GS
668 floating substrings if needed. */
669
4327152a 670STATIC void
304ee84b 671S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 672{
e1ec3a88
AL
673 const STRLEN l = CHR_SVLEN(data->last_found);
674 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 675 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 676
7918f24d
NC
677 PERL_ARGS_ASSERT_SCAN_COMMIT;
678
c277df42 679 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 680 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
681 if (*data->longest == data->longest_fixed) {
682 data->offset_fixed = l ? data->last_start_min : data->pos_min;
683 if (data->flags & SF_BEFORE_EOL)
b81d288d 684 data->flags
c277df42
IZ
685 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
686 else
687 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
688 data->minlen_fixed=minlenp;
689 data->lookbehind_fixed=0;
a0ed51b3 690 }
304ee84b 691 else { /* *data->longest == data->longest_float */
c277df42 692 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
693 data->offset_float_max = (l
694 ? data->last_start_max
c277df42 695 : data->pos_min + data->pos_delta);
304ee84b 696 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 697 data->offset_float_max = I32_MAX;
c277df42 698 if (data->flags & SF_BEFORE_EOL)
b81d288d 699 data->flags
c277df42
IZ
700 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
701 else
702 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
703 data->minlen_float=minlenp;
704 data->lookbehind_float=0;
c277df42
IZ
705 }
706 }
707 SvCUR_set(data->last_found, 0);
0eda9292 708 {
a28509cc 709 SV * const sv = data->last_found;
097eb12c
AL
710 if (SvUTF8(sv) && SvMAGICAL(sv)) {
711 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
712 if (mg)
713 mg->mg_len = 0;
714 }
0eda9292 715 }
c277df42
IZ
716 data->last_end = -1;
717 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 718 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
719}
720
653099ff
GS
721/* Can match anything (initialization) */
722STATIC void
097eb12c 723S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 724{
7918f24d
NC
725 PERL_ARGS_ASSERT_CL_ANYTHING;
726
653099ff 727 ANYOF_CLASS_ZERO(cl);
f8bef550 728 ANYOF_BITMAP_SETALL(cl);
11454c59 729 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
653099ff
GS
730 if (LOC)
731 cl->flags |= ANYOF_LOCALE;
732}
733
734/* Can match anything (initialization) */
735STATIC int
5f66b61c 736S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
737{
738 int value;
739
7918f24d
NC
740 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
741
aaa51d5e 742 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
743 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
744 return 1;
1aa99e6b
IH
745 if (!(cl->flags & ANYOF_UNICODE_ALL))
746 return 0;
10edeb5d 747 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 748 return 0;
653099ff
GS
749 return 1;
750}
751
752/* Can match anything (initialization) */
753STATIC void
097eb12c 754S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 755{
7918f24d
NC
756 PERL_ARGS_ASSERT_CL_INIT;
757
8ecf7187 758 Zero(cl, 1, struct regnode_charclass_class);
653099ff 759 cl->type = ANYOF;
830247a4 760 cl_anything(pRExC_state, cl);
653099ff
GS
761}
762
763STATIC void
097eb12c 764S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 765{
7918f24d
NC
766 PERL_ARGS_ASSERT_CL_INIT_ZERO;
767
8ecf7187 768 Zero(cl, 1, struct regnode_charclass_class);
653099ff 769 cl->type = ANYOF;
830247a4 770 cl_anything(pRExC_state, cl);
653099ff
GS
771 if (LOC)
772 cl->flags |= ANYOF_LOCALE;
773}
774
775/* 'And' a given class with another one. Can create false positives */
776/* We assume that cl is not inverted */
777STATIC void
5f66b61c 778S_cl_and(struct regnode_charclass_class *cl,
a28509cc 779 const struct regnode_charclass_class *and_with)
653099ff 780{
7918f24d 781 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
782
783 assert(and_with->type == ANYOF);
1e6ade67
KW
784
785 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
786 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
653099ff 787 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
788 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
789 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
653099ff
GS
790 int i;
791
792 if (and_with->flags & ANYOF_INVERT)
793 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
794 cl->bitmap[i] &= ~and_with->bitmap[i];
795 else
796 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
797 cl->bitmap[i] &= and_with->bitmap[i];
798 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
799 if (!(and_with->flags & ANYOF_EOS))
800 cl->flags &= ~ANYOF_EOS;
1aa99e6b 801
39065660
KW
802 if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
803 cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
11454c59
KW
804 if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
805 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
8951c461 806
4713bfe1 807 if (cl->flags & ANYOF_UNICODE_ALL
137165a6 808 && ANYOF_NONBITMAP(and_with)
4713bfe1
KW
809 && !(and_with->flags & ANYOF_INVERT))
810 {
811 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
812 cl->flags &= ~ANYOF_UNICODE_ALL;
813 }
137165a6
KW
814 else {
815
816 /* The intersection of all unicode with something that isn't all
817 * unicode is that something */
818 ARG_SET(cl, ARG(and_with));
819 }
1aa99e6b 820 }
14ebb1a2
JH
821 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
822 !(and_with->flags & ANYOF_INVERT))
137165a6 823 {
1aa99e6b 824 cl->flags &= ~ANYOF_UNICODE_ALL;
137165a6
KW
825 if (! ANYOF_NONBITMAP(and_with)) {
826 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
827 }
828 }
653099ff
GS
829}
830
831/* 'OR' a given class with another one. Can create false positives */
832/* We assume that cl is not inverted */
833STATIC void
097eb12c 834S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 835{
7918f24d
NC
836 PERL_ARGS_ASSERT_CL_OR;
837
653099ff
GS
838 if (or_with->flags & ANYOF_INVERT) {
839 /* We do not use
840 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
841 * <= (B1 | !B2) | (CL1 | !CL2)
842 * which is wasteful if CL2 is small, but we ignore CL2:
843 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
844 * XXXX Can we handle case-fold? Unclear:
845 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
846 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
847 */
848 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
849 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
850 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
653099ff
GS
851 int i;
852
853 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
854 cl->bitmap[i] |= ~or_with->bitmap[i];
855 } /* XXXX: logic is complicated otherwise */
856 else {
830247a4 857 cl_anything(pRExC_state, cl);
653099ff
GS
858 }
859 } else {
860 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
861 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
862 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
863 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
653099ff
GS
864 int i;
865
866 /* OR char bitmap and class bitmap separately */
867 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
868 cl->bitmap[i] |= or_with->bitmap[i];
1e6ade67 869 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
653099ff
GS
870 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
871 cl->classflags[i] |= or_with->classflags[i];
872 cl->flags |= ANYOF_CLASS;
873 }
874 }
875 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 876 cl_anything(pRExC_state, cl);
653099ff
GS
877 }
878 }
879 if (or_with->flags & ANYOF_EOS)
880 cl->flags |= ANYOF_EOS;
11454c59
KW
881 if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
882 cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
1aa99e6b 883
39065660
KW
884 if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
885 cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
8951c461 886
9826f543
KW
887 /* If both nodes match something outside the bitmap, but what they match
888 * outside is not the same pointer, and hence not easily compared, give up
889 * and allow the start class to match everything outside the bitmap */
137165a6 890 if (ANYOF_NONBITMAP(cl) && ANYOF_NONBITMAP(or_with) &&
1aa99e6b
IH
891 ARG(cl) != ARG(or_with)) {
892 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 893 }
9826f543 894
1aa99e6b
IH
895 if (or_with->flags & ANYOF_UNICODE_ALL) {
896 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 897 }
653099ff
GS
898}
899
a3621e74
YO
900#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
901#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
902#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
903#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
904
3dab1dad
YO
905
906#ifdef DEBUGGING
07be1b83 907/*
2b8b4781
NC
908 dump_trie(trie,widecharmap,revcharmap)
909 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
910 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
911
912 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
913 The _interim_ variants are used for debugging the interim
914 tables that are used to generate the final compressed
915 representation which is what dump_trie expects.
916
486ec47a 917 Part of the reason for their existence is to provide a form
3dab1dad 918 of documentation as to how the different representations function.
07be1b83
YO
919
920*/
3dab1dad
YO
921
922/*
3dab1dad
YO
923 Dumps the final compressed table form of the trie to Perl_debug_log.
924 Used for debugging make_trie().
925*/
b9a59e08 926
3dab1dad 927STATIC void
2b8b4781
NC
928S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
929 AV *revcharmap, U32 depth)
3dab1dad
YO
930{
931 U32 state;
ab3bbdeb 932 SV *sv=sv_newmortal();
55eed653 933 int colwidth= widecharmap ? 6 : 4;
2e64971a 934 U16 word;
3dab1dad
YO
935 GET_RE_DEBUG_FLAGS_DECL;
936
7918f24d 937 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 938
3dab1dad
YO
939 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
940 (int)depth * 2 + 2,"",
941 "Match","Base","Ofs" );
942
943 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 944 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 945 if ( tmp ) {
ab3bbdeb
YO
946 PerlIO_printf( Perl_debug_log, "%*s",
947 colwidth,
ddc5bc0f 948 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
949 PL_colors[0], PL_colors[1],
950 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
951 PERL_PV_ESCAPE_FIRSTCHAR
952 )
953 );
3dab1dad
YO
954 }
955 }
956 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
957 (int)depth * 2 + 2,"");
958
959 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 960 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
961 PerlIO_printf( Perl_debug_log, "\n");
962
1e2e3d02 963 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 964 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
965
966 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
967
968 if ( trie->states[ state ].wordnum ) {
969 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
970 } else {
971 PerlIO_printf( Perl_debug_log, "%6s", "" );
972 }
973
974 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
975
976 if ( base ) {
977 U32 ofs = 0;
978
979 while( ( base + ofs < trie->uniquecharcount ) ||
980 ( base + ofs - trie->uniquecharcount < trie->lasttrans
981 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
982 ofs++;
983
984 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
985
986 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
987 if ( ( base + ofs >= trie->uniquecharcount ) &&
988 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
989 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
990 {
ab3bbdeb
YO
991 PerlIO_printf( Perl_debug_log, "%*"UVXf,
992 colwidth,
3dab1dad
YO
993 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
994 } else {
ab3bbdeb 995 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
996 }
997 }
998
999 PerlIO_printf( Perl_debug_log, "]");
1000
1001 }
1002 PerlIO_printf( Perl_debug_log, "\n" );
1003 }
2e64971a
DM
1004 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1005 for (word=1; word <= trie->wordcount; word++) {
1006 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1007 (int)word, (int)(trie->wordinfo[word].prev),
1008 (int)(trie->wordinfo[word].len));
1009 }
1010 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
1011}
1012/*
3dab1dad
YO
1013 Dumps a fully constructed but uncompressed trie in list form.
1014 List tries normally only are used for construction when the number of
1015 possible chars (trie->uniquecharcount) is very high.
1016 Used for debugging make_trie().
1017*/
1018STATIC void
55eed653 1019S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1020 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1021 U32 depth)
3dab1dad
YO
1022{
1023 U32 state;
ab3bbdeb 1024 SV *sv=sv_newmortal();
55eed653 1025 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1026 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1027
1028 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1029
3dab1dad 1030 /* print out the table precompression. */
ab3bbdeb
YO
1031 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1032 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1033 "------:-----+-----------------\n" );
3dab1dad
YO
1034
1035 for( state=1 ; state < next_alloc ; state ++ ) {
1036 U16 charid;
1037
ab3bbdeb 1038 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1039 (int)depth * 2 + 2,"", (UV)state );
1040 if ( ! trie->states[ state ].wordnum ) {
1041 PerlIO_printf( Perl_debug_log, "%5s| ","");
1042 } else {
1043 PerlIO_printf( Perl_debug_log, "W%4x| ",
1044 trie->states[ state ].wordnum
1045 );
1046 }
1047 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1048 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1049 if ( tmp ) {
1050 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1051 colwidth,
ddc5bc0f 1052 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1053 PL_colors[0], PL_colors[1],
1054 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1055 PERL_PV_ESCAPE_FIRSTCHAR
1056 ) ,
1e2e3d02
YO
1057 TRIE_LIST_ITEM(state,charid).forid,
1058 (UV)TRIE_LIST_ITEM(state,charid).newstate
1059 );
1060 if (!(charid % 10))
664e119d
RGS
1061 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1062 (int)((depth * 2) + 14), "");
1e2e3d02 1063 }
ab3bbdeb
YO
1064 }
1065 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1066 }
1067}
1068
1069/*
3dab1dad
YO
1070 Dumps a fully constructed but uncompressed trie in table form.
1071 This is the normal DFA style state transition table, with a few
1072 twists to facilitate compression later.
1073 Used for debugging make_trie().
1074*/
1075STATIC void
55eed653 1076S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1077 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1078 U32 depth)
3dab1dad
YO
1079{
1080 U32 state;
1081 U16 charid;
ab3bbdeb 1082 SV *sv=sv_newmortal();
55eed653 1083 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1084 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1085
1086 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1087
1088 /*
1089 print out the table precompression so that we can do a visual check
1090 that they are identical.
1091 */
1092
1093 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1094
1095 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1096 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1097 if ( tmp ) {
ab3bbdeb
YO
1098 PerlIO_printf( Perl_debug_log, "%*s",
1099 colwidth,
ddc5bc0f 1100 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1101 PL_colors[0], PL_colors[1],
1102 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1103 PERL_PV_ESCAPE_FIRSTCHAR
1104 )
1105 );
3dab1dad
YO
1106 }
1107 }
1108
1109 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1110
1111 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1112 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1113 }
1114
1115 PerlIO_printf( Perl_debug_log, "\n" );
1116
1117 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1118
1119 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1120 (int)depth * 2 + 2,"",
1121 (UV)TRIE_NODENUM( state ) );
1122
1123 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1124 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1125 if (v)
1126 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1127 else
1128 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1129 }
1130 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1131 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1132 } else {
1133 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1134 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1135 }
1136 }
07be1b83 1137}
3dab1dad
YO
1138
1139#endif
1140
2e64971a 1141
786e8c11
YO
1142/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1143 startbranch: the first branch in the whole branch sequence
1144 first : start branch of sequence of branch-exact nodes.
1145 May be the same as startbranch
1146 last : Thing following the last branch.
1147 May be the same as tail.
1148 tail : item following the branch sequence
1149 count : words in the sequence
1150 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1151 depth : indent depth
3dab1dad 1152
786e8c11 1153Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1154
786e8c11
YO
1155A trie is an N'ary tree where the branches are determined by digital
1156decomposition of the key. IE, at the root node you look up the 1st character and
1157follow that branch repeat until you find the end of the branches. Nodes can be
1158marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1159
786e8c11 1160 /he|she|his|hers/
72f13be8 1161
786e8c11
YO
1162would convert into the following structure. Numbers represent states, letters
1163following numbers represent valid transitions on the letter from that state, if
1164the number is in square brackets it represents an accepting state, otherwise it
1165will be in parenthesis.
07be1b83 1166
786e8c11
YO
1167 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1168 | |
1169 | (2)
1170 | |
1171 (1) +-i->(6)-+-s->[7]
1172 |
1173 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1174
786e8c11
YO
1175 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1176
1177This shows that when matching against the string 'hers' we will begin at state 1
1178read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1179then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1180is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1181single traverse. We store a mapping from accepting to state to which word was
1182matched, and then when we have multiple possibilities we try to complete the
1183rest of the regex in the order in which they occured in the alternation.
1184
1185The only prior NFA like behaviour that would be changed by the TRIE support is
1186the silent ignoring of duplicate alternations which are of the form:
1187
1188 / (DUPE|DUPE) X? (?{ ... }) Y /x
1189
4b714af6 1190Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1191and without the optimisation. With the optimisations dupes will be silently
486ec47a 1192ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1193the following demonstrates:
1194
1195 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1196
1197which prints out 'word' three times, but
1198
1199 'words'=~/(word|word|word)(?{ print $1 })S/
1200
1201which doesnt print it out at all. This is due to other optimisations kicking in.
1202
1203Example of what happens on a structural level:
1204
486ec47a 1205The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1206
1207 1: CURLYM[1] {1,32767}(18)
1208 5: BRANCH(8)
1209 6: EXACT <ac>(16)
1210 8: BRANCH(11)
1211 9: EXACT <ad>(16)
1212 11: BRANCH(14)
1213 12: EXACT <ab>(16)
1214 16: SUCCEED(0)
1215 17: NOTHING(18)
1216 18: END(0)
1217
1218This would be optimizable with startbranch=5, first=5, last=16, tail=16
1219and should turn into:
1220
1221 1: CURLYM[1] {1,32767}(18)
1222 5: TRIE(16)
1223 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1224 <ac>
1225 <ad>
1226 <ab>
1227 16: SUCCEED(0)
1228 17: NOTHING(18)
1229 18: END(0)
1230
1231Cases where tail != last would be like /(?foo|bar)baz/:
1232
1233 1: BRANCH(4)
1234 2: EXACT <foo>(8)
1235 4: BRANCH(7)
1236 5: EXACT <bar>(8)
1237 7: TAIL(8)
1238 8: EXACT <baz>(10)
1239 10: END(0)
1240
1241which would be optimizable with startbranch=1, first=1, last=7, tail=8
1242and would end up looking like:
1243
1244 1: TRIE(8)
1245 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1246 <foo>
1247 <bar>
1248 7: TAIL(8)
1249 8: EXACT <baz>(10)
1250 10: END(0)
1251
1252 d = uvuni_to_utf8_flags(d, uv, 0);
1253
1254is the recommended Unicode-aware way of saying
1255
1256 *(d++) = uv;
1257*/
1258
1e2e3d02 1259#define TRIE_STORE_REVCHAR \
786e8c11 1260 STMT_START { \
73031816
NC
1261 if (UTF) { \
1262 SV *zlopp = newSV(2); \
88c9ea1e
CB
1263 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1264 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1265 SvCUR_set(zlopp, kapow - flrbbbbb); \
1266 SvPOK_on(zlopp); \
1267 SvUTF8_on(zlopp); \
1268 av_push(revcharmap, zlopp); \
1269 } else { \
6bdeddd2 1270 char ooooff = (char)uvc; \
73031816
NC
1271 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1272 } \
1273 } STMT_END
786e8c11
YO
1274
1275#define TRIE_READ_CHAR STMT_START { \
1276 wordlen++; \
1277 if ( UTF ) { \
1278 if ( folder ) { \
1279 if ( foldlen > 0 ) { \
1280 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1281 foldlen -= len; \
1282 scan += len; \
1283 len = 0; \
1284 } else { \
1285 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1286 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1287 foldlen -= UNISKIP( uvc ); \
1288 scan = foldbuf + UNISKIP( uvc ); \
1289 } \
1290 } else { \
1291 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1292 } \
1293 } else { \
1294 uvc = (U32)*uc; \
1295 len = 1; \
1296 } \
1297} STMT_END
1298
1299
1300
1301#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1302 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1303 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1304 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1305 } \
1306 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1307 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1308 TRIE_LIST_CUR( state )++; \
1309} STMT_END
07be1b83 1310
786e8c11
YO
1311#define TRIE_LIST_NEW(state) STMT_START { \
1312 Newxz( trie->states[ state ].trans.list, \
1313 4, reg_trie_trans_le ); \
1314 TRIE_LIST_CUR( state ) = 1; \
1315 TRIE_LIST_LEN( state ) = 4; \
1316} STMT_END
07be1b83 1317
786e8c11
YO
1318#define TRIE_HANDLE_WORD(state) STMT_START { \
1319 U16 dupe= trie->states[ state ].wordnum; \
1320 regnode * const noper_next = regnext( noper ); \
1321 \
786e8c11
YO
1322 DEBUG_r({ \
1323 /* store the word for dumping */ \
1324 SV* tmp; \
1325 if (OP(noper) != NOTHING) \
740cce10 1326 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1327 else \
740cce10 1328 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1329 av_push( trie_words, tmp ); \
786e8c11
YO
1330 }); \
1331 \
1332 curword++; \
2e64971a
DM
1333 trie->wordinfo[curword].prev = 0; \
1334 trie->wordinfo[curword].len = wordlen; \
1335 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1336 \
1337 if ( noper_next < tail ) { \
1338 if (!trie->jump) \
c944940b 1339 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1340 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1341 if (!jumper) \
1342 jumper = noper_next; \
1343 if (!nextbranch) \
1344 nextbranch= regnext(cur); \
1345 } \
1346 \
1347 if ( dupe ) { \
2e64971a
DM
1348 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1349 /* chain, so that when the bits of chain are later */\
1350 /* linked together, the dups appear in the chain */\
1351 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1352 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1353 } else { \
1354 /* we haven't inserted this word yet. */ \
1355 trie->states[ state ].wordnum = curword; \
1356 } \
1357} STMT_END
07be1b83 1358
3dab1dad 1359
786e8c11
YO
1360#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1361 ( ( base + charid >= ucharcount \
1362 && base + charid < ubound \
1363 && state == trie->trans[ base - ucharcount + charid ].check \
1364 && trie->trans[ base - ucharcount + charid ].next ) \
1365 ? trie->trans[ base - ucharcount + charid ].next \
1366 : ( state==1 ? special : 0 ) \
1367 )
3dab1dad 1368
786e8c11
YO
1369#define MADE_TRIE 1
1370#define MADE_JUMP_TRIE 2
1371#define MADE_EXACT_TRIE 4
3dab1dad 1372
a3621e74 1373STATIC I32
786e8c11 1374S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1375{
27da23d5 1376 dVAR;
a3621e74
YO
1377 /* first pass, loop through and scan words */
1378 reg_trie_data *trie;
55eed653 1379 HV *widecharmap = NULL;
2b8b4781 1380 AV *revcharmap = newAV();
a3621e74 1381 regnode *cur;
9f7f3913 1382 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1383 STRLEN len = 0;
1384 UV uvc = 0;
1385 U16 curword = 0;
1386 U32 next_alloc = 0;
786e8c11
YO
1387 regnode *jumper = NULL;
1388 regnode *nextbranch = NULL;
7f69552c 1389 regnode *convert = NULL;
2e64971a 1390 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1391 /* we just use folder as a flag in utf8 */
1e696034 1392 const U8 * folder = NULL;
a3621e74 1393
2b8b4781
NC
1394#ifdef DEBUGGING
1395 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1396 AV *trie_words = NULL;
1397 /* along with revcharmap, this only used during construction but both are
1398 * useful during debugging so we store them in the struct when debugging.
8e11feef 1399 */
2b8b4781
NC
1400#else
1401 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1402 STRLEN trie_charcount=0;
3dab1dad 1403#endif
2b8b4781 1404 SV *re_trie_maxbuff;
a3621e74 1405 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1406
1407 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1408#ifndef DEBUGGING
1409 PERL_UNUSED_ARG(depth);
1410#endif
a3621e74 1411
1e696034 1412 switch (flags) {
2f7f8cb1 1413 case EXACTFA:
1e696034
KW
1414 case EXACTFU: folder = PL_fold_latin1; break;
1415 case EXACTF: folder = PL_fold; break;
1416 case EXACTFL: folder = PL_fold_locale; break;
1417 }
1418
c944940b 1419 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1420 trie->refcount = 1;
3dab1dad 1421 trie->startstate = 1;
786e8c11 1422 trie->wordcount = word_count;
f8fc2ecf 1423 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1424 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1425 if (!(UTF && folder))
c944940b 1426 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1427 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1428 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1429
a3621e74 1430 DEBUG_r({
2b8b4781 1431 trie_words = newAV();
a3621e74 1432 });
a3621e74 1433
0111c4fd 1434 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1435 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1436 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1437 }
3dab1dad
YO
1438 DEBUG_OPTIMISE_r({
1439 PerlIO_printf( Perl_debug_log,
786e8c11 1440 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1441 (int)depth * 2 + 2, "",
1442 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1443 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1444 (int)depth);
3dab1dad 1445 });
7f69552c
YO
1446
1447 /* Find the node we are going to overwrite */
1448 if ( first == startbranch && OP( last ) != BRANCH ) {
1449 /* whole branch chain */
1450 convert = first;
1451 } else {
1452 /* branch sub-chain */
1453 convert = NEXTOPER( first );
1454 }
1455
a3621e74
YO
1456 /* -- First loop and Setup --
1457
1458 We first traverse the branches and scan each word to determine if it
1459 contains widechars, and how many unique chars there are, this is
1460 important as we have to build a table with at least as many columns as we
1461 have unique chars.
1462
1463 We use an array of integers to represent the character codes 0..255
38a44b82 1464 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1465 native representation of the character value as the key and IV's for the
1466 coded index.
1467
1468 *TODO* If we keep track of how many times each character is used we can
1469 remap the columns so that the table compression later on is more
3b753521 1470 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1471 middle and the least common are on the outside. IMO this would be better
1472 than a most to least common mapping as theres a decent chance the most
1473 common letter will share a node with the least common, meaning the node
486ec47a 1474 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
1475 case is when we have the least common nodes twice.
1476
1477 */
1478
a3621e74 1479 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1480 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1481 const U8 *uc = (U8*)STRING( noper );
a28509cc 1482 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1483 STRLEN foldlen = 0;
1484 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1485 const U8 *scan = (U8*)NULL;
07be1b83 1486 U32 wordlen = 0; /* required init */
02daf0ab
YO
1487 STRLEN chars = 0;
1488 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1489
3dab1dad
YO
1490 if (OP(noper) == NOTHING) {
1491 trie->minlen= 0;
1492 continue;
1493 }
02daf0ab
YO
1494 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1495 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1496 regardless of encoding */
1497
a3621e74 1498 for ( ; uc < e ; uc += len ) {
3dab1dad 1499 TRIE_CHARCOUNT(trie)++;
a3621e74 1500 TRIE_READ_CHAR;
3dab1dad 1501 chars++;
a3621e74
YO
1502 if ( uvc < 256 ) {
1503 if ( !trie->charmap[ uvc ] ) {
1504 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1505 if ( folder )
1506 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1507 TRIE_STORE_REVCHAR;
a3621e74 1508 }
02daf0ab 1509 if ( set_bit ) {
62012aee
KW
1510 /* store the codepoint in the bitmap, and its folded
1511 * equivalent. */
02daf0ab 1512 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1513
1514 /* store the folded codepoint */
1515 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1516
1517 if ( !UTF ) {
1518 /* store first byte of utf8 representation of
acdf4139
KW
1519 variant codepoints */
1520 if (! UNI_IS_INVARIANT(uvc)) {
1521 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
1522 }
1523 }
02daf0ab
YO
1524 set_bit = 0; /* We've done our bit :-) */
1525 }
a3621e74
YO
1526 } else {
1527 SV** svpp;
55eed653
NC
1528 if ( !widecharmap )
1529 widecharmap = newHV();
a3621e74 1530
55eed653 1531 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1532
1533 if ( !svpp )
e4584336 1534 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1535
1536 if ( !SvTRUE( *svpp ) ) {
1537 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1538 TRIE_STORE_REVCHAR;
a3621e74
YO
1539 }
1540 }
1541 }
3dab1dad
YO
1542 if( cur == first ) {
1543 trie->minlen=chars;
1544 trie->maxlen=chars;
1545 } else if (chars < trie->minlen) {
1546 trie->minlen=chars;
1547 } else if (chars > trie->maxlen) {
1548 trie->maxlen=chars;
1549 }
1550
a3621e74
YO
1551 } /* end first pass */
1552 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1553 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1554 (int)depth * 2 + 2,"",
55eed653 1555 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1556 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1557 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1558 );
a3621e74
YO
1559
1560 /*
1561 We now know what we are dealing with in terms of unique chars and
1562 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1563 representation using a flat table will take. If it's over a reasonable
1564 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1565 conservative but potentially much slower representation using an array
1566 of lists.
1567
1568 At the end we convert both representations into the same compressed
1569 form that will be used in regexec.c for matching with. The latter
1570 is a form that cannot be used to construct with but has memory
1571 properties similar to the list form and access properties similar
1572 to the table form making it both suitable for fast searches and
1573 small enough that its feasable to store for the duration of a program.
1574
1575 See the comment in the code where the compressed table is produced
1576 inplace from the flat tabe representation for an explanation of how
1577 the compression works.
1578
1579 */
1580
1581
2e64971a
DM
1582 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1583 prev_states[1] = 0;
1584
3dab1dad 1585 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1586 /*
1587 Second Pass -- Array Of Lists Representation
1588
1589 Each state will be represented by a list of charid:state records
1590 (reg_trie_trans_le) the first such element holds the CUR and LEN
1591 points of the allocated array. (See defines above).
1592
1593 We build the initial structure using the lists, and then convert
1594 it into the compressed table form which allows faster lookups
1595 (but cant be modified once converted).
a3621e74
YO
1596 */
1597
a3621e74
YO
1598 STRLEN transcount = 1;
1599
1e2e3d02
YO
1600 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1601 "%*sCompiling trie using list compiler\n",
1602 (int)depth * 2 + 2, ""));
446bd890 1603
c944940b
JH
1604 trie->states = (reg_trie_state *)
1605 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1606 sizeof(reg_trie_state) );
a3621e74
YO
1607 TRIE_LIST_NEW(1);
1608 next_alloc = 2;
1609
1610 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1611
c445ea15
AL
1612 regnode * const noper = NEXTOPER( cur );
1613 U8 *uc = (U8*)STRING( noper );
1614 const U8 * const e = uc + STR_LEN( noper );
1615 U32 state = 1; /* required init */
1616 U16 charid = 0; /* sanity init */
1617 U8 *scan = (U8*)NULL; /* sanity init */
1618 STRLEN foldlen = 0; /* required init */
07be1b83 1619 U32 wordlen = 0; /* required init */
c445ea15
AL
1620 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1621
3dab1dad 1622 if (OP(noper) != NOTHING) {
786e8c11 1623 for ( ; uc < e ; uc += len ) {
c445ea15 1624
786e8c11 1625 TRIE_READ_CHAR;
c445ea15 1626
786e8c11
YO
1627 if ( uvc < 256 ) {
1628 charid = trie->charmap[ uvc ];
c445ea15 1629 } else {
55eed653 1630 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1631 if ( !svpp ) {
1632 charid = 0;
1633 } else {
1634 charid=(U16)SvIV( *svpp );
1635 }
c445ea15 1636 }
786e8c11
YO
1637 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1638 if ( charid ) {
a3621e74 1639
786e8c11
YO
1640 U16 check;
1641 U32 newstate = 0;
a3621e74 1642
786e8c11
YO
1643 charid--;
1644 if ( !trie->states[ state ].trans.list ) {
1645 TRIE_LIST_NEW( state );
c445ea15 1646 }
786e8c11
YO
1647 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1648 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1649 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1650 break;
1651 }
1652 }
1653 if ( ! newstate ) {
1654 newstate = next_alloc++;
2e64971a 1655 prev_states[newstate] = state;
786e8c11
YO
1656 TRIE_LIST_PUSH( state, charid, newstate );
1657 transcount++;
1658 }
1659 state = newstate;
1660 } else {
1661 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1662 }
a28509cc 1663 }
c445ea15 1664 }
3dab1dad 1665 TRIE_HANDLE_WORD(state);
a3621e74
YO
1666
1667 } /* end second pass */
1668
1e2e3d02
YO
1669 /* next alloc is the NEXT state to be allocated */
1670 trie->statecount = next_alloc;
c944940b
JH
1671 trie->states = (reg_trie_state *)
1672 PerlMemShared_realloc( trie->states,
1673 next_alloc
1674 * sizeof(reg_trie_state) );
a3621e74 1675
3dab1dad 1676 /* and now dump it out before we compress it */
2b8b4781
NC
1677 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1678 revcharmap, next_alloc,
1679 depth+1)
1e2e3d02 1680 );
a3621e74 1681
c944940b
JH
1682 trie->trans = (reg_trie_trans *)
1683 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1684 {
1685 U32 state;
a3621e74
YO
1686 U32 tp = 0;
1687 U32 zp = 0;
1688
1689
1690 for( state=1 ; state < next_alloc ; state ++ ) {
1691 U32 base=0;
1692
1693 /*
1694 DEBUG_TRIE_COMPILE_MORE_r(
1695 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1696 );
1697 */
1698
1699 if (trie->states[state].trans.list) {
1700 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1701 U16 maxid=minid;
a28509cc 1702 U16 idx;
a3621e74
YO
1703
1704 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1705 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1706 if ( forid < minid ) {
1707 minid=forid;
1708 } else if ( forid > maxid ) {
1709 maxid=forid;
1710 }
a3621e74
YO
1711 }
1712 if ( transcount < tp + maxid - minid + 1) {
1713 transcount *= 2;
c944940b
JH
1714 trie->trans = (reg_trie_trans *)
1715 PerlMemShared_realloc( trie->trans,
446bd890
NC
1716 transcount
1717 * sizeof(reg_trie_trans) );
a3621e74
YO
1718 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1719 }
1720 base = trie->uniquecharcount + tp - minid;
1721 if ( maxid == minid ) {
1722 U32 set = 0;
1723 for ( ; zp < tp ; zp++ ) {
1724 if ( ! trie->trans[ zp ].next ) {
1725 base = trie->uniquecharcount + zp - minid;
1726 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1727 trie->trans[ zp ].check = state;
1728 set = 1;
1729 break;
1730 }
1731 }
1732 if ( !set ) {
1733 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1734 trie->trans[ tp ].check = state;
1735 tp++;
1736 zp = tp;
1737 }
1738 } else {
1739 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1740 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1741 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1742 trie->trans[ tid ].check = state;
1743 }
1744 tp += ( maxid - minid + 1 );
1745 }
1746 Safefree(trie->states[ state ].trans.list);
1747 }
1748 /*
1749 DEBUG_TRIE_COMPILE_MORE_r(
1750 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1751 );
1752 */
1753 trie->states[ state ].trans.base=base;
1754 }
cc601c31 1755 trie->lasttrans = tp + 1;
a3621e74
YO
1756 }
1757 } else {
1758 /*
1759 Second Pass -- Flat Table Representation.
1760
1761 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1762 We know that we will need Charcount+1 trans at most to store the data
1763 (one row per char at worst case) So we preallocate both structures
1764 assuming worst case.
1765
1766 We then construct the trie using only the .next slots of the entry
1767 structs.
1768
3b753521 1769 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
1770 make compression both faster and easier by keeping track of how many non
1771 zero fields are in the node.
1772
1773 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1774 transition.
1775
1776 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1777 number representing the first entry of the node, and state as a
1778 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1779 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1780 are 2 entrys per node. eg:
1781
1782 A B A B
1783 1. 2 4 1. 3 7
1784 2. 0 3 3. 0 5
1785 3. 0 0 5. 0 0
1786 4. 0 0 7. 0 0
1787
1788 The table is internally in the right hand, idx form. However as we also
1789 have to deal with the states array which is indexed by nodenum we have to
1790 use TRIE_NODENUM() to convert.
1791
1792 */
1e2e3d02
YO
1793 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1794 "%*sCompiling trie using table compiler\n",
1795 (int)depth * 2 + 2, ""));
3dab1dad 1796
c944940b
JH
1797 trie->trans = (reg_trie_trans *)
1798 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1799 * trie->uniquecharcount + 1,
1800 sizeof(reg_trie_trans) );
1801 trie->states = (reg_trie_state *)
1802 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1803 sizeof(reg_trie_state) );
a3621e74
YO
1804 next_alloc = trie->uniquecharcount + 1;
1805
3dab1dad 1806
a3621e74
YO
1807 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1808
c445ea15 1809 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1810 const U8 *uc = (U8*)STRING( noper );
1811 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1812
1813 U32 state = 1; /* required init */
1814
1815 U16 charid = 0; /* sanity init */
1816 U32 accept_state = 0; /* sanity init */
1817 U8 *scan = (U8*)NULL; /* sanity init */
1818
1819 STRLEN foldlen = 0; /* required init */
07be1b83 1820 U32 wordlen = 0; /* required init */
a3621e74
YO
1821 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1822
3dab1dad 1823 if ( OP(noper) != NOTHING ) {
786e8c11 1824 for ( ; uc < e ; uc += len ) {
a3621e74 1825
786e8c11 1826 TRIE_READ_CHAR;
a3621e74 1827
786e8c11
YO
1828 if ( uvc < 256 ) {
1829 charid = trie->charmap[ uvc ];
1830 } else {
55eed653 1831 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1832 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1833 }
786e8c11
YO
1834 if ( charid ) {
1835 charid--;
1836 if ( !trie->trans[ state + charid ].next ) {
1837 trie->trans[ state + charid ].next = next_alloc;
1838 trie->trans[ state ].check++;
2e64971a
DM
1839 prev_states[TRIE_NODENUM(next_alloc)]
1840 = TRIE_NODENUM(state);
786e8c11
YO
1841 next_alloc += trie->uniquecharcount;
1842 }
1843 state = trie->trans[ state + charid ].next;
1844 } else {
1845 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1846 }
1847 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1848 }
a3621e74 1849 }
3dab1dad
YO
1850 accept_state = TRIE_NODENUM( state );
1851 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1852
1853 } /* end second pass */
1854
3dab1dad 1855 /* and now dump it out before we compress it */
2b8b4781
NC
1856 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1857 revcharmap,
1858 next_alloc, depth+1));
a3621e74 1859
a3621e74
YO
1860 {
1861 /*
1862 * Inplace compress the table.*
1863
1864 For sparse data sets the table constructed by the trie algorithm will
1865 be mostly 0/FAIL transitions or to put it another way mostly empty.
1866 (Note that leaf nodes will not contain any transitions.)
1867
1868 This algorithm compresses the tables by eliminating most such
1869 transitions, at the cost of a modest bit of extra work during lookup:
1870
1871 - Each states[] entry contains a .base field which indicates the
1872 index in the state[] array wheres its transition data is stored.
1873
3b753521 1874 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
1875
1876 - If .base is nonzero then charid is added to it to find an entry in
1877 the trans array.
1878
1879 -If trans[states[state].base+charid].check!=state then the
1880 transition is taken to be a 0/Fail transition. Thus if there are fail
1881 transitions at the front of the node then the .base offset will point
1882 somewhere inside the previous nodes data (or maybe even into a node
1883 even earlier), but the .check field determines if the transition is
1884 valid.
1885
786e8c11 1886 XXX - wrong maybe?
a3621e74 1887 The following process inplace converts the table to the compressed
3b753521 1888 table: We first do not compress the root node 1,and mark all its
a3621e74 1889 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
1890 allows us to do a DFA construction from the compressed table later,
1891 and ensures that any .base pointers we calculate later are greater
1892 than 0.
a3621e74
YO
1893
1894 - We set 'pos' to indicate the first entry of the second node.
1895
1896 - We then iterate over the columns of the node, finding the first and
1897 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1898 and set the .check pointers accordingly, and advance pos
1899 appropriately and repreat for the next node. Note that when we copy
1900 the next pointers we have to convert them from the original
1901 NODEIDX form to NODENUM form as the former is not valid post
1902 compression.
1903
1904 - If a node has no transitions used we mark its base as 0 and do not
1905 advance the pos pointer.
1906
1907 - If a node only has one transition we use a second pointer into the
1908 structure to fill in allocated fail transitions from other states.
1909 This pointer is independent of the main pointer and scans forward
1910 looking for null transitions that are allocated to a state. When it
1911 finds one it writes the single transition into the "hole". If the
786e8c11 1912 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1913
1914 - Once compressed we can Renew/realloc the structures to release the
1915 excess space.
1916
1917 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1918 specifically Fig 3.47 and the associated pseudocode.
1919
1920 demq
1921 */
a3b680e6 1922 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1923 U32 state, charid;
a3621e74 1924 U32 pos = 0, zp=0;
1e2e3d02 1925 trie->statecount = laststate;
a3621e74
YO
1926
1927 for ( state = 1 ; state < laststate ; state++ ) {
1928 U8 flag = 0;
a28509cc
AL
1929 const U32 stateidx = TRIE_NODEIDX( state );
1930 const U32 o_used = trie->trans[ stateidx ].check;
1931 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1932 trie->trans[ stateidx ].check = 0;
1933
1934 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1935 if ( flag || trie->trans[ stateidx + charid ].next ) {
1936 if ( trie->trans[ stateidx + charid ].next ) {
1937 if (o_used == 1) {
1938 for ( ; zp < pos ; zp++ ) {
1939 if ( ! trie->trans[ zp ].next ) {
1940 break;
1941 }
1942 }
1943 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1944 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1945 trie->trans[ zp ].check = state;
1946 if ( ++zp > pos ) pos = zp;
1947 break;
1948 }
1949 used--;
1950 }
1951 if ( !flag ) {
1952 flag = 1;
1953 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1954 }
1955 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1956 trie->trans[ pos ].check = state;
1957 pos++;
1958 }
1959 }
1960 }
cc601c31 1961 trie->lasttrans = pos + 1;
c944940b
JH
1962 trie->states = (reg_trie_state *)
1963 PerlMemShared_realloc( trie->states, laststate
1964 * sizeof(reg_trie_state) );
a3621e74 1965 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1966 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1967 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1968 (int)depth * 2 + 2,"",
1969 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1970 (IV)next_alloc,
1971 (IV)pos,
a3621e74
YO
1972 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1973 );
1974
1975 } /* end table compress */
1976 }
1e2e3d02
YO
1977 DEBUG_TRIE_COMPILE_MORE_r(
1978 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1979 (int)depth * 2 + 2, "",
1980 (UV)trie->statecount,
1981 (UV)trie->lasttrans)
1982 );
cc601c31 1983 /* resize the trans array to remove unused space */
c944940b
JH
1984 trie->trans = (reg_trie_trans *)
1985 PerlMemShared_realloc( trie->trans, trie->lasttrans
1986 * sizeof(reg_trie_trans) );
a3621e74 1987
3b753521 1988 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
1989 U8 nodetype =(U8)(flags & 0xFF);
1990 char *str=NULL;
786e8c11 1991
07be1b83 1992#ifdef DEBUGGING
e62cc96a 1993 regnode *optimize = NULL;
7122b237
YO
1994#ifdef RE_TRACK_PATTERN_OFFSETS
1995
b57a0404
JH
1996 U32 mjd_offset = 0;
1997 U32 mjd_nodelen = 0;
7122b237
YO
1998#endif /* RE_TRACK_PATTERN_OFFSETS */
1999#endif /* DEBUGGING */
a3621e74 2000 /*
3dab1dad
YO
2001 This means we convert either the first branch or the first Exact,
2002 depending on whether the thing following (in 'last') is a branch
2003 or not and whther first is the startbranch (ie is it a sub part of
2004 the alternation or is it the whole thing.)
3b753521 2005 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 2006 the whole branch sequence, including the first.
a3621e74 2007 */
3dab1dad 2008 /* Find the node we are going to overwrite */
7f69552c 2009 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 2010 /* branch sub-chain */
3dab1dad 2011 NEXT_OFF( first ) = (U16)(last - first);
7122b237 2012#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
2013 DEBUG_r({
2014 mjd_offset= Node_Offset((convert));
2015 mjd_nodelen= Node_Length((convert));
2016 });
7122b237 2017#endif
7f69552c 2018 /* whole branch chain */
7122b237
YO
2019 }
2020#ifdef RE_TRACK_PATTERN_OFFSETS
2021 else {
7f69552c
YO
2022 DEBUG_r({
2023 const regnode *nop = NEXTOPER( convert );
2024 mjd_offset= Node_Offset((nop));
2025 mjd_nodelen= Node_Length((nop));
2026 });
07be1b83
YO
2027 }
2028 DEBUG_OPTIMISE_r(
2029 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2030 (int)depth * 2 + 2, "",
786e8c11 2031 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2032 );
7122b237 2033#endif
3dab1dad
YO
2034 /* But first we check to see if there is a common prefix we can
2035 split out as an EXACT and put in front of the TRIE node. */
2036 trie->startstate= 1;
55eed653 2037 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2038 U32 state;
1e2e3d02 2039 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2040 U32 ofs = 0;
8e11feef
RGS
2041 I32 idx = -1;
2042 U32 count = 0;
2043 const U32 base = trie->states[ state ].trans.base;
a3621e74 2044
3dab1dad 2045 if ( trie->states[state].wordnum )
8e11feef 2046 count = 1;
a3621e74 2047
8e11feef 2048 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2049 if ( ( base + ofs >= trie->uniquecharcount ) &&
2050 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2051 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2052 {
3dab1dad 2053 if ( ++count > 1 ) {
2b8b4781 2054 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2055 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2056 if ( state == 1 ) break;
3dab1dad
YO
2057 if ( count == 2 ) {
2058 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2059 DEBUG_OPTIMISE_r(
8e11feef
RGS
2060 PerlIO_printf(Perl_debug_log,
2061 "%*sNew Start State=%"UVuf" Class: [",
2062 (int)depth * 2 + 2, "",
786e8c11 2063 (UV)state));
be8e71aa 2064 if (idx >= 0) {
2b8b4781 2065 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2066 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2067
3dab1dad 2068 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2069 if ( folder )
2070 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2071 DEBUG_OPTIMISE_r(
f1f66076 2072 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2073 );
8e11feef
RGS
2074 }
2075 }
2076 TRIE_BITMAP_SET(trie,*ch);
2077 if ( folder )
2078 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2079 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2080 }
2081 idx = ofs;
2082 }
3dab1dad
YO
2083 }
2084 if ( count == 1 ) {
2b8b4781 2085 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2086 STRLEN len;
2087 char *ch = SvPV( *tmp, len );
de734bd5
A
2088 DEBUG_OPTIMISE_r({
2089 SV *sv=sv_newmortal();
8e11feef
RGS
2090 PerlIO_printf( Perl_debug_log,
2091 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2092 (int)depth * 2 + 2, "",
de734bd5
A
2093 (UV)state, (UV)idx,
2094 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2095 PL_colors[0], PL_colors[1],
2096 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2097 PERL_PV_ESCAPE_FIRSTCHAR
2098 )
2099 );
2100 });
3dab1dad
YO
2101 if ( state==1 ) {
2102 OP( convert ) = nodetype;
2103 str=STRING(convert);
2104 STR_LEN(convert)=0;
2105 }
c490c714
YO
2106 STR_LEN(convert) += len;
2107 while (len--)
de734bd5 2108 *str++ = *ch++;
8e11feef 2109 } else {
f9049ba1 2110#ifdef DEBUGGING
8e11feef
RGS
2111 if (state>1)
2112 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2113#endif
8e11feef
RGS
2114 break;
2115 }
2116 }
2e64971a 2117 trie->prefixlen = (state-1);
3dab1dad 2118 if (str) {
8e11feef 2119 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2120 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2121 trie->startstate = state;
07be1b83
YO
2122 trie->minlen -= (state - 1);
2123 trie->maxlen -= (state - 1);
33809eae
JH
2124#ifdef DEBUGGING
2125 /* At least the UNICOS C compiler choked on this
2126 * being argument to DEBUG_r(), so let's just have
2127 * it right here. */
2128 if (
2129#ifdef PERL_EXT_RE_BUILD
2130 1
2131#else
2132 DEBUG_r_TEST
2133#endif
2134 ) {
2135 regnode *fix = convert;
2136 U32 word = trie->wordcount;
2137 mjd_nodelen++;
2138 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2139 while( ++fix < n ) {
2140 Set_Node_Offset_Length(fix, 0, 0);
2141 }
2142 while (word--) {
2143 SV ** const tmp = av_fetch( trie_words, word, 0 );
2144 if (tmp) {
2145 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2146 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2147 else
2148 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2149 }
2150 }
2151 }
2152#endif
8e11feef
RGS
2153 if (trie->maxlen) {
2154 convert = n;
2155 } else {
3dab1dad 2156 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2157 DEBUG_r(optimize= n);
3dab1dad
YO
2158 }
2159 }
2160 }
a5ca303d
YO
2161 if (!jumper)
2162 jumper = last;
3dab1dad 2163 if ( trie->maxlen ) {
8e11feef
RGS
2164 NEXT_OFF( convert ) = (U16)(tail - convert);
2165 ARG_SET( convert, data_slot );
786e8c11
YO
2166 /* Store the offset to the first unabsorbed branch in
2167 jump[0], which is otherwise unused by the jump logic.
2168 We use this when dumping a trie and during optimisation. */
2169 if (trie->jump)
7f69552c 2170 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2171
6c48061a
YO
2172 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2173 * and there is a bitmap
2174 * and the first "jump target" node we found leaves enough room
2175 * then convert the TRIE node into a TRIEC node, with the bitmap
2176 * embedded inline in the opcode - this is hypothetically faster.
2177 */
2178 if ( !trie->states[trie->startstate].wordnum
2179 && trie->bitmap
2180 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2181 {
2182 OP( convert ) = TRIEC;
2183 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2184 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2185 trie->bitmap= NULL;
2186 } else
2187 OP( convert ) = TRIE;
a3621e74 2188
3dab1dad
YO
2189 /* store the type in the flags */
2190 convert->flags = nodetype;
a5ca303d
YO
2191 DEBUG_r({
2192 optimize = convert
2193 + NODE_STEP_REGNODE
2194 + regarglen[ OP( convert ) ];
2195 });
2196 /* XXX We really should free up the resource in trie now,
2197 as we won't use them - (which resources?) dmq */
3dab1dad 2198 }
a3621e74 2199 /* needed for dumping*/
e62cc96a 2200 DEBUG_r(if (optimize) {
07be1b83 2201 regnode *opt = convert;
bcdf7404 2202
e62cc96a 2203 while ( ++opt < optimize) {
07be1b83
YO
2204 Set_Node_Offset_Length(opt,0,0);
2205 }
786e8c11
YO
2206 /*
2207 Try to clean up some of the debris left after the
2208 optimisation.
a3621e74 2209 */
786e8c11 2210 while( optimize < jumper ) {
07be1b83 2211 mjd_nodelen += Node_Length((optimize));
a3621e74 2212 OP( optimize ) = OPTIMIZED;
07be1b83 2213 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2214 optimize++;
2215 }
07be1b83 2216 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2217 });
2218 } /* end node insert */
2e64971a
DM
2219
2220 /* Finish populating the prev field of the wordinfo array. Walk back
2221 * from each accept state until we find another accept state, and if
2222 * so, point the first word's .prev field at the second word. If the
2223 * second already has a .prev field set, stop now. This will be the
2224 * case either if we've already processed that word's accept state,
3b753521
FN
2225 * or that state had multiple words, and the overspill words were
2226 * already linked up earlier.
2e64971a
DM
2227 */
2228 {
2229 U16 word;
2230 U32 state;
2231 U16 prev;
2232
2233 for (word=1; word <= trie->wordcount; word++) {
2234 prev = 0;
2235 if (trie->wordinfo[word].prev)
2236 continue;
2237 state = trie->wordinfo[word].accept;
2238 while (state) {
2239 state = prev_states[state];
2240 if (!state)
2241 break;
2242 prev = trie->states[state].wordnum;
2243 if (prev)
2244 break;
2245 }
2246 trie->wordinfo[word].prev = prev;
2247 }
2248 Safefree(prev_states);
2249 }
2250
2251
2252 /* and now dump out the compressed format */
2253 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2254
55eed653 2255 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2256#ifdef DEBUGGING
2257 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2258 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2259#else
2260 SvREFCNT_dec(revcharmap);
07be1b83 2261#endif
786e8c11
YO
2262 return trie->jump
2263 ? MADE_JUMP_TRIE
2264 : trie->startstate>1
2265 ? MADE_EXACT_TRIE
2266 : MADE_TRIE;
2267}
2268
2269STATIC void
2270S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2271{
3b753521 2272/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2273
2274 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2275 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2276 ISBN 0-201-10088-6
2277
2278 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2279 suffix of the current state's 'word' that is also a proper prefix of another word in our
2280 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2281 the DFA not to have to restart after its tried and failed a word at a given point, it
2282 simply continues as though it had been matching the other word in the first place.
2283 Consider
2284 'abcdgu'=~/abcdefg|cdgu/
2285 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2286 fail, which would bring us to the state representing 'd' in the second word where we would
2287 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2288 */
2289 /* add a fail transition */
3251b653
NC
2290 const U32 trie_offset = ARG(source);
2291 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2292 U32 *q;
2293 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2294 const U32 numstates = trie->statecount;
786e8c11
YO
2295 const U32 ubound = trie->lasttrans + ucharcount;
2296 U32 q_read = 0;
2297 U32 q_write = 0;
2298 U32 charid;
2299 U32 base = trie->states[ 1 ].trans.base;
2300 U32 *fail;
2301 reg_ac_data *aho;
2302 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2303 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2304
2305 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2306#ifndef DEBUGGING
2307 PERL_UNUSED_ARG(depth);
2308#endif
2309
2310
2311 ARG_SET( stclass, data_slot );
c944940b 2312 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2313 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2314 aho->trie=trie_offset;
446bd890
NC
2315 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2316 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2317 Newxz( q, numstates, U32);
c944940b 2318 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2319 aho->refcount = 1;
2320 fail = aho->fail;
2321 /* initialize fail[0..1] to be 1 so that we always have
2322 a valid final fail state */
2323 fail[ 0 ] = fail[ 1 ] = 1;
2324
2325 for ( charid = 0; charid < ucharcount ; charid++ ) {
2326 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2327 if ( newstate ) {
2328 q[ q_write ] = newstate;
2329 /* set to point at the root */
2330 fail[ q[ q_write++ ] ]=1;
2331 }
2332 }
2333 while ( q_read < q_write) {
2334 const U32 cur = q[ q_read++ % numstates ];
2335 base = trie->states[ cur ].trans.base;
2336
2337 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2338 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2339 if (ch_state) {
2340 U32 fail_state = cur;
2341 U32 fail_base;
2342 do {
2343 fail_state = fail[ fail_state ];
2344 fail_base = aho->states[ fail_state ].trans.base;
2345 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2346
2347 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2348 fail[ ch_state ] = fail_state;
2349 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2350 {
2351 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2352 }
2353 q[ q_write++ % numstates] = ch_state;
2354 }
2355 }
2356 }
2357 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2358 when we fail in state 1, this allows us to use the
2359 charclass scan to find a valid start char. This is based on the principle
2360 that theres a good chance the string being searched contains lots of stuff
2361 that cant be a start char.
2362 */
2363 fail[ 0 ] = fail[ 1 ] = 0;
2364 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2365 PerlIO_printf(Perl_debug_log,
2366 "%*sStclass Failtable (%"UVuf" states): 0",
2367 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2368 );
786e8c11
YO
2369 for( q_read=1; q_read<numstates; q_read++ ) {
2370 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2371 }
2372 PerlIO_printf(Perl_debug_log, "\n");
2373 });
2374 Safefree(q);
2375 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2376}
2377
786e8c11 2378
a3621e74 2379/*
5d1c421c
JH
2380 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2381 * These need to be revisited when a newer toolchain becomes available.
2382 */
2383#if defined(__sparc64__) && defined(__GNUC__)
2384# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2385# undef SPARC64_GCC_WORKAROUND
2386# define SPARC64_GCC_WORKAROUND 1
2387# endif
2388#endif
2389
07be1b83 2390#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2391 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2392 SV * const mysv=sv_newmortal(); \
2393 regnode *Next = regnext(scan); \
2394 regprop(RExC_rx, mysv, scan); \
7f69552c 2395 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2396 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2397 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2398 }});
07be1b83 2399
1de06328
YO
2400
2401
2402
2403
07be1b83
YO
2404#define JOIN_EXACT(scan,min,flags) \
2405 if (PL_regkind[OP(scan)] == EXACT) \
2406 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2407
be8e71aa 2408STATIC U32
07be1b83
YO
2409S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2410 /* Merge several consecutive EXACTish nodes into one. */
2411 regnode *n = regnext(scan);
2412 U32 stringok = 1;
2413 regnode *next = scan + NODE_SZ_STR(scan);
2414 U32 merged = 0;
2415 U32 stopnow = 0;
2416#ifdef DEBUGGING
2417 regnode *stop = scan;
72f13be8 2418 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2419#else
d47053eb
RGS
2420 PERL_UNUSED_ARG(depth);
2421#endif
7918f24d
NC
2422
2423 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2424#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2425 PERL_UNUSED_ARG(flags);
2426 PERL_UNUSED_ARG(val);
07be1b83 2427#endif
07be1b83
YO
2428 DEBUG_PEEP("join",scan,depth);
2429
2430 /* Skip NOTHING, merge EXACT*. */
2431 while (n &&
2432 ( PL_regkind[OP(n)] == NOTHING ||
2433 (stringok && (OP(n) == OP(scan))))
2434 && NEXT_OFF(n)
2435 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2436
2437 if (OP(n) == TAIL || n > next)
2438 stringok = 0;
2439 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2440 DEBUG_PEEP("skip:",n,depth);
2441 NEXT_OFF(scan) += NEXT_OFF(n);
2442 next = n + NODE_STEP_REGNODE;
2443#ifdef DEBUGGING
2444 if (stringok)
2445 stop = n;
2446#endif
2447 n = regnext(n);
2448 }
2449 else if (stringok) {
786e8c11 2450 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2451 regnode * const nnext = regnext(n);
2452
2453 DEBUG_PEEP("merg",n,depth);
2454
2455 merged++;
2456 if (oldl + STR_LEN(n) > U8_MAX)
2457 break;
2458 NEXT_OFF(scan) += NEXT_OFF(n);
2459 STR_LEN(scan) += STR_LEN(n);
2460 next = n + NODE_SZ_STR(n);
2461 /* Now we can overwrite *n : */
2462 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2463#ifdef DEBUGGING
2464 stop = next - 1;
2465#endif
2466 n = nnext;
2467 if (stopnow) break;
2468 }
2469
d47053eb
RGS
2470#ifdef EXPERIMENTAL_INPLACESCAN
2471 if (flags && !NEXT_OFF(n)) {
2472 DEBUG_PEEP("atch", val, depth);
2473 if (reg_off_by_arg[OP(n)]) {
2474 ARG_SET(n, val - n);
2475 }
2476 else {
2477 NEXT_OFF(n) = val - n;
2478 }
2479 stopnow = 1;
2480 }
07be1b83
YO
2481#endif
2482 }
ced7f090
KW
2483#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2484#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2485#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2486#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2c2b7f86
KW
2487
2488 if (UTF
2f7f8cb1 2489 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2c2b7f86
KW
2490 && ( STR_LEN(scan) >= 6 ) )
2491 {
07be1b83
YO
2492 /*
2493 Two problematic code points in Unicode casefolding of EXACT nodes:
2494
2495 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2496 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2497
2498 which casefold to
2499
2500 Unicode UTF-8
2501
2502 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2503 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2504
2505 This means that in case-insensitive matching (or "loose matching",
2506 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2507 length of the above casefolded versions) can match a target string
2508 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2509 This would rather mess up the minimum length computation.
2510
2511 What we'll do is to look for the tail four bytes, and then peek
2512 at the preceding two bytes to see whether we need to decrease
2513 the minimum length by four (six minus two).
2514
2515 Thanks to the design of UTF-8, there cannot be false matches:
2516 A sequence of valid UTF-8 bytes cannot be a subsequence of
2517 another valid sequence of UTF-8 bytes.
2518
2519 */
2520 char * const s0 = STRING(scan), *s, *t;
2521 char * const s1 = s0 + STR_LEN(scan) - 1;
2522 char * const s2 = s1 - 4;
e294cc5d
JH
2523#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2524 const char t0[] = "\xaf\x49\xaf\x42";
2525#else
07be1b83 2526 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2527#endif
07be1b83
YO
2528 const char * const t1 = t0 + 3;
2529
2530 for (s = s0 + 2;
2531 s < s2 && (t = ninstr(s, s1, t0, t1));
2532 s = t + 4) {
e294cc5d
JH
2533#ifdef EBCDIC
2534 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2535 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2536#else
07be1b83
YO
2537 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2538 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2539#endif
07be1b83
YO
2540 *min -= 4;
2541 }
2542 }
2543
2544#ifdef DEBUGGING
2545 /* Allow dumping */
2546 n = scan + NODE_SZ_STR(scan);
2547 while (n <= stop) {
2548 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2549 OP(n) = OPTIMIZED;
2550 NEXT_OFF(n) = 0;
2551 }
2552 n++;
2553 }
2554#endif
2555 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2556 return stopnow;
2557}
2558
486ec47a 2559/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
2560 Finds fixed substrings. */
2561
a0288114 2562/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2563 to the position after last scanned or to NULL. */
2564
40d049e4
YO
2565#define INIT_AND_WITHP \
2566 assert(!and_withp); \
2567 Newx(and_withp,1,struct regnode_charclass_class); \
2568 SAVEFREEPV(and_withp)
07be1b83 2569
b515a41d 2570/* this is a chain of data about sub patterns we are processing that
486ec47a 2571 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
2572 we can simulate recursion without losing state. */
2573struct scan_frame;
2574typedef struct scan_frame {
2575 regnode *last; /* last node to process in this frame */
2576 regnode *next; /* next node to process when last is reached */
2577 struct scan_frame *prev; /*previous frame*/
2578 I32 stop; /* what stopparen do we use */
2579} scan_frame;
2580
304ee84b
YO
2581
2582#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2583
e1d1eefb
YO
2584#define CASE_SYNST_FNC(nAmE) \
2585case nAmE: \
2586 if (flags & SCF_DO_STCLASS_AND) { \
2587 for (value = 0; value < 256; value++) \
2588 if (!is_ ## nAmE ## _cp(value)) \
2589 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2590 } \
2591 else { \
2592 for (value = 0; value < 256; value++) \
2593 if (is_ ## nAmE ## _cp(value)) \
2594 ANYOF_BITMAP_SET(data->start_class, value); \
2595 } \
2596 break; \
2597case N ## nAmE: \
2598 if (flags & SCF_DO_STCLASS_AND) { \
2599 for (value = 0; value < 256; value++) \
2600 if (is_ ## nAmE ## _cp(value)) \
2601 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2602 } \
2603 else { \
2604 for (value = 0; value < 256; value++) \
2605 if (!is_ ## nAmE ## _cp(value)) \
2606 ANYOF_BITMAP_SET(data->start_class, value); \
2607 } \
2608 break
2609
2610
2611
76e3520e 2612STATIC I32
40d049e4 2613S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2614 I32 *minlenp, I32 *deltap,
40d049e4
YO
2615 regnode *last,
2616 scan_data_t *data,
2617 I32 stopparen,
2618 U8* recursed,
2619 struct regnode_charclass_class *and_withp,
2620 U32 flags, U32 depth)
c277df42
IZ
2621 /* scanp: Start here (read-write). */
2622 /* deltap: Write maxlen-minlen here. */
2623 /* last: Stop before this one. */
40d049e4
YO
2624 /* data: string data about the pattern */
2625 /* stopparen: treat close N as END */
2626 /* recursed: which subroutines have we recursed into */
2627 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2628{
97aff369 2629 dVAR;
c277df42
IZ
2630 I32 min = 0, pars = 0, code;
2631 regnode *scan = *scanp, *next;
2632 I32 delta = 0;
2633 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2634 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2635 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2636 scan_data_t data_fake;
a3621e74 2637 SV *re_trie_maxbuff = NULL;
786e8c11 2638 regnode *first_non_open = scan;
e2e6a0f1 2639 I32 stopmin = I32_MAX;
8aa23a47 2640 scan_frame *frame = NULL;
a3621e74 2641 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2642
7918f24d
NC
2643 PERL_ARGS_ASSERT_STUDY_CHUNK;
2644
13a24bad 2645#ifdef DEBUGGING
40d049e4 2646 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2647#endif
40d049e4 2648
786e8c11 2649 if ( depth == 0 ) {
40d049e4 2650 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2651 first_non_open=regnext(first_non_open);
2652 }
2653
b81d288d 2654
8aa23a47
YO
2655 fake_study_recurse:
2656 while ( scan && OP(scan) != END && scan < last ){
2657 /* Peephole optimizer: */
304ee84b 2658 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2659 DEBUG_PEEP("Peep",scan,depth);
2660 JOIN_EXACT(scan,&min,0);
2661
2662 /* Follow the next-chain of the current node and optimize
2663 away all the NOTHINGs from it. */
2664 if (OP(scan) != CURLYX) {
2665 const int max = (reg_off_by_arg[OP(scan)]
2666 ? I32_MAX
2667 /* I32 may be smaller than U16 on CRAYs! */
2668 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2669 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2670 int noff;
2671 regnode *n = scan;
2672
2673 /* Skip NOTHING and LONGJMP. */
2674 while ((n = regnext(n))
2675 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2676 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2677 && off + noff < max)
2678 off += noff;
2679 if (reg_off_by_arg[OP(scan)])
2680 ARG(scan) = off;
2681 else
2682 NEXT_OFF(scan) = off;
2683 }
a3621e74 2684
c277df42 2685
8aa23a47
YO
2686
2687 /* The principal pseudo-switch. Cannot be a switch, since we
2688 look into several different things. */
2689 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2690 || OP(scan) == IFTHEN) {
2691 next = regnext(scan);
2692 code = OP(scan);
2693 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2694
2695 if (OP(next) == code || code == IFTHEN) {
2696 /* NOTE - There is similar code to this block below for handling
2697 TRIE nodes on a re-study. If you change stuff here check there
2698 too. */
2699 I32 max1 = 0, min1 = I32_MAX, num = 0;
2700 struct regnode_charclass_class accum;
2701 regnode * const startbranch=scan;
2702
2703 if (flags & SCF_DO_SUBSTR)
304ee84b 2704 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2705 if (flags & SCF_DO_STCLASS)
2706 cl_init_zero(pRExC_state, &accum);
2707
2708 while (OP(scan) == code) {
2709 I32 deltanext, minnext, f = 0, fake;
2710 struct regnode_charclass_class this_class;
2711
2712 num++;
2713 data_fake.flags = 0;
2714 if (data) {
2715 data_fake.whilem_c = data->whilem_c;
2716 data_fake.last_closep = data->last_closep;
2717 }
2718 else
2719 data_fake.last_closep = &fake;
58e23c8d
YO
2720
2721 data_fake.pos_delta = delta;
8aa23a47
YO
2722 next = regnext(scan);
2723 scan = NEXTOPER(scan);
2724 if (code != BRANCH)
c277df42 2725 scan = NEXTOPER(scan);
8aa23a47
YO
2726 if (flags & SCF_DO_STCLASS) {
2727 cl_init(pRExC_state, &this_class);
2728 data_fake.start_class = &this_class;
2729 f = SCF_DO_STCLASS_AND;
58e23c8d 2730 }
8aa23a47
YO
2731 if (flags & SCF_WHILEM_VISITED_POS)
2732 f |= SCF_WHILEM_VISITED_POS;
2733
2734 /* we suppose the run is continuous, last=next...*/
2735 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2736 next, &data_fake,
2737 stopparen, recursed, NULL, f,depth+1);
2738 if (min1 > minnext)
2739 min1 = minnext;
2740 if (max1 < minnext + deltanext)
2741 max1 = minnext + deltanext;
2742 if (deltanext == I32_MAX)
2743 is_inf = is_inf_internal = 1;
2744 scan = next;
2745 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2746 pars++;
2747 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2748 if ( stopmin > minnext)
2749 stopmin = min + min1;
2750 flags &= ~SCF_DO_SUBSTR;
2751 if (data)
2752 data->flags |= SCF_SEEN_ACCEPT;
2753 }
2754 if (data) {
2755 if (data_fake.flags & SF_HAS_EVAL)
2756 data->flags |= SF_HAS_EVAL;
2757 data->whilem_c = data_fake.whilem_c;
3dab1dad 2758 }
8aa23a47
YO
2759 if (flags & SCF_DO_STCLASS)
2760 cl_or(pRExC_state, &accum, &this_class);
2761 }
2762 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2763 min1 = 0;
2764 if (flags & SCF_DO_SUBSTR) {
2765 data->pos_min += min1;
2766 data->pos_delta += max1 - min1;
2767 if (max1 != min1 || is_inf)
2768 data->longest = &(data->longest_float);
2769 }
2770 min += min1;
2771 delta += max1 - min1;
2772 if (flags & SCF_DO_STCLASS_OR) {
2773 cl_or(pRExC_state, data->start_class, &accum);
2774 if (min1) {
2775 cl_and(data->start_class, and_withp);
2776 flags &= ~SCF_DO_STCLASS;
653099ff 2777 }
8aa23a47
YO
2778 }
2779 else if (flags & SCF_DO_STCLASS_AND) {
2780 if (min1) {
2781 cl_and(data->start_class, &accum);
2782 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2783 }
8aa23a47
YO
2784 else {
2785 /* Switch to OR mode: cache the old value of
2786 * data->start_class */
2787 INIT_AND_WITHP;
2788 StructCopy(data->start_class, and_withp,
2789 struct regnode_charclass_class);
2790 flags &= ~SCF_DO_STCLASS_AND;
2791 StructCopy(&accum, data->start_class,
2792 struct regnode_charclass_class);
2793 flags |= SCF_DO_STCLASS_OR;
2794 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2795 }
8aa23a47 2796 }
a3621e74 2797
8aa23a47
YO
2798 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2799 /* demq.
a3621e74 2800
8aa23a47
YO
2801 Assuming this was/is a branch we are dealing with: 'scan' now
2802 points at the item that follows the branch sequence, whatever
2803 it is. We now start at the beginning of the sequence and look
2804 for subsequences of
a3621e74 2805
8aa23a47
YO
2806 BRANCH->EXACT=>x1
2807 BRANCH->EXACT=>x2
2808 tail
a3621e74 2809
8aa23a47 2810 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2811
486ec47a 2812 If we can find such a subsequence we need to turn the first
8aa23a47
YO
2813 element into a trie and then add the subsequent branch exact
2814 strings to the trie.
a3621e74 2815
8aa23a47 2816 We have two cases
a3621e74 2817
3b753521 2818 1. patterns where the whole set of branches can be converted.
a3621e74 2819
8aa23a47 2820 2. patterns where only a subset can be converted.
a3621e74 2821
8aa23a47
YO
2822 In case 1 we can replace the whole set with a single regop
2823 for the trie. In case 2 we need to keep the start and end
3b753521 2824 branches so
a3621e74 2825
8aa23a47
YO
2826 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2827 becomes BRANCH TRIE; BRANCH X;
786e8c11 2828
8aa23a47
YO
2829 There is an additional case, that being where there is a
2830 common prefix, which gets split out into an EXACT like node
2831 preceding the TRIE node.
a3621e74 2832
8aa23a47
YO
2833 If x(1..n)==tail then we can do a simple trie, if not we make
2834 a "jump" trie, such that when we match the appropriate word
486ec47a 2835 we "jump" to the appropriate tail node. Essentially we turn
8aa23a47 2836 a nested if into a case structure of sorts.
b515a41d 2837
8aa23a47
YO
2838 */
2839
2840 int made=0;
2841 if (!re_trie_maxbuff) {
2842 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2843 if (!SvIOK(re_trie_maxbuff))
2844 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2845 }
2846 if ( SvIV(re_trie_maxbuff)>=0 ) {
2847 regnode *cur;
2848 regnode *first = (regnode *)NULL;
2849 regnode *last = (regnode *)NULL;
2850 regnode *tail = scan;
2851 U8 optype = 0;
2852 U32 count=0;
a3621e74
YO
2853
2854#ifdef DEBUGGING
8aa23a47 2855 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2856#endif
8aa23a47
YO
2857 /* var tail is used because there may be a TAIL
2858 regop in the way. Ie, the exacts will point to the
2859 thing following the TAIL, but the last branch will
2860 point at the TAIL. So we advance tail. If we
2861 have nested (?:) we may have to move through several
2862 tails.
2863 */
2864
2865 while ( OP( tail ) == TAIL ) {
2866 /* this is the TAIL generated by (?:) */
2867 tail = regnext( tail );
2868 }
a3621e74 2869
8aa23a47
YO
2870
2871 DEBUG_OPTIMISE_r({
2872 regprop(RExC_rx, mysv, tail );
2873 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2874 (int)depth * 2 + 2, "",
2875 "Looking for TRIE'able sequences. Tail node is: ",
2876 SvPV_nolen_const( mysv )
2877 );
2878 });
2879
2880 /*
2881
2882 step through the branches, cur represents each
2883 branch, noper is the first thing to be matched
2884 as part of that branch and noper_next is the
2885 regnext() of that node. if noper is an EXACT
2886 and noper_next is the same as scan (our current
2887 position in the regex) then the EXACT branch is
2888 a possible optimization target. Once we have
486ec47a 2889 two or more consecutive such branches we can
8aa23a47
YO
2890 create a trie of the EXACT's contents and stich
2891 it in place. If the sequence represents all of
2892 the branches we eliminate the whole thing and
2893 replace it with a single TRIE. If it is a
2894 subsequence then we need to stitch it in. This
2895 means the first branch has to remain, and needs
2896 to be repointed at the item on the branch chain
2897 following the last branch optimized. This could
2898 be either a BRANCH, in which case the
2899 subsequence is internal, or it could be the
2900 item following the branch sequence in which
2901 case the subsequence is at the end.
2902
2903 */
2904
2905 /* dont use tail as the end marker for this traverse */
2906 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2907 regnode * const noper = NEXTOPER( cur );
b515a41d 2908#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2909 regnode * const noper_next = regnext( noper );
b515a41d
YO
2910#endif
2911
8aa23a47
YO
2912 DEBUG_OPTIMISE_r({
2913 regprop(RExC_rx, mysv, cur);
2914 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2915 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2916
2917 regprop(RExC_rx, mysv, noper);
2918 PerlIO_printf( Perl_debug_log, " -> %s",
2919 SvPV_nolen_const(mysv));
2920
2921 if ( noper_next ) {
2922 regprop(RExC_rx, mysv, noper_next );
2923 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2924 SvPV_nolen_const(mysv));
2925 }
2926 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2927 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2928 });
2929 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2930 : PL_regkind[ OP( noper ) ] == EXACT )
2931 || OP(noper) == NOTHING )
786e8c11 2932#ifdef NOJUMPTRIE
8aa23a47 2933 && noper_next == tail
786e8c11 2934#endif
8aa23a47
YO
2935 && count < U16_MAX)
2936 {
2937 count++;
2938 if ( !first || optype == NOTHING ) {
2939 if (!first) first = cur;
2940 optype = OP( noper );
2941 } else {
2942 last = cur;
2943 }
2944 } else {
a0a388a1 2945/*
0abd0d78
YO
2946 Currently we do not believe that the trie logic can
2947 handle case insensitive matching properly when the
2948 pattern is not unicode (thus forcing unicode semantics).
2949
2950 If/when this is fixed the following define can be swapped
2951 in below to fully enable trie logic.
2952
a0a388a1 2953#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2954
2955*/
2956#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2957
a0a388a1 2958 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2959 make_trie( pRExC_state,
2960 startbranch, first, cur, tail, count,
2961 optype, depth+1 );
2962 }
2963 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2964#ifdef NOJUMPTRIE
8aa23a47 2965 && noper_next == tail
786e8c11 2966#endif
8aa23a47
YO
2967 ){
2968 count = 1;
2969 first = cur;
2970 optype = OP( noper );
2971 } else {
2972 count = 0;
2973 first = NULL;
2974 optype = 0;
2975 }
2976 last = NULL;
2977 }
2978 }
2979 DEBUG_OPTIMISE_r({
2980 regprop(RExC_rx, mysv, cur);
2981 PerlIO_printf( Perl_debug_log,
2982 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2983 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2984
2985 });
a0a388a1
YO
2986
2987 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2988 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2989#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2990 if ( ((made == MADE_EXACT_TRIE &&
2991 startbranch == first)
2992 || ( first_non_open == first )) &&
2993 depth==0 ) {
2994 flags |= SCF_TRIE_RESTUDY;
2995 if ( startbranch == first
2996 && scan == tail )
2997 {
2998 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2999 }
3000 }
3dab1dad 3001#endif
8aa23a47
YO
3002 }
3003 }
3004
3005 } /* do trie */
3006
653099ff 3007 }
8aa23a47
YO
3008 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3009 scan = NEXTOPER(NEXTOPER(scan));
3010 } else /* single branch is optimized. */
3011 scan = NEXTOPER(scan);
3012 continue;
3013 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3014 scan_frame *newframe = NULL;
3015 I32 paren;
3016 regnode *start;
3017 regnode *end;
3018
3019 if (OP(scan) != SUSPEND) {
3020 /* set the pointer */
3021 if (OP(scan) == GOSUB) {
3022 paren = ARG(scan);
3023 RExC_recurse[ARG2L(scan)] = scan;
3024 start = RExC_open_parens[paren-1];
3025 end = RExC_close_parens[paren-1];
3026 } else {
3027 paren = 0;
f8fc2ecf 3028 start = RExC_rxi->program + 1;
8aa23a47
YO
3029 end = RExC_opend;
3030 }
3031 if (!recursed) {
3032 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3033 SAVEFREEPV(recursed);
3034 }
3035 if (!PAREN_TEST(recursed,paren+1)) {
3036 PAREN_SET(recursed,paren+1);
3037 Newx(newframe,1,scan_frame);
3038 } else {
3039 if (flags & SCF_DO_SUBSTR) {
304ee84b 3040 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3041 data->longest = &(data->longest_float);
3042 }
3043 is_inf = is_inf_internal = 1;
3044 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3045 cl_anything(pRExC_state, data->start_class);
3046 flags &= ~SCF_DO_STCLASS;
3047 }
3048 } else {
3049 Newx(newframe,1,scan_frame);
3050 paren = stopparen;
3051 start = scan+2;
3052 end = regnext(scan);
3053 }
3054 if (newframe) {
3055 assert(start);
3056 assert(end);
3057 SAVEFREEPV(newframe);
3058 newframe->next = regnext(scan);
3059 newframe->last = last;
3060 newframe->stop = stopparen;
3061 newframe->prev = frame;
3062
3063 frame = newframe;
3064 scan = start;
3065 stopparen = paren;
3066 last = end;
3067
3068 continue;
3069 }
3070 }
3071 else if (OP(scan) == EXACT) {
3072 I32 l = STR_LEN(scan);
3073 UV uc;
3074 if (UTF) {
3075 const U8 * const s = (U8*)STRING(scan);
3076 l = utf8_length(s, s + l);
3077 uc = utf8_to_uvchr(s, NULL);
3078 } else {
3079 uc = *((U8*)STRING(scan));
3080 }
3081 min += l;
3082 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3083 /* The code below prefers earlier match for fixed
3084 offset, later match for variable offset. */
3085 if (data->last_end == -1) { /* Update the start info. */
3086 data->last_start_min = data->pos_min;
3087 data->last_start_max = is_inf
3088 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3089 }
8aa23a47
YO
3090 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3091 if (UTF)
3092 SvUTF8_on(data->last_found);
3093 {
3094 SV * const sv = data->last_found;
3095 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3096 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3097 if (mg && mg->mg_len >= 0)
3098 mg->mg_len += utf8_length((U8*)STRING(scan),
3099 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3100 }
8aa23a47
YO
3101 data->last_end = data->pos_min + l;
3102 data->pos_min += l; /* As in the first entry. */
3103 data->flags &= ~SF_BEFORE_EOL;
3104 }
3105 if (flags & SCF_DO_STCLASS_AND) {
3106 /* Check whether it is compatible with what we know already! */
3107 int compat = 1;
3108
54251c2e 3109
486ec47a 3110 /* If compatible, we or it in below. It is compatible if is
54251c2e
KW
3111 * in the bitmp and either 1) its bit or its fold is set, or 2)
3112 * it's for a locale. Even if there isn't unicode semantics
3113 * here, at runtime there may be because of matching against a
3114 * utf8 string, so accept a possible false positive for
3115 * latin1-range folds */
8aa23a47
YO
3116 if (uc >= 0x100 ||
3117 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3118 && !ANYOF_BITMAP_TEST(data->start_class, uc)
39065660 3119 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
54251c2e 3120 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
8aa23a47 3121 )
d18bf9dc 3122 {
8aa23a47 3123 compat = 0;
d18bf9dc 3124 }
8aa23a47
YO
3125 ANYOF_CLASS_ZERO(data->start_class);
3126 ANYOF_BITMAP_ZERO(data->start_class);
3127 if (compat)
3128 ANYOF_BITMAP_SET(data->start_class, uc);
d18bf9dc
KW
3129 else if (uc >= 0x100) {
3130 int i;
3131
3132 /* Some Unicode code points fold to the Latin1 range; as
3133 * XXX temporary code, instead of figuring out if this is
3134 * one, just assume it is and set all the start class bits
3135 * that could be some such above 255 code point's fold
3136 * which will generate fals positives. As the code
3137 * elsewhere that does compute the fold settles down, it
3138 * can be extracted out and re-used here */
3139 for (i = 0; i < 256; i++){
3140 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3141 ANYOF_BITMAP_SET(data->start_class, i);
3142 }
3143 }
3144 }
8aa23a47
YO
3145 data->start_class->flags &= ~ANYOF_EOS;
3146 if (uc < 0x100)
3147 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3148 }
3149 else if (flags & SCF_DO_STCLASS_OR) {
3150 /* false positive possible if the class is case-folded */
3151 if (uc < 0x100)
3152 ANYOF_BITMAP_SET(data->start_class, uc);
3153 else
3154 data->start_class->flags |= ANYOF_UNICODE_ALL;
3155 data->start_class->flags &= ~ANYOF_EOS;
3156 cl_and(data->start_class, and_withp);
3157 }
3158 flags &= ~SCF_DO_STCLASS;
3159 }
3160 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3161 I32 l = STR_LEN(scan);
3162 UV uc = *((U8*)STRING(scan));
3163
3164 /* Search for fixed substrings supports EXACT only. */
3165 if (flags & SCF_DO_SUBSTR) {
3166 assert(data);
304ee84b 3167 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3168 }
3169 if (UTF) {
3170 const U8 * const s = (U8 *)STRING(scan);
3171 l = utf8_length(s, s + l);
3172 uc = utf8_to_uvchr(s, NULL);
3173 }
3174 min += l;
3175 if (flags & SCF_DO_SUBSTR)
3176 data->pos_min += l;
3177 if (flags & SCF_DO_STCLASS_AND) {
3178 /* Check whether it is compatible with what we know already! */
3179 int compat = 1;
8aa23a47 3180 if (uc >= 0x100 ||
54251c2e
KW
3181 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3182 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3183 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3184 {
8aa23a47 3185 compat = 0;
54251c2e 3186 }
8aa23a47
YO
3187 ANYOF_CLASS_ZERO(data->start_class);
3188 ANYOF_BITMAP_ZERO(data->start_class);
3189 if (compat) {
3190 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3191 data->start_class->flags &= ~ANYOF_EOS;
39065660 3192 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
970c8436 3193 if (OP(scan) == EXACTFL) {
8aa23a47 3194 data->start_class->flags |= ANYOF_LOCALE;
970c8436
KW
3195 }
3196 else {
3197
54251c2e
KW
3198 /* Also set the other member of the fold pair. In case
3199 * that unicode semantics is called for at runtime, use
3200 * the full latin1 fold. (Can't do this for locale,
3201 * because not known until runtime */
3202 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
970c8436 3203 }
653099ff 3204 }
d18bf9dc
KW
3205 else if (uc >= 0x100) {
3206 int i;
3207 for (i = 0; i < 256; i++){
3208 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3209 ANYOF_BITMAP_SET(data->start_class, i);
3210 }
3211 }
3212 }
8aa23a47
YO
3213 }
3214 else if (flags & SCF_DO_STCLASS_OR) {
39065660 3215 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
8aa23a47
YO
3216 /* false positive possible if the class is case-folded.
3217 Assume that the locale settings are the same... */
970c8436 3218 if (uc < 0x100) {
1aa99e6b 3219 ANYOF_BITMAP_SET(data->start_class, uc);
970c8436
KW
3220 if (OP(scan) != EXACTFL) {
3221
3222 /* And set the other member of the fold pair, but
3223 * can't do that in locale because not known until
3224 * run-time */
3225 ANYOF_BITMAP_SET(data->start_class,
54251c2e 3226 PL_fold_latin1[uc]);
970c8436
KW
3227 }
3228 }
653099ff
GS
3229 data->start_class->flags &= ~ANYOF_EOS;
3230 }
8aa23a47 3231 cl_and(data->start_class, and_withp);
653099ff 3232 }
8aa23a47
YO
3233 flags &= ~SCF_DO_STCLASS;
3234 }
e52fc539 3235 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3236 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3237 I32 f = flags, pos_before = 0;
3238 regnode * const oscan = scan;
3239 struct regnode_charclass_class this_class;
3240 struct regnode_charclass_class *oclass = NULL;
3241 I32 next_is_eval = 0;
3242
3243 switch (PL_regkind[OP(scan)]) {
3244 case WHILEM: /* End of (?:...)* . */
3245 scan = NEXTOPER(scan);
3246 goto finish;
3247 case PLUS:
3248 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3249 next = NEXTOPER(scan);
3250 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3251 mincount = 1;
3252 maxcount = REG_INFTY;
3253 next = regnext(scan);
3254 scan = NEXTOPER(scan);
3255 goto do_curly;
3256 }
3257 }
3258 if (flags & SCF_DO_SUBSTR)
3259 data->pos_min++;
3260 min++;
3261 /* Fall through. */
3262 case STAR:
3263 if (flags & SCF_DO_STCLASS) {
3264 mincount = 0;
3265 maxcount = REG_INFTY;
3266 next = regnext(scan);
3267 scan = NEXTOPER(scan);
3268 goto do_curly;
3269 }
3270 is_inf = is_inf_internal = 1;
3271 scan = regnext(scan);
c277df42 3272 if (flags & SCF_DO_SUBSTR) {
304ee84b 3273 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3274 data->longest = &(data->longest_float);
c277df42 3275 }
8aa23a47
YO
3276 goto optimize_curly_tail;
3277 case CURLY:
3278 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3279 && (scan->flags == stopparen))
3280 {
3281 mincount = 1;
3282 maxcount = 1;
3283 } else {
3284 mincount = ARG1(scan);
3285 maxcount = ARG2(scan);
653099ff 3286 }
8aa23a47
YO
3287 next = regnext(scan);
3288 if (OP(scan) == CURLYX) {
3289 I32 lp = (data ? *(data->last_closep) : 0);
3290 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3291 }
8aa23a47
YO
3292 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3293 next_is_eval = (OP(scan) == EVAL);
3294 do_curly:
3295 if (flags & SCF_DO_SUBSTR) {
304ee84b 3296 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3297 pos_before = data->pos_min;
b45f050a 3298 }
8aa23a47
YO
3299 if (data) {
3300 fl = data->flags;
3301 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3302 if (is_inf)
3303 data->flags |= SF_IS_INF;
3304 }
3305 if (flags & SCF_DO_STCLASS) {
3306 cl_init(pRExC_state, &this_class);
3307 oclass = data->start_class;
3308 data->start_class = &this_class;
3309 f |= SCF_DO_STCLASS_AND;
3310 f &= ~SCF_DO_STCLASS_OR;
3311 }
779bcb7d
NC
3312 /* Exclude from super-linear cache processing any {n,m}
3313 regops for which the combination of input pos and regex
3314 pos is not enough information to determine if a match
3315 will be possible.
3316
3317 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3318 regex pos at the \s*, the prospects for a match depend not
3319 only on the input position but also on how many (bar\s*)
3320 repeats into the {4,8} we are. */
3321 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
8aa23a47 3322 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3323
8aa23a47
YO
3324 /* This will finish on WHILEM, setting scan, or on NULL: */
3325 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3326 last, data, stopparen, recursed, NULL,
3327 (mincount == 0
3328 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3329
8aa23a47
YO
3330 if (flags & SCF_DO_STCLASS)
3331 data->start_class = oclass;
3332 if (mincount == 0 || minnext == 0) {
3333 if (flags & SCF_DO_STCLASS_OR) {
3334 cl_or(pRExC_state, data->start_class, &this_class);
3335 }
3336 else if (flags & SCF_DO_STCLASS_AND) {
3337 /* Switch to OR mode: cache the old value of
3338 * data->start_class */
3339 INIT_AND_WITHP;
3340 StructCopy(data->start_class, and_withp,
3341 struct regnode_charclass_class);
3342 flags &= ~SCF_DO_STCLASS_AND;
3343 StructCopy(&this_class, data->start_class,
3344 struct regnode_charclass_class);
3345 flags |= SCF_DO_STCLASS_OR;
3346 data->start_class->flags |= ANYOF_EOS;
3347 }
3348 } else { /* Non-zero len */
3349 if (flags & SCF_DO_STCLASS_OR) {
3350 cl_or(pRExC_state, data->start_class, &this_class);
3351 cl_and(data->start_class, and_withp);
3352 }
3353 else if (flags & SCF_DO_STCLASS_AND)
3354 cl_and(data->start_class, &this_class);
3355 flags &= ~SCF_DO_STCLASS;
3356 }
3357 if (!scan) /* It was not CURLYX, but CURLY. */
3358 scan = next;
3359 if ( /* ? quantifier ok, except for (?{ ... }) */
3360 (next_is_eval || !(mincount == 0 && maxcount == 1))
3361 && (minnext == 0) && (deltanext == 0)
3362 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3363 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3364 {
668c081a
NC
3365 ckWARNreg(RExC_parse,
3366 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3367 }
3368
3369 min += minnext * mincount;
3370 is_inf_internal |= ((maxcount == REG_INFTY
3371 && (minnext + deltanext) > 0)
3372 || deltanext == I32_MAX);
3373 is_inf |= is_inf_internal;
3374 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3375
3376 /* Try powerful optimization CURLYX => CURLYN. */
3377 if ( OP(oscan) == CURLYX && data
3378 && data->flags & SF_IN_PAR
3379 && !(data->flags & SF_HAS_EVAL)
3380 && !deltanext && minnext == 1 ) {
3381 /* Try to optimize to CURLYN. */
3382 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3383 regnode * const nxt1 = nxt;
497b47a8 3384#ifdef DEBUGGING
8aa23a47 3385 regnode *nxt2;
497b47a8 3386#endif
c277df42 3387
8aa23a47
YO
3388 /* Skip open. */
3389 nxt = regnext(nxt);
e52fc539 3390 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3391 && !(PL_regkind[OP(nxt)] == EXACT
3392 && STR_LEN(nxt) == 1))
3393 goto nogo;
497b47a8 3394#ifdef DEBUGGING
8aa23a47 3395 nxt2 = nxt;
497b47a8 3396#endif
8aa23a47
YO
3397 nxt = regnext(nxt);
3398 if (OP(nxt) != CLOSE)
3399 goto nogo;
3400 if (RExC_open_parens) {
3401 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3402 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3403 }
3404 /* Now we know that nxt2 is the only contents: */
3405 oscan->flags = (U8)ARG(nxt);
3406 OP(oscan) = CURLYN;
3407 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3408
c277df42 3409#ifdef DEBUGGING
8aa23a47 3410 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3411 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3412 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3413 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3414 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3415 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3416#endif
8aa23a47
YO
3417 }
3418 nogo:
3419
3420 /* Try optimization CURLYX => CURLYM. */
3421 if ( OP(oscan) == CURLYX && data
3422 && !(data->flags & SF_HAS_PAR)
3423 && !(data->flags & SF_HAS_EVAL)
3424 && !deltanext /* atom is fixed width */
3425 && minnext != 0 /* CURLYM can't handle zero width */
3426 ) {
3427 /* XXXX How to optimize if data == 0? */
3428 /* Optimize to a simpler form. */
3429 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3430 regnode *nxt2;
3431
3432 OP(oscan) = CURLYM;
3433 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3434 && (OP(nxt2) != WHILEM))
3435 nxt = nxt2;
3436 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3437 /* Need to optimize away parenths. */
b3c0965f 3438 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3439 /* Set the parenth number. */
3440 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3441
8aa23a47
YO
3442 oscan->flags = (U8)ARG(nxt);
3443 if (RExC_open_parens) {
3444 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3445 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3446 }
8aa23a47
YO
3447 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3448 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3449
c277df42 3450#ifdef DEBUGGING
8aa23a47
YO
3451 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3452 OP(nxt + 1) = OPTIMIZED; /* was count. */
486ec47a
PA
3453 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3454 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
b81d288d 3455#endif
c277df42 3456#if 0
8aa23a47
YO
3457 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3458 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3459 if (nnxt == nxt) {
3460 if (reg_off_by_arg[OP(nxt1)])
3461 ARG_SET(nxt1, nxt2 - nxt1);
3462 else if (nxt2 - nxt1 < U16_MAX)
3463 NEXT_OFF(nxt1) = nxt2 - nxt1;
3464 else
3465 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3466 }
8aa23a47 3467 nxt1 = nnxt;
c277df42 3468 }
5d1c421c 3469#endif
8aa23a47
YO
3470 /* Optimize again: */
3471 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3472 NULL, stopparen, recursed, NULL, 0,depth+1);
3473 }
3474 else
3475 oscan->flags = 0;
3476 }
3477 else if ((OP(oscan) == CURLYX)
3478 && (flags & SCF_WHILEM_VISITED_POS)
3479 /* See the comment on a similar expression above.
3b753521 3480 However, this time it's not a subexpression
8aa23a47
YO
3481 we care about, but the expression itself. */
3482 && (maxcount == REG_INFTY)
3483 && data && ++data->whilem_c < 16) {
3484 /* This stays as CURLYX, we can put the count/of pair. */
3485 /* Find WHILEM (as in regexec.c) */
3486 regnode *nxt = oscan + NEXT_OFF(oscan);
3487
3488 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3489 nxt += ARG(nxt);
3490 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3491 | (RExC_whilem_seen << 4)); /* On WHILEM */
3492 }
3493 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3494 pars++;
3495 if (flags & SCF_DO_SUBSTR) {
3496 SV *last_str = NULL;
3497 int counted = mincount != 0;
a0ed51b3 3498
8aa23a47
YO
3499 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3500#if defined(SPARC64_GCC_WORKAROUND)
3501 I32 b = 0;
3502 STRLEN l = 0;
3503 const char *s = NULL;
3504 I32 old = 0;
b515a41d 3505
8aa23a47
YO
3506 if (pos_before >= data->last_start_min)
3507 b = pos_before;
3508 else
3509 b = data->last_start_min;
b515a41d 3510
8aa23a47
YO
3511 l = 0;
3512 s = SvPV_const(data->last_found, l);
3513 old = b - data->last_start_min;
3514
3515#else
3516 I32 b = pos_before >= data->last_start_min
3517 ? pos_before : data->last_start_min;
3518 STRLEN l;
3519 const char * const s = SvPV_const(data->last_found, l);
3520 I32 old = b - data->last_start_min;
3521#endif
3522
3523 if (UTF)
3524 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3525 l -= old;
3526 /* Get the added string: */
740cce10 3527 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3528 if (deltanext == 0 && pos_before == b) {
3529 /* What was added is a constant string */
3530 if (mincount > 1) {
3531 SvGROW(last_str, (mincount * l) + 1);
3532 repeatcpy(SvPVX(last_str) + l,
3533 SvPVX_const(last_str), l, mincount - 1);
3534 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3535 /* Add additional parts. */
3536 SvCUR_set(data->last_found,
3537 SvCUR(data->last_found) - l);
3538 sv_catsv(data->last_found, last_str);
3539 {
3540 SV * sv = data->last_found;
3541 MAGIC *mg =
3542 SvUTF8(sv) && SvMAGICAL(sv) ?
3543 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3544 if (mg && mg->mg_len >= 0)
bd94e887 3545 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3546 }
8aa23a47 3547 data->last_end += l * (mincount - 1);
b515a41d 3548 }
8aa23a47
YO
3549 } else {
3550 /* start offset must point into the last copy */
3551 data->last_start_min += minnext * (mincount - 1);
3552 data->last_start_max += is_inf ? I32_MAX
3553 : (maxcount - 1) * (minnext + data->pos_delta);
3554 }
c277df42 3555 }
8aa23a47
YO
3556 /* It is counted once already... */
3557 data->pos_min += minnext * (mincount - counted);
3558 data->pos_delta += - counted * deltanext +
3559 (minnext + deltanext) * maxcount - minnext * mincount;
3560 if (mincount != maxcount) {
3561 /* Cannot extend fixed substrings found inside
3562 the group. */
304ee84b 3563 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3564 if (mincount && last_str) {
3565 SV * const sv = data->last_found;
3566 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3567 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3568
3569 if (mg)
3570 mg->mg_len = -1;
3571 sv_setsv(sv, last_str);
3572 data->last_end = data->pos_min;
3573 data->last_start_min =
3574 data->pos_min - CHR_SVLEN(last_str);
3575 data->last_start_max = is_inf
3576 ? I32_MAX
3577 : data->pos_min + data->pos_delta
3578 - CHR_SVLEN(last_str);
3579 }
3580 data->longest = &(data->longest_float);
3581 }
3582 SvREFCNT_dec(last_str);
c277df42 3583 }
8aa23a47
YO
3584 if (data && (fl & SF_HAS_EVAL))
3585 data->flags |= SF_HAS_EVAL;
3586 optimize_curly_tail:
3587 if (OP(oscan) != CURLYX) {
3588 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3589 && NEXT_OFF(next))
3590 NEXT_OFF(oscan) += NEXT_OFF(next);
3591 }
3592 continue;
f56b6394 3593 default: /* REF, ANYOFV, and CLUMP only? */
8aa23a47 3594 if (flags & SCF_DO_SUBSTR) {
304ee84b 3595 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3596 data->longest = &(data->longest_float);
3597 }
3598 is_inf = is_inf_internal = 1;
3599 if (flags & SCF_DO_STCLASS_OR)
3600 cl_anything(pRExC_state, data->start_class);
3601 flags &= ~SCF_DO_STCLASS;
3602 break;
c277df42 3603 }
8aa23a47 3604 }
e1d1eefb
YO
3605 else if (OP(scan) == LNBREAK) {
3606 if (flags & SCF_DO_STCLASS) {
3607 int value = 0;
3608 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3609 if (flags & SCF_DO_STCLASS_AND) {
3610 for (value = 0; value < 256; value++)
e64b1bd1 3611 if (!is_VERTWS_cp(value))
b9a59e08
KW
3612 ANYOF_BITMAP_CLEAR(data->start_class, value);
3613 }
3614 else {
e1d1eefb 3615 for (value = 0; value < 256; value++)
e64b1bd1 3616 if (is_VERTWS_cp(value))
b9a59e08
KW
3617 ANYOF_BITMAP_SET(data->start_class, value);
3618 }
e1d1eefb
YO
3619 if (flags & SCF_DO_STCLASS_OR)
3620 cl_and(data->start_class, and_withp);
3621 flags &= ~SCF_DO_STCLASS;
3622 }
3623 min += 1;
f9a79580 3624 delta += 1;
e1d1eefb
YO
3625 if (flags & SCF_DO_SUBSTR) {
3626 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3627 data->pos_min += 1;
f9a79580 3628 data->pos_delta += 1;
e1d1eefb
YO
3629 data->longest = &(data->longest_float);
3630 }
e1d1eefb 3631 }
f9a79580 3632 else if (OP(scan) == FOLDCHAR) {
ced7f090 3633 int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
f9a79580
RGS
3634 flags &= ~SCF_DO_STCLASS;
3635 min += 1;
3636 delta += d;
3637 if (flags & SCF_DO_SUBSTR) {
3638 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3639 data->pos_min += 1;
3640 data->pos_delta += d;
3641 data->longest = &(data->longest_float);
3642 }
3643 }
e52fc539 3644 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 3645 int value = 0;
653099ff 3646
8aa23a47 3647 if (flags & SCF_DO_SUBSTR) {
304ee84b 3648 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3649 data->pos_min++;
3650 }
3651 min++;
3652 if (flags & SCF_DO_STCLASS) {
3653 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3654
8aa23a47
YO
3655 /* Some of the logic below assumes that switching
3656 locale on will only add false positives. */
3657 switch (PL_regkind[OP(scan)]) {
3658 case SANY:
3659 default:
3660 do_default:
3661 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3662 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3663 cl_anything(pRExC_state, data->start_class);
3664 break;
3665 case REG_ANY:
3666 if (OP(scan) == SANY)
3667 goto do_default;
3668 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3669 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3a15e693 3670 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
8aa23a47 3671 cl_anything(pRExC_state, data->start_class);
653099ff 3672 }
8aa23a47
YO
3673 if (flags & SCF_DO_STCLASS_AND || !value)
3674 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3675 break;
3676 case ANYOF:
3677 if (flags & SCF_DO_STCLASS_AND)
3678 cl_and(data->start_class,
3679 (struct regnode_charclass_class*)scan);
653099ff 3680 else
8aa23a47
YO
3681 cl_or(pRExC_state, data->start_class,
3682 (struct regnode_charclass_class*)scan);
3683 break;
3684 case ALNUM:
3685 if (flags & SCF_DO_STCLASS_AND) {
3686 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3687 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
980866de 3688 if (OP(scan) == ALNUMU) {
a12cf05f
KW
3689 for (value = 0; value < 256; value++) {
3690 if (!isWORDCHAR_L1(value)) {
3691 ANYOF_BITMAP_CLEAR(data->start_class, value);
3692 }
3693 }
3694 } else {
3695 for (value = 0; value < 256; value++) {
3696 if (!isALNUM(value)) {
3697 ANYOF_BITMAP_CLEAR(data->start_class, value);
3698 }
3699 }
3700 }
8aa23a47 3701 }
653099ff 3702 }
8aa23a47
YO
3703 else {
3704 if (data->start_class->flags & ANYOF_LOCALE)
3705 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
980866de 3706 else if (OP(scan) == ALNUMU) {
a12cf05f
KW
3707 for (value = 0; value < 256; value++) {
3708 if (isWORDCHAR_L1(value)) {
3709 ANYOF_BITMAP_SET(data->start_class, value);
3710 }
3711 }
3712 } else {
3713 for (value = 0; value < 256; value++) {
3714 if (isALNUM(value)) {
3715 ANYOF_BITMAP_SET(data->start_class, value);
3716 }
3717 }
3718 }
8aa23a47
YO
3719 }
3720 break;
8aa23a47
YO
3721 case NALNUM:
3722 if (flags & SCF_DO_STCLASS_AND) {
3723 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3724 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
980866de 3725 if (OP(scan) == NALNUMU) {
a12cf05f
KW
3726 for (value = 0; value < 256; value++) {
3727 if (isWORDCHAR_L1(value)) {
3728 ANYOF_BITMAP_CLEAR(data->start_class, value);
3729 }
3730 }
3731 } else {
3732 for (value = 0; value < 256; value++) {
3733 if (isALNUM(value)) {
3734 ANYOF_BITMAP_CLEAR(data->start_class, value);
3735 }
3736 }
3737 }
653099ff
GS
3738 }
3739 }
8aa23a47
YO
3740 else {
3741 if (data->start_class->flags & ANYOF_LOCALE)
3742 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3743 else {
980866de 3744 if (OP(scan) == NALNUMU) {
e9a9c1bc
KW
3745 for (value = 0; value < 256; value++) {
3746 if (! isWORDCHAR_L1(value)) {
3747 ANYOF_BITMAP_SET(data->start_class, value);
3748 }
3749 }
3750 } else {
3751 for (value = 0; value < 256; value++) {
3752 if (! isALNUM(value)) {
3753 ANYOF_BITMAP_SET(data->start_class, value);
3754 }
3755 }
3756 }
8aa23a47 3757 }
653099ff 3758 }
8aa23a47 3759 break;
8aa23a47
YO
3760 case SPACE:
3761 if (flags & SCF_DO_STCLASS_AND) {
3762 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3763 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
980866de 3764 if (OP(scan) == SPACEU) {
a12cf05f
KW
3765 for (value = 0; value < 256; value++) {
3766 if (!isSPACE_L1(value)) {
3767 ANYOF_BITMAP_CLEAR(data->start_class, value);
3768 }
3769 }
3770 } else {
3771 for (value = 0; value < 256; value++) {
3772 if (!isSPACE(value)) {
3773 ANYOF_BITMAP_CLEAR(data->start_class, value);
3774 }
3775 }
3776 }
653099ff
GS
3777 }
3778 }
8aa23a47 3779 else {
a12cf05f 3780 if (data->start_class->flags & ANYOF_LOCALE) {
8aa23a47 3781 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
a12cf05f 3782 }
980866de 3783 else if (OP(scan) == SPACEU) {
a12cf05f
KW
3784 for (value = 0; value < 256; value++) {
3785 if (isSPACE_L1(value)) {
3786 ANYOF_BITMAP_SET(data->start_class, value);
3787 }
3788 }
3789 } else {
3790 for (value = 0; value < 256; value++) {
3791 if (isSPACE(value)) {
3792 ANYOF_BITMAP_SET(data->start_class, value);
3793 }
3794 }
8aa23a47 3795 }
653099ff 3796 }
8aa23a47 3797 break;
8aa23a47
YO
3798 case NSPACE:
3799 if (flags & SCF_DO_STCLASS_AND) {
3800 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3801 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
980866de 3802 if (OP(scan) == NSPACEU) {
a12cf05f
KW
3803 for (value = 0; value < 256; value++) {
3804 if (isSPACE_L1(value)) {
3805 ANYOF_BITMAP_CLEAR(data->start_class, value);
3806 }
3807 }
3808 } else {
3809 for (value = 0; value < 256; value++) {
3810 if (isSPACE(value)) {
3811 ANYOF_BITMAP_CLEAR(data->start_class, value);
3812 }
3813 }
3814 }
653099ff 3815 }
8aa23a47
YO
3816 }
3817 else {
3818 if (data->start_class->flags & ANYOF_LOCALE)
3819 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
980866de 3820 else if (OP(scan) == NSPACEU) {
a12cf05f
KW
3821 for (value = 0; value < 256; value++) {
3822 if (!isSPACE_L1(value)) {
3823 ANYOF_BITMAP_SET(data->start_class, value);
3824 }
3825 }
3826 }
3827 else {
3828 for (value = 0; value < 256; value++) {
3829 if (!isSPACE(value)) {
3830 ANYOF_BITMAP_SET(data->start_class, value);
3831 }
3832 }
3833 }
653099ff 3834 }
8aa23a47 3835 break;
8aa23a47
YO
3836 case DIGIT:
3837 if (flags & SCF_DO_STCLASS_AND) {
3838 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3839 for (value = 0; value < 256; value++)
3840 if (!isDIGIT(value))
3841 ANYOF_BITMAP_CLEAR(data->start_class, value);
3842 }
3843 else {
3844 if (data->start_class->flags & ANYOF_LOCALE)
3845 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3846 else {
3847 for (value = 0; value < 256; value++)
3848 if (isDIGIT(value))
b9a59e08 3849 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47
YO
3850 }
3851 }
3852 break;
3853 case NDIGIT:
3854 if (flags & SCF_DO_STCLASS_AND) {
3855 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3856 for (value = 0; value < 256; value++)
3857 if (isDIGIT(value))
3858 ANYOF_BITMAP_CLEAR(data->start_class, value);
3859 }
3860 else {
3861 if (data->start_class->flags & ANYOF_LOCALE)
3862 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3863 else {
3864 for (value = 0; value < 256; value++)
3865 if (!isDIGIT(value))
b9a59e08 3866 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3867 }
3868 }
8aa23a47 3869 break;
e1d1eefb
YO
3870 CASE_SYNST_FNC(VERTWS);
3871 CASE_SYNST_FNC(HORIZWS);
3872
8aa23a47
YO
3873 }
3874 if (flags & SCF_DO_STCLASS_OR)
3875 cl_and(data->start_class, and_withp);
3876 flags &= ~SCF_DO_STCLASS;
3877 }
3878 }
3879 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3880 data->flags |= (OP(scan) == MEOL
3881 ? SF_BEFORE_MEOL
3882 : SF_BEFORE_SEOL);
3883 }
3884 else if ( PL_regkind[OP(scan)] == BRANCHJ
3885 /* Lookbehind, or need to calculate parens/evals/stclass: */
3886 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3887 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3888 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3889 || OP(scan) == UNLESSM )
3890 {
3891 /* Negative Lookahead/lookbehind
3892 In this case we can't do fixed string optimisation.
3893 */
1de06328 3894
8aa23a47
YO
3895 I32 deltanext, minnext, fake = 0;
3896 regnode *nscan;
3897 struct regnode_charclass_class intrnl;
3898 int f = 0;
1de06328 3899
8aa23a47
YO
3900 data_fake.flags = 0;
3901 if (data) {
3902 data_fake.whilem_c = data->whilem_c;
3903 data_fake.last_closep = data->last_closep;
c277df42 3904 }
8aa23a47
YO
3905 else
3906 data_fake.last_closep = &fake;
58e23c8d 3907 data_fake.pos_delta = delta;
8aa23a47
YO
3908 if ( flags & SCF_DO_STCLASS && !scan->flags
3909 && OP(scan) == IFMATCH ) { /* Lookahead */
3910 cl_init(pRExC_state, &intrnl);
3911 data_fake.start_class = &intrnl;
3912 f |= SCF_DO_STCLASS_AND;
3913 }
3914 if (flags & SCF_WHILEM_VISITED_POS)
3915 f |= SCF_WHILEM_VISITED_POS;
3916 next = regnext(scan);
3917 nscan = NEXTOPER(NEXTOPER(scan));
3918 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3919 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3920 if (scan->flags) {
3921 if (deltanext) {
58e23c8d 3922 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3923 }
3924 else if (minnext > (I32)U8_MAX) {
58e23c8d 3925 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3926 }
3927 scan->flags = (U8)minnext;
3928 }
3929 if (data) {
3930 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3931 pars++;
3932 if (data_fake.flags & SF_HAS_EVAL)
3933 data->flags |= SF_HAS_EVAL;
3934 data->whilem_c = data_fake.whilem_c;
3935 }
3936 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
3937 if (flags & SCF_DO_STCLASS_OR) {
3938 /* OR before, AND after: ideally we would recurse with
3939 * data_fake to get the AND applied by study of the
3940 * remainder of the pattern, and then derecurse;
3941 * *** HACK *** for now just treat as "no information".
3942 * See [perl #56690].
3943 */
3944 cl_init(pRExC_state, data->start_class);
3945 } else {
3946 /* AND before and after: combine and continue */
3947 const int was = (data->start_class->flags & ANYOF_EOS);
3948
3949 cl_and(data->start_class, &intrnl);
3950 if (was)
3951 data->start_class->flags |= ANYOF_EOS;
3952 }
8aa23a47 3953 }
cb434fcc 3954 }
8aa23a47
YO
3955#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3956 else {
3957 /* Positive Lookahead/lookbehind
3958 In this case we can do fixed string optimisation,
3959 but we must be careful about it. Note in the case of
3960 lookbehind the positions will be offset by the minimum
3961 length of the pattern, something we won't know about
3962 until after the recurse.
3963 */
3964 I32 deltanext, fake = 0;
3965 regnode *nscan;
3966 struct regnode_charclass_class intrnl;
3967 int f = 0;
3968 /* We use SAVEFREEPV so that when the full compile
3969 is finished perl will clean up the allocated
3b753521 3970 minlens when it's all done. This way we don't
8aa23a47
YO
3971 have to worry about freeing them when we know
3972 they wont be used, which would be a pain.
3973 */
3974 I32 *minnextp;
3975 Newx( minnextp, 1, I32 );
3976 SAVEFREEPV(minnextp);
3977
3978 if (data) {
3979 StructCopy(data, &data_fake, scan_data_t);
3980 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3981 f |= SCF_DO_SUBSTR;
3982 if (scan->flags)
304ee84b 3983 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3984 data_fake.last_found=newSVsv(data->last_found);
3985 }
3986 }
3987 else
3988 data_fake.last_closep = &fake;
3989 data_fake.flags = 0;
58e23c8d 3990 data_fake.pos_delta = delta;
8aa23a47
YO
3991 if (is_inf)
3992 data_fake.flags |= SF_IS_INF;
3993 if ( flags & SCF_DO_STCLASS && !scan->flags
3994 && OP(scan) == IFMATCH ) { /* Lookahead */
3995 cl_init(pRExC_state, &intrnl);
3996 data_fake.start_class = &intrnl;
3997 f |= SCF_DO_STCLASS_AND;
3998 }
3999 if (flags & SCF_WHILEM_VISITED_POS)
4000 f |= SCF_WHILEM_VISITED_POS;
4001 next = regnext(scan);
4002 nscan = NEXTOPER(NEXTOPER(scan));
4003
4004 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4005 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4006 if (scan->flags) {
4007 if (deltanext) {
58e23c8d 4008 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
4009 }
4010 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 4011 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
4012 }
4013 scan->flags = (U8)*minnextp;
4014 }
4015
4016 *minnextp += min;
4017
4018 if (f & SCF_DO_STCLASS_AND) {
4019 const int was = (data->start_class->flags & ANYOF_EOS);
4020
4021 cl_and(data->start_class, &intrnl);
4022 if (was)
4023 data->start_class->flags |= ANYOF_EOS;
4024 }
4025 if (data) {
4026 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4027 pars++;
4028 if (data_fake.flags & SF_HAS_EVAL)
4029 data->flags |= SF_HAS_EVAL;
4030 data->whilem_c = data_fake.whilem_c;
4031 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4032 if (RExC_rx->minlen<*minnextp)
4033 RExC_rx->minlen=*minnextp;
304ee84b 4034 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
4035 SvREFCNT_dec(data_fake.last_found);
4036
4037 if ( data_fake.minlen_fixed != minlenp )
4038 {
4039 data->offset_fixed= data_fake.offset_fixed;
4040 data->minlen_fixed= data_fake.minlen_fixed;
4041 data->lookbehind_fixed+= scan->flags;
4042 }
4043 if ( data_fake.minlen_float != minlenp )
4044 {
4045 data->minlen_float= data_fake.minlen_float;
4046 data->offset_float_min=data_fake.offset_float_min;
4047 data->offset_float_max=data_fake.offset_float_max;
4048 data->lookbehind_float+= scan->flags;
4049 }
4050 }
4051 }
4052
4053
40d049e4 4054 }
8aa23a47
YO
4055#endif
4056 }
4057 else if (OP(scan) == OPEN) {
4058 if (stopparen != (I32)ARG(scan))
4059 pars++;
4060 }
4061 else if (OP(scan) == CLOSE) {
4062 if (stopparen == (I32)ARG(scan)) {
4063 break;
4064 }
4065 if ((I32)ARG(scan) == is_par) {
4066 next = regnext(scan);
b515a41d 4067
8aa23a47
YO
4068 if ( next && (OP(next) != WHILEM) && next < last)
4069 is_par = 0; /* Disable optimization */
40d049e4 4070 }
8aa23a47
YO
4071 if (data)
4072 *(data->last_closep) = ARG(scan);
4073 }
4074 else if (OP(scan) == EVAL) {
c277df42
IZ
4075 if (data)
4076 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
4077 }
4078 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4079 if (flags & SCF_DO_SUBSTR) {
304ee84b 4080 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 4081 flags &= ~SCF_DO_SUBSTR;
40d049e4 4082 }
8aa23a47
YO
4083 if (data && OP(scan)==ACCEPT) {
4084 data->flags |= SCF_SEEN_ACCEPT;
4085 if (stopmin > min)
4086 stopmin = min;
e2e6a0f1 4087 }
8aa23a47
YO
4088 }
4089 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4090 {
0f5d15d6 4091 if (flags & SCF_DO_SUBSTR) {
304ee84b 4092 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
4093 data->longest = &(data->longest_float);
4094 }
4095 is_inf = is_inf_internal = 1;
653099ff 4096 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 4097 cl_anything(pRExC_state, data->start_class);
96776eda 4098 flags &= ~SCF_DO_STCLASS;
8aa23a47 4099 }
58e23c8d 4100 else if (OP(scan) == GPOS) {
bbe252da 4101 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
4102 !(delta || is_inf || (data && data->pos_delta)))
4103 {
bbe252da
YO
4104 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4105 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
4106 if (RExC_rx->gofs < (U32)min)
4107 RExC_rx->gofs = min;
4108 } else {
bbe252da 4109 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
4110 RExC_rx->gofs = 0;
4111 }
4112 }
786e8c11 4113#ifdef TRIE_STUDY_OPT
40d049e4 4114#ifdef FULL_TRIE_STUDY
8aa23a47
YO
4115 else if (PL_regkind[OP(scan)] == TRIE) {
4116 /* NOTE - There is similar code to this block above for handling
4117 BRANCH nodes on the initial study. If you change stuff here
4118 check there too. */
4119 regnode *trie_node= scan;
4120 regnode *tail= regnext(scan);
f8fc2ecf 4121 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4122 I32 max1 = 0, min1 = I32_MAX;
4123 struct regnode_charclass_class accum;
4124
4125 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 4126 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
4127 if (flags & SCF_DO_STCLASS)
4128 cl_init_zero(pRExC_state, &accum);
4129
4130 if (!trie->jump) {
4131 min1= trie->minlen;
4132 max1= trie->maxlen;
4133 } else {
4134 const regnode *nextbranch= NULL;
4135 U32 word;
4136
4137 for ( word=1 ; word <= trie->wordcount ; word++)
4138 {
4139 I32 deltanext=0, minnext=0, f = 0, fake;
4140 struct regnode_charclass_class this_class;
4141
4142 data_fake.flags = 0;
4143 if (data) {
4144 data_fake.whilem_c = data->whilem_c;
4145 data_fake.last_closep = data->last_closep;
4146 }
4147 else
4148 data_fake.last_closep = &fake;
58e23c8d 4149 data_fake.pos_delta = delta;
8aa23a47
YO
4150 if (flags & SCF_DO_STCLASS) {
4151 cl_init(pRExC_state, &this_class);
4152 data_fake.start_class = &this_class;
4153 f = SCF_DO_STCLASS_AND;
4154 }
4155 if (flags & SCF_WHILEM_VISITED_POS)
4156 f |= SCF_WHILEM_VISITED_POS;
4157
4158 if (trie->jump[word]) {
4159 if (!nextbranch)
4160 nextbranch = trie_node + trie->jump[0];
4161 scan= trie_node + trie->jump[word];
4162 /* We go from the jump point to the branch that follows
4163 it. Note this means we need the vestigal unused branches
4164 even though they arent otherwise used.
4165 */
4166 minnext = study_chunk(pRExC_state, &scan, minlenp,
4167 &deltanext, (regnode *)nextbranch, &data_fake,
4168 stopparen, recursed, NULL, f,depth+1);
4169 }
4170 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4171 nextbranch= regnext((regnode*)nextbranch);
4172
4173 if (min1 > (I32)(minnext + trie->minlen))
4174 min1 = minnext + trie->minlen;
4175 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4176 max1 = minnext + deltanext + trie->maxlen;
4177 if (deltanext == I32_MAX)
4178 is_inf = is_inf_internal = 1;
4179
4180 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4181 pars++;
4182 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4183 if ( stopmin > min + min1)
4184 stopmin = min + min1;
4185 flags &= ~SCF_DO_SUBSTR;
4186 if (data)
4187 data->flags |= SCF_SEEN_ACCEPT;
4188 }
4189 if (data) {
4190 if (data_fake.flags & SF_HAS_EVAL)
4191 data->flags |= SF_HAS_EVAL;
4192 data->whilem_c = data_fake.whilem_c;
4193 }
4194 if (flags & SCF_DO_STCLASS)
4195 cl_or(pRExC_state, &accum, &this_class);
4196 }
4197 }
4198 if (flags & SCF_DO_SUBSTR) {
4199 data->pos_min += min1;
4200 data->pos_delta += max1 - min1;
4201 if (max1 != min1 || is_inf)
4202 data->longest = &(data->longest_float);
4203 }
4204 min += min1;
4205 delta += max1 - min1;
4206 if (flags & SCF_DO_STCLASS_OR) {
4207 cl_or(pRExC_state, data->start_class, &accum);
4208 if (min1) {
4209 cl_and(data->start_class, and_withp);
4210 flags &= ~SCF_DO_STCLASS;
4211 }
4212 }
4213 else if (flags & SCF_DO_STCLASS_AND) {
4214 if (min1) {
4215 cl_and(data->start_class, &accum);
4216 flags &= ~SCF_DO_STCLASS;
4217 }
4218 else {
4219 /* Switch to OR mode: cache the old value of
4220 * data->start_class */
4221 INIT_AND_WITHP;
4222 StructCopy(data->start_class, and_withp,
4223 struct regnode_charclass_class);
4224 flags &= ~SCF_DO_STCLASS_AND;
4225 StructCopy(&accum, data->start_class,
4226 struct regnode_charclass_class);
4227 flags |= SCF_DO_STCLASS_OR;
4228 data->start_class->flags |= ANYOF_EOS;
4229 }
4230 }
4231 scan= tail;
4232 continue;
4233 }
786e8c11 4234#else
8aa23a47 4235 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4236 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4237 U8*bang=NULL;
4238
4239 min += trie->minlen;
4240 delta += (trie->maxlen - trie->minlen);
4241 flags &= ~SCF_DO_STCLASS; /* xxx */
4242 if (flags & SCF_DO_SUBSTR) {
304ee84b 4243 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4244 data->pos_min += trie->minlen;
4245 data->pos_delta += (trie->maxlen - trie->minlen);
4246 if (trie->maxlen != trie->minlen)
4247 data->longest = &(data->longest_float);
4248 }
4249 if (trie->jump) /* no more substrings -- for now /grr*/
4250 flags &= ~SCF_DO_SUBSTR;
b515a41d 4251 }
8aa23a47
YO
4252#endif /* old or new */
4253#endif /* TRIE_STUDY_OPT */
e1d1eefb 4254
8aa23a47
YO
4255 /* Else: zero-length, ignore. */
4256 scan = regnext(scan);
4257 }
4258 if (frame) {
4259 last = frame->last;
4260 scan = frame->next;
4261 stopparen = frame->stop;
4262 frame = frame->prev;
4263 goto fake_study_recurse;
c277df42
IZ
4264 }
4265
4266 finish:
8aa23a47 4267 assert(!frame);
304ee84b 4268 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4269
c277df42 4270 *scanp = scan;
aca2d497 4271 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4272 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4273 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4274 if (is_par > (I32)U8_MAX)
c277df42
IZ
4275 is_par = 0;
4276 if (is_par && pars==1 && data) {
4277 data->flags |= SF_IN_PAR;
4278 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4279 }
4280 else if (pars && data) {
c277df42
IZ
4281 data->flags |= SF_HAS_PAR;
4282 data->flags &= ~SF_IN_PAR;
4283 }
653099ff 4284 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4285 cl_and(data->start_class, and_withp);
786e8c11
YO
4286 if (flags & SCF_TRIE_RESTUDY)
4287 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4288
304ee84b 4289 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4290
e2e6a0f1 4291 return min < stopmin ? min : stopmin;
c277df42
IZ
4292}
4293
2eccd3b2
NC
4294STATIC U32
4295S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4296{
4a4e7719
NC
4297 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4298
7918f24d
NC
4299 PERL_ARGS_ASSERT_ADD_DATA;
4300
4a4e7719
NC
4301 Renewc(RExC_rxi->data,
4302 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4303 char, struct reg_data);
4304 if(count)
f8fc2ecf 4305 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4306 else
f8fc2ecf 4307 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4308 RExC_rxi->data->count = count + n;
4309 Copy(s, RExC_rxi->data->what + count, n, U8);
4310 return count;
c277df42
IZ
4311}
4312
f8149455 4313/*XXX: todo make this not included in a non debugging perl */
76234dfb 4314#ifndef PERL_IN_XSUB_RE
d88dccdf 4315void
864dbfa3 4316Perl_reginitcolors(pTHX)
d88dccdf 4317{
97aff369 4318 dVAR;
1df70142 4319 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4320 if (s) {
1df70142
AL
4321 char *t = savepv(s);
4322 int i = 0;
4323 PL_colors[0] = t;
d88dccdf 4324 while (++i < 6) {
1df70142
AL
4325 t = strchr(t, '\t');
4326 if (t) {
4327 *t = '\0';
4328 PL_colors[i] = ++t;
d88dccdf
IZ
4329 }
4330 else
1df70142 4331 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4332 }
4333 } else {
1df70142 4334 int i = 0;
b81d288d 4335 while (i < 6)
06b5626a 4336 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4337 }
4338 PL_colorset = 1;
4339}
76234dfb 4340#endif
8615cb43 4341
07be1b83 4342
786e8c11
YO
4343#ifdef TRIE_STUDY_OPT
4344#define CHECK_RESTUDY_GOTO \
4345 if ( \
4346 (data.flags & SCF_TRIE_RESTUDY) \
4347 && ! restudied++ \
4348 ) goto reStudy
4349#else
4350#define CHECK_RESTUDY_GOTO
4351#endif
f9f4320a 4352
a687059c 4353/*
e50aee73 4354 - pregcomp - compile a regular expression into internal code
a687059c
LW
4355 *
4356 * We can't allocate space until we know how big the compiled form will be,
4357 * but we can't compile it (and thus know how big it is) until we've got a
4358 * place to put the code. So we cheat: we compile it twice, once with code
4359 * generation turned off and size counting turned on, and once "for real".
4360 * This also means that we don't allocate space until we are sure that the
4361 * thing really will compile successfully, and we never have to move the
4362 * code and thus invalidate pointers into it. (Note that it has to be in
4363 * one piece because free() must be able to free it all.) [NB: not true in perl]
4364 *
4365 * Beware that the optimization-preparation code in here knows about some
4366 * of the structure of the compiled regexp. [I'll say.]
4367 */
b9b4dddf
YO
4368
4369
4370
f9f4320a 4371#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4372#define RE_ENGINE_PTR &PL_core_reg_engine
4373#else
f9f4320a
YO
4374extern const struct regexp_engine my_reg_engine;
4375#define RE_ENGINE_PTR &my_reg_engine
4376#endif
6d5c990f
RGS
4377
4378#ifndef PERL_IN_XSUB_RE
3ab4a224 4379REGEXP *
1593ad57 4380Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4381{
97aff369 4382 dVAR;
6d5c990f 4383 HV * const table = GvHV(PL_hintgv);
7918f24d
NC
4384
4385 PERL_ARGS_ASSERT_PREGCOMP;
4386
f9f4320a
YO
4387 /* Dispatch a request to compile a regexp to correct
4388 regexp engine. */
f9f4320a
YO
4389 if (table) {
4390 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4391 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4392 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4393 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4394 DEBUG_COMPILE_r({
8d8756e7 4395 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4396 SvIV(*ptr));
4397 });
3ab4a224 4398 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4399 }
b9b4dddf 4400 }
3ab4a224 4401 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4402}
6d5c990f 4403#endif
2a5d9b1d 4404
3ab4a224 4405REGEXP *
29b09c41 4406Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
2a5d9b1d
RGS
4407{
4408 dVAR;
288b8c02
NC
4409 REGEXP *rx;
4410 struct regexp *r;
f8fc2ecf 4411 register regexp_internal *ri;
3ab4a224 4412 STRLEN plen;
5d51ce98
KW
4413 char *exp;
4414 char* xend;
c277df42 4415 regnode *scan;
a0d0e21e 4416 I32 flags;
a0d0e21e 4417 I32 minlen = 0;
29b09c41 4418 U32 pm_flags;
e7f38d0f
YO
4419
4420 /* these are all flags - maybe they should be turned
4421 * into a single int with different bit masks */
4422 I32 sawlookahead = 0;
a0d0e21e
LW
4423 I32 sawplus = 0;
4424 I32 sawopen = 0;
29b09c41 4425 bool used_setjump = FALSE;
e7f38d0f 4426
bbd61b5f
KW
4427 U8 jump_ret = 0;
4428 dJMPENV;
2c2d71f5 4429 scan_data_t data;
830247a4 4430 RExC_state_t RExC_state;
be8e71aa 4431 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83 4432#ifdef TRIE_STUDY_OPT
5d51ce98 4433 int restudied;
07be1b83
YO
4434 RExC_state_t copyRExC_state;
4435#endif
2a5d9b1d 4436 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4437
4438 PERL_ARGS_ASSERT_RE_COMPILE;
4439
6d5c990f 4440 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4441
29b09c41 4442 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
e40e74fe 4443 RExC_uni_semantics = 0;
7b597bb8 4444
d6bd454d 4445 /****************** LONG JUMP TARGET HERE***********************/
bbd61b5f
KW
4446 /* Longjmp back to here if have to switch in midstream to utf8 */
4447 if (! RExC_orig_utf8) {
4448 JMPENV_PUSH(jump_ret);
29b09c41 4449 used_setjump = TRUE;
bbd61b5f
KW
4450 }
4451
5d51ce98 4452 if (jump_ret == 0) { /* First time through */
29b09c41
KW
4453 exp = SvPV(pattern, plen);
4454 xend = exp + plen;
4455 /* ignore the utf8ness if the pattern is 0 length */
4456 if (plen == 0) {
4457 RExC_utf8 = RExC_orig_utf8 = 0;
4458 }
4459
5d51ce98
KW
4460 DEBUG_COMPILE_r({
4461 SV *dsv= sv_newmortal();
4462 RE_PV_QUOTED_DECL(s, RExC_utf8,
4463 dsv, exp, plen, 60);
4464 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4465 PL_colors[4],PL_colors[5],s);
4466 });
4467 }
4468 else { /* longjumped back */
bbd61b5f
KW
4469 STRLEN len = plen;
4470
5d51ce98
KW
4471 /* If the cause for the longjmp was other than changing to utf8, pop
4472 * our own setjmp, and longjmp to the correct handler */
bbd61b5f
KW
4473 if (jump_ret != UTF8_LONGJMP) {
4474 JMPENV_POP;
4475 JMPENV_JUMP(jump_ret);
4476 }
4477
595598ee
KW
4478 GET_RE_DEBUG_FLAGS;
4479
bbd61b5f
KW
4480 /* It's possible to write a regexp in ascii that represents Unicode
4481 codepoints outside of the byte range, such as via \x{100}. If we
4482 detect such a sequence we have to convert the entire pattern to utf8
4483 and then recompile, as our sizing calculation will have been based
4484 on 1 byte == 1 character, but we will need to use utf8 to encode
4485 at least some part of the pattern, and therefore must convert the whole
4486 thing.
4487 -- dmq */
4488 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4489 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
595598ee 4490 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
bbd61b5f
KW
4491 xend = exp + len;
4492 RExC_orig_utf8 = RExC_utf8 = 1;
4493 SAVEFREEPV(exp);
4494 }
4495
5d51ce98
KW
4496#ifdef TRIE_STUDY_OPT
4497 restudied = 0;
4498#endif
4499
29b09c41 4500 /* Set to use unicode semantics if the pattern is in utf8 and has the
50e91148 4501 * 'depends' charset specified, as it means unicode when utf8 */
29b09c41 4502 pm_flags = orig_pm_flags;
a62b1201
KW
4503
4504 if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
4505 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
29b09c41
KW
4506 }
4507
02daf0ab 4508 RExC_precomp = exp;
c737faaf 4509 RExC_flags = pm_flags;
830247a4 4510 RExC_sawback = 0;
bbce6d69 4511
830247a4 4512 RExC_seen = 0;
b57e4118 4513 RExC_in_lookbehind = 0;
830247a4
IZ
4514 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4515 RExC_seen_evals = 0;
4516 RExC_extralen = 0;
c277df42 4517
bbce6d69 4518 /* First pass: determine size, legality. */
830247a4 4519 RExC_parse = exp;
fac92740 4520 RExC_start = exp;
830247a4
IZ
4521 RExC_end = xend;
4522 RExC_naughty = 0;
4523 RExC_npar = 1;
e2e6a0f1 4524 RExC_nestroot = 0;
830247a4
IZ
4525 RExC_size = 0L;
4526 RExC_emit = &PL_regdummy;
4527 RExC_whilem_seen = 0;
40d049e4
YO
4528 RExC_open_parens = NULL;
4529 RExC_close_parens = NULL;
4530 RExC_opend = NULL;
81714fb9 4531 RExC_paren_names = NULL;
1f1031fe
YO
4532#ifdef DEBUGGING
4533 RExC_paren_name_list = NULL;
4534#endif
40d049e4
YO
4535 RExC_recurse = NULL;
4536 RExC_recurse_count = 0;
81714fb9 4537
85ddcde9
JH
4538#if 0 /* REGC() is (currently) a NOP at the first pass.
4539 * Clever compilers notice this and complain. --jhi */
830247a4 4540 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4541#endif
3dab1dad
YO
4542 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4543 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4544 RExC_precomp = NULL;
a0d0e21e
LW
4545 return(NULL);
4546 }
bbd61b5f 4547
29b09c41
KW
4548 /* Here, finished first pass. Get rid of any added setjmp */
4549 if (used_setjump) {
bbd61b5f 4550 JMPENV_POP;
02daf0ab 4551 }
e40e74fe 4552
07be1b83 4553 DEBUG_PARSE_r({
81714fb9
YO
4554 PerlIO_printf(Perl_debug_log,
4555 "Required size %"IVdf" nodes\n"
4556 "Starting second pass (creation)\n",
4557 (IV)RExC_size);
07be1b83
YO
4558 RExC_lastnum=0;
4559 RExC_lastparse=NULL;
4560 });
e40e74fe
KW
4561
4562 /* The first pass could have found things that force Unicode semantics */
4563 if ((RExC_utf8 || RExC_uni_semantics)
4564 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4565 {
4566 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4567 }
4568
c277df42
IZ
4569 /* Small enough for pointer-storage convention?
4570 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4571 if (RExC_size >= 0x10000L && RExC_extralen)
4572 RExC_size += RExC_extralen;
c277df42 4573 else
830247a4
IZ
4574 RExC_extralen = 0;
4575 if (RExC_whilem_seen > 15)
4576 RExC_whilem_seen = 15;
a0d0e21e 4577
f9f4320a
YO
4578 /* Allocate space and zero-initialize. Note, the two step process
4579 of zeroing when in debug mode, thus anything assigned has to
4580 happen after that */
d2f13c59 4581 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 4582 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
4583 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4584 char, regexp_internal);
4585 if ( r == NULL || ri == NULL )
b45f050a 4586 FAIL("Regexp out of space");
0f79a09d
GS
4587#ifdef DEBUGGING
4588 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4589 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4590#else
f8fc2ecf
YO
4591 /* bulk initialize base fields with 0. */
4592 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4593#endif
58e23c8d
YO
4594
4595 /* non-zero initialization begins here */
f8fc2ecf 4596 RXi_SET( r, ri );
f9f4320a 4597 r->engine= RE_ENGINE_PTR;
c737faaf 4598 r->extflags = pm_flags;
bcdf7404 4599 {
f7819f85 4600 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
a62b1201 4601 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
c5ea2ffa
KW
4602
4603 /* The caret is output if there are any defaults: if not all the STD
4604 * flags are set, or if no character set specifier is needed */
4605 bool has_default =
4606 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4607 || ! has_charset);
bcdf7404 4608 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
4609 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4610 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
4611 const char *fptr = STD_PAT_MODS; /*"msix"*/
4612 char *p;
fb85c044 4613 /* Allocate for the worst case, which is all the std flags are turned
c5ea2ffa
KW
4614 * on. If more precision is desired, we could do a population count of
4615 * the flags set. This could be done with a small lookup table, or by
4616 * shifting, masking and adding, or even, when available, assembly
4617 * language for a machine-language population count.
4618 * We never output a minus, as all those are defaults, so are
4619 * covered by the caret */
fb85c044 4620 const STRLEN wraplen = plen + has_p + has_runon
c5ea2ffa 4621 + has_default /* If needs a caret */
a62b1201
KW
4622
4623 /* If needs a character set specifier */
4624 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
bcdf7404
YO
4625 + (sizeof(STD_PAT_MODS) - 1)
4626 + (sizeof("(?:)") - 1);
4627
c5ea2ffa 4628 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
f7c278bf 4629 SvPOK_on(rx);
8f6ae13c 4630 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 4631 *p++='('; *p++='?';
9de15fec
KW
4632
4633 /* If a default, cover it using the caret */
c5ea2ffa 4634 if (has_default) {
85508812 4635 *p++= DEFAULT_PAT_MOD;
fb85c044 4636 }
c5ea2ffa 4637 if (has_charset) {
a62b1201
KW
4638 STRLEN len;
4639 const char* const name = get_regex_charset_name(r->extflags, &len);
4640 Copy(name, p, len, char);
4641 p += len;
9de15fec 4642 }
f7819f85
A
4643 if (has_p)
4644 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404 4645 {
bcdf7404 4646 char ch;
bcdf7404
YO
4647 while((ch = *fptr++)) {
4648 if(reganch & 1)
4649 *p++ = ch;
bcdf7404
YO
4650 reganch >>= 1;
4651 }
bcdf7404
YO
4652 }
4653
28d8d7f4 4654 *p++ = ':';
bb661a58 4655 Copy(RExC_precomp, p, plen, char);
efd26800
NC
4656 assert ((RX_WRAPPED(rx) - p) < 16);
4657 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 4658 p += plen;
bcdf7404 4659 if (has_runon)
28d8d7f4
YO
4660 *p++ = '\n';
4661 *p++ = ')';
4662 *p = 0;
fb85c044 4663 SvCUR_set(rx, p - SvPVX_const(rx));
bcdf7404
YO
4664 }
4665
bbe252da 4666 r->intflags = 0;
830247a4 4667 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4668
6bda09f9 4669 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4670 Newxz(RExC_open_parens, RExC_npar,regnode *);
4671 SAVEFREEPV(RExC_open_parens);
4672 Newxz(RExC_close_parens,RExC_npar,regnode *);
4673 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4674 }
4675
4676 /* Useful during FAIL. */
7122b237
YO
4677#ifdef RE_TRACK_PATTERN_OFFSETS
4678 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4679 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4680 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4681 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4682 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4683#endif
4684 SetProgLen(ri,RExC_size);
288b8c02 4685 RExC_rx_sv = rx;
830247a4 4686 RExC_rx = r;
f8fc2ecf 4687 RExC_rxi = ri;
bbce6d69 4688
4689 /* Second pass: emit code. */
c737faaf 4690 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
4691 RExC_parse = exp;
4692 RExC_end = xend;
4693 RExC_naughty = 0;
4694 RExC_npar = 1;
f8fc2ecf
YO
4695 RExC_emit_start = ri->program;
4696 RExC_emit = ri->program;
3b57cd43
YO
4697 RExC_emit_bound = ri->program + RExC_size + 1;
4698
2cd61cdb 4699 /* Store the count of eval-groups for security checks: */
f8149455 4700 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4701 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 4702 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 4703 ReREFCNT_dec(rx);
a0d0e21e 4704 return(NULL);
80757612 4705 }
07be1b83
YO
4706 /* XXXX To minimize changes to RE engine we always allocate
4707 3-units-long substrs field. */
4708 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4709 if (RExC_recurse_count) {
4710 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4711 SAVEFREEPV(RExC_recurse);
4712 }
a0d0e21e 4713
07be1b83 4714reStudy:
e7f38d0f 4715 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
07be1b83 4716 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4717
07be1b83 4718#ifdef TRIE_STUDY_OPT
0934c9d9
SH
4719 if (!restudied) {
4720 StructCopy(&zero_scan_data, &data, scan_data_t);
4721 copyRExC_state = RExC_state;
4722 } else {
5d458dd8 4723 U32 seen=RExC_seen;
07be1b83 4724 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4725
4726 RExC_state = copyRExC_state;
4727 if (seen & REG_TOP_LEVEL_BRANCHES)
4728 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4729 else
4730 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4731 if (data.last_found) {
07be1b83 4732 SvREFCNT_dec(data.longest_fixed);
07be1b83 4733 SvREFCNT_dec(data.longest_float);
07be1b83 4734 SvREFCNT_dec(data.last_found);
1de06328 4735 }
40d049e4 4736 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4737 }
40d049e4
YO
4738#else
4739 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4740#endif
fc8cd66c 4741
a0d0e21e 4742 /* Dig out information for optimizations. */
f7819f85 4743 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
4744 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4745
a0ed51b3 4746 if (UTF)
8f6ae13c 4747 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 4748 ri->regstclass = NULL;
830247a4 4749 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4750 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4751 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4752
1de06328
YO
4753 /* testing for BRANCH here tells us whether there is "must appear"
4754 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4755 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4756 I32 fake;
c5254dd6 4757 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4758 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4759 int stclass_flag;
07be1b83 4760 I32 last_close = 0; /* pointed to by data */
5339e136
YO
4761 regnode *first= scan;
4762 regnode *first_next= regnext(first);
639081d6
YO
4763 /*
4764 * Skip introductions and multiplicators >= 1
4765 * so that we can extract the 'meat' of the pattern that must
4766 * match in the large if() sequence following.
4767 * NOTE that EXACT is NOT covered here, as it is normally
4768 * picked up by the optimiser separately.
4769 *
4770 * This is unfortunate as the optimiser isnt handling lookahead
4771 * properly currently.
4772 *
4773 */
a0d0e21e 4774 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4775 /* An OR of *one* alternative - should not happen now. */
5339e136 4776 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83 4777 /* for now we can't handle lookbehind IFMATCH*/
e7f38d0f 4778 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
a0d0e21e
LW
4779 (OP(first) == PLUS) ||
4780 (OP(first) == MINMOD) ||
653099ff 4781 /* An {n,m} with n>0 */
5339e136
YO
4782 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4783 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 4784 {
639081d6
YO
4785 /*
4786 * the only op that could be a regnode is PLUS, all the rest
4787 * will be regnode_1 or regnode_2.
4788 *
4789 */
a0d0e21e
LW
4790 if (OP(first) == PLUS)
4791 sawplus = 1;
4792 else
3dab1dad 4793 first += regarglen[OP(first)];
639081d6
YO
4794
4795 first = NEXTOPER(first);
5339e136 4796 first_next= regnext(first);
a687059c
LW
4797 }
4798
a0d0e21e
LW
4799 /* Starting-point info. */
4800 again:
786e8c11 4801 DEBUG_PEEP("first:",first,0);
07be1b83 4802 /* Ignore EXACT as we deal with it later. */
3dab1dad 4803 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4804 if (OP(first) == EXACT)
6f207bd3 4805 NOOP; /* Empty, get anchored substr later. */
e5fbd0ff 4806 else
f8fc2ecf 4807 ri->regstclass = first;
b3c9acc1 4808 }
07be1b83 4809#ifdef TRIE_STCLASS
786e8c11 4810 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4811 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4812 {
786e8c11 4813 regnode *trie_op;
07be1b83 4814 /* this can happen only on restudy */
786e8c11 4815 if ( OP(first) == TRIE ) {
c944940b 4816 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4817 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4818 StructCopy(first,trieop,struct regnode_1);
4819 trie_op=(regnode *)trieop;
4820 } else {
c944940b 4821 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4822 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4823 StructCopy(first,trieop,struct regnode_charclass);
4824 trie_op=(regnode *)trieop;
4825 }
1de06328 4826 OP(trie_op)+=2;
786e8c11 4827 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4828 ri->regstclass = trie_op;
07be1b83
YO
4829 }
4830#endif
e52fc539 4831 else if (REGNODE_SIMPLE(OP(first)))
f8fc2ecf 4832 ri->regstclass = first;
3dab1dad
YO
4833 else if (PL_regkind[OP(first)] == BOUND ||
4834 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4835 ri->regstclass = first;
3dab1dad 4836 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4837 r->extflags |= (OP(first) == MBOL
4838 ? RXf_ANCH_MBOL
cad2e5aa 4839 : (OP(first) == SBOL
bbe252da
YO
4840 ? RXf_ANCH_SBOL
4841 : RXf_ANCH_BOL));
a0d0e21e 4842 first = NEXTOPER(first);
774d564b 4843 goto again;
4844 }
4845 else if (OP(first) == GPOS) {
bbe252da 4846 r->extflags |= RXf_ANCH_GPOS;
774d564b 4847 first = NEXTOPER(first);
4848 goto again;
a0d0e21e 4849 }
cf2a2b69
YO
4850 else if ((!sawopen || !RExC_sawback) &&
4851 (OP(first) == STAR &&
3dab1dad 4852 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4853 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4854 {
4855 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4856 const int type =
4857 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4858 ? RXf_ANCH_MBOL
4859 : RXf_ANCH_SBOL;
4860 r->extflags |= type;
4861 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4862 first = NEXTOPER(first);
774d564b 4863 goto again;
a0d0e21e 4864 }
e7f38d0f 4865 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
830247a4 4866 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4867 /* x+ must match at the 1st pos of run of x's */
bbe252da 4868 r->intflags |= PREGf_SKIP;
a0d0e21e 4869
c277df42 4870 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4871#ifdef TRIE_STUDY_OPT
81714fb9 4872 DEBUG_PARSE_r(
be8e71aa
YO
4873 if (!restudied)
4874 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4875 (IV)(first - scan + 1))
4876 );
4877#else
81714fb9 4878 DEBUG_PARSE_r(
be8e71aa
YO
4879 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4880 (IV)(first - scan + 1))
4881 );
4882#endif
4883
4884
a0d0e21e
LW
4885 /*
4886 * If there's something expensive in the r.e., find the
4887 * longest literal string that must appear and make it the
4888 * regmust. Resolve ties in favor of later strings, since
4889 * the regstart check works with the beginning of the r.e.
4890 * and avoiding duplication strengthens checking. Not a
4891 * strong reason, but sufficient in the absence of others.
4892 * [Now we resolve ties in favor of the earlier string if
c277df42 4893 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4894 * earlier string may buy us something the later one won't.]
4895 */
de8c5301 4896
396482e1
GA
4897 data.longest_fixed = newSVpvs("");
4898 data.longest_float = newSVpvs("");
4899 data.last_found = newSVpvs("");
c277df42
IZ
4900 data.longest = &(data.longest_fixed);
4901 first = scan;
f8fc2ecf 4902 if (!ri->regstclass) {
830247a4 4903 cl_init(pRExC_state, &ch_class);
653099ff
GS
4904 data.start_class = &ch_class;
4905 stclass_flag = SCF_DO_STCLASS_AND;
4906 } else /* XXXX Check for BOUND? */
4907 stclass_flag = 0;
cb434fcc 4908 data.last_closep = &last_close;
de8c5301 4909
1de06328 4910 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4911 &data, -1, NULL, NULL,
4912 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4913
07be1b83 4914
786e8c11
YO
4915 CHECK_RESTUDY_GOTO;
4916
4917
830247a4 4918 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4919 && data.last_start_min == 0 && data.last_end > 0
830247a4 4920 && !RExC_seen_zerolen
2bf803e2 4921 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4922 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4923 r->extflags |= RXf_CHECK_ALL;
304ee84b 4924 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4925 SvREFCNT_dec(data.last_found);
4926
1de06328
YO
4927 /* Note that code very similar to this but for anchored string
4928 follows immediately below, changes may need to be made to both.
4929 Be careful.
4930 */
a0ed51b3 4931 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4932 if (longest_float_length
c277df42
IZ
4933 || (data.flags & SF_FL_BEFORE_EOL
4934 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4935 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4936 {
1182767e 4937 I32 t,ml;
cf93c79d 4938
1de06328 4939 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4940 && data.offset_fixed == data.offset_float_min
4941 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4942 goto remove_float; /* As in (a)+. */
4943
1de06328
YO
4944 /* copy the information about the longest float from the reg_scan_data
4945 over to the program. */
33b8afdf
JH
4946 if (SvUTF8(data.longest_float)) {
4947 r->float_utf8 = data.longest_float;
c445ea15 4948 r->float_substr = NULL;
33b8afdf
JH
4949 } else {
4950 r->float_substr = data.longest_float;
c445ea15 4951 r->float_utf8 = NULL;
33b8afdf 4952 }
1de06328
YO
4953 /* float_end_shift is how many chars that must be matched that
4954 follow this item. We calculate it ahead of time as once the
4955 lookbehind offset is added in we lose the ability to correctly
4956 calculate it.*/
4957 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4958 : (I32)longest_float_length;
1de06328
YO
4959 r->float_end_shift = ml - data.offset_float_min
4960 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4961 + data.lookbehind_float;
4962 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4963 r->float_max_offset = data.offset_float_max;
1182767e 4964 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4965 r->float_max_offset -= data.lookbehind_float;
4966
cf93c79d
IZ
4967 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4968 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4969 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4970 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4971 }
4972 else {
aca2d497 4973 remove_float:
c445ea15 4974 r->float_substr = r->float_utf8 = NULL;
c277df42 4975 SvREFCNT_dec(data.longest_float);
c5254dd6 4976 longest_float_length = 0;
a0d0e21e 4977 }
c277df42 4978
1de06328
YO
4979 /* Note that code very similar to this but for floating string
4980 is immediately above, changes may need to be made to both.
4981 Be careful.
4982 */
a0ed51b3 4983 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4984 if (longest_fixed_length
c277df42
IZ
4985 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4986 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4987 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4988 {
1182767e 4989 I32 t,ml;
cf93c79d 4990
1de06328
YO
4991 /* copy the information about the longest fixed
4992 from the reg_scan_data over to the program. */
33b8afdf
JH
4993 if (SvUTF8(data.longest_fixed)) {
4994 r->anchored_utf8 = data.longest_fixed;
c445ea15 4995 r->anchored_substr = NULL;
33b8afdf
JH
4996 } else {
4997 r->anchored_substr = data.longest_fixed;
c445ea15 4998 r->anchored_utf8 = NULL;
33b8afdf 4999 }
1de06328
YO
5000 /* fixed_end_shift is how many chars that must be matched that
5001 follow this item. We calculate it ahead of time as once the
5002 lookbehind offset is added in we lose the ability to correctly
5003 calculate it.*/
5004 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 5005 : (I32)longest_fixed_length;
1de06328
YO
5006 r->anchored_end_shift = ml - data.offset_fixed
5007 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5008 + data.lookbehind_fixed;
5009 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5010
cf93c79d
IZ
5011 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5012 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 5013 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 5014 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
5015 }
5016 else {
c445ea15 5017 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 5018 SvREFCNT_dec(data.longest_fixed);
c5254dd6 5019 longest_fixed_length = 0;
a0d0e21e 5020 }
f8fc2ecf
YO
5021 if (ri->regstclass
5022 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5023 ri->regstclass = NULL;
f4244008
KW
5024
5025 /* If the synthetic start class were to ever be used when EOS is set,
5026 * that bit would have to be cleared, as it is shared with another */
33b8afdf
JH
5027 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5028 && stclass_flag
653099ff 5029 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
5030 && !cl_is_anything(data.start_class))
5031 {
2eccd3b2 5032 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 5033
f8fc2ecf 5034 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
5035 struct regnode_charclass_class);
5036 StructCopy(data.start_class,
f8fc2ecf 5037 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 5038 struct regnode_charclass_class);
f8fc2ecf 5039 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5040 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5041 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 5042 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5043 PerlIO_printf(Perl_debug_log,
a0288114 5044 "synthetic stclass \"%s\".\n",
3f7c398e 5045 SvPVX_const(sv));});
653099ff 5046 }
c277df42
IZ
5047
5048 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 5049 if (longest_fixed_length > longest_float_length) {
1de06328 5050 r->check_end_shift = r->anchored_end_shift;
c277df42 5051 r->check_substr = r->anchored_substr;
33b8afdf 5052 r->check_utf8 = r->anchored_utf8;
c277df42 5053 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
5054 if (r->extflags & RXf_ANCH_SINGLE)
5055 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
5056 }
5057 else {
1de06328 5058 r->check_end_shift = r->float_end_shift;
c277df42 5059 r->check_substr = r->float_substr;
33b8afdf 5060 r->check_utf8 = r->float_utf8;
1de06328
YO
5061 r->check_offset_min = r->float_min_offset;
5062 r->check_offset_max = r->float_max_offset;
a0d0e21e 5063 }
30382c73
IZ
5064 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5065 This should be changed ASAP! */
bbe252da
YO
5066 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5067 r->extflags |= RXf_USE_INTUIT;
33b8afdf 5068 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 5069 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 5070 }
1de06328
YO
5071 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5072 if ( (STRLEN)minlen < longest_float_length )
5073 minlen= longest_float_length;
5074 if ( (STRLEN)minlen < longest_fixed_length )
5075 minlen= longest_fixed_length;
5076 */
a0ed51b3
LW
5077 }
5078 else {
c277df42
IZ
5079 /* Several toplevels. Best we can is to set minlen. */
5080 I32 fake;
653099ff 5081 struct regnode_charclass_class ch_class;
cb434fcc 5082 I32 last_close = 0;
c277df42 5083
5d458dd8 5084 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 5085
f8fc2ecf 5086 scan = ri->program + 1;
830247a4 5087 cl_init(pRExC_state, &ch_class);
653099ff 5088 data.start_class = &ch_class;
cb434fcc 5089 data.last_closep = &last_close;
07be1b83 5090
de8c5301 5091
1de06328 5092 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 5093 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 5094
786e8c11 5095 CHECK_RESTUDY_GOTO;
07be1b83 5096
33b8afdf 5097 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 5098 = r->float_substr = r->float_utf8 = NULL;
f4244008
KW
5099
5100 /* If the synthetic start class were to ever be used when EOS is set,
5101 * that bit would have to be cleared, as it is shared with another */
653099ff 5102 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
5103 && !cl_is_anything(data.start_class))
5104 {
2eccd3b2 5105 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 5106
f8fc2ecf 5107 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
5108 struct regnode_charclass_class);
5109 StructCopy(data.start_class,
f8fc2ecf 5110 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 5111 struct regnode_charclass_class);
f8fc2ecf 5112 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5113 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5114 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 5115 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5116 PerlIO_printf(Perl_debug_log,
a0288114 5117 "synthetic stclass \"%s\".\n",
3f7c398e 5118 SvPVX_const(sv));});
653099ff 5119 }
a0d0e21e
LW
5120 }
5121
1de06328
YO
5122 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5123 the "real" pattern. */
cf9788e3
RGS
5124 DEBUG_OPTIMISE_r({
5125 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 5126 (IV)minlen, (IV)r->minlen);
cf9788e3 5127 });
de8c5301 5128 r->minlenret = minlen;
1de06328
YO
5129 if (r->minlen < minlen)
5130 r->minlen = minlen;
5131
b81d288d 5132 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 5133 r->extflags |= RXf_GPOS_SEEN;
830247a4 5134 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 5135 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 5136 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 5137 r->extflags |= RXf_EVAL_SEEN;
f33976b4 5138 if (RExC_seen & REG_SEEN_CANY)
bbe252da 5139 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 5140 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 5141 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 5142 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 5143 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 5144 if (RExC_paren_names)
85fbaab2 5145 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 5146 else
5daac39c 5147 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 5148
7bd1e614 5149#ifdef STUPID_PATTERN_CHECKS
5509d87a 5150 if (RX_PRELEN(rx) == 0)
640f820d 5151 r->extflags |= RXf_NULL;
5509d87a 5152 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
5153 /* XXX: this should happen BEFORE we compile */
5154 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 5155 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 5156 r->extflags |= RXf_WHITE;
5509d87a 5157 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 5158 r->extflags |= RXf_START_ONLY;
f1b875a0 5159#else
5509d87a 5160 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
5161 /* XXX: this should happen BEFORE we compile */
5162 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5163 else {
5164 regnode *first = ri->program + 1;
39aa8307 5165 U8 fop = OP(first);
f6d9469c
DM
5166
5167 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
640f820d 5168 r->extflags |= RXf_NULL;
f6d9469c 5169 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
7bd1e614 5170 r->extflags |= RXf_START_ONLY;
f6d9469c
DM
5171 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5172 && OP(regnext(first)) == END)
7bd1e614
YO
5173 r->extflags |= RXf_WHITE;
5174 }
f1b875a0 5175#endif
1f1031fe
YO
5176#ifdef DEBUGGING
5177 if (RExC_paren_names) {
af534a04 5178 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
1f1031fe
YO
5179 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5180 } else
1f1031fe 5181#endif
cde0cee5 5182 ri->name_list_idx = 0;
1f1031fe 5183
40d049e4
YO
5184 if (RExC_recurse_count) {
5185 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5186 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5187 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5188 }
5189 }
f0ab9afb 5190 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
5191 /* assume we don't need to swap parens around before we match */
5192
be8e71aa
YO
5193 DEBUG_DUMP_r({
5194 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
5195 regdump(r);
5196 });
7122b237
YO
5197#ifdef RE_TRACK_PATTERN_OFFSETS
5198 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5199 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
5200 U32 i;
5201 GET_RE_DEBUG_FLAGS_DECL;
7122b237 5202 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 5203 for (i = 1; i <= len; i++) {
7122b237 5204 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 5205 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 5206 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
5207 }
5208 PerlIO_printf(Perl_debug_log, "\n");
5209 });
7122b237 5210#endif
288b8c02 5211 return rx;
a687059c
LW
5212}
5213
f9f4320a 5214#undef RE_ENGINE_PTR
3dab1dad 5215
93b32b6d 5216
81714fb9 5217SV*
192b9cd1
AB
5218Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5219 const U32 flags)
5220{
7918f24d
NC
5221 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5222
192b9cd1
AB
5223 PERL_UNUSED_ARG(value);
5224
f1b875a0 5225 if (flags & RXapif_FETCH) {
192b9cd1 5226 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 5227 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6ad8f254 5228 Perl_croak_no_modify(aTHX);
192b9cd1 5229 return NULL;
f1b875a0 5230 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
5231 return reg_named_buff_exists(rx, key, flags)
5232 ? &PL_sv_yes
5233 : &PL_sv_no;
f1b875a0 5234 } else if (flags & RXapif_REGNAMES) {
192b9cd1 5235 return reg_named_buff_all(rx, flags);
f1b875a0 5236 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
5237 return reg_named_buff_scalar(rx, flags);
5238 } else {
5239 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5240 return NULL;
5241 }
5242}
5243
5244SV*
5245Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5246 const U32 flags)
5247{
7918f24d 5248 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
5249 PERL_UNUSED_ARG(lastkey);
5250
f1b875a0 5251 if (flags & RXapif_FIRSTKEY)
192b9cd1 5252 return reg_named_buff_firstkey(rx, flags);
f1b875a0 5253 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
5254 return reg_named_buff_nextkey(rx, flags);
5255 else {
5256 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5257 return NULL;
5258 }
5259}
5260
5261SV*
288b8c02
NC
5262Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5263 const U32 flags)
81714fb9 5264{
44a2ac75
YO
5265 AV *retarray = NULL;
5266 SV *ret;
288b8c02 5267 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5268
5269 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5270
f1b875a0 5271 if (flags & RXapif_ALL)
44a2ac75 5272 retarray=newAV();
93b32b6d 5273
5daac39c
NC
5274 if (rx && RXp_PAREN_NAMES(rx)) {
5275 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
5276 if (he_str) {
5277 IV i;
5278 SV* sv_dat=HeVAL(he_str);
5279 I32 *nums=(I32*)SvPVX(sv_dat);
5280 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
5281 if ((I32)(rx->nparens) >= nums[i]
5282 && rx->offs[nums[i]].start != -1
5283 && rx->offs[nums[i]].end != -1)
93b32b6d 5284 {
49d7dfbc 5285 ret = newSVpvs("");
288b8c02 5286 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
5287 if (!retarray)
5288 return ret;
5289 } else {
5290 ret = newSVsv(&PL_sv_undef);
5291 }
ec83ea38 5292 if (retarray)
93b32b6d 5293 av_push(retarray, ret);
81714fb9 5294 }
93b32b6d 5295 if (retarray)
ad64d0ec 5296 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
5297 }
5298 }
5299 return NULL;
5300}
5301
5302bool
288b8c02 5303Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
5304 const U32 flags)
5305{
288b8c02 5306 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5307
5308 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5309
5daac39c 5310 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5311 if (flags & RXapif_ALL) {
5daac39c 5312 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 5313 } else {
288b8c02 5314 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
5315 if (sv) {
5316 SvREFCNT_dec(sv);
192b9cd1
AB
5317 return TRUE;
5318 } else {
5319 return FALSE;
5320 }
5321 }
5322 } else {
5323 return FALSE;
5324 }
5325}
5326
5327SV*
288b8c02 5328Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5329{
288b8c02 5330 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5331
5332 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5333
5daac39c
NC
5334 if ( rx && RXp_PAREN_NAMES(rx) ) {
5335 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 5336
288b8c02 5337 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
5338 } else {
5339 return FALSE;
5340 }
192b9cd1
AB
5341}
5342
5343SV*
288b8c02 5344Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5345{
288b8c02 5346 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 5347 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5348
5349 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5350
5daac39c
NC
5351 if (rx && RXp_PAREN_NAMES(rx)) {
5352 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
5353 HE *temphe;
5354 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5355 IV i;
5356 IV parno = 0;
5357 SV* sv_dat = HeVAL(temphe);
5358 I32 *nums = (I32*)SvPVX(sv_dat);
5359 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5360 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5361 rx->offs[nums[i]].start != -1 &&
5362 rx->offs[nums[i]].end != -1)
5363 {
5364 parno = nums[i];
5365 break;
5366 }
5367 }
f1b875a0 5368 if (parno || flags & RXapif_ALL) {
a663657d 5369 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5370 }
81714fb9
YO
5371 }
5372 }
44a2ac75
YO
5373 return NULL;
5374}
5375
192b9cd1 5376SV*
288b8c02 5377Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5378{
5379 SV *ret;
5380 AV *av;
5381 I32 length;
288b8c02 5382 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5383
7918f24d
NC
5384 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5385
5daac39c 5386 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5387 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5388 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5389 } else if (flags & RXapif_ONE) {
288b8c02 5390 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 5391 av = MUTABLE_AV(SvRV(ret));
192b9cd1 5392 length = av_len(av);
ec83ea38 5393 SvREFCNT_dec(ret);
192b9cd1
AB
5394 return newSViv(length + 1);
5395 } else {
5396 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5397 return NULL;
5398 }
5399 }
5400 return &PL_sv_undef;
5401}
5402
5403SV*
288b8c02 5404Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5405{
288b8c02 5406 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5407 AV *av = newAV();
5408
7918f24d
NC
5409 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5410
5daac39c
NC
5411 if (rx && RXp_PAREN_NAMES(rx)) {
5412 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5413 HE *temphe;
5414 (void)hv_iterinit(hv);
5415 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5416 IV i;
5417 IV parno = 0;
5418 SV* sv_dat = HeVAL(temphe);
5419 I32 *nums = (I32*)SvPVX(sv_dat);
5420 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5421 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5422 rx->offs[nums[i]].start != -1 &&
5423 rx->offs[nums[i]].end != -1)
5424 {
5425 parno = nums[i];
5426 break;
5427 }
5428 }
f1b875a0 5429 if (parno || flags & RXapif_ALL) {
a663657d 5430 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
5431 }
5432 }
5433 }
5434
ad64d0ec 5435 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
5436}
5437
49d7dfbc 5438void
288b8c02
NC
5439Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5440 SV * const sv)
44a2ac75 5441{
288b8c02 5442 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 5443 char *s = NULL;
a9d504c3 5444 I32 i = 0;
44a2ac75 5445 I32 s1, t1;
7918f24d
NC
5446
5447 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 5448
cde0cee5
YO
5449 if (!rx->subbeg) {
5450 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5451 return;
cde0cee5
YO
5452 }
5453 else
f1b875a0 5454 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 5455 /* $` */
f0ab9afb 5456 i = rx->offs[0].start;
cde0cee5 5457 s = rx->subbeg;
44a2ac75
YO
5458 }
5459 else
f1b875a0 5460 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 5461 /* $' */
f0ab9afb
NC
5462 s = rx->subbeg + rx->offs[0].end;
5463 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
5464 }
5465 else
5466 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
5467 (s1 = rx->offs[paren].start) != -1 &&
5468 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
5469 {
5470 /* $& $1 ... */
5471 i = t1 - s1;
5472 s = rx->subbeg + s1;
cde0cee5
YO
5473 } else {
5474 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5475 return;
cde0cee5
YO
5476 }
5477 assert(rx->sublen >= (s - rx->subbeg) + i );
5478 if (i >= 0) {
5479 const int oldtainted = PL_tainted;
5480 TAINT_NOT;
5481 sv_setpvn(sv, s, i);
5482 PL_tainted = oldtainted;
5483 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 5484 ? (RXp_MATCH_UTF8(rx)
cde0cee5 5485 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 5486 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
5487 {
5488 SvUTF8_on(sv);
5489 }
5490 else
5491 SvUTF8_off(sv);
5492 if (PL_tainting) {
07bc277f 5493 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
5494 if (SvTYPE(sv) >= SVt_PVMG) {
5495 MAGIC* const mg = SvMAGIC(sv);
5496 MAGIC* mgt;
5497 PL_tainted = 1;
5498 SvMAGIC_set(sv, mg->mg_moremagic);
5499 SvTAINT(sv);
5500 if ((mgt = SvMAGIC(sv))) {
5501 mg->mg_moremagic = mgt;
5502 SvMAGIC_set(sv, mg);
44a2ac75 5503 }
cde0cee5
YO
5504 } else {
5505 PL_tainted = 1;
5506 SvTAINT(sv);
5507 }
5508 } else
5509 SvTAINTED_off(sv);
44a2ac75 5510 }
81714fb9 5511 } else {
44a2ac75 5512 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5513 return;
81714fb9
YO
5514 }
5515}
93b32b6d 5516
2fdbfb4d
AB
5517void
5518Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5519 SV const * const value)
5520{
7918f24d
NC
5521 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5522
2fdbfb4d
AB
5523 PERL_UNUSED_ARG(rx);
5524 PERL_UNUSED_ARG(paren);
5525 PERL_UNUSED_ARG(value);
5526
5527 if (!PL_localizing)
6ad8f254 5528 Perl_croak_no_modify(aTHX);
2fdbfb4d
AB
5529}
5530
5531I32
288b8c02 5532Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
5533 const I32 paren)
5534{
288b8c02 5535 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
5536 I32 i;
5537 I32 s1, t1;
5538
7918f24d
NC
5539 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5540
2fdbfb4d
AB
5541 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5542 switch (paren) {
192b9cd1 5543 /* $` / ${^PREMATCH} */
f1b875a0 5544 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
5545 if (rx->offs[0].start != -1) {
5546 i = rx->offs[0].start;
5547 if (i > 0) {
5548 s1 = 0;
5549 t1 = i;
5550 goto getlen;
5551 }
5552 }
5553 return 0;
192b9cd1 5554 /* $' / ${^POSTMATCH} */
f1b875a0 5555 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
5556 if (rx->offs[0].end != -1) {
5557 i = rx->sublen - rx->offs[0].end;
5558 if (i > 0) {
5559 s1 = rx->offs[0].end;
5560 t1 = rx->sublen;
5561 goto getlen;
5562 }
5563 }
5564 return 0;
192b9cd1
AB
5565 /* $& / ${^MATCH}, $1, $2, ... */
5566 default:
2fdbfb4d
AB
5567 if (paren <= (I32)rx->nparens &&
5568 (s1 = rx->offs[paren].start) != -1 &&
5569 (t1 = rx->offs[paren].end) != -1)
5570 {
5571 i = t1 - s1;
5572 goto getlen;
5573 } else {
5574 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 5575 report_uninit((const SV *)sv);
2fdbfb4d
AB
5576 return 0;
5577 }
5578 }
5579 getlen:
07bc277f 5580 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
5581 const char * const s = rx->subbeg + s1;
5582 const U8 *ep;
5583 STRLEN el;
5584
5585 i = t1 - s1;
5586 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5587 i = el;
5588 }
5589 return i;
5590}
5591
fe578d7f 5592SV*
49d7dfbc 5593Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 5594{
7918f24d 5595 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 5596 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
5597 if (0)
5598 return NULL;
5599 else
5600 return newSVpvs("Regexp");
fe578d7f 5601}
0a4db386 5602
894be9b7 5603/* Scans the name of a named buffer from the pattern.
0a4db386
YO
5604 * If flags is REG_RSN_RETURN_NULL returns null.
5605 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5606 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5607 * to the parsed name as looked up in the RExC_paren_names hash.
5608 * If there is an error throws a vFAIL().. type exception.
894be9b7 5609 */
0a4db386
YO
5610
5611#define REG_RSN_RETURN_NULL 0
5612#define REG_RSN_RETURN_NAME 1
5613#define REG_RSN_RETURN_DATA 2
5614
894be9b7 5615STATIC SV*
7918f24d
NC
5616S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5617{
894be9b7 5618 char *name_start = RExC_parse;
1f1031fe 5619
7918f24d
NC
5620 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5621
1f1031fe
YO
5622 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5623 /* skip IDFIRST by using do...while */
5624 if (UTF)
5625 do {
5626 RExC_parse += UTF8SKIP(RExC_parse);
5627 } while (isALNUM_utf8((U8*)RExC_parse));
5628 else
5629 do {
5630 RExC_parse++;
5631 } while (isALNUM(*RExC_parse));
894be9b7 5632 }
1f1031fe 5633
0a4db386 5634 if ( flags ) {
59cd0e26
NC
5635 SV* sv_name
5636 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5637 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
5638 if ( flags == REG_RSN_RETURN_NAME)
5639 return sv_name;
5640 else if (flags==REG_RSN_RETURN_DATA) {
5641 HE *he_str = NULL;
5642 SV *sv_dat = NULL;
5643 if ( ! sv_name ) /* should not happen*/
5644 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5645 if (RExC_paren_names)
5646 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5647 if ( he_str )
5648 sv_dat = HeVAL(he_str);
5649 if ( ! sv_dat )
5650 vFAIL("Reference to nonexistent named group");
5651 return sv_dat;
5652 }
5653 else {
5654 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5655 }
5656 /* NOT REACHED */
894be9b7 5657 }
0a4db386 5658 return NULL;
894be9b7
YO
5659}
5660
3dab1dad
YO
5661#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5662 int rem=(int)(RExC_end - RExC_parse); \
5663 int cut; \
5664 int num; \
5665 int iscut=0; \
5666 if (rem>10) { \
5667 rem=10; \
5668 iscut=1; \
5669 } \
5670 cut=10-rem; \
5671 if (RExC_lastparse!=RExC_parse) \
5672 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5673 rem, RExC_parse, \
5674 cut + 4, \
5675 iscut ? "..." : "<" \
5676 ); \
5677 else \
5678 PerlIO_printf(Perl_debug_log,"%16s",""); \
5679 \
5680 if (SIZE_ONLY) \
3b57cd43 5681 num = RExC_size + 1; \
3dab1dad
YO
5682 else \
5683 num=REG_NODE_NUM(RExC_emit); \
5684 if (RExC_lastnum!=num) \
0a4db386 5685 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 5686 else \
0a4db386 5687 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
5688 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5689 (int)((depth*2)), "", \
3dab1dad
YO
5690 (funcname) \
5691 ); \
5692 RExC_lastnum=num; \
5693 RExC_lastparse=RExC_parse; \
5694})
5695
07be1b83
YO
5696
5697
3dab1dad
YO
5698#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5699 DEBUG_PARSE_MSG((funcname)); \
5700 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5701})
6bda09f9
YO
5702#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5703 DEBUG_PARSE_MSG((funcname)); \
5704 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5705})
d764b54e
KW
5706
5707/* This section of code defines the inversion list object and its methods. The
5708 * interfaces are highly subject to change, so as much as possible is static to
5709 * this file. An inversion list is here implemented as a malloc'd C array with
5710 * some added info. More will be coming when functionality is added later.
5711 *
5712 * Some of the methods should always be private to the implementation, and some
5713 * should eventually be made public */
5714
5715#define INVLIST_INITIAL_LEN 10
5716#define INVLIST_ARRAY_KEY "array"
5717#define INVLIST_MAX_KEY "max"
5718#define INVLIST_LEN_KEY "len"
5719
5720PERL_STATIC_INLINE UV*
5721S_invlist_array(pTHX_ HV* const invlist)
5722{
5723 /* Returns the pointer to the inversion list's array. Every time the
5724 * length changes, this needs to be called in case malloc or realloc moved
5725 * it */
5726
5727 SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5728
5729 PERL_ARGS_ASSERT_INVLIST_ARRAY;
5730
5731 if (list_ptr == NULL) {
5732 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5733 INVLIST_ARRAY_KEY);
5734 }
5735
5736 return INT2PTR(UV *, SvUV(*list_ptr));
5737}
5738
5739PERL_STATIC_INLINE void
5740S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
5741{
5742 PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
5743
5744 /* Sets the array stored in the inversion list to the memory beginning with
5745 * the parameter */
5746
5747 if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
5748 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5749 INVLIST_ARRAY_KEY);
5750 }
5751}
5752
5753PERL_STATIC_INLINE UV
5754S_invlist_len(pTHX_ HV* const invlist)
5755{
5756 /* Returns the current number of elements in the inversion list's array */
5757
5758 SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
5759
5760 PERL_ARGS_ASSERT_INVLIST_LEN;
5761
5762 if (len_ptr == NULL) {
5763 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5764 INVLIST_LEN_KEY);
5765 }
5766
5767 return SvUV(*len_ptr);
5768}
5769
5770PERL_STATIC_INLINE UV
5771S_invlist_max(pTHX_ HV* const invlist)
5772{
5773 /* Returns the maximum number of elements storable in the inversion list's
5774 * array, without having to realloc() */
5775
5776 SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
5777
5778 PERL_ARGS_ASSERT_INVLIST_MAX;
5779
5780 if (max_ptr == NULL) {
5781 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5782 INVLIST_MAX_KEY);
5783 }
5784
5785 return SvUV(*max_ptr);
5786}
5787
5788PERL_STATIC_INLINE void
5789S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
5790{
5791 /* Sets the current number of elements stored in the inversion list */
5792
5793 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
5794
5795 if (len != 0 && len > invlist_max(invlist)) {
5796 Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
5797 }
5798
5799 if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
5800 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5801 INVLIST_LEN_KEY);
5802 }
5803}
5804
5805PERL_STATIC_INLINE void
5806S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
5807{
5808
5809 /* Sets the maximum number of elements storable in the inversion list
5810 * without having to realloc() */
5811
5812 PERL_ARGS_ASSERT_INVLIST_SET_MAX;
5813
5814 if (max < invlist_len(invlist)) {
5815 Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
5816 }
5817
5818 if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
5819 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5820 INVLIST_LEN_KEY);
5821 }
5822}
5823
8d69a883 5824#ifndef PERL_IN_XSUB_RE
d764b54e
KW
5825HV*
5826Perl__new_invlist(pTHX_ IV initial_size)
5827{
5828
5829 /* Return a pointer to a newly constructed inversion list, with enough
5830 * space to store 'initial_size' elements. If that number is negative, a
5831 * system default is used instead */
5832
5833 HV* invlist = newHV();
5834 UV* list;
5835
5836 if (initial_size < 0) {
5837 initial_size = INVLIST_INITIAL_LEN;
5838 }
5839
5840 /* Allocate the initial space */
5841 Newx(list, initial_size, UV);
5842 invlist_set_array(invlist, list);
5843
5844 /* set_len has to come before set_max, as the latter inspects the len */
5845 invlist_set_len(invlist, 0);
5846 invlist_set_max(invlist, initial_size);
5847
5848 return invlist;
5849}
8d69a883 5850#endif
d764b54e
KW
5851
5852PERL_STATIC_INLINE void
5853S_invlist_destroy(pTHX_ HV* const invlist)
5854{
5855 /* Inversion list destructor */
5856
5857 SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5858
5859 PERL_ARGS_ASSERT_INVLIST_DESTROY;
5860
5861 if (list_ptr != NULL) {
b9d2ea5b
GG
5862 UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */
5863 Safefree(list);
d764b54e
KW
5864 }
5865}
5866
5867STATIC void
5868S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
5869{
5870 /* Change the maximum size of an inversion list (up or down) */
5871
5872 UV* orig_array;
5873 UV* array;
5874 const UV old_max = invlist_max(invlist);
5875
5876 PERL_ARGS_ASSERT_INVLIST_EXTEND;
5877
5878 if (old_max == new_max) { /* If a no-op */
5879 return;
5880 }
5881
5882 array = orig_array = invlist_array(invlist);
5883 Renew(array, new_max, UV);
5884
5885 /* If the size change moved the list in memory, set the new one */
5886 if (array != orig_array) {
5887 invlist_set_array(invlist, array);
5888 }
5889
5890 invlist_set_max(invlist, new_max);
5891
5892}
5893
5894PERL_STATIC_INLINE void
5895S_invlist_trim(pTHX_ HV* const invlist)
5896{
5897 PERL_ARGS_ASSERT_INVLIST_TRIM;
5898
5899 /* Change the length of the inversion list to how many entries it currently
5900 * has */
5901
5902 invlist_extend(invlist, invlist_len(invlist));
5903}
5904
5905/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
5906 * etc */
5907
5908#define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
5909
8d69a883 5910#ifndef PERL_IN_XSUB_RE
d764b54e 5911void
c2b03b8c 5912Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
d764b54e
KW
5913{
5914 /* Subject to change or removal. Append the range from 'start' to 'end' at
5915 * the end of the inversion list. The range must be above any existing
5916 * ones. */
5917
5918 UV* array = invlist_array(invlist);
5919 UV max = invlist_max(invlist);
5920 UV len = invlist_len(invlist);
5921
5922 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
5923
5924 if (len > 0) {
5925
5926 /* Here, the existing list is non-empty. The current max entry in the
5927 * list is generally the first value not in the set, except when the
5928 * set extends to the end of permissible values, in which case it is
5929 * the first entry in that final set, and so this call is an attempt to
5930 * append out-of-order */
5931
5932 UV final_element = len - 1;
5933 if (array[final_element] > start
5934 || ELEMENT_IN_INVLIST_SET(final_element))
5935 {
5936 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
5937 }
5938
5939 /* Here, it is a legal append. If the new range begins with the first
5940 * value not in the set, it is extending the set, so the new first
5941 * value not in the set is one greater than the newly extended range.
5942 * */
5943 if (array[final_element] == start) {
5944 if (end != UV_MAX) {
5945 array[final_element] = end + 1;
5946 }
5947 else {
5948 /* But if the end is the maximum representable on the machine,
5949 * just let the range that this would extend have no end */
5950 invlist_set_len(invlist, len - 1);
5951 }
5952 return;
5953 }
5954 }
5955
5956 /* Here the new range doesn't extend any existing set. Add it */
5957
5958 len += 2; /* Includes an element each for the start and end of range */
5959
5960 /* If overflows the existing space, extend, which may cause the array to be
5961 * moved */
5962 if (max < len) {
5963 invlist_extend(invlist, len);
5964 array = invlist_array(invlist);
5965 }
5966
5967 invlist_set_len(invlist, len);
5968
5969 /* The next item on the list starts the range, the one after that is
5970 * one past the new range. */
5971 array[len - 2] = start;
5972 if (end != UV_MAX) {
5973 array[len - 1] = end + 1;
5974 }
5975 else {
5976 /* But if the end is the maximum representable on the machine, just let
5977 * the range have no end */
5978 invlist_set_len(invlist, len - 1);
5979 }
5980}
8d69a883 5981#endif
d764b54e 5982
3097108b 5983STATIC HV*
d764b54e
KW
5984S_invlist_union(pTHX_ HV* const a, HV* const b)
5985{
5986 /* Return a new inversion list which is the union of two inversion lists.
5987 * The basis for this comes from "Unicode Demystified" Chapter 13 by
5988 * Richard Gillam, published by Addison-Wesley, and explained at some
5989 * length there. The preface says to incorporate its examples into your
5990 * code at your own risk.
5991 *
5992 * The algorithm is like a merge sort.
5993 *
5994 * XXX A potential performance improvement is to keep track as we go along
5995 * if only one of the inputs contributes to the result, meaning the other
5996 * is a subset of that one. In that case, we can skip the final copy and
5997 * return the larger of the input lists */
5998
5999 UV* array_a = invlist_array(a); /* a's array */
6000 UV* array_b = invlist_array(b);
6001 UV len_a = invlist_len(a); /* length of a's array */
6002 UV len_b = invlist_len(b);
6003
6004 HV* u; /* the resulting union */
6005 UV* array_u;
6006 UV len_u;
6007
6008 UV i_a = 0; /* current index into a's array */
6009 UV i_b = 0;
6010 UV i_u = 0;
6011
6012 /* running count, as explained in the algorithm source book; items are
6013 * stopped accumulating and are output when the count changes to/from 0.
6014 * The count is incremented when we start a range that's in the set, and
6015 * decremented when we start a range that's not in the set. So its range
6016 * is 0 to 2. Only when the count is zero is something not in the set.
6017 */
6018 UV count = 0;
6019
6020 PERL_ARGS_ASSERT_INVLIST_UNION;
6021
6022 /* Size the union for the worst case: that the sets are completely
6023 * disjoint */
6024 u = _new_invlist(len_a + len_b);
6025 array_u = invlist_array(u);
6026
6027 /* Go through each list item by item, stopping when exhausted one of
6028 * them */
6029 while (i_a < len_a && i_b < len_b) {
6030 UV cp; /* The element to potentially add to the union's array */
6031 bool cp_in_set; /* is it in the the input list's set or not */
6032
6033 /* We need to take one or the other of the two inputs for the union.
6034 * Since we are merging two sorted lists, we take the smaller of the
6035 * next items. In case of a tie, we take the one that is in its set
6036 * first. If we took one not in the set first, it would decrement the
6037 * count, possibly to 0 which would cause it to be output as ending the
6038 * range, and the next time through we would take the same number, and
6039 * output it again as beginning the next range. By doing it the
6040 * opposite way, there is no possibility that the count will be
6041 * momentarily decremented to 0, and thus the two adjoining ranges will
6042 * be seamlessly merged. (In a tie and both are in the set or both not
6043 * in the set, it doesn't matter which we take first.) */
6044 if (array_a[i_a] < array_b[i_b]
6045 || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
6046 {
6047 cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6048 cp= array_a[i_a++];
6049 }
6050 else {
6051 cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6052 cp= array_b[i_b++];
6053 }
6054
6055 /* Here, have chosen which of the two inputs to look at. Only output
6056 * if the running count changes to/from 0, which marks the
6057 * beginning/end of a range in that's in the set */
6058 if (cp_in_set) {
6059 if (count == 0) {
6060 array_u[i_u++] = cp;
6061 }
6062 count++;
6063 }
6064 else {
6065 count--;
6066 if (count == 0) {
6067 array_u[i_u++] = cp;
6068 }
6069 }
6070 }
6071
6072 /* Here, we are finished going through at least one of the lists, which
6073 * means there is something remaining in at most one. We check if the list
6074 * that hasn't been exhausted is positioned such that we are in the middle
6075 * of a range in its set or not. (We are in the set if the next item in
6076 * the array marks the beginning of something not in the set) If in the
6077 * set, we decrement 'count'; if 0, there is potentially more to output.
6078 * There are four cases:
6079 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
6080 * in the union is entirely from the non-exhausted set.
6081 * 2) Both were in their sets, count is 2. Nothing further should
6082 * be output, as everything that remains will be in the exhausted
6083 * list's set, hence in the union; decrementing to 1 but not 0 insures
6084 * that
6085 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
6086 * Nothing further should be output because the union includes
6087 * everything from the exhausted set. Not decrementing insures that.
6088 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6089 * decrementing to 0 insures that we look at the remainder of the
6090 * non-exhausted set */
6091 if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6092 || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6093 {
6094 count--;
6095 }
6096
6097 /* The final length is what we've output so far, plus what else is about to
6098 * be output. (If 'count' is non-zero, then the input list we exhausted
6099 * has everything remaining up to the machine's limit in its set, and hence
6100 * in the union, so there will be no further output. */
6101 len_u = i_u;
6102 if (count == 0) {
6103 /* At most one of the subexpressions will be non-zero */
6104 len_u += (len_a - i_a) + (len_b - i_b);
6105 }
6106
6107 /* Set result to final length, which can change the pointer to array_u, so
6108 * re-find it */
6109 if (len_u != invlist_len(u)) {
6110 invlist_set_len(u, len_u);
6111 invlist_trim(u);
6112 array_u = invlist_array(u);
6113 }
6114
6115 /* When 'count' is 0, the list that was exhausted (if one was shorter than
6116 * the other) ended with everything above it not in its set. That means
6117 * that the remaining part of the union is precisely the same as the
6118 * non-exhausted list, so can just copy it unchanged. (If both list were
6119 * exhausted at the same time, then the operations below will be both 0.)
6120 */
6121 if (count == 0) {
6122 IV copy_count; /* At most one will have a non-zero copy count */
6123 if ((copy_count = len_a - i_a) > 0) {
6124 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6125 }
6126 else if ((copy_count = len_b - i_b) > 0) {
6127 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6128 }
6129 }
6130
6131 return u;
6132}
6133
3097108b 6134STATIC HV*
d764b54e
KW
6135S_invlist_intersection(pTHX_ HV* const a, HV* const b)
6136{
6137 /* Return the intersection of two inversion lists. The basis for this
6138 * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
6139 * by Addison-Wesley, and explained at some length there. The preface says
6140 * to incorporate its examples into your code at your own risk.
6141 *
6142 * The algorithm is like a merge sort, and is essentially the same as the
6143 * union above
6144 */
6145
6146 UV* array_a = invlist_array(a); /* a's array */
6147 UV* array_b = invlist_array(b);
6148 UV len_a = invlist_len(a); /* length of a's array */
6149 UV len_b = invlist_len(b);
6150
6151 HV* r; /* the resulting intersection */
6152 UV* array_r;
6153 UV len_r;
6154
6155 UV i_a = 0; /* current index into a's array */
6156 UV i_b = 0;
6157 UV i_r = 0;
6158
6159 /* running count, as explained in the algorithm source book; items are
6160 * stopped accumulating and are output when the count changes to/from 2.
6161 * The count is incremented when we start a range that's in the set, and
6162 * decremented when we start a range that's not in the set. So its range
6163 * is 0 to 2. Only when the count is 2 is something in the intersection.
6164 */
6165 UV count = 0;
6166
6167 PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
6168
6169 /* Size the intersection for the worst case: that the intersection ends up
6170 * fragmenting everything to be completely disjoint */
6171 r= _new_invlist(len_a + len_b);
6172 array_r = invlist_array(r);
6173
6174 /* Go through each list item by item, stopping when exhausted one of
6175 * them */
6176 while (i_a < len_a && i_b < len_b) {
6177 UV cp; /* The element to potentially add to the intersection's
6178 array */
6179 bool cp_in_set; /* Is it in the input list's set or not */
6180
6181 /* We need to take one or the other of the two inputs for the union.
6182 * Since we are merging two sorted lists, we take the smaller of the
6183 * next items. In case of a tie, we take the one that is not in its
6184 * set first (a difference from the union algorithm). If we took one
6185 * in the set first, it would increment the count, possibly to 2 which
6186 * would cause it to be output as starting a range in the intersection,
6187 * and the next time through we would take that same number, and output
6188 * it again as ending the set. By doing it the opposite of this, we
6189 * there is no possibility that the count will be momentarily
6190 * incremented to 2. (In a tie and both are in the set or both not in
6191 * the set, it doesn't matter which we take first.) */
6192 if (array_a[i_a] < array_b[i_b]
6193 || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
6194 {
6195 cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6196 cp= array_a[i_a++];
6197 }
6198 else {
6199 cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6200 cp= array_b[i_b++];
6201 }
6202
6203 /* Here, have chosen which of the two inputs to look at. Only output
6204 * if the running count changes to/from 2, which marks the
6205 * beginning/end of a range that's in the intersection */
6206 if (cp_in_set) {
6207 count++;
6208 if (count == 2) {
6209 array_r[i_r++] = cp;
6210 }
6211 }
6212 else {
6213 if (count == 2) {
6214 array_r[i_r++] = cp;
6215 }
6216 count--;
6217 }
6218 }
6219
6220 /* Here, we are finished going through at least one of the sets, which
6221 * means there is something remaining in at most one. See the comments in
6222 * the union code */
6223 if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6224 || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6225 {
6226 count--;
6227 }
6228
6229 /* The final length is what we've output so far plus what else is in the
6230 * intersection. Only one of the subexpressions below will be non-zero */
6231 len_r = i_r;
6232 if (count == 2) {
6233 len_r += (len_a - i_a) + (len_b - i_b);
6234 }
6235
6236 /* Set result to final length, which can change the pointer to array_r, so
6237 * re-find it */
6238 if (len_r != invlist_len(r)) {
6239 invlist_set_len(r, len_r);
6240 invlist_trim(r);
6241 array_r = invlist_array(r);
6242 }
6243
6244 /* Finish outputting any remaining */
6245 if (count == 2) { /* Only one of will have a non-zero copy count */
6246 IV copy_count;
6247 if ((copy_count = len_a - i_a) > 0) {
6248 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6249 }
6250 else if ((copy_count = len_b - i_b) > 0) {
6251 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6252 }
6253 }
6254
6255 return r;
6256}
6257
6258STATIC HV*
c52a3e71 6259S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end)
d764b54e
KW
6260{
6261 /* Add the range from 'start' to 'end' inclusive to the inversion list's
6262 * set. A pointer to the inversion list is returned. This may actually be
c52a3e71
KW
6263 * a new list, in which case the passed in one has been destroyed. The
6264 * passed in inversion list can be NULL, in which case a new one is created
6265 * with just the one range in it */
d764b54e
KW
6266
6267 HV* range_invlist;
6268 HV* added_invlist;
c52a3e71 6269 UV len;
d764b54e 6270
c52a3e71
KW
6271 if (invlist == NULL) {
6272 invlist = _new_invlist(2);
6273 len = 0;
6274 }
6275 else {
6276 len = invlist_len(invlist);
6277 }
d764b54e
KW
6278
6279 /* If comes after the final entry, can just append it to the end */
6280 if (len == 0
6281 || start >= invlist_array(invlist)
6282 [invlist_len(invlist) - 1])
6283 {
6284 _append_range_to_invlist(invlist, start, end);
6285 return invlist;
6286 }
6287
6288 /* Here, can't just append things, create and return a new inversion list
6289 * which is the union of this range and the existing inversion list */
6290 range_invlist = _new_invlist(2);
6291 _append_range_to_invlist(range_invlist, start, end);
6292
6293 added_invlist = invlist_union(invlist, range_invlist);
6294
6295 /* The passed in list can be freed, as well as our temporary */
6296 invlist_destroy(range_invlist);
6297 if (invlist != added_invlist) {
6298 invlist_destroy(invlist);
6299 }
6300
6301 return added_invlist;
6302}
6303
c229b64c
KW
6304PERL_STATIC_INLINE HV*
6305S_add_cp_to_invlist(pTHX_ HV* invlist, const UV cp) {
6306 return add_range_to_invlist(invlist, cp, cp);
6307}
6308
d764b54e
KW
6309/* End of inversion list object */
6310
a687059c
LW
6311/*
6312 - reg - regular expression, i.e. main body or parenthesized thing
6313 *
6314 * Caller must absorb opening parenthesis.
6315 *
6316 * Combining parenthesis handling with the base level of regular expression
6317 * is a trifle forced, but the need to tie the tails of the branches to what
6318 * follows makes it hard to avoid.
6319 */
07be1b83
YO
6320#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
6321#ifdef DEBUGGING
6322#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
6323#else
6324#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
6325#endif
3dab1dad 6326
76e3520e 6327STATIC regnode *
3dab1dad 6328S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 6329 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 6330{
27da23d5 6331 dVAR;
c277df42
IZ
6332 register regnode *ret; /* Will be the head of the group. */
6333 register regnode *br;
6334 register regnode *lastbr;
cbbf8932 6335 register regnode *ender = NULL;
a0d0e21e 6336 register I32 parno = 0;
cbbf8932 6337 I32 flags;
f7819f85 6338 U32 oregflags = RExC_flags;
6136c704
AL
6339 bool have_branch = 0;
6340 bool is_open = 0;
594d7033
YO
6341 I32 freeze_paren = 0;
6342 I32 after_freeze = 0;
9d1d55b5
JP
6343
6344 /* for (?g), (?gc), and (?o) warnings; warning
6345 about (?c) will warn about (?g) -- japhy */
6346
6136c704
AL
6347#define WASTED_O 0x01
6348#define WASTED_G 0x02
6349#define WASTED_C 0x04
6350#define WASTED_GC (0x02|0x04)
cbbf8932 6351 I32 wastedflags = 0x00;
9d1d55b5 6352
fac92740 6353 char * parse_start = RExC_parse; /* MJD */
a28509cc 6354 char * const oregcomp_parse = RExC_parse;
a0d0e21e 6355
3dab1dad 6356 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6357
6358 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
6359 DEBUG_PARSE("reg ");
6360
821b33a5 6361 *flagp = 0; /* Tentatively. */
a0d0e21e 6362
9d1d55b5 6363
a0d0e21e
LW
6364 /* Make an OPEN node, if parenthesized. */
6365 if (paren) {
e2e6a0f1
YO
6366 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
6367 char *start_verb = RExC_parse;
6368 STRLEN verb_len = 0;
6369 char *start_arg = NULL;
6370 unsigned char op = 0;
6371 int argok = 1;
6372 int internal_argval = 0; /* internal_argval is only useful if !argok */
6373 while ( *RExC_parse && *RExC_parse != ')' ) {
6374 if ( *RExC_parse == ':' ) {
6375 start_arg = RExC_parse + 1;
6376 break;
6377 }
6378 RExC_parse++;
6379 }
6380 ++start_verb;
6381 verb_len = RExC_parse - start_verb;
6382 if ( start_arg ) {
6383 RExC_parse++;
6384 while ( *RExC_parse && *RExC_parse != ')' )
6385 RExC_parse++;
6386 if ( *RExC_parse != ')' )
6387 vFAIL("Unterminated verb pattern argument");
6388 if ( RExC_parse == start_arg )
6389 start_arg = NULL;
6390 } else {
6391 if ( *RExC_parse != ')' )
6392 vFAIL("Unterminated verb pattern");
6393 }
5d458dd8 6394
e2e6a0f1
YO
6395 switch ( *start_verb ) {
6396 case 'A': /* (*ACCEPT) */
568a785a 6397 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
6398 op = ACCEPT;
6399 internal_argval = RExC_nestroot;
6400 }
6401 break;
6402 case 'C': /* (*COMMIT) */
568a785a 6403 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 6404 op = COMMIT;
e2e6a0f1
YO
6405 break;
6406 case 'F': /* (*FAIL) */
568a785a 6407 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
6408 op = OPFAIL;
6409 argok = 0;
6410 }
6411 break;
5d458dd8
YO
6412 case ':': /* (*:NAME) */
6413 case 'M': /* (*MARK:NAME) */
568a785a 6414 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 6415 op = MARKPOINT;
5d458dd8
YO
6416 argok = -1;
6417 }
6418 break;
6419 case 'P': /* (*PRUNE) */
568a785a 6420 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 6421 op = PRUNE;
e2e6a0f1 6422 break;
5d458dd8 6423 case 'S': /* (*SKIP) */
568a785a 6424 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
6425 op = SKIP;
6426 break;
6427 case 'T': /* (*THEN) */
6428 /* [19:06] <TimToady> :: is then */
568a785a 6429 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
6430 op = CUTGROUP;
6431 RExC_seen |= REG_SEEN_CUTGROUP;
6432 }
e2e6a0f1
YO
6433 break;
6434 }
6435 if ( ! op ) {
6436 RExC_parse++;
6437 vFAIL3("Unknown verb pattern '%.*s'",
6438 verb_len, start_verb);
6439 }
6440 if ( argok ) {
6441 if ( start_arg && internal_argval ) {
6442 vFAIL3("Verb pattern '%.*s' may not have an argument",
6443 verb_len, start_verb);
6444 } else if ( argok < 0 && !start_arg ) {
6445 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
6446 verb_len, start_verb);
6447 } else {
6448 ret = reganode(pRExC_state, op, internal_argval);
6449 if ( ! internal_argval && ! SIZE_ONLY ) {
6450 if (start_arg) {
6451 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
6452 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 6453 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
6454 ret->flags = 0;
6455 } else {
6456 ret->flags = 1;
6457 }
6458 }
6459 }
6460 if (!internal_argval)
6461 RExC_seen |= REG_SEEN_VERBARG;
6462 } else if ( start_arg ) {
6463 vFAIL3("Verb pattern '%.*s' may not have an argument",
6464 verb_len, start_verb);
6465 } else {
6466 ret = reg_node(pRExC_state, op);
6467 }
6468 nextchar(pRExC_state);
6469 return ret;
6470 } else
fac92740 6471 if (*RExC_parse == '?') { /* (?...) */
6136c704 6472 bool is_logical = 0;
a28509cc 6473 const char * const seqstart = RExC_parse;
fb85c044 6474 bool has_use_defaults = FALSE;
ca9dfc88 6475
830247a4
IZ
6476 RExC_parse++;
6477 paren = *RExC_parse++;
c277df42 6478 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 6479 switch (paren) {
894be9b7 6480
1f1031fe
YO
6481 case 'P': /* (?P...) variants for those used to PCRE/Python */
6482 paren = *RExC_parse++;
6483 if ( paren == '<') /* (?P<...>) named capture */
6484 goto named_capture;
6485 else if (paren == '>') { /* (?P>name) named recursion */
6486 goto named_recursion;
6487 }
6488 else if (paren == '=') { /* (?P=...) named backref */
6489 /* this pretty much dupes the code for \k<NAME> in regatom(), if
6490 you change this make sure you change that */
6491 char* name_start = RExC_parse;
6492 U32 num = 0;
6493 SV *sv_dat = reg_scan_name(pRExC_state,
6494 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6495 if (RExC_parse == name_start || *RExC_parse != ')')
6496 vFAIL2("Sequence %.3s... not terminated",parse_start);
6497
6498 if (!SIZE_ONLY) {
6499 num = add_data( pRExC_state, 1, "S" );
6500 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 6501 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
6502 }
6503 RExC_sawback = 1;
4444fd9f
KW
6504 ret = reganode(pRExC_state,
6505 ((! FOLD)
6506 ? NREF
2f7f8cb1
KW
6507 : (MORE_ASCII_RESTRICTED)
6508 ? NREFFA
6509 : (AT_LEAST_UNI_SEMANTICS)
6510 ? NREFFU
6511 : (LOC)
6512 ? NREFFL
6513 : NREFF),
4444fd9f 6514 num);
1f1031fe
YO
6515 *flagp |= HASWIDTH;
6516
6517 Set_Node_Offset(ret, parse_start+1);
6518 Set_Node_Cur_Length(ret); /* MJD */
6519
6520 nextchar(pRExC_state);
6521 return ret;
6522 }
57b84237
YO
6523 RExC_parse++;
6524 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6525 /*NOTREACHED*/
6526 case '<': /* (?<...) */
b81d288d 6527 if (*RExC_parse == '!')
c277df42 6528 paren = ',';
0a4db386 6529 else if (*RExC_parse != '=')
1f1031fe 6530 named_capture:
0a4db386 6531 { /* (?<...>) */
81714fb9 6532 char *name_start;
894be9b7 6533 SV *svname;
81714fb9
YO
6534 paren= '>';
6535 case '\'': /* (?'...') */
6536 name_start= RExC_parse;
0a4db386
YO
6537 svname = reg_scan_name(pRExC_state,
6538 SIZE_ONLY ? /* reverse test from the others */
6539 REG_RSN_RETURN_NAME :
6540 REG_RSN_RETURN_NULL);
57b84237
YO
6541 if (RExC_parse == name_start) {
6542 RExC_parse++;
6543 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6544 /*NOTREACHED*/
6545 }
81714fb9
YO
6546 if (*RExC_parse != paren)
6547 vFAIL2("Sequence (?%c... not terminated",
6548 paren=='>' ? '<' : paren);
6549 if (SIZE_ONLY) {
e62cc96a
YO
6550 HE *he_str;
6551 SV *sv_dat = NULL;
486ec47a 6552 if (!svname) /* shouldn't happen */
894be9b7
YO
6553 Perl_croak(aTHX_
6554 "panic: reg_scan_name returned NULL");
81714fb9
YO
6555 if (!RExC_paren_names) {
6556 RExC_paren_names= newHV();
ad64d0ec 6557 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
6558#ifdef DEBUGGING
6559 RExC_paren_name_list= newAV();
ad64d0ec 6560 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 6561#endif
81714fb9
YO
6562 }
6563 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 6564 if ( he_str )
81714fb9 6565 sv_dat = HeVAL(he_str);
e62cc96a 6566 if ( ! sv_dat ) {
81714fb9 6567 /* croak baby croak */
e62cc96a
YO
6568 Perl_croak(aTHX_
6569 "panic: paren_name hash element allocation failed");
6570 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
6571 /* (?|...) can mean we have dupes so scan to check
6572 its already been stored. Maybe a flag indicating
6573 we are inside such a construct would be useful,
6574 but the arrays are likely to be quite small, so
6575 for now we punt -- dmq */
6576 IV count = SvIV(sv_dat);
6577 I32 *pv = (I32*)SvPVX(sv_dat);
6578 IV i;
6579 for ( i = 0 ; i < count ; i++ ) {
6580 if ( pv[i] == RExC_npar ) {
6581 count = 0;
6582 break;
6583 }
6584 }
6585 if ( count ) {
6586 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
6587 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
6588 pv[count] = RExC_npar;
3a92e6ae 6589 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 6590 }
81714fb9
YO
6591 } else {
6592 (void)SvUPGRADE(sv_dat,SVt_PVNV);
6593 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
6594 SvIOK_on(sv_dat);
3ec35e0f 6595 SvIV_set(sv_dat, 1);
e62cc96a 6596 }
1f1031fe
YO
6597#ifdef DEBUGGING
6598 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
6599 SvREFCNT_dec(svname);
6600#endif
e62cc96a 6601
81714fb9
YO
6602 /*sv_dump(sv_dat);*/
6603 }
6604 nextchar(pRExC_state);
6605 paren = 1;
6606 goto capturing_parens;
6607 }
6608 RExC_seen |= REG_SEEN_LOOKBEHIND;
b57e4118 6609 RExC_in_lookbehind++;
830247a4 6610 RExC_parse++;
fac92740 6611 case '=': /* (?=...) */
89c6a13e 6612 RExC_seen_zerolen++;
5c3fa2e7 6613 break;
fac92740 6614 case '!': /* (?!...) */
830247a4 6615 RExC_seen_zerolen++;
e2e6a0f1
YO
6616 if (*RExC_parse == ')') {
6617 ret=reg_node(pRExC_state, OPFAIL);
6618 nextchar(pRExC_state);
6619 return ret;
6620 }
594d7033
YO
6621 break;
6622 case '|': /* (?|...) */
6623 /* branch reset, behave like a (?:...) except that
6624 buffers in alternations share the same numbers */
6625 paren = ':';
6626 after_freeze = freeze_paren = RExC_npar;
6627 break;
fac92740
MJD
6628 case ':': /* (?:...) */
6629 case '>': /* (?>...) */
a0d0e21e 6630 break;
fac92740
MJD
6631 case '$': /* (?$...) */
6632 case '@': /* (?@...) */
8615cb43 6633 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 6634 break;
fac92740 6635 case '#': /* (?#...) */
830247a4
IZ
6636 while (*RExC_parse && *RExC_parse != ')')
6637 RExC_parse++;
6638 if (*RExC_parse != ')')
c277df42 6639 FAIL("Sequence (?#... not terminated");
830247a4 6640 nextchar(pRExC_state);
a0d0e21e
LW
6641 *flagp = TRYAGAIN;
6642 return NULL;
894be9b7
YO
6643 case '0' : /* (?0) */
6644 case 'R' : /* (?R) */
6645 if (*RExC_parse != ')')
6bda09f9 6646 FAIL("Sequence (?R) not terminated");
1a147d38 6647 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 6648 *flagp |= POSTPONED;
7f69552c
YO
6649 nextchar(pRExC_state);
6650 return ret;
6651 /*notreached*/
894be9b7
YO
6652 { /* named and numeric backreferences */
6653 I32 num;
894be9b7
YO
6654 case '&': /* (?&NAME) */
6655 parse_start = RExC_parse - 1;
1f1031fe 6656 named_recursion:
894be9b7 6657 {
0a4db386
YO
6658 SV *sv_dat = reg_scan_name(pRExC_state,
6659 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6660 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
6661 }
6662 goto gen_recurse_regop;
6663 /* NOT REACHED */
542fa716
YO
6664 case '+':
6665 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6666 RExC_parse++;
6667 vFAIL("Illegal pattern");
6668 }
6669 goto parse_recursion;
6670 /* NOT REACHED*/
6671 case '-': /* (?-1) */
6672 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6673 RExC_parse--; /* rewind to let it be handled later */
6674 goto parse_flags;
6675 }
6676 /*FALLTHROUGH */
6bda09f9
YO
6677 case '1': case '2': case '3': case '4': /* (?1) */
6678 case '5': case '6': case '7': case '8': case '9':
6679 RExC_parse--;
542fa716 6680 parse_recursion:
894be9b7
YO
6681 num = atoi(RExC_parse);
6682 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
6683 if (*RExC_parse == '-')
6684 RExC_parse++;
6bda09f9
YO
6685 while (isDIGIT(*RExC_parse))
6686 RExC_parse++;
6687 if (*RExC_parse!=')')
6688 vFAIL("Expecting close bracket");
894be9b7
YO
6689
6690 gen_recurse_regop:
542fa716
YO
6691 if ( paren == '-' ) {
6692 /*
6693 Diagram of capture buffer numbering.
6694 Top line is the normal capture buffer numbers
3b753521 6695 Bottom line is the negative indexing as from
542fa716
YO
6696 the X (the (?-2))
6697
6698 + 1 2 3 4 5 X 6 7
6699 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6700 - 5 4 3 2 1 X x x
6701
6702 */
6703 num = RExC_npar + num;
6704 if (num < 1) {
6705 RExC_parse++;
6706 vFAIL("Reference to nonexistent group");
6707 }
6708 } else if ( paren == '+' ) {
6709 num = RExC_npar + num - 1;
6710 }
6711
1a147d38 6712 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
6713 if (!SIZE_ONLY) {
6714 if (num > (I32)RExC_rx->nparens) {
6715 RExC_parse++;
6716 vFAIL("Reference to nonexistent group");
6717 }
40d049e4 6718 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 6719 RExC_emit++;
226de585 6720 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 6721 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 6722 } else {
6bda09f9 6723 RExC_size++;
6bda09f9 6724 }
0a4db386 6725 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 6726 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
6727 Set_Node_Offset(ret, parse_start); /* MJD */
6728
a3b492c3 6729 *flagp |= POSTPONED;
6bda09f9
YO
6730 nextchar(pRExC_state);
6731 return ret;
894be9b7
YO
6732 } /* named and numeric backreferences */
6733 /* NOT REACHED */
6734
fac92740 6735 case '?': /* (??...) */
6136c704 6736 is_logical = 1;
57b84237
YO
6737 if (*RExC_parse != '{') {
6738 RExC_parse++;
6739 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6740 /*NOTREACHED*/
6741 }
a3b492c3 6742 *flagp |= POSTPONED;
830247a4 6743 paren = *RExC_parse++;
0f5d15d6 6744 /* FALL THROUGH */
fac92740 6745 case '{': /* (?{...}) */
c277df42 6746 {
2eccd3b2
NC
6747 I32 count = 1;
6748 U32 n = 0;
c277df42 6749 char c;
830247a4 6750 char *s = RExC_parse;
c277df42 6751
830247a4
IZ
6752 RExC_seen_zerolen++;
6753 RExC_seen |= REG_SEEN_EVAL;
6754 while (count && (c = *RExC_parse)) {
6136c704
AL
6755 if (c == '\\') {
6756 if (RExC_parse[1])
6757 RExC_parse++;
6758 }
b81d288d 6759 else if (c == '{')
c277df42 6760 count++;
b81d288d 6761 else if (c == '}')
c277df42 6762 count--;
830247a4 6763 RExC_parse++;
c277df42 6764 }
6136c704 6765 if (*RExC_parse != ')') {
b81d288d 6766 RExC_parse = s;
b45f050a
JF
6767 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6768 }
c277df42 6769 if (!SIZE_ONLY) {
f3548bdc 6770 PAD *pad;
6136c704
AL
6771 OP_4tree *sop, *rop;
6772 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 6773
569233ed
SB
6774 ENTER;
6775 Perl_save_re_context(aTHX);
d59a8b3e 6776 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
9b978d73
DM
6777 sop->op_private |= OPpREFCOUNTED;
6778 /* re_dup will OpREFCNT_inc */
6779 OpREFCNT_set(sop, 1);
569233ed 6780 LEAVE;
c277df42 6781
830247a4 6782 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
6783 RExC_rxi->data->data[n] = (void*)rop;
6784 RExC_rxi->data->data[n+1] = (void*)sop;
6785 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 6786 SvREFCNT_dec(sv);
a0ed51b3 6787 }
e24b16f9 6788 else { /* First pass */
830247a4 6789 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 6790 && IN_PERL_RUNTIME)
2cd61cdb
IZ
6791 /* No compiled RE interpolated, has runtime
6792 components ===> unsafe. */
6793 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 6794 if (PL_tainting && PL_tainted)
cc6b7395 6795 FAIL("Eval-group in insecure regular expression");
54df2634 6796#if PERL_VERSION > 8
923e4eb5 6797 if (IN_PERL_COMPILETIME)
b5c19bd7 6798 PL_cv_has_eval = 1;
54df2634 6799#endif
c277df42 6800 }
b5c19bd7 6801
830247a4 6802 nextchar(pRExC_state);
6136c704 6803 if (is_logical) {
830247a4 6804 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6805 if (!SIZE_ONLY)
6806 ret->flags = 2;
3dab1dad 6807 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 6808 /* deal with the length of this later - MJD */
0f5d15d6
IZ
6809 return ret;
6810 }
ccb2c380
MP
6811 ret = reganode(pRExC_state, EVAL, n);
6812 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6813 Set_Node_Offset(ret, parse_start);
6814 return ret;
c277df42 6815 }
fac92740 6816 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 6817 {
0a4db386 6818 int is_define= 0;
fac92740 6819 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
6820 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6821 || RExC_parse[1] == '<'
830247a4 6822 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
6823 I32 flag;
6824
830247a4 6825 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6826 if (!SIZE_ONLY)
6827 ret->flags = 1;
3dab1dad 6828 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 6829 goto insert_if;
b81d288d 6830 }
a0ed51b3 6831 }
0a4db386
YO
6832 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6833 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6834 {
6835 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6836 char *name_start= RExC_parse++;
2eccd3b2 6837 U32 num = 0;
0a4db386
YO
6838 SV *sv_dat=reg_scan_name(pRExC_state,
6839 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6840 if (RExC_parse == name_start || *RExC_parse != ch)
6841 vFAIL2("Sequence (?(%c... not terminated",
6842 (ch == '>' ? '<' : ch));
6843 RExC_parse++;
6844 if (!SIZE_ONLY) {
6845 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 6846 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 6847 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
6848 }
6849 ret = reganode(pRExC_state,NGROUPP,num);
6850 goto insert_if_check_paren;
6851 }
6852 else if (RExC_parse[0] == 'D' &&
6853 RExC_parse[1] == 'E' &&
6854 RExC_parse[2] == 'F' &&
6855 RExC_parse[3] == 'I' &&
6856 RExC_parse[4] == 'N' &&
6857 RExC_parse[5] == 'E')
6858 {
6859 ret = reganode(pRExC_state,DEFINEP,0);
6860 RExC_parse +=6 ;
6861 is_define = 1;
6862 goto insert_if_check_paren;
6863 }
6864 else if (RExC_parse[0] == 'R') {
6865 RExC_parse++;
6866 parno = 0;
6867 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6868 parno = atoi(RExC_parse++);
6869 while (isDIGIT(*RExC_parse))
6870 RExC_parse++;
6871 } else if (RExC_parse[0] == '&') {
6872 SV *sv_dat;
6873 RExC_parse++;
6874 sv_dat = reg_scan_name(pRExC_state,
6875 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6876 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6877 }
1a147d38 6878 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
6879 goto insert_if_check_paren;
6880 }
830247a4 6881 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 6882 /* (?(1)...) */
6136c704 6883 char c;
830247a4 6884 parno = atoi(RExC_parse++);
c277df42 6885
830247a4
IZ
6886 while (isDIGIT(*RExC_parse))
6887 RExC_parse++;
fac92740 6888 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 6889
0a4db386 6890 insert_if_check_paren:
830247a4 6891 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 6892 vFAIL("Switch condition not recognized");
c277df42 6893 insert_if:
3dab1dad
YO
6894 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6895 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 6896 if (br == NULL)
830247a4 6897 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 6898 else
3dab1dad 6899 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 6900 c = *nextchar(pRExC_state);
d1b80229
IZ
6901 if (flags&HASWIDTH)
6902 *flagp |= HASWIDTH;
c277df42 6903 if (c == '|') {
0a4db386
YO
6904 if (is_define)
6905 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 6906 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
6907 regbranch(pRExC_state, &flags, 1,depth+1);
6908 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
6909 if (flags&HASWIDTH)
6910 *flagp |= HASWIDTH;
830247a4 6911 c = *nextchar(pRExC_state);
a0ed51b3
LW
6912 }
6913 else
c277df42
IZ
6914 lastbr = NULL;
6915 if (c != ')')
8615cb43 6916 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 6917 ender = reg_node(pRExC_state, TAIL);
3dab1dad 6918 REGTAIL(pRExC_state, br, ender);
c277df42 6919 if (lastbr) {
3dab1dad
YO
6920 REGTAIL(pRExC_state, lastbr, ender);
6921 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
6922 }
6923 else
3dab1dad 6924 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
6925 RExC_size++; /* XXX WHY do we need this?!!
6926 For large programs it seems to be required
6927 but I can't figure out why. -- dmq*/
c277df42 6928 return ret;
a0ed51b3
LW
6929 }
6930 else {
830247a4 6931 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
6932 }
6933 }
1b1626e4 6934 case 0:
830247a4 6935 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 6936 vFAIL("Sequence (? incomplete");
1b1626e4 6937 break;
85508812
KW
6938 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6939 that follow */
fb85c044
KW
6940 has_use_defaults = TRUE;
6941 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
e40e74fe
KW
6942 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
6943 ? REGEX_UNICODE_CHARSET
6944 : REGEX_DEPENDS_CHARSET);
fb85c044 6945 goto parse_flags;
a0d0e21e 6946 default:
cde0cee5
YO
6947 --RExC_parse;
6948 parse_flags: /* (?i) */
6949 {
6950 U32 posflags = 0, negflags = 0;
6951 U32 *flagsp = &posflags;
9de15fec 6952 bool has_charset_modifier = 0;
a62b1201 6953 regex_charset cs = REGEX_DEPENDS_CHARSET;
cde0cee5
YO
6954
6955 while (*RExC_parse) {
6956 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
6957 /* (?g), (?gc) and (?o) are useless here
6958 and must be globally applied -- japhy */
cde0cee5
YO
6959 switch (*RExC_parse) {
6960 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9de15fec
KW
6961 case LOCALE_PAT_MOD:
6962 if (has_charset_modifier || flagsp == &negflags) {
6963 goto fail_modifiers;
6964 }
a62b1201 6965 cs = REGEX_LOCALE_CHARSET;
9de15fec
KW
6966 has_charset_modifier = 1;
6967 break;
6968 case UNICODE_PAT_MOD:
6969 if (has_charset_modifier || flagsp == &negflags) {
6970 goto fail_modifiers;
6971 }
a62b1201 6972 cs = REGEX_UNICODE_CHARSET;
9de15fec
KW
6973 has_charset_modifier = 1;
6974 break;
cfaf538b
KW
6975 case ASCII_RESTRICT_PAT_MOD:
6976 if (has_charset_modifier || flagsp == &negflags) {
6977 goto fail_modifiers;
6978 }
2f7f8cb1
KW
6979 if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) {
6980 /* Doubled modifier implies more restricted */
6981 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
6982 RExC_parse++;
6983 }
6984 else {
6985 cs = REGEX_ASCII_RESTRICTED_CHARSET;
6986 }
cfaf538b
KW
6987 has_charset_modifier = 1;
6988 break;
50e91148 6989 case DEPENDS_PAT_MOD:
9de15fec
KW
6990 if (has_use_defaults
6991 || has_charset_modifier
6992 || flagsp == &negflags)
6993 {
6994 goto fail_modifiers;
6995 }
7b98bc43
KW
6996
6997 /* The dual charset means unicode semantics if the
6998 * pattern (or target, not known until runtime) are
e40e74fe
KW
6999 * utf8, or something in the pattern indicates unicode
7000 * semantics */
7001 cs = (RExC_utf8 || RExC_uni_semantics)
a62b1201
KW
7002 ? REGEX_UNICODE_CHARSET
7003 : REGEX_DEPENDS_CHARSET;
9de15fec
KW
7004 has_charset_modifier = 1;
7005 break;
f7819f85
A
7006 case ONCE_PAT_MOD: /* 'o' */
7007 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 7008 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 7009 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
7010 if (! (wastedflags & wflagbit) ) {
7011 wastedflags |= wflagbit;
7012 vWARN5(
7013 RExC_parse + 1,
7014 "Useless (%s%c) - %suse /%c modifier",
7015 flagsp == &negflags ? "?-" : "?",
7016 *RExC_parse,
7017 flagsp == &negflags ? "don't " : "",
7018 *RExC_parse
7019 );
7020 }
7021 }
cde0cee5
YO
7022 break;
7023
f7819f85 7024 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 7025 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
7026 if (! (wastedflags & WASTED_C) ) {
7027 wastedflags |= WASTED_GC;
9d1d55b5
JP
7028 vWARN3(
7029 RExC_parse + 1,
7030 "Useless (%sc) - %suse /gc modifier",
7031 flagsp == &negflags ? "?-" : "?",
7032 flagsp == &negflags ? "don't " : ""
7033 );
7034 }
7035 }
cde0cee5 7036 break;
f7819f85 7037 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5 7038 if (flagsp == &negflags) {
668c081a
NC
7039 if (SIZE_ONLY)
7040 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
7041 } else {
7042 *flagsp |= RXf_PMf_KEEPCOPY;
7043 }
7044 break;
7045 case '-':
3b753521 7046 /* A flag is a default iff it is following a minus, so
fb85c044
KW
7047 * if there is a minus, it means will be trying to
7048 * re-specify a default which is an error */
7049 if (has_use_defaults || flagsp == &negflags) {
9de15fec 7050 fail_modifiers:
57b84237
YO
7051 RExC_parse++;
7052 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7053 /*NOTREACHED*/
7054 }
cde0cee5
YO
7055 flagsp = &negflags;
7056 wastedflags = 0; /* reset so (?g-c) warns twice */
7057 break;
7058 case ':':
7059 paren = ':';
7060 /*FALLTHROUGH*/
7061 case ')':
7062 RExC_flags |= posflags;
7063 RExC_flags &= ~negflags;
a62b1201 7064 set_regex_charset(&RExC_flags, cs);
f7819f85
A
7065 if (paren != ':') {
7066 oregflags |= posflags;
7067 oregflags &= ~negflags;
a62b1201 7068 set_regex_charset(&oregflags, cs);
f7819f85 7069 }
cde0cee5
YO
7070 nextchar(pRExC_state);
7071 if (paren != ':') {
7072 *flagp = TRYAGAIN;
7073 return NULL;
7074 } else {
7075 ret = NULL;
7076 goto parse_rest;
7077 }
7078 /*NOTREACHED*/
7079 default:
cde0cee5
YO
7080 RExC_parse++;
7081 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7082 /*NOTREACHED*/
7083 }
830247a4 7084 ++RExC_parse;
48c036b1 7085 }
cde0cee5 7086 }} /* one for the default block, one for the switch */
a0d0e21e 7087 }
fac92740 7088 else { /* (...) */
81714fb9 7089 capturing_parens:
830247a4
IZ
7090 parno = RExC_npar;
7091 RExC_npar++;
e2e6a0f1 7092
830247a4 7093 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
7094 if (!SIZE_ONLY ){
7095 if (!RExC_nestroot)
7096 RExC_nestroot = parno;
c009da3d
YO
7097 if (RExC_seen & REG_SEEN_RECURSE
7098 && !RExC_open_parens[parno-1])
7099 {
e2e6a0f1 7100 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
7101 "Setting open paren #%"IVdf" to %d\n",
7102 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
7103 RExC_open_parens[parno-1]= ret;
7104 }
6bda09f9 7105 }
fac92740
MJD
7106 Set_Node_Length(ret, 1); /* MJD */
7107 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 7108 is_open = 1;
a0d0e21e 7109 }
a0ed51b3 7110 }
fac92740 7111 else /* ! paren */
a0d0e21e 7112 ret = NULL;
cde0cee5
YO
7113
7114 parse_rest:
a0d0e21e 7115 /* Pick up the branches, linking them together. */
fac92740 7116 parse_start = RExC_parse; /* MJD */
3dab1dad 7117 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e 7118
fac92740 7119 /* branch_len = (paren != 0); */
2af232bd 7120
a0d0e21e
LW
7121 if (br == NULL)
7122 return(NULL);
830247a4
IZ
7123 if (*RExC_parse == '|') {
7124 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 7125 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 7126 }
fac92740 7127 else { /* MJD */
6bda09f9 7128 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
7129 Set_Node_Length(br, paren != 0);
7130 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
7131 }
c277df42
IZ
7132 have_branch = 1;
7133 if (SIZE_ONLY)
830247a4 7134 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
7135 }
7136 else if (paren == ':') {
c277df42
IZ
7137 *flagp |= flags&SIMPLE;
7138 }
6136c704 7139 if (is_open) { /* Starts with OPEN. */
3dab1dad 7140 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
7141 }
7142 else if (paren != '?') /* Not Conditional */
a0d0e21e 7143 ret = br;
8ae10a67 7144 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 7145 lastbr = br;
830247a4
IZ
7146 while (*RExC_parse == '|') {
7147 if (!SIZE_ONLY && RExC_extralen) {
7148 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 7149 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
7150 }
7151 if (SIZE_ONLY)
830247a4
IZ
7152 RExC_extralen += 2; /* Account for LONGJMP. */
7153 nextchar(pRExC_state);
594d7033
YO
7154 if (freeze_paren) {
7155 if (RExC_npar > after_freeze)
7156 after_freeze = RExC_npar;
7157 RExC_npar = freeze_paren;
7158 }
3dab1dad 7159 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 7160
a687059c 7161 if (br == NULL)
a0d0e21e 7162 return(NULL);
3dab1dad 7163 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 7164 lastbr = br;
8ae10a67 7165 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
7166 }
7167
c277df42
IZ
7168 if (have_branch || paren != ':') {
7169 /* Make a closing node, and hook it on the end. */
7170 switch (paren) {
7171 case ':':
830247a4 7172 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
7173 break;
7174 case 1:
830247a4 7175 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
7176 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
7177 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7178 "Setting close paren #%"IVdf" to %d\n",
7179 (IV)parno, REG_NODE_NUM(ender)));
7180 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
7181 if (RExC_nestroot == parno)
7182 RExC_nestroot = 0;
40d049e4 7183 }
fac92740
MJD
7184 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
7185 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
7186 break;
7187 case '<':
c277df42
IZ
7188 case ',':
7189 case '=':
7190 case '!':
c277df42 7191 *flagp &= ~HASWIDTH;
821b33a5
IZ
7192 /* FALL THROUGH */
7193 case '>':
830247a4 7194 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
7195 break;
7196 case 0:
830247a4 7197 ender = reg_node(pRExC_state, END);
40d049e4
YO
7198 if (!SIZE_ONLY) {
7199 assert(!RExC_opend); /* there can only be one! */
7200 RExC_opend = ender;
7201 }
c277df42
IZ
7202 break;
7203 }
eaf3ca90 7204 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 7205
9674d46a 7206 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
7207 if (depth==1)
7208 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
7209
c277df42 7210 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
7211 for (br = ret; br; br = regnext(br)) {
7212 const U8 op = PL_regkind[OP(br)];
7213 if (op == BRANCH) {
07be1b83 7214 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
7215 }
7216 else if (op == BRANCHJ) {
07be1b83 7217 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 7218 }
c277df42
IZ
7219 }
7220 }
a0d0e21e 7221 }
c277df42
IZ
7222
7223 {
e1ec3a88
AL
7224 const char *p;
7225 static const char parens[] = "=!<,>";
c277df42
IZ
7226
7227 if (paren && (p = strchr(parens, paren))) {
eb160463 7228 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
7229 int flag = (p - parens) > 1;
7230
7231 if (paren == '>')
7232 node = SUSPEND, flag = 0;
6bda09f9 7233 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
7234 Set_Node_Cur_Length(ret);
7235 Set_Node_Offset(ret, parse_start + 1);
c277df42 7236 ret->flags = flag;
07be1b83 7237 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 7238 }
a0d0e21e
LW
7239 }
7240
7241 /* Check for proper termination. */
ce3e6498 7242 if (paren) {
e2509266 7243 RExC_flags = oregflags;
830247a4
IZ
7244 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
7245 RExC_parse = oregcomp_parse;
380a0633 7246 vFAIL("Unmatched (");
ce3e6498 7247 }
a0ed51b3 7248 }
830247a4
IZ
7249 else if (!paren && RExC_parse < RExC_end) {
7250 if (*RExC_parse == ')') {
7251 RExC_parse++;
380a0633 7252 vFAIL("Unmatched )");
a0ed51b3
LW
7253 }
7254 else
b45f050a 7255 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
7256 /* NOTREACHED */
7257 }
b57e4118
KW
7258
7259 if (RExC_in_lookbehind) {
7260 RExC_in_lookbehind--;
7261 }
fd4be6f0 7262 if (after_freeze > RExC_npar)
594d7033 7263 RExC_npar = after_freeze;
a0d0e21e 7264 return(ret);
a687059c
LW
7265}
7266
7267/*
7268 - regbranch - one alternative of an | operator
7269 *
7270 * Implements the concatenation operator.
7271 */
76e3520e 7272STATIC regnode *
3dab1dad 7273S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 7274{
97aff369 7275 dVAR;
c277df42
IZ
7276 register regnode *ret;
7277 register regnode *chain = NULL;
7278 register regnode *latest;
7279 I32 flags = 0, c = 0;
3dab1dad 7280 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
7281
7282 PERL_ARGS_ASSERT_REGBRANCH;
7283
3dab1dad 7284 DEBUG_PARSE("brnc");
02daf0ab 7285
b81d288d 7286 if (first)
c277df42
IZ
7287 ret = NULL;
7288 else {
b81d288d 7289 if (!SIZE_ONLY && RExC_extralen)
830247a4 7290 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 7291 else {
830247a4 7292 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
7293 Set_Node_Length(ret, 1);
7294 }
c277df42
IZ
7295 }
7296
b81d288d 7297 if (!first && SIZE_ONLY)
830247a4 7298 RExC_extralen += 1; /* BRANCHJ */
b81d288d 7299
c277df42 7300 *flagp = WORST; /* Tentatively. */
a0d0e21e 7301
830247a4
IZ
7302 RExC_parse--;
7303 nextchar(pRExC_state);
7304 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 7305 flags &= ~TRYAGAIN;
3dab1dad 7306 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
7307 if (latest == NULL) {
7308 if (flags & TRYAGAIN)
7309 continue;
7310 return(NULL);
a0ed51b3
LW
7311 }
7312 else if (ret == NULL)
c277df42 7313 ret = latest;
8ae10a67 7314 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 7315 if (chain == NULL) /* First piece. */
a0d0e21e
LW
7316 *flagp |= flags&SPSTART;
7317 else {
830247a4 7318 RExC_naughty++;
3dab1dad 7319 REGTAIL(pRExC_state, chain, latest);
a687059c 7320 }
a0d0e21e 7321 chain = latest;
c277df42
IZ
7322 c++;
7323 }
7324 if (chain == NULL) { /* Loop ran zero times. */
830247a4 7325 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
7326 if (ret == NULL)
7327 ret = chain;
7328 }
7329 if (c == 1) {
7330 *flagp |= flags&SIMPLE;
a0d0e21e 7331 }
a687059c 7332
d4c19fe8 7333 return ret;
a687059c
LW
7334}
7335
7336/*
7337 - regpiece - something followed by possible [*+?]
7338 *
7339 * Note that the branching code sequences used for ? and the general cases
7340 * of * and + are somewhat optimized: they use the same NOTHING node as
7341 * both the endmarker for their branch list and the body of the last branch.
7342 * It might seem that this node could be dispensed with entirely, but the
7343 * endmarker role is not redundant.
7344 */
76e3520e 7345STATIC regnode *
3dab1dad 7346S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 7347{
97aff369 7348 dVAR;
c277df42 7349 register regnode *ret;
a0d0e21e
LW
7350 register char op;
7351 register char *next;
7352 I32 flags;
1df70142 7353 const char * const origparse = RExC_parse;
a0d0e21e 7354 I32 min;
c277df42 7355 I32 max = REG_INFTY;
fac92740 7356 char *parse_start;
10edeb5d 7357 const char *maxpos = NULL;
3dab1dad 7358 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
7359
7360 PERL_ARGS_ASSERT_REGPIECE;
7361
3dab1dad 7362 DEBUG_PARSE("piec");
a0d0e21e 7363
3dab1dad 7364 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
7365 if (ret == NULL) {
7366 if (flags & TRYAGAIN)
7367 *flagp |= TRYAGAIN;
7368 return(NULL);
7369 }
7370
830247a4 7371 op = *RExC_parse;
a0d0e21e 7372
830247a4 7373 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 7374 maxpos = NULL;
fac92740 7375 parse_start = RExC_parse; /* MJD */
830247a4 7376 next = RExC_parse + 1;
a0d0e21e
LW
7377 while (isDIGIT(*next) || *next == ',') {
7378 if (*next == ',') {
7379 if (maxpos)
7380 break;
7381 else
7382 maxpos = next;
a687059c 7383 }
a0d0e21e
LW
7384 next++;
7385 }
7386 if (*next == '}') { /* got one */
7387 if (!maxpos)
7388 maxpos = next;
830247a4
IZ
7389 RExC_parse++;
7390 min = atoi(RExC_parse);
a0d0e21e
LW
7391 if (*maxpos == ',')
7392 maxpos++;
7393 else
830247a4 7394 maxpos = RExC_parse;
a0d0e21e
LW
7395 max = atoi(maxpos);
7396 if (!max && *maxpos != '0')
c277df42
IZ
7397 max = REG_INFTY; /* meaning "infinity" */
7398 else if (max >= REG_INFTY)
8615cb43 7399 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
7400 RExC_parse = next;
7401 nextchar(pRExC_state);
a0d0e21e
LW
7402
7403 do_curly:
7404 if ((flags&SIMPLE)) {
830247a4 7405 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 7406 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
7407 Set_Node_Offset(ret, parse_start+1); /* MJD */
7408 Set_Node_Cur_Length(ret);
a0d0e21e
LW
7409 }
7410 else {
3dab1dad 7411 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
7412
7413 w->flags = 0;
3dab1dad 7414 REGTAIL(pRExC_state, ret, w);
830247a4 7415 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
7416 reginsert(pRExC_state, LONGJMP,ret, depth+1);
7417 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
7418 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
7419 }
6bda09f9 7420 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
7421 /* MJD hk */
7422 Set_Node_Offset(ret, parse_start+1);
2af232bd 7423 Set_Node_Length(ret,
fac92740 7424 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 7425
830247a4 7426 if (!SIZE_ONLY && RExC_extralen)
c277df42 7427 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 7428 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 7429 if (SIZE_ONLY)
830247a4
IZ
7430 RExC_whilem_seen++, RExC_extralen += 3;
7431 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 7432 }
c277df42 7433 ret->flags = 0;
a0d0e21e
LW
7434
7435 if (min > 0)
821b33a5
IZ
7436 *flagp = WORST;
7437 if (max > 0)
7438 *flagp |= HASWIDTH;
8fa23287 7439 if (max < min)
8615cb43 7440 vFAIL("Can't do {n,m} with n > m");
c277df42 7441 if (!SIZE_ONLY) {
eb160463
GS
7442 ARG1_SET(ret, (U16)min);
7443 ARG2_SET(ret, (U16)max);
a687059c 7444 }
a687059c 7445
a0d0e21e 7446 goto nest_check;
a687059c 7447 }
a0d0e21e 7448 }
a687059c 7449
a0d0e21e
LW
7450 if (!ISMULT1(op)) {
7451 *flagp = flags;
a687059c 7452 return(ret);
a0d0e21e 7453 }
bb20fd44 7454
c277df42 7455#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
7456
7457 /* if this is reinstated, don't forget to put this back into perldiag:
7458
7459 =item Regexp *+ operand could be empty at {#} in regex m/%s/
7460
7461 (F) The part of the regexp subject to either the * or + quantifier
7462 could match an empty string. The {#} shows in the regular
7463 expression about where the problem was discovered.
7464
7465 */
7466
bb20fd44 7467 if (!(flags&HASWIDTH) && op != '?')
b45f050a 7468 vFAIL("Regexp *+ operand could be empty");
b81d288d 7469#endif
bb20fd44 7470
fac92740 7471 parse_start = RExC_parse;
830247a4 7472 nextchar(pRExC_state);
a0d0e21e 7473
821b33a5 7474 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
7475
7476 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 7477 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 7478 ret->flags = 0;
830247a4 7479 RExC_naughty += 4;
a0d0e21e
LW
7480 }
7481 else if (op == '*') {
7482 min = 0;
7483 goto do_curly;
a0ed51b3
LW
7484 }
7485 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 7486 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 7487 ret->flags = 0;
830247a4 7488 RExC_naughty += 3;
a0d0e21e
LW
7489 }
7490 else if (op == '+') {
7491 min = 1;
7492 goto do_curly;
a0ed51b3
LW
7493 }
7494 else if (op == '?') {
a0d0e21e
LW
7495 min = 0; max = 1;
7496 goto do_curly;
7497 }
7498 nest_check:
668c081a
NC
7499 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
7500 ckWARN3reg(RExC_parse,
7501 "%.*s matches null string many times",
7502 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
7503 origparse);
a0d0e21e
LW
7504 }
7505
b9b4dddf 7506 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 7507 nextchar(pRExC_state);
6bda09f9 7508 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 7509 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 7510 }
b9b4dddf
YO
7511#ifndef REG_ALLOW_MINMOD_SUSPEND
7512 else
7513#endif
7514 if (RExC_parse < RExC_end && *RExC_parse == '+') {
7515 regnode *ender;
7516 nextchar(pRExC_state);
7517 ender = reg_node(pRExC_state, SUCCEED);
7518 REGTAIL(pRExC_state, ret, ender);
7519 reginsert(pRExC_state, SUSPEND, ret, depth+1);
7520 ret->flags = 0;
7521 ender = reg_node(pRExC_state, TAIL);
7522 REGTAIL(pRExC_state, ret, ender);
7523 /*ret= ender;*/
7524 }
7525
7526 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 7527 RExC_parse++;
b45f050a
JF
7528 vFAIL("Nested quantifiers");
7529 }
a0d0e21e
LW
7530
7531 return(ret);
a687059c
LW
7532}
7533
fc8cd66c
YO
7534
7535/* reg_namedseq(pRExC_state,UVp)
7536
7537 This is expected to be called by a parser routine that has
afefe6bf 7538 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
7539 expected to point at the first char following the N at the time
7540 of the call.
ff3f963a
KW
7541
7542 The \N may be inside (indicated by valuep not being NULL) or outside a
7543 character class.
7544
7545 \N may begin either a named sequence, or if outside a character class, mean
7546 to match a non-newline. For non single-quoted regexes, the tokenizer has
7547 attempted to decide which, and in the case of a named sequence converted it
7548 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
7549 where c1... are the characters in the sequence. For single-quoted regexes,
7550 the tokenizer passes the \N sequence through unchanged; this code will not
7551 attempt to determine this nor expand those. The net effect is that if the
7552 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
7553 signals that this \N occurrence means to match a non-newline.
7554
7555 Only the \N{U+...} form should occur in a character class, for the same
7556 reason that '.' inside a character class means to just match a period: it
7557 just doesn't make sense.
fc8cd66c
YO
7558
7559 If valuep is non-null then it is assumed that we are parsing inside
7560 of a charclass definition and the first codepoint in the resolved
7561 string is returned via *valuep and the routine will return NULL.
7562 In this mode if a multichar string is returned from the charnames
ff3f963a 7563 handler, a warning will be issued, and only the first char in the
fc8cd66c
YO
7564 sequence will be examined. If the string returned is zero length
7565 then the value of *valuep is undefined and NON-NULL will
7566 be returned to indicate failure. (This will NOT be a valid pointer
7567 to a regnode.)
7568
ff3f963a
KW
7569 If valuep is null then it is assumed that we are parsing normal text and a
7570 new EXACT node is inserted into the program containing the resolved string,
7571 and a pointer to the new node is returned. But if the string is zero length
7572 a NOTHING node is emitted instead.
afefe6bf 7573
fc8cd66c 7574 On success RExC_parse is set to the char following the endbrace.
ff3f963a 7575 Parsing failures will generate a fatal error via vFAIL(...)
fc8cd66c
YO
7576 */
7577STATIC regnode *
afefe6bf 7578S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
fc8cd66c 7579{
c3c41406 7580 char * endbrace; /* '}' following the name */
fc8cd66c 7581 regnode *ret = NULL;
ff3f963a
KW
7582#ifdef DEBUGGING
7583 char* parse_start = RExC_parse - 2; /* points to the '\N' */
7584#endif
c3c41406 7585 char* p;
ff3f963a
KW
7586
7587 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
7588
7589 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
ff3f963a
KW
7590
7591 GET_RE_DEBUG_FLAGS;
c3c41406
KW
7592
7593 /* The [^\n] meaning of \N ignores spaces and comments under the /x
7594 * modifier. The other meaning does not */
7595 p = (RExC_flags & RXf_PMf_EXTENDED)
7596 ? regwhite( pRExC_state, RExC_parse )
7597 : RExC_parse;
7918f24d 7598
ff3f963a 7599 /* Disambiguate between \N meaning a named character versus \N meaning
c3c41406
KW
7600 * [^\n]. The former is assumed when it can't be the latter. */
7601 if (*p != '{' || regcurly(p)) {
7602 RExC_parse = p;
ff3f963a 7603 if (valuep) {
afefe6bf 7604 /* no bare \N in a charclass */
ff3f963a
KW
7605 vFAIL("\\N in a character class must be a named character: \\N{...}");
7606 }
afefe6bf
RGS
7607 nextchar(pRExC_state);
7608 ret = reg_node(pRExC_state, REG_ANY);
7609 *flagp |= HASWIDTH|SIMPLE;
7610 RExC_naughty++;
7611 RExC_parse--;
7612 Set_Node_Length(ret, 1); /* MJD */
7613 return ret;
fc8cd66c 7614 }
a4893424 7615
c3c41406
KW
7616 /* Here, we have decided it should be a named sequence */
7617
7618 /* The test above made sure that the next real character is a '{', but
7619 * under the /x modifier, it could be separated by space (or a comment and
7620 * \n) and this is not allowed (for consistency with \x{...} and the
7621 * tokenizer handling of \N{NAME}). */
7622 if (*RExC_parse != '{') {
7623 vFAIL("Missing braces on \\N{}");
7624 }
7625
ff3f963a 7626 RExC_parse++; /* Skip past the '{' */
c3c41406
KW
7627
7628 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
7629 || ! (endbrace == RExC_parse /* nothing between the {} */
7630 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
7631 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
7632 {
7633 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
7634 vFAIL("\\N{NAME} must be resolved by the lexer");
7635 }
7636
ff3f963a
KW
7637 if (endbrace == RExC_parse) { /* empty: \N{} */
7638 if (! valuep) {
7639 RExC_parse = endbrace + 1;
7640 return reg_node(pRExC_state,NOTHING);
a4893424 7641 }
fc8cd66c 7642
ff3f963a
KW
7643 if (SIZE_ONLY) {
7644 ckWARNreg(RExC_parse,
7645 "Ignoring zero length \\N{} in character class"
7646 );
7647 RExC_parse = endbrace + 1;
7648 }
7649 *valuep = 0;
7650 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 7651 }
ff3f963a 7652
62fed28b 7653 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
ff3f963a
KW
7654 RExC_parse += 2; /* Skip past the 'U+' */
7655
7656 if (valuep) { /* In a bracketed char class */
7657 /* We only pay attention to the first char of
7658 multichar strings being returned. I kinda wonder
7659 if this makes sense as it does change the behaviour
7660 from earlier versions, OTOH that behaviour was broken
7661 as well. XXX Solution is to recharacterize as
7662 [rest-of-class]|multi1|multi2... */
7663
7664 STRLEN length_of_hex;
7665 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7666 | PERL_SCAN_DISALLOW_PREFIX
7667 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7668
37820adc
KW
7669 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7670 if (endchar < endbrace) {
ff3f963a
KW
7671 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7672 }
ff3f963a
KW
7673
7674 length_of_hex = (STRLEN)(endchar - RExC_parse);
7675 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7676
7677 /* The tokenizer should have guaranteed validity, but it's possible to
7678 * bypass it by using single quoting, so check */
c3c41406
KW
7679 if (length_of_hex == 0
7680 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7681 {
7682 RExC_parse += length_of_hex; /* Includes all the valid */
7683 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7684 ? UTF8SKIP(RExC_parse)
7685 : 1;
7686 /* Guard against malformed utf8 */
7687 if (RExC_parse >= endchar) RExC_parse = endchar;
7688 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
7689 }
7690
7691 RExC_parse = endbrace + 1;
7692 if (endchar == endbrace) return NULL;
7693
7694 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 7695 }
ff3f963a
KW
7696 else { /* Not a char class */
7697 char *s; /* String to put in generated EXACT node */
fda99bee 7698 STRLEN len = 0; /* Its current byte length */
ff3f963a
KW
7699 char *endchar; /* Points to '.' or '}' ending cur char in the input
7700 stream */
2f7f8cb1
KW
7701 ret = reg_node(pRExC_state,
7702 (U8) ((! FOLD) ? EXACT
7703 : (LOC)
7704 ? EXACTFL
7705 : (MORE_ASCII_RESTRICTED)
7706 ? EXACTFA
7707 : (AT_LEAST_UNI_SEMANTICS)
7708 ? EXACTFU
7709 : EXACTF));
ff3f963a
KW
7710 s= STRING(ret);
7711
7712 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
7713 * the input which is of the form now 'c1.c2.c3...}' until find the
fda99bee 7714 * ending brace or exceed length 255. The characters that exceed this
ff3f963a
KW
7715 * limit are dropped. The limit could be relaxed should it become
7716 * desirable by reparsing this as (?:\N{NAME}), so could generate
7717 * multiple EXACT nodes, as is done for just regular input. But this
7718 * is primarily a named character, and not intended to be a huge long
7719 * string, so 255 bytes should be good enough */
7720 while (1) {
c3c41406 7721 STRLEN length_of_hex;
ff3f963a
KW
7722 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7723 | PERL_SCAN_DISALLOW_PREFIX
7724 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7725 UV cp; /* Ord of current character */
2f7f8cb1 7726 bool use_this_char_fold = FOLD;
ff3f963a
KW
7727
7728 /* Code points are separated by dots. If none, there is only one
7729 * code point, and is terminated by the brace */
37820adc 7730 endchar = RExC_parse + strcspn(RExC_parse, ".}");
ff3f963a
KW
7731
7732 /* The values are Unicode even on EBCDIC machines */
c3c41406
KW
7733 length_of_hex = (STRLEN)(endchar - RExC_parse);
7734 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7735 if ( length_of_hex == 0
7736 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
ff3f963a 7737 {
c3c41406
KW
7738 RExC_parse += length_of_hex; /* Includes all the valid */
7739 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7740 ? UTF8SKIP(RExC_parse)
7741 : 1;
7742 /* Guard against malformed utf8 */
7743 if (RExC_parse >= endchar) RExC_parse = endchar;
7744 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
7745 }
7746
e074b0e5 7747 /* XXX ? Change to ANYOF node
2f7f8cb1 7748 if (FOLD
e074b0e5 7749 && (cp > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
2f7f8cb1
KW
7750 && is_TRICKYFOLD_cp(cp))
7751 {
7752 }
e074b0e5 7753 */
2f7f8cb1
KW
7754
7755 /* Under /aa, we can't mix ASCII with non- in a fold. If we are
7756 * folding, and the source isn't ASCII, look through all the
7757 * characters it folds to. If any one of them is ASCII, forbid
7758 * this fold. (cp is uni, so the 127 below is correct even for
17580e7a
KW
7759 * EBCDIC). Similarly under locale rules, we don't mix under 256
7760 * with above 255. XXX It really doesn't make sense to have \N{}
7761 * which means a Unicode rules under locale. I (khw) think this
7762 * should be warned about, but the counter argument is that people
7763 * who have programmed around Perl's earlier lack of specifying the
7764 * rules and used \N{} to force Unicode things in a local
7765 * environment shouldn't get suddenly a warning */
7766 if (use_this_char_fold) {
7767 if (LOC && cp < 256) { /* Fold not known until run-time */
7768 use_this_char_fold = FALSE;
7769 }
7770 else if ((cp > 127 && MORE_ASCII_RESTRICTED)
7771 || (cp > 255 && LOC))
7772 {
2f7f8cb1
KW
7773 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
7774 U8* s = tmpbuf;
7775 U8* e;
7776 STRLEN foldlen;
7777
7778 (void) toFOLD_uni(cp, tmpbuf, &foldlen);
7779 e = s + foldlen;
7780
7781 while (s < e) {
17580e7a
KW
7782 if (isASCII(*s)
7783 || (LOC && (UTF8_IS_INVARIANT(*s)
7784 || UTF8_IS_DOWNGRADEABLE_START(*s))))
7785 {
2f7f8cb1
KW
7786 use_this_char_fold = FALSE;
7787 break;
7788 }
7789 s += UTF8SKIP(s);
7790 }
17580e7a 7791 }
2f7f8cb1
KW
7792 }
7793
7794 if (! use_this_char_fold) { /* Not folding, just append to the
7795 string */
ff3f963a
KW
7796 STRLEN unilen;
7797
7798 /* Quit before adding this character if would exceed limit */
7799 if (len + UNISKIP(cp) > U8_MAX) break;
fc8cd66c 7800
ff3f963a
KW
7801 unilen = reguni(pRExC_state, cp, s);
7802 if (unilen > 0) {
7803 s += unilen;
7804 len += unilen;
7805 }
7806 } else { /* Folding, output the folded equivalent */
7807 STRLEN foldlen,numlen;
7808 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7809 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7810
7811 /* Quit before exceeding size limit */
7812 if (len + foldlen > U8_MAX) break;
7813
7814 for (foldbuf = tmpbuf;
7815 foldlen;
7816 foldlen -= numlen)
7817 {
7818 cp = utf8_to_uvchr(foldbuf, &numlen);
7819 if (numlen > 0) {
7820 const STRLEN unilen = reguni(pRExC_state, cp, s);
7821 s += unilen;
7822 len += unilen;
7823 /* In EBCDIC the numlen and unilen can differ. */
7824 foldbuf += numlen;
7825 if (numlen >= foldlen)
7826 break;
7827 }
7828 else
7829 break; /* "Can't happen." */
7830 }
7831 }
7832
7833 /* Point to the beginning of the next character in the sequence. */
7834 RExC_parse = endchar + 1;
7835
7836 /* Quit if no more characters */
7837 if (RExC_parse >= endbrace) break;
7838 }
7839
7840
7841 if (SIZE_ONLY) {
7842 if (RExC_parse < endbrace) {
7843 ckWARNreg(RExC_parse - 1,
7844 "Using just the first characters returned by \\N{}");
7845 }
7846
7847 RExC_size += STR_SZ(len);
7848 } else {
7849 STR_LEN(ret) = len;
7850 RExC_emit += STR_SZ(len);
7851 }
7852
7853 RExC_parse = endbrace + 1;
7854
7855 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7856 with malformed in t/re/pat_advanced.t */
7857 RExC_parse --;
7858 Set_Node_Cur_Length(ret); /* MJD */
7859 nextchar(pRExC_state);
7860 }
7861
7862 return ret;
fc8cd66c
YO
7863}
7864
7865
9e08bc66
TS
7866/*
7867 * reg_recode
7868 *
7869 * It returns the code point in utf8 for the value in *encp.
7870 * value: a code value in the source encoding
7871 * encp: a pointer to an Encode object
7872 *
7873 * If the result from Encode is not a single character,
7874 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7875 */
7876STATIC UV
7877S_reg_recode(pTHX_ const char value, SV **encp)
7878{
7879 STRLEN numlen = 1;
59cd0e26 7880 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 7881 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
7882 const STRLEN newlen = SvCUR(sv);
7883 UV uv = UNICODE_REPLACEMENT;
7884
7918f24d
NC
7885 PERL_ARGS_ASSERT_REG_RECODE;
7886
9e08bc66
TS
7887 if (newlen)
7888 uv = SvUTF8(sv)
7889 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7890 : *(U8*)s;
7891
7892 if (!newlen || numlen != newlen) {
7893 uv = UNICODE_REPLACEMENT;
c86f7df5 7894 *encp = NULL;
9e08bc66
TS
7895 }
7896 return uv;
7897}
7898
fc8cd66c 7899
a687059c
LW
7900/*
7901 - regatom - the lowest level
ee9b8eae
YO
7902
7903 Try to identify anything special at the start of the pattern. If there
7904 is, then handle it as required. This may involve generating a single regop,
7905 such as for an assertion; or it may involve recursing, such as to
7906 handle a () structure.
7907
7908 If the string doesn't start with something special then we gobble up
7909 as much literal text as we can.
7910
7911 Once we have been able to handle whatever type of thing started the
7912 sequence, we return.
7913
7914 Note: we have to be careful with escapes, as they can be both literal
7915 and special, and in the case of \10 and friends can either, depending
486ec47a 7916 on context. Specifically there are two separate switches for handling
ee9b8eae
YO
7917 escape sequences, with the one for handling literal escapes requiring
7918 a dummy entry for all of the special escapes that are actually handled
7919 by the other.
7920*/
7921
76e3520e 7922STATIC regnode *
3dab1dad 7923S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 7924{
97aff369 7925 dVAR;
cbbf8932 7926 register regnode *ret = NULL;
a0d0e21e 7927 I32 flags;
45948336 7928 char *parse_start = RExC_parse;
980866de 7929 U8 op;
3dab1dad
YO
7930 GET_RE_DEBUG_FLAGS_DECL;
7931 DEBUG_PARSE("atom");
a0d0e21e
LW
7932 *flagp = WORST; /* Tentatively. */
7933
7918f24d 7934 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 7935
a0d0e21e 7936tryagain:
f9a79580 7937 switch ((U8)*RExC_parse) {
a0d0e21e 7938 case '^':
830247a4
IZ
7939 RExC_seen_zerolen++;
7940 nextchar(pRExC_state);
bbe252da 7941 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7942 ret = reg_node(pRExC_state, MBOL);
bbe252da 7943 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7944 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7945 else
830247a4 7946 ret = reg_node(pRExC_state, BOL);
fac92740 7947 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7948 break;
7949 case '$':
830247a4 7950 nextchar(pRExC_state);
b81d288d 7951 if (*RExC_parse)
830247a4 7952 RExC_seen_zerolen++;
bbe252da 7953 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7954 ret = reg_node(pRExC_state, MEOL);
bbe252da 7955 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7956 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7957 else
830247a4 7958 ret = reg_node(pRExC_state, EOL);
fac92740 7959 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7960 break;
7961 case '.':
830247a4 7962 nextchar(pRExC_state);
bbe252da 7963 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
7964 ret = reg_node(pRExC_state, SANY);
7965 else
7966 ret = reg_node(pRExC_state, REG_ANY);
7967 *flagp |= HASWIDTH|SIMPLE;
830247a4 7968 RExC_naughty++;
fac92740 7969 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7970 break;
7971 case '[':
b45f050a 7972 {
3dab1dad
YO
7973 char * const oregcomp_parse = ++RExC_parse;
7974 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
7975 if (*RExC_parse != ']') {
7976 RExC_parse = oregcomp_parse;
b45f050a
JF
7977 vFAIL("Unmatched [");
7978 }
830247a4 7979 nextchar(pRExC_state);
a0d0e21e 7980 *flagp |= HASWIDTH|SIMPLE;
fac92740 7981 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 7982 break;
b45f050a 7983 }
a0d0e21e 7984 case '(':
830247a4 7985 nextchar(pRExC_state);
3dab1dad 7986 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 7987 if (ret == NULL) {
bf93d4cc 7988 if (flags & TRYAGAIN) {
830247a4 7989 if (RExC_parse == RExC_end) {
bf93d4cc
GS
7990 /* Make parent create an empty node if needed. */
7991 *flagp |= TRYAGAIN;
7992 return(NULL);
7993 }
a0d0e21e 7994 goto tryagain;
bf93d4cc 7995 }
a0d0e21e
LW
7996 return(NULL);
7997 }
a3b492c3 7998 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
7999 break;
8000 case '|':
8001 case ')':
8002 if (flags & TRYAGAIN) {
8003 *flagp |= TRYAGAIN;
8004 return NULL;
8005 }
b45f050a 8006 vFAIL("Internal urp");
a0d0e21e
LW
8007 /* Supposed to be caught earlier. */
8008 break;
85afd4ae 8009 case '{':
830247a4
IZ
8010 if (!regcurly(RExC_parse)) {
8011 RExC_parse++;
85afd4ae
CS
8012 goto defchar;
8013 }
8014 /* FALL THROUGH */
a0d0e21e
LW
8015 case '?':
8016 case '+':
8017 case '*':
830247a4 8018 RExC_parse++;
b45f050a 8019 vFAIL("Quantifier follows nothing");
a0d0e21e 8020 break;
ced7f090
KW
8021 case LATIN_SMALL_LETTER_SHARP_S:
8022 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8023 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8024#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
8025#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below.
8026 case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
8027#endif
a0a388a1 8028 do_foldchar:
56d400ed 8029 if (!LOC && FOLD) {
e64b1bd1 8030 U32 len,cp;
7cf3a6a3 8031 len=0; /* silence a spurious compiler warning */
56d400ed 8032 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
e64b1bd1
YO
8033 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
8034 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
8035 ret = reganode(pRExC_state, FOLDCHAR, cp);
8036 Set_Node_Length(ret, 1); /* MJD */
8037 nextchar(pRExC_state); /* kill whitespace under /x */
8038 return ret;
8039 }
8040 }
8041 goto outer_default;
a0d0e21e 8042 case '\\':
ee9b8eae
YO
8043 /* Special Escapes
8044
8045 This switch handles escape sequences that resolve to some kind
8046 of special regop and not to literal text. Escape sequnces that
8047 resolve to literal text are handled below in the switch marked
8048 "Literal Escapes".
8049
8050 Every entry in this switch *must* have a corresponding entry
8051 in the literal escape switch. However, the opposite is not
8052 required, as the default for this switch is to jump to the
8053 literal text handling code.
8054 */
a0a388a1 8055 switch ((U8)*++RExC_parse) {
ced7f090
KW
8056 case LATIN_SMALL_LETTER_SHARP_S:
8057 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8058 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
a0a388a1 8059 goto do_foldchar;
ee9b8eae 8060 /* Special Escapes */
a0d0e21e 8061 case 'A':
830247a4
IZ
8062 RExC_seen_zerolen++;
8063 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 8064 *flagp |= SIMPLE;
ee9b8eae 8065 goto finish_meta_pat;
a0d0e21e 8066 case 'G':
830247a4
IZ
8067 ret = reg_node(pRExC_state, GPOS);
8068 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 8069 *flagp |= SIMPLE;
ee9b8eae
YO
8070 goto finish_meta_pat;
8071 case 'K':
8072 RExC_seen_zerolen++;
8073 ret = reg_node(pRExC_state, KEEPS);
8074 *flagp |= SIMPLE;
37923168
RGS
8075 /* XXX:dmq : disabling in-place substitution seems to
8076 * be necessary here to avoid cases of memory corruption, as
8077 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
8078 */
8079 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 8080 goto finish_meta_pat;
a0d0e21e 8081 case 'Z':
830247a4 8082 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 8083 *flagp |= SIMPLE;
a1917ab9 8084 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 8085 goto finish_meta_pat;
b85d18e9 8086 case 'z':
830247a4 8087 ret = reg_node(pRExC_state, EOS);
b85d18e9 8088 *flagp |= SIMPLE;
830247a4 8089 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 8090 goto finish_meta_pat;
4a2d328f 8091 case 'C':
f33976b4
DB
8092 ret = reg_node(pRExC_state, CANY);
8093 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 8094 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8095 goto finish_meta_pat;
a0ed51b3 8096 case 'X':
830247a4 8097 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 8098 *flagp |= HASWIDTH;
ee9b8eae 8099 goto finish_meta_pat;
a0d0e21e 8100 case 'w':
980866de
KW
8101 switch (get_regex_charset(RExC_flags)) {
8102 case REGEX_LOCALE_CHARSET:
8103 op = ALNUML;
8104 break;
8105 case REGEX_UNICODE_CHARSET:
8106 op = ALNUMU;
8107 break;
cfaf538b 8108 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 8109 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
8110 op = ALNUMA;
8111 break;
980866de
KW
8112 case REGEX_DEPENDS_CHARSET:
8113 op = ALNUM;
8114 break;
8115 default:
8116 goto bad_charset;
a12cf05f 8117 }
980866de 8118 ret = reg_node(pRExC_state, op);
a0d0e21e 8119 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8120 goto finish_meta_pat;
a0d0e21e 8121 case 'W':
980866de
KW
8122 switch (get_regex_charset(RExC_flags)) {
8123 case REGEX_LOCALE_CHARSET:
8124 op = NALNUML;
8125 break;
8126 case REGEX_UNICODE_CHARSET:
8127 op = NALNUMU;
8128 break;
cfaf538b 8129 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 8130 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
8131 op = NALNUMA;
8132 break;
980866de
KW
8133 case REGEX_DEPENDS_CHARSET:
8134 op = NALNUM;
8135 break;
8136 default:
8137 goto bad_charset;
a12cf05f 8138 }
980866de 8139 ret = reg_node(pRExC_state, op);
a0d0e21e 8140 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8141 goto finish_meta_pat;
a0d0e21e 8142 case 'b':
830247a4
IZ
8143 RExC_seen_zerolen++;
8144 RExC_seen |= REG_SEEN_LOOKBEHIND;
63ac0dad
KW
8145 switch (get_regex_charset(RExC_flags)) {
8146 case REGEX_LOCALE_CHARSET:
8147 op = BOUNDL;
8148 break;
8149 case REGEX_UNICODE_CHARSET:
8150 op = BOUNDU;
8151 break;
cfaf538b 8152 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 8153 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
8154 op = BOUNDA;
8155 break;
63ac0dad
KW
8156 case REGEX_DEPENDS_CHARSET:
8157 op = BOUND;
8158 break;
8159 default:
8160 goto bad_charset;
a12cf05f 8161 }
63ac0dad 8162 ret = reg_node(pRExC_state, op);
b988e673 8163 FLAGS(ret) = get_regex_charset(RExC_flags);
a0d0e21e 8164 *flagp |= SIMPLE;
5024bc2d
KW
8165 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8166 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
8167 }
ee9b8eae 8168 goto finish_meta_pat;
a0d0e21e 8169 case 'B':
830247a4
IZ
8170 RExC_seen_zerolen++;
8171 RExC_seen |= REG_SEEN_LOOKBEHIND;
63ac0dad
KW
8172 switch (get_regex_charset(RExC_flags)) {
8173 case REGEX_LOCALE_CHARSET:
8174 op = NBOUNDL;
8175 break;
8176 case REGEX_UNICODE_CHARSET:
8177 op = NBOUNDU;
8178 break;
cfaf538b 8179 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 8180 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
8181 op = NBOUNDA;
8182 break;
63ac0dad
KW
8183 case REGEX_DEPENDS_CHARSET:
8184 op = NBOUND;
8185 break;
8186 default:
8187 goto bad_charset;
a12cf05f 8188 }
63ac0dad 8189 ret = reg_node(pRExC_state, op);
b988e673 8190 FLAGS(ret) = get_regex_charset(RExC_flags);
a0d0e21e 8191 *flagp |= SIMPLE;
5024bc2d
KW
8192 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8193 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
8194 }
ee9b8eae 8195 goto finish_meta_pat;
a0d0e21e 8196 case 's':
980866de
KW
8197 switch (get_regex_charset(RExC_flags)) {
8198 case REGEX_LOCALE_CHARSET:
8199 op = SPACEL;
8200 break;
8201 case REGEX_UNICODE_CHARSET:
8202 op = SPACEU;
8203 break;
cfaf538b 8204 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 8205 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
8206 op = SPACEA;
8207 break;
980866de
KW
8208 case REGEX_DEPENDS_CHARSET:
8209 op = SPACE;
8210 break;
8211 default:
8212 goto bad_charset;
a12cf05f 8213 }
980866de 8214 ret = reg_node(pRExC_state, op);
a0d0e21e 8215 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8216 goto finish_meta_pat;
a0d0e21e 8217 case 'S':
980866de
KW
8218 switch (get_regex_charset(RExC_flags)) {
8219 case REGEX_LOCALE_CHARSET:
8220 op = NSPACEL;
8221 break;
8222 case REGEX_UNICODE_CHARSET:
8223 op = NSPACEU;
8224 break;
cfaf538b 8225 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 8226 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
8227 op = NSPACEA;
8228 break;
980866de
KW
8229 case REGEX_DEPENDS_CHARSET:
8230 op = NSPACE;
8231 break;
8232 default:
8233 goto bad_charset;
a12cf05f 8234 }
980866de 8235 ret = reg_node(pRExC_state, op);
a0d0e21e 8236 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8237 goto finish_meta_pat;
a0d0e21e 8238 case 'd':
56ae17b4
KW
8239 switch (get_regex_charset(RExC_flags)) {
8240 case REGEX_LOCALE_CHARSET:
8241 op = DIGITL;
8242 break;
cfaf538b 8243 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 8244 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
8245 op = DIGITA;
8246 break;
56ae17b4
KW
8247 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8248 case REGEX_UNICODE_CHARSET:
8249 op = DIGIT;
8250 break;
8251 default:
8252 goto bad_charset;
6ab9ea91 8253 }
56ae17b4 8254 ret = reg_node(pRExC_state, op);
a0d0e21e 8255 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8256 goto finish_meta_pat;
a0d0e21e 8257 case 'D':
56ae17b4
KW
8258 switch (get_regex_charset(RExC_flags)) {
8259 case REGEX_LOCALE_CHARSET:
8260 op = NDIGITL;
8261 break;
cfaf538b 8262 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 8263 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
8264 op = NDIGITA;
8265 break;
56ae17b4
KW
8266 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8267 case REGEX_UNICODE_CHARSET:
8268 op = NDIGIT;
8269 break;
8270 default:
8271 goto bad_charset;
6ab9ea91 8272 }
56ae17b4 8273 ret = reg_node(pRExC_state, op);
a0d0e21e 8274 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8275 goto finish_meta_pat;
e1d1eefb
YO
8276 case 'R':
8277 ret = reg_node(pRExC_state, LNBREAK);
8278 *flagp |= HASWIDTH|SIMPLE;
8279 goto finish_meta_pat;
8280 case 'h':
8281 ret = reg_node(pRExC_state, HORIZWS);
8282 *flagp |= HASWIDTH|SIMPLE;
8283 goto finish_meta_pat;
8284 case 'H':
8285 ret = reg_node(pRExC_state, NHORIZWS);
8286 *flagp |= HASWIDTH|SIMPLE;
8287 goto finish_meta_pat;
ee9b8eae 8288 case 'v':
e1d1eefb
YO
8289 ret = reg_node(pRExC_state, VERTWS);
8290 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
8291 goto finish_meta_pat;
8292 case 'V':
e1d1eefb
YO
8293 ret = reg_node(pRExC_state, NVERTWS);
8294 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 8295 finish_meta_pat:
830247a4 8296 nextchar(pRExC_state);
fac92740 8297 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 8298 break;
a14b48bc
LW
8299 case 'p':
8300 case 'P':
3568d838 8301 {
3dab1dad 8302 char* const oldregxend = RExC_end;
d008bc60 8303#ifdef DEBUGGING
ccb2c380 8304 char* parse_start = RExC_parse - 2;
d008bc60 8305#endif
a14b48bc 8306
830247a4 8307 if (RExC_parse[1] == '{') {
3568d838 8308 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
8309 RExC_end = strchr(RExC_parse, '}');
8310 if (!RExC_end) {
3dab1dad 8311 const U8 c = (U8)*RExC_parse;
830247a4
IZ
8312 RExC_parse += 2;
8313 RExC_end = oldregxend;
0da60cf5 8314 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 8315 }
830247a4 8316 RExC_end++;
a14b48bc 8317 }
af6f566e 8318 else {
830247a4 8319 RExC_end = RExC_parse + 2;
af6f566e
HS
8320 if (RExC_end > oldregxend)
8321 RExC_end = oldregxend;
8322 }
830247a4 8323 RExC_parse--;
a14b48bc 8324
3dab1dad 8325 ret = regclass(pRExC_state,depth+1);
a14b48bc 8326
830247a4
IZ
8327 RExC_end = oldregxend;
8328 RExC_parse--;
ccb2c380
MP
8329
8330 Set_Node_Offset(ret, parse_start + 2);
8331 Set_Node_Cur_Length(ret);
830247a4 8332 nextchar(pRExC_state);
a14b48bc
LW
8333 *flagp |= HASWIDTH|SIMPLE;
8334 }
8335 break;
fc8cd66c 8336 case 'N':
afefe6bf 8337 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
8338 multicharacter. join_exact() will join them up later on.
8339 Also this makes sure that things like /\N{BLAH}+/ and
8340 \N{BLAH} being multi char Just Happen. dmq*/
8341 ++RExC_parse;
afefe6bf 8342 ret= reg_namedseq(pRExC_state, NULL, flagp);
fc8cd66c 8343 break;
0a4db386 8344 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 8345 parse_named_seq:
81714fb9
YO
8346 {
8347 char ch= RExC_parse[1];
1f1031fe
YO
8348 if (ch != '<' && ch != '\'' && ch != '{') {
8349 RExC_parse++;
8350 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 8351 } else {
1f1031fe
YO
8352 /* this pretty much dupes the code for (?P=...) in reg(), if
8353 you change this make sure you change that */
81714fb9 8354 char* name_start = (RExC_parse += 2);
2eccd3b2 8355 U32 num = 0;
0a4db386
YO
8356 SV *sv_dat = reg_scan_name(pRExC_state,
8357 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 8358 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 8359 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
8360 vFAIL2("Sequence %.3s... not terminated",parse_start);
8361
8362 if (!SIZE_ONLY) {
8363 num = add_data( pRExC_state, 1, "S" );
8364 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 8365 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
8366 }
8367
81714fb9
YO
8368 RExC_sawback = 1;
8369 ret = reganode(pRExC_state,
4444fd9f
KW
8370 ((! FOLD)
8371 ? NREF
2f7f8cb1
KW
8372 : (MORE_ASCII_RESTRICTED)
8373 ? NREFFA
8374 : (AT_LEAST_UNI_SEMANTICS)
8375 ? NREFFU
8376 : (LOC)
8377 ? NREFFL
8378 : NREFF),
4444fd9f 8379 num);
81714fb9 8380 *flagp |= HASWIDTH;
1f1031fe 8381
81714fb9
YO
8382 /* override incorrect value set in reganode MJD */
8383 Set_Node_Offset(ret, parse_start+1);
8384 Set_Node_Cur_Length(ret); /* MJD */
8385 nextchar(pRExC_state);
1f1031fe 8386
81714fb9
YO
8387 }
8388 break;
1f1031fe 8389 }
2bf803e2 8390 case 'g':
a0d0e21e
LW
8391 case '1': case '2': case '3': case '4':
8392 case '5': case '6': case '7': case '8': case '9':
8393 {
c74340f9 8394 I32 num;
2bf803e2
YO
8395 bool isg = *RExC_parse == 'g';
8396 bool isrel = 0;
8397 bool hasbrace = 0;
8398 if (isg) {
c74340f9 8399 RExC_parse++;
2bf803e2
YO
8400 if (*RExC_parse == '{') {
8401 RExC_parse++;
8402 hasbrace = 1;
8403 }
8404 if (*RExC_parse == '-') {
8405 RExC_parse++;
8406 isrel = 1;
8407 }
1f1031fe
YO
8408 if (hasbrace && !isDIGIT(*RExC_parse)) {
8409 if (isrel) RExC_parse--;
8410 RExC_parse -= 2;
8411 goto parse_named_seq;
8412 } }
c74340f9 8413 num = atoi(RExC_parse);
b72d83b2
RGS
8414 if (isg && num == 0)
8415 vFAIL("Reference to invalid group 0");
c74340f9 8416 if (isrel) {
5624f11d 8417 num = RExC_npar - num;
c74340f9
YO
8418 if (num < 1)
8419 vFAIL("Reference to nonexistent or unclosed group");
8420 }
2bf803e2 8421 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
8422 goto defchar;
8423 else {
3dab1dad 8424 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
8425 while (isDIGIT(*RExC_parse))
8426 RExC_parse++;
1f1031fe
YO
8427 if (parse_start == RExC_parse - 1)
8428 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
8429 if (hasbrace) {
8430 if (*RExC_parse != '}')
8431 vFAIL("Unterminated \\g{...} pattern");
8432 RExC_parse++;
8433 }
c74340f9
YO
8434 if (!SIZE_ONLY) {
8435 if (num > (I32)RExC_rx->nparens)
8436 vFAIL("Reference to nonexistent group");
c74340f9 8437 }
830247a4 8438 RExC_sawback = 1;
eb160463 8439 ret = reganode(pRExC_state,
4444fd9f
KW
8440 ((! FOLD)
8441 ? REF
2f7f8cb1
KW
8442 : (MORE_ASCII_RESTRICTED)
8443 ? REFFA
8444 : (AT_LEAST_UNI_SEMANTICS)
8445 ? REFFU
8446 : (LOC)
8447 ? REFFL
8448 : REFF),
4444fd9f 8449 num);
a0d0e21e 8450 *flagp |= HASWIDTH;
2af232bd 8451
fac92740 8452 /* override incorrect value set in reganode MJD */
2af232bd 8453 Set_Node_Offset(ret, parse_start+1);
fac92740 8454 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
8455 RExC_parse--;
8456 nextchar(pRExC_state);
a0d0e21e
LW
8457 }
8458 }
8459 break;
8460 case '\0':
830247a4 8461 if (RExC_parse >= RExC_end)
b45f050a 8462 FAIL("Trailing \\");
a0d0e21e
LW
8463 /* FALL THROUGH */
8464 default:
a0288114 8465 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 8466 back into the quick-grab loop below */
45948336 8467 parse_start--;
a0d0e21e
LW
8468 goto defchar;
8469 }
8470 break;
4633a7c4
LW
8471
8472 case '#':
bbe252da 8473 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 8474 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
8475 goto tryagain;
8476 }
8477 /* FALL THROUGH */
8478
f9a79580
RGS
8479 default:
8480 outer_default:{
ba210ebe 8481 register STRLEN len;
58ae7d3f 8482 register UV ender;
a0d0e21e 8483 register char *p;
3dab1dad 8484 char *s;
80aecb99 8485 STRLEN foldlen;
89ebb4a3 8486 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6e326e84 8487 regnode * orig_emit;
f06dbbb7
JH
8488
8489 parse_start = RExC_parse - 1;
a0d0e21e 8490
830247a4 8491 RExC_parse++;
a0d0e21e
LW
8492
8493 defchar:
58ae7d3f 8494 ender = 0;
6e326e84
KW
8495 orig_emit = RExC_emit; /* Save the original output node position in
8496 case we need to output a different node
8497 type */
eb160463 8498 ret = reg_node(pRExC_state,
2c2b7f86
KW
8499 (U8) ((! FOLD) ? EXACT
8500 : (LOC)
8501 ? EXACTFL
2f7f8cb1
KW
8502 : (MORE_ASCII_RESTRICTED)
8503 ? EXACTFA
8504 : (AT_LEAST_UNI_SEMANTICS)
8505 ? EXACTFU
8506 : EXACTF)
2c2b7f86 8507 );
cd439c50 8508 s = STRING(ret);
830247a4
IZ
8509 for (len = 0, p = RExC_parse - 1;
8510 len < 127 && p < RExC_end;
a0d0e21e
LW
8511 len++)
8512 {
3dab1dad 8513 char * const oldp = p;
5b5a24f7 8514
bbe252da 8515 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 8516 p = regwhite( pRExC_state, p );
f9a79580 8517 switch ((U8)*p) {
ced7f090
KW
8518 case LATIN_SMALL_LETTER_SHARP_S:
8519 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8520 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
56d400ed 8521 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
f9a79580 8522 goto normal_default;
a0d0e21e
LW
8523 case '^':
8524 case '$':
8525 case '.':
8526 case '[':
8527 case '(':
8528 case ')':
8529 case '|':
8530 goto loopdone;
8531 case '\\':
ee9b8eae
YO
8532 /* Literal Escapes Switch
8533
8534 This switch is meant to handle escape sequences that
8535 resolve to a literal character.
8536
8537 Every escape sequence that represents something
8538 else, like an assertion or a char class, is handled
8539 in the switch marked 'Special Escapes' above in this
8540 routine, but also has an entry here as anything that
8541 isn't explicitly mentioned here will be treated as
8542 an unescaped equivalent literal.
8543 */
8544
a0a388a1 8545 switch ((U8)*++p) {
ee9b8eae 8546 /* These are all the special escapes. */
ced7f090
KW
8547 case LATIN_SMALL_LETTER_SHARP_S:
8548 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8549 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
a0a388a1
YO
8550 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8551 goto normal_default;
ee9b8eae
YO
8552 case 'A': /* Start assertion */
8553 case 'b': case 'B': /* Word-boundary assertion*/
8554 case 'C': /* Single char !DANGEROUS! */
8555 case 'd': case 'D': /* digit class */
8556 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 8557 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
8558 case 'k': case 'K': /* named backref, keep marker */
8559 case 'N': /* named char sequence */
38a44b82 8560 case 'p': case 'P': /* Unicode property */
e1d1eefb 8561 case 'R': /* LNBREAK */
ee9b8eae 8562 case 's': case 'S': /* space class */
e1d1eefb 8563 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
8564 case 'w': case 'W': /* word class */
8565 case 'X': /* eXtended Unicode "combining character sequence" */
8566 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
8567 --p;
8568 goto loopdone;
ee9b8eae
YO
8569
8570 /* Anything after here is an escape that resolves to a
8571 literal. (Except digits, which may or may not)
8572 */
a0d0e21e
LW
8573 case 'n':
8574 ender = '\n';
8575 p++;
a687059c 8576 break;
a0d0e21e
LW
8577 case 'r':
8578 ender = '\r';
8579 p++;
a687059c 8580 break;
a0d0e21e
LW
8581 case 't':
8582 ender = '\t';
8583 p++;
a687059c 8584 break;
a0d0e21e
LW
8585 case 'f':
8586 ender = '\f';
8587 p++;
a687059c 8588 break;
a0d0e21e 8589 case 'e':
c7f1f016 8590 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 8591 p++;
a687059c 8592 break;
a0d0e21e 8593 case 'a':
c7f1f016 8594 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 8595 p++;
a687059c 8596 break;
f0a2b745
KW
8597 case 'o':
8598 {
8599 STRLEN brace_len = len;
00c0cb6d 8600 UV result;
454155d9
KW
8601 const char* error_msg;
8602
8603 bool valid = grok_bslash_o(p,
8604 &result,
8605 &brace_len,
8606 &error_msg,
8607 1);
8608 p += brace_len;
8609 if (! valid) {
8610 RExC_parse = p; /* going to die anyway; point
8611 to exact spot of failure */
f0a2b745
KW
8612 vFAIL(error_msg);
8613 }
00c0cb6d
DG
8614 else
8615 {
8616 ender = result;
8617 }
f0a2b745
KW
8618 if (PL_encoding && ender < 0x100) {
8619 goto recode_encoding;
8620 }
8621 if (ender > 0xff) {
62fed28b 8622 REQUIRE_UTF8;
f0a2b745
KW
8623 }
8624 break;
8625 }
a0d0e21e 8626 case 'x':
a0ed51b3 8627 if (*++p == '{') {
1df70142 8628 char* const e = strchr(p, '}');
b81d288d 8629
b45f050a 8630 if (!e) {
830247a4 8631 RExC_parse = p + 1;
b45f050a
JF
8632 vFAIL("Missing right brace on \\x{}");
8633 }
de5f0749 8634 else {
a4c04bdc
NC
8635 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8636 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 8637 STRLEN numlen = e - p - 1;
53305cf1 8638 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028 8639 if (ender > 0xff)
62fed28b 8640 REQUIRE_UTF8;
a0ed51b3
LW
8641 p = e + 1;
8642 }
a0ed51b3
LW
8643 }
8644 else {
a4c04bdc 8645 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 8646 STRLEN numlen = 2;
53305cf1 8647 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
8648 p += numlen;
8649 }
9e08bc66
TS
8650 if (PL_encoding && ender < 0x100)
8651 goto recode_encoding;
a687059c 8652 break;
a0d0e21e
LW
8653 case 'c':
8654 p++;
17a3df4c 8655 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
a687059c 8656 break;
a0d0e21e
LW
8657 case '0': case '1': case '2': case '3':case '4':
8658 case '5': case '6': case '7': case '8':case '9':
8659 if (*p == '0' ||
ca67da41 8660 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
c99e91e9
KW
8661 {
8662 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
1df70142 8663 STRLEN numlen = 3;
53305cf1 8664 ender = grok_oct(p, &numlen, &flags, NULL);
fa1639c5 8665 if (ender > 0xff) {
62fed28b 8666 REQUIRE_UTF8;
609122bd 8667 }
a0d0e21e
LW
8668 p += numlen;
8669 }
8670 else {
8671 --p;
8672 goto loopdone;
a687059c 8673 }
9e08bc66
TS
8674 if (PL_encoding && ender < 0x100)
8675 goto recode_encoding;
8676 break;
8677 recode_encoding:
8678 {
8679 SV* enc = PL_encoding;
8680 ender = reg_recode((const char)(U8)ender, &enc);
668c081a
NC
8681 if (!enc && SIZE_ONLY)
8682 ckWARNreg(p, "Invalid escape in the specified encoding");
62fed28b 8683 REQUIRE_UTF8;
9e08bc66 8684 }
a687059c 8685 break;
a0d0e21e 8686 case '\0':
830247a4 8687 if (p >= RExC_end)
b45f050a 8688 FAIL("Trailing \\");
a687059c 8689 /* FALL THROUGH */
a0d0e21e 8690 default:
216bfc0a
KW
8691 if (!SIZE_ONLY&& isALPHA(*p)) {
8692 /* Include any { following the alpha to emphasize
8693 * that it could be part of an escape at some point
8694 * in the future */
8695 int len = (*(p + 1) == '{') ? 2 : 1;
8696 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
8697 }
a0ed51b3 8698 goto normal_default;
a0d0e21e
LW
8699 }
8700 break;
a687059c 8701 default:
a0ed51b3 8702 normal_default:
fd400ab9 8703 if (UTF8_IS_START(*p) && UTF) {
1df70142 8704 STRLEN numlen;
5e12f4fb 8705 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 8706 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
8707 p += numlen;
8708 }
8709 else
5b67c30a 8710 ender = (U8) *p++;
a0d0e21e 8711 break;
7e2509c1
KW
8712 } /* End of switch on the literal */
8713
6e326e84
KW
8714 /* Certain characters are problematic because their folded
8715 * length is so different from their original length that it
8716 * isn't handleable by the optimizer. They are therefore not
8717 * placed in an EXACTish node; and are here handled specially.
8718 * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
8719 * putting it in a special node keeps regexec from having to
8720 * deal with a non-utf8 multi-char fold */
2f7f8cb1 8721 if (FOLD
e074b0e5 8722 && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
2f7f8cb1
KW
8723 && is_TRICKYFOLD_cp(ender))
8724 {
6e326e84
KW
8725 /* If is in middle of outputting characters into an
8726 * EXACTish node, go output what we have so far, and
8727 * position the parse so that this will be called again
8728 * immediately */
8729 if (len) {
8730 p = RExC_parse + len - 1;
8731 goto loopdone;
8732 }
8733 else {
8734
8735 /* Here we are ready to output our tricky fold
8736 * character. What's done is to pretend it's in a
8737 * [bracketed] class, and let the code that deals with
8738 * those handle it, as that code has all the
8739 * intelligence necessary. First save the current
8740 * parse state, get rid of the already allocated EXACT
8741 * node that the ANYOFV node will replace, and point
8742 * the parse to a buffer which we fill with the
8743 * character we want the regclass code to think is
8744 * being parsed */
8745 char* const oldregxend = RExC_end;
8746 char tmpbuf[2];
8747 RExC_emit = orig_emit;
8748 RExC_parse = tmpbuf;
8749 if (UTF) {
8750 tmpbuf[0] = UTF8_TWO_BYTE_HI(ender);
8751 tmpbuf[1] = UTF8_TWO_BYTE_LO(ender);
8752 RExC_end = RExC_parse + 2;
8753 }
8754 else {
cd64649c 8755 tmpbuf[0] = (char) ender;
6e326e84
KW
8756 RExC_end = RExC_parse + 1;
8757 }
8758
8759 ret = regclass(pRExC_state,depth+1);
8760
8761 /* Here, have parsed the buffer. Reset the parse to
8762 * the actual input, and return */
8763 RExC_end = oldregxend;
8764 RExC_parse = p - 1;
8765
8766 Set_Node_Offset(ret, RExC_parse);
8767 Set_Node_Cur_Length(ret);
8768 nextchar(pRExC_state);
8769 *flagp |= HASWIDTH|SIMPLE;
8770 return ret;
8771 }
8772 }
8773
bcdf7404
YO
8774 if ( RExC_flags & RXf_PMf_EXTENDED)
8775 p = regwhite( pRExC_state, p );
60a8b682 8776 if (UTF && FOLD) {
17580e7a
KW
8777 /* Prime the casefolded buffer. Locale rules, which apply
8778 * only to code points < 256, aren't known until execution,
8779 * so for them, just output the original character using
8780 * utf8 */
8781 if (LOC && ender < 256) {
8782 if (UNI_IS_INVARIANT(ender)) {
8783 *tmpbuf = (U8) ender;
8784 foldlen = 1;
8785 } else {
8786 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
8787 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
8788 foldlen = 2;
8789 }
8790 }
8791 else if (isASCII(ender)) { /* Note: Here can't also be LOC
8792 */
2f7f8cb1 8793 ender = toLOWER(ender);
cd64649c 8794 *tmpbuf = (U8) ender;
2f7f8cb1
KW
8795 foldlen = 1;
8796 }
17580e7a
KW
8797 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
8798
8799 /* Locale and /aa require more selectivity about the
8800 * fold, so are handled below. Otherwise, here, just
8801 * use the fold */
2f7f8cb1
KW
8802 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
8803 }
8804 else {
17580e7a
KW
8805 /* Under locale rules or /aa we are not to mix,
8806 * respectively, ords < 256 or ASCII with non-. So
8807 * reject folds that mix them, using only the
8808 * non-folded code point. So do the fold to a
8809 * temporary, and inspect each character in it. */
2f7f8cb1
KW
8810 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
8811 U8* s = trialbuf;
8812 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
8813 U8* e = s + foldlen;
8814 bool fold_ok = TRUE;
8815
8816 while (s < e) {
17580e7a
KW
8817 if (isASCII(*s)
8818 || (LOC && (UTF8_IS_INVARIANT(*s)
8819 || UTF8_IS_DOWNGRADEABLE_START(*s))))
8820 {
2f7f8cb1
KW
8821 fold_ok = FALSE;
8822 break;
8823 }
8824 s += UTF8SKIP(s);
8825 }
8826 if (fold_ok) {
8827 Copy(trialbuf, tmpbuf, foldlen, U8);
8828 ender = tmpender;
8829 }
8830 else {
8831 uvuni_to_utf8(tmpbuf, ender);
8832 foldlen = UNISKIP(ender);
8833 }
8834 }
60a8b682 8835 }
bcdf7404 8836 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
8837 if (len)
8838 p = oldp;
16ea2a2e 8839 else if (UTF) {
80aecb99 8840 if (FOLD) {
60a8b682 8841 /* Emit all the Unicode characters. */
1df70142 8842 STRLEN numlen;
80aecb99
JH
8843 for (foldbuf = tmpbuf;
8844 foldlen;
8845 foldlen -= numlen) {
8846 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 8847 if (numlen > 0) {
71207a34 8848 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
8849 s += unilen;
8850 len += unilen;
8851 /* In EBCDIC the numlen
8852 * and unilen can differ. */
9dc45d57 8853 foldbuf += numlen;
47654450
JH
8854 if (numlen >= foldlen)
8855 break;
9dc45d57
JH
8856 }
8857 else
8858 break; /* "Can't happen." */
80aecb99
JH
8859 }
8860 }
8861 else {
71207a34 8862 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 8863 if (unilen > 0) {
0ebc6274
JH
8864 s += unilen;
8865 len += unilen;
9dc45d57 8866 }
80aecb99 8867 }
a0ed51b3 8868 }
a0d0e21e
LW
8869 else {
8870 len++;
eb160463 8871 REGC((char)ender, s++);
a0d0e21e
LW
8872 }
8873 break;
a687059c 8874 }
16ea2a2e 8875 if (UTF) {
80aecb99 8876 if (FOLD) {
60a8b682 8877 /* Emit all the Unicode characters. */
1df70142 8878 STRLEN numlen;
80aecb99
JH
8879 for (foldbuf = tmpbuf;
8880 foldlen;
8881 foldlen -= numlen) {
8882 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 8883 if (numlen > 0) {
71207a34 8884 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
8885 len += unilen;
8886 s += unilen;
8887 /* In EBCDIC the numlen
8888 * and unilen can differ. */
9dc45d57 8889 foldbuf += numlen;
47654450
JH
8890 if (numlen >= foldlen)
8891 break;
9dc45d57
JH
8892 }
8893 else
8894 break;
80aecb99
JH
8895 }
8896 }
8897 else {
71207a34 8898 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 8899 if (unilen > 0) {
0ebc6274
JH
8900 s += unilen;
8901 len += unilen;
9dc45d57 8902 }
80aecb99
JH
8903 }
8904 len--;
a0ed51b3
LW
8905 }
8906 else
eb160463 8907 REGC((char)ender, s++);
a0d0e21e 8908 }
7e2509c1
KW
8909 loopdone: /* Jumped to when encounters something that shouldn't be in
8910 the node */
830247a4 8911 RExC_parse = p - 1;
fac92740 8912 Set_Node_Cur_Length(ret); /* MJD */
830247a4 8913 nextchar(pRExC_state);
793db0cb
JH
8914 {
8915 /* len is STRLEN which is unsigned, need to copy to signed */
8916 IV iv = len;
8917 if (iv < 0)
8918 vFAIL("Internal disaster");
8919 }
a0d0e21e
LW
8920 if (len > 0)
8921 *flagp |= HASWIDTH;
090f7165 8922 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 8923 *flagp |= SIMPLE;
3dab1dad 8924
cd439c50 8925 if (SIZE_ONLY)
830247a4 8926 RExC_size += STR_SZ(len);
3dab1dad
YO
8927 else {
8928 STR_LEN(ret) = len;
830247a4 8929 RExC_emit += STR_SZ(len);
07be1b83 8930 }
3dab1dad 8931 }
a0d0e21e
LW
8932 break;
8933 }
a687059c 8934
a0d0e21e 8935 return(ret);
980866de
KW
8936
8937/* Jumped to when an unrecognized character set is encountered */
8938bad_charset:
8939 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
8940 return(NULL);
a687059c
LW
8941}
8942
873ef191 8943STATIC char *
bcdf7404 8944S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 8945{
bcdf7404 8946 const char *e = RExC_end;
7918f24d
NC
8947
8948 PERL_ARGS_ASSERT_REGWHITE;
8949
5b5a24f7
CS
8950 while (p < e) {
8951 if (isSPACE(*p))
8952 ++p;
8953 else if (*p == '#') {
bcdf7404 8954 bool ended = 0;
5b5a24f7 8955 do {
bcdf7404
YO
8956 if (*p++ == '\n') {
8957 ended = 1;
8958 break;
8959 }
8960 } while (p < e);
8961 if (!ended)
8962 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
8963 }
8964 else
8965 break;
8966 }
8967 return p;
8968}
8969
b8c5462f
JH
8970/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
8971 Character classes ([:foo:]) can also be negated ([:^foo:]).
8972 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
8973 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 8974 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
8975
8976#define POSIXCC_DONE(c) ((c) == ':')
8977#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
8978#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
8979
b8c5462f 8980STATIC I32
830247a4 8981S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 8982{
97aff369 8983 dVAR;
936ed897 8984 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 8985
7918f24d
NC
8986 PERL_ARGS_ASSERT_REGPPOSIXCC;
8987
830247a4 8988 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 8989 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 8990 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 8991 const char c = UCHARAT(RExC_parse);
097eb12c 8992 char* const s = RExC_parse++;
b81d288d 8993
9a86a77b 8994 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
8995 RExC_parse++;
8996 if (RExC_parse == RExC_end)
620e46c5 8997 /* Grandfather lone [:, [=, [. */
830247a4 8998 RExC_parse = s;
620e46c5 8999 else {
3dab1dad 9000 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
9001 assert(*t == c);
9002
9a86a77b 9003 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 9004 const char *posixcc = s + 1;
830247a4 9005 RExC_parse++; /* skip over the ending ] */
3dab1dad 9006
b8c5462f 9007 if (*s == ':') {
1df70142
AL
9008 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9009 const I32 skip = t - posixcc;
80916619
NC
9010
9011 /* Initially switch on the length of the name. */
9012 switch (skip) {
9013 case 4:
3dab1dad
YO
9014 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9015 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 9016 break;
80916619
NC
9017 case 5:
9018 /* Names all of length 5. */
9019 /* alnum alpha ascii blank cntrl digit graph lower
9020 print punct space upper */
9021 /* Offset 4 gives the best switch position. */
9022 switch (posixcc[4]) {
9023 case 'a':
3dab1dad
YO
9024 if (memEQ(posixcc, "alph", 4)) /* alpha */
9025 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
9026 break;
9027 case 'e':
3dab1dad
YO
9028 if (memEQ(posixcc, "spac", 4)) /* space */
9029 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
9030 break;
9031 case 'h':
3dab1dad
YO
9032 if (memEQ(posixcc, "grap", 4)) /* graph */
9033 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
9034 break;
9035 case 'i':
3dab1dad
YO
9036 if (memEQ(posixcc, "asci", 4)) /* ascii */
9037 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
9038 break;
9039 case 'k':
3dab1dad
YO
9040 if (memEQ(posixcc, "blan", 4)) /* blank */
9041 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
9042 break;
9043 case 'l':
3dab1dad
YO
9044 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9045 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
9046 break;
9047 case 'm':
3dab1dad
YO
9048 if (memEQ(posixcc, "alnu", 4)) /* alnum */
9049 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
9050 break;
9051 case 'r':
3dab1dad
YO
9052 if (memEQ(posixcc, "lowe", 4)) /* lower */
9053 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9054 else if (memEQ(posixcc, "uppe", 4)) /* upper */
9055 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
9056 break;
9057 case 't':
3dab1dad
YO
9058 if (memEQ(posixcc, "digi", 4)) /* digit */
9059 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9060 else if (memEQ(posixcc, "prin", 4)) /* print */
9061 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9062 else if (memEQ(posixcc, "punc", 4)) /* punct */
9063 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 9064 break;
b8c5462f
JH
9065 }
9066 break;
80916619 9067 case 6:
3dab1dad
YO
9068 if (memEQ(posixcc, "xdigit", 6))
9069 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
9070 break;
9071 }
80916619
NC
9072
9073 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
9074 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9075 t - s - 1, s + 1);
80916619
NC
9076 assert (posixcc[skip] == ':');
9077 assert (posixcc[skip+1] == ']');
b45f050a 9078 } else if (!SIZE_ONLY) {
b8c5462f 9079 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 9080
830247a4 9081 /* adjust RExC_parse so the warning shows after
b45f050a 9082 the class closes */
9a86a77b 9083 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 9084 RExC_parse++;
b45f050a
JF
9085 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9086 }
b8c5462f
JH
9087 } else {
9088 /* Maternal grandfather:
9089 * "[:" ending in ":" but not in ":]" */
830247a4 9090 RExC_parse = s;
767d463e 9091 }
620e46c5
JH
9092 }
9093 }
9094
b8c5462f
JH
9095 return namedclass;
9096}
9097
9098STATIC void
830247a4 9099S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 9100{
97aff369 9101 dVAR;
7918f24d
NC
9102
9103 PERL_ARGS_ASSERT_CHECKPOSIXCC;
9104
3dab1dad 9105 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
9106 const char *s = RExC_parse;
9107 const char c = *s++;
b8c5462f 9108
3dab1dad 9109 while (isALNUM(*s))
b8c5462f
JH
9110 s++;
9111 if (*s && c == *s && s[1] == ']') {
668c081a
NC
9112 ckWARN3reg(s+2,
9113 "POSIX syntax [%c %c] belongs inside character classes",
9114 c, c);
b45f050a
JF
9115
9116 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 9117 if (POSIXCC_NOTYET(c)) {
830247a4 9118 /* adjust RExC_parse so the error shows after
b45f050a 9119 the class closes */
9a86a77b 9120 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 9121 NOOP;
b45f050a
JF
9122 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9123 }
b8c5462f
JH
9124 }
9125 }
620e46c5
JH
9126}
9127
003331de
KW
9128/* No locale test, and always Unicode semantics */
9129#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
9130ANYOF_##NAME: \
9131 for (value = 0; value < 256; value++) \
9132 if (TEST) \
5bfec14d 9133 stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
003331de
KW
9134 yesno = '+'; \
9135 what = WORD; \
9136 break; \
9137case ANYOF_N##NAME: \
9138 for (value = 0; value < 256; value++) \
9139 if (!TEST) \
5bfec14d 9140 stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
003331de
KW
9141 yesno = '!'; \
9142 what = WORD; \
e1d1eefb 9143 break
89836f1f 9144
a12cf05f
KW
9145/* Like the above, but there are differences if we are in uni-8-bit or not, so
9146 * there are two tests passed in, to use depending on that. There aren't any
9147 * cases where the label is different from the name, so no need for that
9148 * parameter */
f952827c 9149#define _C_C_T_(NAME, TEST_8, TEST_7, WORD) \
003331de
KW
9150ANYOF_##NAME: \
9151 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
9152 else if (UNI_SEMANTICS) { \
9153 for (value = 0; value < 256; value++) { \
f952827c 9154 if (TEST_8(value)) stored += \
5bfec14d 9155 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
003331de
KW
9156 } \
9157 } \
9158 else { \
9159 for (value = 0; value < 128; value++) { \
f952827c 9160 if (TEST_7(UNI_TO_NATIVE(value))) stored += \
4c9daa0a 9161 set_regclass_bit(pRExC_state, ret, \
5bfec14d 9162 (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
003331de
KW
9163 } \
9164 } \
9165 yesno = '+'; \
9166 what = WORD; \
9167 break; \
9168case ANYOF_N##NAME: \
9169 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
9170 else if (UNI_SEMANTICS) { \
9171 for (value = 0; value < 256; value++) { \
f952827c 9172 if (! TEST_8(value)) stored += \
5bfec14d 9173 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
003331de
KW
9174 } \
9175 } \
9176 else { \
9177 for (value = 0; value < 128; value++) { \
4c9daa0a 9178 if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \
5bfec14d 9179 pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
003331de 9180 } \
2f7f8cb1 9181 if (AT_LEAST_ASCII_RESTRICTED) { \
cfaf538b 9182 for (value = 128; value < 256; value++) { \
4c9daa0a 9183 stored += set_regclass_bit( \
5bfec14d 9184 pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
cfaf538b 9185 } \
137165a6 9186 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; \
cfaf538b
KW
9187 } \
9188 else { \
9189 /* For a non-ut8 target string with DEPENDS semantics, all above \
9190 * ASCII Latin1 code points match the complement of any of the \
9191 * classes. But in utf8, they have their Unicode semantics, so \
9192 * can't just set them in the bitmap, or else regexec.c will think \
9193 * they matched when they shouldn't. */ \
137165a6 9194 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; \
cfaf538b 9195 } \
003331de
KW
9196 } \
9197 yesno = '!'; \
9198 what = WORD; \
a12cf05f
KW
9199 break
9200
2283d326 9201STATIC U8
2c6aa593 9202S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
2283d326
KW
9203{
9204
9205 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
9206 * Locale folding is done at run-time, so this function should not be
9207 * called for nodes that are for locales.
9208 *
d50a4f90 9209 * This function sets the bit corresponding to the fold of the input
2283d326
KW
9210 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
9211 * 'F' is 'f'.
9212 *
d50a4f90
KW
9213 * It also knows about the characters that are in the bitmap that have
9214 * folds that are matchable only outside it, and sets the appropriate lists
9215 * and flags.
9216 *
9217 * It returns the number of bits that actually changed from 0 to 1 */
2283d326
KW
9218
9219 U8 stored = 0;
2283d326
KW
9220 U8 fold;
9221
4c9daa0a
KW
9222 PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
9223
cfaf538b 9224 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
2f7f8cb1 9225 : PL_fold[value];
2283d326
KW
9226
9227 /* It assumes the bit for 'value' has already been set */
9228 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
9229 ANYOF_BITMAP_SET(node, fold);
9230 stored++;
9231 }
d50a4f90
KW
9232 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
9233 /* Certain Latin1 characters have matches outside the bitmap. To get
9234 * here, 'value' is one of those characters. None of these matches is
9235 * valid for ASCII characters under /aa, which have been excluded by
9236 * the 'if' above. The matches fall into three categories:
9237 * 1) They are singly folded-to or -from an above 255 character, as
9238 * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
9239 * WITH DIAERESIS;
9240 * 2) They are part of a multi-char fold with another character in the
9241 * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
9242 * 3) They are part of a multi-char fold with a character not in the
9243 * bitmap, such as various ligatures.
9244 * We aren't dealing fully with multi-char folds, except we do deal
9245 * with the pattern containing a character that has a multi-char fold
9246 * (not so much the inverse).
9247 * For types 1) and 3), the matches only happen when the target string
9248 * is utf8; that's not true for 2), and we set a flag for it.
9249 *
9250 * The code below adds to the passed in inversion list the single fold
9251 * closures for 'value'. The values are hard-coded here so that an
9252 * innocent-looking character class, like /[ks]/i won't have to go out
9253 * to disk to find the possible matches. XXX It would be better to
9254 * generate these via regen, in case a new version of the Unicode
9255 * standard adds new mappings, though that is not really likely. */
9256 switch (value) {
9257 case 'k':
9258 case 'K':
9259 /* KELVIN SIGN */
9260 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
9261 break;
9262 case 's':
9263 case 'S':
9264 /* LATIN SMALL LETTER LONG S */
9265 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
9266 break;
9267 case MICRO_SIGN:
9268 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9269 GREEK_SMALL_LETTER_MU);
9270 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9271 GREEK_CAPITAL_LETTER_MU);
9272 break;
9273 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9274 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9275 /* ANGSTROM SIGN */
9276 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
9277 if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
9278 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9279 PL_fold_latin1[value]);
9280 }
9281 break;
9282 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9283 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9284 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9285 break;
9286 case LATIN_SMALL_LETTER_SHARP_S:
9287
9288 /* Under /d and /u, this can match the two chars "ss" */
9289 if (! MORE_ASCII_RESTRICTED) {
9290 add_alternate(alternate_ptr, (U8 *) "ss", 2);
9291
9292 /* And under /u, it can match even if the target is not
9293 * utf8 */
9294 if (UNI_SEMANTICS) {
9295 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9296 }
9297 }
9298 break;
9299 case 'F': case 'f':
9300 case 'I': case 'i':
9301 case 'L': case 'l':
9302 case 'T': case 't':
9303 /* These all are targets of multi-character folds, which can
9304 * occur with only non-Latin1 characters in the fold, so they
9305 * can match if the target string isn't UTF-8 */
9306 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9307 break;
9308 case 'A': case 'a':
9309 case 'H': case 'h':
9310 case 'J': case 'j':
9311 case 'N': case 'n':
9312 case 'W': case 'w':
9313 case 'Y': case 'y':
9314 /* These all are targets of multi-character folds, which occur
9315 * only with a non-Latin1 character as part of the fold, so
9316 * they can't match unless the target string is in UTF-8, so no
9317 * action here is necessary */
9318 break;
9319 default:
9320 /* Use deprecated warning to increase the chances of this
9321 * being output */
9322 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
9323 break;
9324 }
9325 }
9326 else if (DEPENDS_SEMANTICS
f56b6394 9327 && ! isASCII(value)
d50a4f90
KW
9328 && PL_fold_latin1[value] != value)
9329 {
9330 /* Under DEPENDS rules, non-ASCII Latin1 characters match their
9331 * folds only when the target string is in UTF-8. We add the fold
9332 * here to the list of things to match outside the bitmap, which
9333 * won't be looked at unless it is UTF8 (or else if something else
9334 * says to look even if not utf8, but those things better not happen
9335 * under DEPENDS semantics. */
9336 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
2283d326
KW
9337 }
9338
9339 return stored;
9340}
9341
9342
9343PERL_STATIC_INLINE U8
2c6aa593 9344S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
2283d326
KW
9345{
9346 /* This inline function sets a bit in the bitmap if not already set, and if
9347 * appropriate, its fold, returning the number of bits that actually
9348 * changed from 0 to 1 */
9349
9350 U8 stored;
9351
4c9daa0a
KW
9352 PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
9353
2283d326
KW
9354 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
9355 return 0;
9356 }
9357
9358 ANYOF_BITMAP_SET(node, value);
9359 stored = 1;
9360
9361 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
2c6aa593 9362 stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
2283d326
KW
9363 }
9364
9365 return stored;
9366}
9367
c8453963
KW
9368STATIC void
9369S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
9370{
9371 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
9372 * alternate list, pointed to by 'alternate_ptr'. This is an array of
9373 * the multi-character folds of characters in the node */
9374 SV *sv;
9375
9376 PERL_ARGS_ASSERT_ADD_ALTERNATE;
9377
9378 if (! *alternate_ptr) {
9379 *alternate_ptr = newAV();
9380 }
9381 sv = newSVpvn_utf8((char*)string, len, TRUE);
9382 av_push(*alternate_ptr, sv);
9383 return;
9384}
9385
7f6f358c
YO
9386/*
9387 parse a class specification and produce either an ANYOF node that
ddad5e0b 9388 matches the pattern or perhaps will be optimized into an EXACTish node
679d1424
KW
9389 instead. The node contains a bit map for the first 256 characters, with the
9390 corresponding bit set if that character is in the list. For characters
9391 above 255, a range list is used */
89836f1f 9392
76e3520e 9393STATIC regnode *
3dab1dad 9394S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 9395{
97aff369 9396 dVAR;
9a86a77b 9397 register UV nextvalue;
3568d838 9398 register IV prevvalue = OOB_UNICODE;
ffc61ed2 9399 register IV range = 0;
e1d1eefb 9400 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 9401 register regnode *ret;
ba210ebe 9402 STRLEN numlen;
ffc61ed2 9403 IV namedclass;
cbbf8932 9404 char *rangebegin = NULL;
936ed897 9405 bool need_class = 0;
c445ea15 9406 SV *listsv = NULL;
137165a6
KW
9407 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9408 than just initialized. */
ffc61ed2 9409 UV n;
53742956
KW
9410
9411 /* code points this node matches that can't be stored in the bitmap */
56ca34ca 9412 HV* nonbitmap = NULL;
53742956
KW
9413
9414 /* The items that are to match that aren't stored in the bitmap, but are a
9415 * result of things that are stored there. This is the fold closure of
9416 * such a character, either because it has DEPENDS semantics and shouldn't
9417 * be matched unless the target string is utf8, or is a code point that is
9418 * too large for the bit map, as for example, the fold of the MICRO SIGN is
9419 * above 255. This all is solely for performance reasons. By having this
9420 * code know the outside-the-bitmap folds that the bitmapped characters are
9421 * involved with, we don't have to go out to disk to find the list of
9422 * matches, unless the character class includes code points that aren't
9423 * storable in the bit map. That means that a character class with an 's'
9424 * in it, for example, doesn't need to go out to disk to find everything
9425 * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
9426 * empty unless there is something whose fold we don't know about, and will
9427 * have to go out to the disk to find. */
5bfec14d 9428 HV* l1_fold_invlist = NULL;
53742956
KW
9429
9430 /* List of multi-character folds that are matched by this node */
cbbf8932 9431 AV* unicode_alternate = NULL;
1b2d223b
JH
9432#ifdef EBCDIC
9433 UV literal_endpoint = 0;
9434#endif
ffc130aa 9435 UV stored = 0; /* how many chars stored in the bitmap */
ffc61ed2 9436
3dab1dad 9437 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 9438 case we need to change the emitted regop to an EXACT. */
07be1b83 9439 const char * orig_parse = RExC_parse;
72f13be8 9440 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9441
9442 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
9443#ifndef DEBUGGING
9444 PERL_UNUSED_ARG(depth);
9445#endif
72f13be8 9446
3dab1dad 9447 DEBUG_PARSE("clas");
7f6f358c
YO
9448
9449 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
9450 ret = reganode(pRExC_state, ANYOF, 0);
9451
56ca34ca
KW
9452
9453 if (!SIZE_ONLY) {
ffc61ed2 9454 ANYOF_FLAGS(ret) = 0;
56ca34ca 9455 }
ffc61ed2 9456
9a86a77b 9457 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
9458 RExC_naughty++;
9459 RExC_parse++;
9460 if (!SIZE_ONLY)
9461 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
9462 }
a0d0e21e 9463
73060fc4 9464 if (SIZE_ONLY) {
830247a4 9465 RExC_size += ANYOF_SKIP;
3a15e693
KW
9466#ifdef ANYOF_ADD_LOC_SKIP
9467 if (LOC) {
9468 RExC_size += ANYOF_ADD_LOC_SKIP;
9469 }
9470#endif
73060fc4
JH
9471 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
9472 }
936ed897 9473 else {
830247a4 9474 RExC_emit += ANYOF_SKIP;
3a15e693 9475 if (LOC) {
936ed897 9476 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3a15e693
KW
9477#ifdef ANYOF_ADD_LOC_SKIP
9478 RExC_emit += ANYOF_ADD_LOC_SKIP;
9479#endif
9480 }
ffc61ed2 9481 ANYOF_BITMAP_ZERO(ret);
396482e1 9482 listsv = newSVpvs("# comment\n");
137165a6 9483 initial_listsv_len = SvCUR(listsv);
a0d0e21e 9484 }
b8c5462f 9485
9a86a77b
JH
9486 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9487
b938889d 9488 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 9489 checkposixcc(pRExC_state);
b8c5462f 9490
f064b6ad
HS
9491 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
9492 if (UCHARAT(RExC_parse) == ']')
9493 goto charclassloop;
ffc61ed2 9494
fc8cd66c 9495parseit:
9a86a77b 9496 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
9497
9498 charclassloop:
9499
9500 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9501
73b437c8 9502 if (!range)
830247a4 9503 rangebegin = RExC_parse;
ffc61ed2 9504 if (UTF) {
5e12f4fb 9505 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 9506 RExC_end - RExC_parse,
9f7f3913 9507 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
9508 RExC_parse += numlen;
9509 }
9510 else
9511 value = UCHARAT(RExC_parse++);
7f6f358c 9512
9a86a77b
JH
9513 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9514 if (value == '[' && POSIXCC(nextvalue))
830247a4 9515 namedclass = regpposixcc(pRExC_state, value);
620e46c5 9516 else if (value == '\\') {
ffc61ed2 9517 if (UTF) {
5e12f4fb 9518 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 9519 RExC_end - RExC_parse,
9f7f3913 9520 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
9521 RExC_parse += numlen;
9522 }
9523 else
9524 value = UCHARAT(RExC_parse++);
470c3474 9525 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 9526 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
9527 * be a problem later if we want switch on Unicode.
9528 * A similar issue a little bit later when switching on
9529 * namedclass. --jhi */
ffc61ed2 9530 switch ((I32)value) {
b8c5462f
JH
9531 case 'w': namedclass = ANYOF_ALNUM; break;
9532 case 'W': namedclass = ANYOF_NALNUM; break;
9533 case 's': namedclass = ANYOF_SPACE; break;
9534 case 'S': namedclass = ANYOF_NSPACE; break;
9535 case 'd': namedclass = ANYOF_DIGIT; break;
9536 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
9537 case 'v': namedclass = ANYOF_VERTWS; break;
9538 case 'V': namedclass = ANYOF_NVERTWS; break;
9539 case 'h': namedclass = ANYOF_HORIZWS; break;
9540 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
9541 case 'N': /* Handle \N{NAME} in class */
9542 {
9543 /* We only pay attention to the first char of
9544 multichar strings being returned. I kinda wonder
9545 if this makes sense as it does change the behaviour
9546 from earlier versions, OTOH that behaviour was broken
9547 as well. */
9548 UV v; /* value is register so we cant & it /grrr */
afefe6bf 9549 if (reg_namedseq(pRExC_state, &v, NULL)) {
fc8cd66c
YO
9550 goto parseit;
9551 }
9552 value= v;
9553 }
9554 break;
ffc61ed2
JH
9555 case 'p':
9556 case 'P':
3dab1dad
YO
9557 {
9558 char *e;
af6f566e 9559 if (RExC_parse >= RExC_end)
2a4859cd 9560 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 9561 if (*RExC_parse == '{') {
1df70142 9562 const U8 c = (U8)value;
ffc61ed2
JH
9563 e = strchr(RExC_parse++, '}');
9564 if (!e)
0da60cf5 9565 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
9566 while (isSPACE(UCHARAT(RExC_parse)))
9567 RExC_parse++;
9568 if (e == RExC_parse)
0da60cf5 9569 vFAIL2("Empty \\%c{}", c);
ffc61ed2 9570 n = e - RExC_parse;
ab13f0c7
JH
9571 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
9572 n--;
ffc61ed2
JH
9573 }
9574 else {
9575 e = RExC_parse;
9576 n = 1;
9577 }
ee410026 9578 if (!SIZE_ONLY) {
ab13f0c7
JH
9579 if (UCHARAT(RExC_parse) == '^') {
9580 RExC_parse++;
9581 n--;
9582 value = value == 'p' ? 'P' : 'p'; /* toggle */
9583 while (isSPACE(UCHARAT(RExC_parse))) {
9584 RExC_parse++;
9585 n--;
9586 }
9587 }
2f833f52
KW
9588
9589 /* Add the property name to the list. If /i matching, give
9590 * a different name which consists of the normal name
9591 * sandwiched between two underscores and '_i'. The design
9592 * is discussed in the commit message for this. */
9593 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
9594 (value=='p' ? '+' : '!'),
9595 (FOLD) ? "__" : "",
9596 (int)n,
9597 RExC_parse,
9598 (FOLD) ? "_i" : ""
9599 );
ffc61ed2
JH
9600 }
9601 RExC_parse = e + 1;
08fc12dd
KW
9602
9603 /* The \p could match something in the Latin1 range, hence
9604 * something that isn't utf8 */
db8c82dd 9605 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
f81125e2 9606 namedclass = ANYOF_MAX; /* no official name, but it's named */
e40e74fe
KW
9607
9608 /* \p means they want Unicode semantics */
9609 RExC_uni_semantics = 1;
3dab1dad 9610 }
f81125e2 9611 break;
b8c5462f
JH
9612 case 'n': value = '\n'; break;
9613 case 'r': value = '\r'; break;
9614 case 't': value = '\t'; break;
9615 case 'f': value = '\f'; break;
9616 case 'b': value = '\b'; break;
c7f1f016
NIS
9617 case 'e': value = ASCII_TO_NATIVE('\033');break;
9618 case 'a': value = ASCII_TO_NATIVE('\007');break;
f0a2b745
KW
9619 case 'o':
9620 RExC_parse--; /* function expects to be pointed at the 'o' */
454155d9
KW
9621 {
9622 const char* error_msg;
9623 bool valid = grok_bslash_o(RExC_parse,
f0a2b745
KW
9624 &value,
9625 &numlen,
454155d9
KW
9626 &error_msg,
9627 SIZE_ONLY);
9628 RExC_parse += numlen;
9629 if (! valid) {
9630 vFAIL(error_msg);
9631 }
f0a2b745 9632 }
f0a2b745
KW
9633 if (PL_encoding && value < 0x100) {
9634 goto recode_encoding;
9635 }
9636 break;
b8c5462f 9637 case 'x':
ffc61ed2 9638 if (*RExC_parse == '{') {
a4c04bdc
NC
9639 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9640 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 9641 char * const e = strchr(RExC_parse++, '}');
b81d288d 9642 if (!e)
ffc61ed2 9643 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
9644
9645 numlen = e - RExC_parse;
9646 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
9647 RExC_parse = e + 1;
9648 }
9649 else {
a4c04bdc 9650 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
9651 numlen = 2;
9652 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
9653 RExC_parse += numlen;
9654 }
9e08bc66
TS
9655 if (PL_encoding && value < 0x100)
9656 goto recode_encoding;
b8c5462f
JH
9657 break;
9658 case 'c':
17a3df4c 9659 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
b8c5462f
JH
9660 break;
9661 case '0': case '1': case '2': case '3': case '4':
c99e91e9 9662 case '5': case '6': case '7':
9e08bc66 9663 {
c99e91e9
KW
9664 /* Take 1-3 octal digits */
9665 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9e08bc66
TS
9666 numlen = 3;
9667 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
9668 RExC_parse += numlen;
9669 if (PL_encoding && value < 0x100)
9670 goto recode_encoding;
9671 break;
9672 }
9673 recode_encoding:
9674 {
9675 SV* enc = PL_encoding;
9676 value = reg_recode((const char)(U8)value, &enc);
668c081a
NC
9677 if (!enc && SIZE_ONLY)
9678 ckWARNreg(RExC_parse,
9679 "Invalid escape in the specified encoding");
9e08bc66
TS
9680 break;
9681 }
1028017a 9682 default:
c99e91e9
KW
9683 /* Allow \_ to not give an error */
9684 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
668c081a
NC
9685 ckWARN2reg(RExC_parse,
9686 "Unrecognized escape \\%c in character class passed through",
9687 (int)value);
c99e91e9 9688 }
1028017a 9689 break;
b8c5462f 9690 }
ffc61ed2 9691 } /* end of \blah */
1b2d223b
JH
9692#ifdef EBCDIC
9693 else
9694 literal_endpoint++;
9695#endif
ffc61ed2
JH
9696
9697 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
9698
2c63ecad
KW
9699 /* What matches in a locale is not known until runtime, so need to
9700 * (one time per class) allocate extra space to pass to regexec.
9701 * The space will contain a bit for each named class that is to be
9702 * matched against. This isn't needed for \p{} and pseudo-classes,
9703 * as they are not affected by locale, and hence are dealt with
9704 * separately */
9705 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
9706 need_class = 1;
9707 if (SIZE_ONLY) {
3a15e693 9708#ifdef ANYOF_CLASS_ADD_SKIP
2c63ecad 9709 RExC_size += ANYOF_CLASS_ADD_SKIP;
3a15e693 9710#endif
2c63ecad
KW
9711 }
9712 else {
3a15e693 9713#ifdef ANYOF_CLASS_ADD_SKIP
2c63ecad 9714 RExC_emit += ANYOF_CLASS_ADD_SKIP;
3a15e693 9715#endif
2c63ecad
KW
9716 ANYOF_CLASS_ZERO(ret);
9717 }
9051cfd9 9718 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
2c63ecad 9719 }
ffc61ed2 9720
d5788240 9721 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
1d791ab2
KW
9722 * literal, as is the character that began the false range, i.e.
9723 * the 'a' in the examples */
ffc61ed2 9724 if (range) {
73b437c8 9725 if (!SIZE_ONLY) {
668c081a
NC
9726 const int w =
9727 RExC_parse >= rangebegin ?
9728 RExC_parse - rangebegin : 0;
9729 ckWARN4reg(RExC_parse,
b45f050a 9730 "False [] range \"%*.*s\"",
097eb12c 9731 w, w, rangebegin);
668c081a 9732
1d791ab2 9733 stored +=
5bfec14d 9734 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
3568d838 9735 if (prevvalue < 256) {
2283d326 9736 stored +=
5bfec14d 9737 set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
ffc61ed2
JH
9738 }
9739 else {
1d791ab2 9740 nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
ffc61ed2 9741 }
b8c5462f 9742 }
ffc61ed2
JH
9743
9744 range = 0; /* this was not a true range */
73b437c8 9745 }
ffc61ed2 9746
89836f1f
YO
9747
9748
73b437c8 9749 if (!SIZE_ONLY) {
c49a72a9
NC
9750 const char *what = NULL;
9751 char yesno = 0;
9752
e2962f66
JH
9753 /* Possible truncation here but in some 64-bit environments
9754 * the compiler gets heartburn about switch on 64-bit values.
9755 * A similar issue a little earlier when switching on value.
98f323fa 9756 * --jhi */
e2962f66 9757 switch ((I32)namedclass) {
da7fcca4 9758
f952827c
KW
9759 case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
9760 case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
9761 case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
9762 case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
9763 case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
9764 case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
9765 case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
9766 case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
9767 case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
9768 case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
a12cf05f 9769 /* \s, \w match all unicode if utf8. */
f952827c
KW
9770 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
9771 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
f952827c 9772 case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
e1d1eefb
YO
9773 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
9774 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
73b437c8
JH
9775 case ANYOF_ASCII:
9776 if (LOC)
936ed897 9777 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 9778 else {
1ba5c669 9779 for (value = 0; value < 128; value++)
2283d326 9780 stored +=
5bfec14d 9781 set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
73b437c8 9782 }
c49a72a9 9783 yesno = '+';
ce1c68b2
KW
9784 what = NULL; /* Doesn't match outside ascii, so
9785 don't want to add +utf8:: */
73b437c8
JH
9786 break;
9787 case ANYOF_NASCII:
9788 if (LOC)
936ed897 9789 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 9790 else {
1ba5c669 9791 for (value = 128; value < 256; value++)
2283d326 9792 stored +=
5bfec14d 9793 set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
73b437c8 9794 }
cfaf538b 9795 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
c49a72a9
NC
9796 yesno = '!';
9797 what = "ASCII";
89836f1f 9798 break;
ffc61ed2
JH
9799 case ANYOF_DIGIT:
9800 if (LOC)
9801 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
9802 else {
9803 /* consecutive digits assumed */
9804 for (value = '0'; value <= '9'; value++)
2283d326 9805 stored +=
5bfec14d 9806 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
ffc61ed2 9807 }
c49a72a9 9808 yesno = '+';
779d7b58 9809 what = "Digit";
ffc61ed2
JH
9810 break;
9811 case ANYOF_NDIGIT:
9812 if (LOC)
9813 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
9814 else {
9815 /* consecutive digits assumed */
9816 for (value = 0; value < '0'; value++)
2283d326 9817 stored +=
5bfec14d 9818 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
ffc61ed2 9819 for (value = '9' + 1; value < 256; value++)
2283d326 9820 stored +=
5bfec14d 9821 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
ffc61ed2 9822 }
c49a72a9 9823 yesno = '!';
779d7b58 9824 what = "Digit";
2f7f8cb1 9825 if (AT_LEAST_ASCII_RESTRICTED ) {
cfaf538b
KW
9826 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9827 }
89836f1f 9828 break;
f81125e2
JP
9829 case ANYOF_MAX:
9830 /* this is to handle \p and \P */
9831 break;
73b437c8 9832 default:
b45f050a 9833 vFAIL("Invalid [::] class");
73b437c8 9834 break;
b8c5462f 9835 }
2f7f8cb1 9836 if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
c49a72a9
NC
9837 /* Strings such as "+utf8::isWord\n" */
9838 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
ef87b810 9839 }
ce1c68b2 9840
73b437c8 9841 continue;
a0d0e21e 9842 }
ffc61ed2
JH
9843 } /* end of namedclass \blah */
9844
a0d0e21e 9845 if (range) {
eb160463 9846 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
9847 const int w = RExC_parse - rangebegin;
9848 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 9849 range = 0; /* not a valid range */
73b437c8 9850 }
a0d0e21e
LW
9851 }
9852 else {
3568d838 9853 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
9854 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
9855 RExC_parse[1] != ']') {
9856 RExC_parse++;
ffc61ed2
JH
9857
9858 /* a bad range like \w-, [:word:]- ? */
9859 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 9860 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 9861 const int w =
afd78fd5
JH
9862 RExC_parse >= rangebegin ?
9863 RExC_parse - rangebegin : 0;
830247a4 9864 vWARN4(RExC_parse,
b45f050a 9865 "False [] range \"%*.*s\"",
097eb12c 9866 w, w, rangebegin);
afd78fd5 9867 }
73b437c8 9868 if (!SIZE_ONLY)
2283d326 9869 stored +=
5bfec14d 9870 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
73b437c8 9871 } else
ffc61ed2
JH
9872 range = 1; /* yeah, it's a range! */
9873 continue; /* but do it the next time */
a0d0e21e 9874 }
a687059c 9875 }
ffc61ed2 9876
046c4055
KW
9877 /* non-Latin1 code point implies unicode semantics. Must be set in
9878 * pass1 so is there for the whole of pass 2 */
56ca34ca
KW
9879 if (value > 255) {
9880 RExC_uni_semantics = 1;
9881 }
9882
93733859 9883 /* now is the next time */
ae5c130c 9884 if (!SIZE_ONLY) {
3568d838 9885 if (prevvalue < 256) {
1df70142 9886 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 9887 IV i;
3568d838 9888#ifdef EBCDIC
1b2d223b
JH
9889 /* In EBCDIC [\x89-\x91] should include
9890 * the \x8e but [i-j] should not. */
9891 if (literal_endpoint == 2 &&
9892 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
9893 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 9894 {
3568d838
JH
9895 if (isLOWER(prevvalue)) {
9896 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 9897 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326 9898 stored +=
5bfec14d 9899 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
2670d666 9900 }
ffc61ed2 9901 } else {
3568d838 9902 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 9903 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326 9904 stored +=
5bfec14d 9905 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
2670d666 9906 }
ffc61ed2 9907 }
8ada0baa 9908 }
ffc61ed2 9909 else
8ada0baa 9910#endif
07be1b83 9911 for (i = prevvalue; i <= ceilvalue; i++) {
5bfec14d 9912 stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
07be1b83 9913 }
3568d838 9914 }
56ca34ca
KW
9915 if (value > 255) {
9916 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
9917 const UV natvalue = NATIVE_TO_UNI(value);
56ca34ca 9918 nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
56ca34ca 9919 }
1b2d223b
JH
9920#ifdef EBCDIC
9921 literal_endpoint = 0;
9922#endif
8ada0baa 9923 }
ffc61ed2
JH
9924
9925 range = 0; /* this range (if it was one) is done now */
a0d0e21e 9926 }
ffc61ed2 9927
ffc61ed2 9928
7f6f358c
YO
9929
9930 if (SIZE_ONLY)
9931 return ret;
9932 /****** !SIZE_ONLY AFTER HERE *********/
9933
0c6e4288
KW
9934 /* If folding and there are code points above 255, we calculate all
9935 * characters that could fold to or from the ones already on the list */
9936 if (FOLD && nonbitmap) {
56ca34ca
KW
9937 UV i;
9938
93e5bb1c
KW
9939 HV* fold_intersection;
9940 UV* fold_list;
9941
9942 /* This is a list of all the characters that participate in folds
9943 * (except marks, etc in multi-char folds */
9944 if (! PL_utf8_foldable) {
9945 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
9946 PL_utf8_foldable = _swash_to_invlist(swash);
9947 }
56ca34ca 9948
93e5bb1c
KW
9949 /* This is a hash that for a particular fold gives all characters
9950 * that are involved in it */
9951 if (! PL_utf8_foldclosures) {
9952
9953 /* If we were unable to find any folds, then we likely won't be
9954 * able to find the closures. So just create an empty list.
9955 * Folding will effectively be restricted to the non-Unicode rules
9956 * hard-coded into Perl. (This case happens legitimately during
9957 * compilation of Perl itself before the Unicode tables are
9958 * generated) */
9959 if (invlist_len(PL_utf8_foldable) == 0) {
9960 PL_utf8_foldclosures = _new_invlist(0);
9961 } else {
9962 /* If the folds haven't been read in, call a fold function
9963 * to force that */
9964 if (! PL_utf8_tofold) {
9965 U8 dummy[UTF8_MAXBYTES+1];
9966 STRLEN dummy_len;
9967 to_utf8_fold((U8*) "A", dummy, &dummy_len);
56ca34ca 9968 }
93e5bb1c 9969 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
56ca34ca 9970 }
93e5bb1c
KW
9971 }
9972
9973 /* Only the characters in this class that participate in folds need
9974 * be checked. Get the intersection of this class and all the
9975 * possible characters that are foldable. This can quickly narrow
9976 * down a large class */
9977 fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
9978
9979 /* Now look at the foldable characters in this class individually */
9980 fold_list = invlist_array(fold_intersection);
9981 for (i = 0; i < invlist_len(fold_intersection); i++) {
9982 UV j;
9983
9984 /* The next entry is the beginning of the range that is in the
9985 * class */
9986 UV start = fold_list[i++];
9987
56ca34ca 9988
93e5bb1c
KW
9989 /* The next entry is the beginning of the next range, which
9990 * isn't in the class, so the end of the current range is one
9991 * less than that */
9992 UV end = fold_list[i] - 1;
9993
9994 /* Look at every character in the range */
9995 for (j = start; j <= end; j++) {
9996
9997 /* Get its fold */
9998 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
9999 STRLEN foldlen;
10000 const UV f = to_uni_fold(j, foldbuf, &foldlen);
10001
10002 if (foldlen > (STRLEN)UNISKIP(f)) {
10003
10004 /* Any multicharacter foldings (disallowed in
10005 * lookbehind patterns) require the following
10006 * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
10007 * E folds into "pq" and F folds into "rst", all other
10008 * characters fold to single characters. We save away
10009 * these multicharacter foldings, to be later saved as
10010 * part of the additional "s" data. */
10011 if (! RExC_in_lookbehind) {
10012 U8* loc = foldbuf;
10013 U8* e = foldbuf + foldlen;
10014
10015 /* If any of the folded characters of this are in
10016 * the Latin1 range, tell the regex engine that
10017 * this can match a non-utf8 target string. The
10018 * only multi-byte fold whose source is in the
10019 * Latin1 range (U+00DF) applies only when the
10020 * target string is utf8, or under unicode rules */
10021 if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
10022 while (loc < e) {
10023
10024 /* Can't mix ascii with non- under /aa */
10025 if (MORE_ASCII_RESTRICTED
10026 && (isASCII(*loc) != isASCII(j)))
10027 {
10028 goto end_multi_fold;
10029 }
10030 if (UTF8_IS_INVARIANT(*loc)
10031 || UTF8_IS_DOWNGRADEABLE_START(*loc))
10032 {
10033 /* Can't mix above and below 256 under
10034 * LOC */
10035 if (LOC) {
2f7f8cb1
KW
10036 goto end_multi_fold;
10037 }
93e5bb1c
KW
10038 ANYOF_FLAGS(ret)
10039 |= ANYOF_NONBITMAP_NON_UTF8;
10040 break;
8e3094e5 10041 }
93e5bb1c 10042 loc += UTF8SKIP(loc);
8e3094e5 10043 }
56ca34ca 10044 }
17580e7a 10045
93e5bb1c
KW
10046 add_alternate(&unicode_alternate, foldbuf, foldlen);
10047 end_multi_fold: ;
10048 }
10049 }
10050 else {
10051 /* Single character fold. Add everything in its fold
10052 * closure to the list that this node should match */
10053 SV** listp;
10054
10055 /* The fold closures data structure is a hash with the
10056 * keys being every character that is folded to, like
10057 * 'k', and the values each an array of everything that
10058 * folds to its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
10059 if ((listp = hv_fetch(PL_utf8_foldclosures,
10060 (char *) foldbuf, foldlen, FALSE)))
10061 {
10062 AV* list = (AV*) *listp;
10063 IV k;
10064 for (k = 0; k <= av_len(list); k++) {
10065 SV** c_p = av_fetch(list, k, FALSE);
10066 UV c;
10067 if (c_p == NULL) {
10068 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
10069 }
10070 c = SvUV(*c_p);
10071
10072 /* /aa doesn't allow folds between ASCII and
10073 * non-; /l doesn't allow them between above
10074 * and below 256 */
10075 if ((MORE_ASCII_RESTRICTED
10076 && (isASCII(c) != isASCII(j)))
10077 || (LOC && ((c < 256) != (j < 256))))
10078 {
10079 continue;
10080 }
56ca34ca 10081
93e5bb1c
KW
10082 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
10083 stored += set_regclass_bit(pRExC_state,
10084 ret,
10085 (U8) c,
10086 &l1_fold_invlist, &unicode_alternate);
10087 }
10088 /* It may be that the code point is already
10089 * in this range or already in the bitmap,
10090 * in which case we need do nothing */
10091 else if ((c < start || c > end)
10092 && (c > 255
10093 || ! ANYOF_BITMAP_TEST(ret, c)))
10094 {
10095 nonbitmap = add_cp_to_invlist(nonbitmap, c);
56ca34ca
KW
10096 }
10097 }
10098 }
10099 }
10100 }
93e5bb1c
KW
10101 }
10102 invlist_destroy(fold_intersection);
56ca34ca
KW
10103 }
10104
53742956
KW
10105 /* Combine the two lists into one. */
10106 if (l1_fold_invlist) {
10107 if (nonbitmap) {
10108 nonbitmap = invlist_union(nonbitmap, l1_fold_invlist);
10109 }
10110 else {
10111 nonbitmap = l1_fold_invlist;
10112 }
10113 }
10114
fb9bfbf7
KW
10115 /* Here, we have calculated what code points should be in the character
10116 * class. Now we can see about various optimizations. Fold calculation
10117 * needs to take place before inversion. Otherwise /[^k]/i would invert to
10118 * include K, which under /i would match k. */
10119
f56b6394
KW
10120 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
10121 * set the FOLD flag yet, so this this does optimize those. It doesn't
40c78556
KW
10122 * optimize locale. Doing so perhaps could be done as long as there is
10123 * nothing like \w in it; some thought also would have to be given to the
10124 * interaction with above 0x100 chars */
137165a6
KW
10125 if (! LOC
10126 && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT
10127 && ! unicode_alternate
10128 && ! nonbitmap
10129 && SvCUR(listsv) == initial_listsv_len)
10130 {
40c78556
KW
10131 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
10132 ANYOF_BITMAP(ret)[value] ^= 0xFF;
10133 stored = 256 - stored;
10134
d5788240
KW
10135 /* The inversion means that everything above 255 is matched; and at the
10136 * same time we clear the invert flag */
137165a6 10137 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
40c78556
KW
10138 }
10139
0222889f
KW
10140 /* Folding in the bitmap is taken care of above, but not for locale (for
10141 * which we have to wait to see what folding is in effect at runtime), and
10142 * for things not in the bitmap. Set run-time fold flag for these */
53742956 10143 if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
0222889f 10144 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
f56b6394
KW
10145 }
10146
2786be71
KW
10147 /* A single character class can be "optimized" into an EXACTish node.
10148 * Note that since we don't currently count how many characters there are
10149 * outside the bitmap, we are XXX missing optimization possibilities for
10150 * them. This optimization can't happen unless this is a truly single
10151 * character class, which means that it can't be an inversion into a
10152 * many-character class, and there must be no possibility of there being
10153 * things outside the bitmap. 'stored' (only) for locales doesn't include
6da63e10
KW
10154 * \w, etc, so have to make a special test that they aren't present
10155 *
10156 * Similarly A 2-character class of the very special form like [bB] can be
10157 * optimized into an EXACTFish node, but only for non-locales, and for
10158 * characters which only have the two folds; so things like 'fF' and 'Ii'
10159 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
10160 * FI'. */
137165a6 10161 if (! nonbitmap
53742956 10162 && ! unicode_alternate
137165a6
KW
10163 && SvCUR(listsv) == initial_listsv_len
10164 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
6da63e10
KW
10165 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10166 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
10167 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10168 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
10169 /* If the latest code point has a fold whose
10170 * bit is set, it must be the only other one */
2dcac756 10171 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
6da63e10 10172 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
2786be71
KW
10173 {
10174 /* Note that the information needed to decide to do this optimization
10175 * is not currently available until the 2nd pass, and that the actually
6da63e10
KW
10176 * used EXACTish node takes less space than the calculated ANYOF node,
10177 * and hence the amount of space calculated in the first pass is larger
2786be71
KW
10178 * than actually used, so this optimization doesn't gain us any space.
10179 * But an EXACT node is faster than an ANYOF node, and can be combined
10180 * with any adjacent EXACT nodes later by the optimizer for further
6da63e10
KW
10181 * gains. The speed of executing an EXACTF is similar to an ANYOF
10182 * node, so the optimization advantage comes from the ability to join
10183 * it to adjacent EXACT nodes */
2786be71 10184
07be1b83 10185 const char * cur_parse= RExC_parse;
6da63e10 10186 U8 op;
07be1b83
YO
10187 RExC_emit = (regnode *)orig_emit;
10188 RExC_parse = (char *)orig_parse;
2786be71 10189
6da63e10
KW
10190 if (stored == 1) {
10191
10192 /* A locale node with one point can be folded; all the other cases
10193 * with folding will have two points, since we calculate them above
10194 */
39065660 10195 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
6da63e10
KW
10196 op = EXACTFL;
10197 }
10198 else {
10199 op = EXACT;
10200 }
10201 } /* else 2 chars in the bit map: the folds of each other */
cfaf538b 10202 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
6da63e10
KW
10203
10204 /* To join adjacent nodes, they must be the exact EXACTish type.
10205 * Try to use the most likely type, by using EXACTFU if the regex
10206 * calls for them, or is required because the character is
10207 * non-ASCII */
10208 op = EXACTFU;
10209 }
10210 else { /* Otherwise, more likely to be EXACTF type */
10211 op = EXACTF;
10212 }
10213
10214 ret = reg_node(pRExC_state, op);
07be1b83 10215 RExC_parse = (char *)cur_parse;
2786be71
KW
10216 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
10217 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
10218 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
10219 STR_LEN(ret)= 2;
10220 RExC_emit += STR_SZ(2);
10221 }
10222 else {
10223 *STRING(ret)= (char)value;
10224 STR_LEN(ret)= 1;
10225 RExC_emit += STR_SZ(1);
10226 }
ef8d46e8 10227 SvREFCNT_dec(listsv);
7f6f358c
YO
10228 return ret;
10229 }
ffc61ed2 10230
9e791dfe
KW
10231 if (nonbitmap) {
10232 UV* nonbitmap_array = invlist_array(nonbitmap);
10233 UV nonbitmap_len = invlist_len(nonbitmap);
10234 UV i;
10235
10236 /* Here have the full list of items to match that aren't in the
10237 * bitmap. Convert to the structure that the rest of the code is
10238 * expecting. XXX That rest of the code should convert to this
10239 * structure */
10240 for (i = 0; i < nonbitmap_len; i++) {
10241
10242 /* The next entry is the beginning of the range that is in the
10243 * class */
10244 UV start = nonbitmap_array[i++];
27340457 10245 UV end;
9e791dfe
KW
10246
10247 /* The next entry is the beginning of the next range, which isn't
10248 * in the class, so the end of the current range is one less than
27340457
KW
10249 * that. But if there is no next range, it means that the range
10250 * begun by 'start' extends to infinity, which for this platform
10251 * ends at UV_MAX */
10252 if (i == nonbitmap_len) {
10253 end = UV_MAX;
10254 }
10255 else {
10256 end = nonbitmap_array[i] - 1;
10257 }
10258
9e791dfe
KW
10259 if (start == end) {
10260 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
10261 }
10262 else {
10263 /* The \t sets the whole range */
10264 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
10265 /* XXX EBCDIC */
10266 start, end);
10267 }
10268 }
10269 invlist_destroy(nonbitmap);
10270 }
10271
137165a6
KW
10272 if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
10273 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
10274 SvREFCNT_dec(listsv);
10275 SvREFCNT_dec(unicode_alternate);
10276 }
10277 else {
10278
097eb12c 10279 AV * const av = newAV();
ffc61ed2 10280 SV *rv;
9e55ce06 10281 /* The 0th element stores the character class description
6a0407ee 10282 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
10283 * to initialize the appropriate swash (which gets stored in
10284 * the 1st element), and also useful for dumping the regnode.
10285 * The 2nd element stores the multicharacter foldings,
6a0407ee 10286 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
10287 av_store(av, 0, listsv);
10288 av_store(av, 1, NULL);
ad64d0ec 10289 av_store(av, 2, MUTABLE_SV(unicode_alternate));
c93d5d8b
KW
10290 if (unicode_alternate) { /* This node is variable length */
10291 OP(ret) = ANYOFV;
10292 }
ad64d0ec 10293 rv = newRV_noinc(MUTABLE_SV(av));
19860706 10294 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 10295 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 10296 ARG_SET(ret, n);
a0ed51b3 10297 }
a0ed51b3
LW
10298 return ret;
10299}
89836f1f
YO
10300#undef _C_C_T_
10301
a0ed51b3 10302
bcdf7404
YO
10303/* reg_skipcomment()
10304
10305 Absorbs an /x style # comments from the input stream.
10306 Returns true if there is more text remaining in the stream.
10307 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
10308 terminates the pattern without including a newline.
10309
10310 Note its the callers responsibility to ensure that we are
10311 actually in /x mode
10312
10313*/
10314
10315STATIC bool
10316S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
10317{
10318 bool ended = 0;
7918f24d
NC
10319
10320 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
10321
bcdf7404
YO
10322 while (RExC_parse < RExC_end)
10323 if (*RExC_parse++ == '\n') {
10324 ended = 1;
10325 break;
10326 }
10327 if (!ended) {
10328 /* we ran off the end of the pattern without ending
10329 the comment, so we have to add an \n when wrapping */
10330 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10331 return 0;
10332 } else
10333 return 1;
10334}
10335
10336/* nextchar()
10337
3b753521 10338 Advances the parse position, and optionally absorbs
bcdf7404
YO
10339 "whitespace" from the inputstream.
10340
10341 Without /x "whitespace" means (?#...) style comments only,
10342 with /x this means (?#...) and # comments and whitespace proper.
10343
10344 Returns the RExC_parse point from BEFORE the scan occurs.
10345
10346 This is the /x friendly way of saying RExC_parse++.
10347*/
10348
76e3520e 10349STATIC char*
830247a4 10350S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 10351{
097eb12c 10352 char* const retval = RExC_parse++;
a0d0e21e 10353
7918f24d
NC
10354 PERL_ARGS_ASSERT_NEXTCHAR;
10355
4633a7c4 10356 for (;;) {
830247a4
IZ
10357 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
10358 RExC_parse[2] == '#') {
e994fd66
AE
10359 while (*RExC_parse != ')') {
10360 if (RExC_parse == RExC_end)
10361 FAIL("Sequence (?#... not terminated");
830247a4 10362 RExC_parse++;
e994fd66 10363 }
830247a4 10364 RExC_parse++;
4633a7c4
LW
10365 continue;
10366 }
bbe252da 10367 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
10368 if (isSPACE(*RExC_parse)) {
10369 RExC_parse++;
748a9306
LW
10370 continue;
10371 }
830247a4 10372 else if (*RExC_parse == '#') {
bcdf7404
YO
10373 if ( reg_skipcomment( pRExC_state ) )
10374 continue;
748a9306 10375 }
748a9306 10376 }
4633a7c4 10377 return retval;
a0d0e21e 10378 }
a687059c
LW
10379}
10380
10381/*
c277df42 10382- reg_node - emit a node
a0d0e21e 10383*/
76e3520e 10384STATIC regnode * /* Location. */
830247a4 10385S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 10386{
97aff369 10387 dVAR;
c277df42 10388 register regnode *ptr;
504618e9 10389 regnode * const ret = RExC_emit;
07be1b83 10390 GET_RE_DEBUG_FLAGS_DECL;
a687059c 10391
7918f24d
NC
10392 PERL_ARGS_ASSERT_REG_NODE;
10393
c277df42 10394 if (SIZE_ONLY) {
830247a4
IZ
10395 SIZE_ALIGN(RExC_size);
10396 RExC_size += 1;
a0d0e21e
LW
10397 return(ret);
10398 }
3b57cd43
YO
10399 if (RExC_emit >= RExC_emit_bound)
10400 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10401
c277df42 10402 NODE_ALIGN_FILL(ret);
a0d0e21e 10403 ptr = ret;
c277df42 10404 FILL_ADVANCE_NODE(ptr, op);
7122b237 10405#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 10406 if (RExC_offsets) { /* MJD */
07be1b83 10407 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 10408 "reg_node", __LINE__,
13d6edb4 10409 PL_reg_name[op],
07be1b83
YO
10410 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
10411 ? "Overwriting end of array!\n" : "OK",
10412 (UV)(RExC_emit - RExC_emit_start),
10413 (UV)(RExC_parse - RExC_start),
10414 (UV)RExC_offsets[0]));
ccb2c380 10415 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 10416 }
7122b237 10417#endif
830247a4 10418 RExC_emit = ptr;
a0d0e21e 10419 return(ret);
a687059c
LW
10420}
10421
10422/*
a0d0e21e
LW
10423- reganode - emit a node with an argument
10424*/
76e3520e 10425STATIC regnode * /* Location. */
830247a4 10426S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 10427{
97aff369 10428 dVAR;
c277df42 10429 register regnode *ptr;
504618e9 10430 regnode * const ret = RExC_emit;
07be1b83 10431 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 10432
7918f24d
NC
10433 PERL_ARGS_ASSERT_REGANODE;
10434
c277df42 10435 if (SIZE_ONLY) {
830247a4
IZ
10436 SIZE_ALIGN(RExC_size);
10437 RExC_size += 2;
6bda09f9
YO
10438 /*
10439 We can't do this:
10440
10441 assert(2==regarglen[op]+1);
10442
10443 Anything larger than this has to allocate the extra amount.
10444 If we changed this to be:
10445
10446 RExC_size += (1 + regarglen[op]);
10447
10448 then it wouldn't matter. Its not clear what side effect
10449 might come from that so its not done so far.
10450 -- dmq
10451 */
a0d0e21e
LW
10452 return(ret);
10453 }
3b57cd43
YO
10454 if (RExC_emit >= RExC_emit_bound)
10455 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10456
c277df42 10457 NODE_ALIGN_FILL(ret);
a0d0e21e 10458 ptr = ret;
c277df42 10459 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 10460#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 10461 if (RExC_offsets) { /* MJD */
07be1b83 10462 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 10463 "reganode",
ccb2c380 10464 __LINE__,
13d6edb4 10465 PL_reg_name[op],
07be1b83 10466 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 10467 "Overwriting end of array!\n" : "OK",
07be1b83
YO
10468 (UV)(RExC_emit - RExC_emit_start),
10469 (UV)(RExC_parse - RExC_start),
10470 (UV)RExC_offsets[0]));
ccb2c380 10471 Set_Cur_Node_Offset;
fac92740 10472 }
7122b237 10473#endif
830247a4 10474 RExC_emit = ptr;
a0d0e21e 10475 return(ret);
fe14fcc3
LW
10476}
10477
10478/*
cd439c50 10479- reguni - emit (if appropriate) a Unicode character
a0ed51b3 10480*/
71207a34
AL
10481STATIC STRLEN
10482S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 10483{
97aff369 10484 dVAR;
7918f24d
NC
10485
10486 PERL_ARGS_ASSERT_REGUNI;
10487
71207a34 10488 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
10489}
10490
10491/*
a0d0e21e
LW
10492- reginsert - insert an operator in front of already-emitted operand
10493*
10494* Means relocating the operand.
10495*/
76e3520e 10496STATIC void
6bda09f9 10497S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 10498{
97aff369 10499 dVAR;
c277df42
IZ
10500 register regnode *src;
10501 register regnode *dst;
10502 register regnode *place;
504618e9 10503 const int offset = regarglen[(U8)op];
6bda09f9 10504 const int size = NODE_STEP_REGNODE + offset;
07be1b83 10505 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10506
10507 PERL_ARGS_ASSERT_REGINSERT;
def51078 10508 PERL_UNUSED_ARG(depth);
22c35a8c 10509/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 10510 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 10511 if (SIZE_ONLY) {
6bda09f9 10512 RExC_size += size;
a0d0e21e
LW
10513 return;
10514 }
a687059c 10515
830247a4 10516 src = RExC_emit;
6bda09f9 10517 RExC_emit += size;
830247a4 10518 dst = RExC_emit;
40d049e4 10519 if (RExC_open_parens) {
6bda09f9 10520 int paren;
3b57cd43 10521 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 10522 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 10523 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 10524 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
10525 RExC_open_parens[paren] += size;
10526 } else {
3b57cd43 10527 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
10528 }
10529 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 10530 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
10531 RExC_close_parens[paren] += size;
10532 } else {
3b57cd43 10533 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
10534 }
10535 }
6bda09f9 10536 }
40d049e4 10537
fac92740 10538 while (src > opnd) {
c277df42 10539 StructCopy(--src, --dst, regnode);
7122b237 10540#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 10541 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 10542 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 10543 "reg_insert",
ccb2c380 10544 __LINE__,
13d6edb4 10545 PL_reg_name[op],
07be1b83
YO
10546 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
10547 ? "Overwriting end of array!\n" : "OK",
10548 (UV)(src - RExC_emit_start),
10549 (UV)(dst - RExC_emit_start),
10550 (UV)RExC_offsets[0]));
ccb2c380
MP
10551 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
10552 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 10553 }
7122b237 10554#endif
fac92740
MJD
10555 }
10556
a0d0e21e
LW
10557
10558 place = opnd; /* Op node, where operand used to be. */
7122b237 10559#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 10560 if (RExC_offsets) { /* MJD */
07be1b83 10561 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 10562 "reginsert",
ccb2c380 10563 __LINE__,
13d6edb4 10564 PL_reg_name[op],
07be1b83 10565 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 10566 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
10567 (UV)(place - RExC_emit_start),
10568 (UV)(RExC_parse - RExC_start),
786e8c11 10569 (UV)RExC_offsets[0]));
ccb2c380 10570 Set_Node_Offset(place, RExC_parse);
45948336 10571 Set_Node_Length(place, 1);
fac92740 10572 }
7122b237 10573#endif
c277df42
IZ
10574 src = NEXTOPER(place);
10575 FILL_ADVANCE_NODE(place, op);
10576 Zero(src, offset, regnode);
a687059c
LW
10577}
10578
10579/*
c277df42 10580- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 10581- SEE ALSO: regtail_study
a0d0e21e 10582*/
097eb12c 10583/* TODO: All three parms should be const */
76e3520e 10584STATIC void
3dab1dad 10585S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 10586{
97aff369 10587 dVAR;
c277df42 10588 register regnode *scan;
72f13be8 10589 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10590
10591 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
10592#ifndef DEBUGGING
10593 PERL_UNUSED_ARG(depth);
10594#endif
a0d0e21e 10595
c277df42 10596 if (SIZE_ONLY)
a0d0e21e
LW
10597 return;
10598
10599 /* Find last node. */
10600 scan = p;
10601 for (;;) {
504618e9 10602 regnode * const temp = regnext(scan);
3dab1dad
YO
10603 DEBUG_PARSE_r({
10604 SV * const mysv=sv_newmortal();
10605 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
10606 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
10607 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
10608 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
10609 (temp == NULL ? "->" : ""),
13d6edb4 10610 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 10611 );
3dab1dad
YO
10612 });
10613 if (temp == NULL)
10614 break;
10615 scan = temp;
10616 }
10617
10618 if (reg_off_by_arg[OP(scan)]) {
10619 ARG_SET(scan, val - scan);
10620 }
10621 else {
10622 NEXT_OFF(scan) = val - scan;
10623 }
10624}
10625
07be1b83 10626#ifdef DEBUGGING
3dab1dad
YO
10627/*
10628- regtail_study - set the next-pointer at the end of a node chain of p to val.
10629- Look for optimizable sequences at the same time.
10630- currently only looks for EXACT chains.
07be1b83 10631
486ec47a 10632This is experimental code. The idea is to use this routine to perform
07be1b83
YO
10633in place optimizations on branches and groups as they are constructed,
10634with the long term intention of removing optimization from study_chunk so
10635that it is purely analytical.
10636
10637Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
10638to control which is which.
10639
3dab1dad
YO
10640*/
10641/* TODO: All four parms should be const */
07be1b83 10642
3dab1dad
YO
10643STATIC U8
10644S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10645{
10646 dVAR;
10647 register regnode *scan;
07be1b83
YO
10648 U8 exact = PSEUDO;
10649#ifdef EXPERIMENTAL_INPLACESCAN
10650 I32 min = 0;
10651#endif
3dab1dad
YO
10652 GET_RE_DEBUG_FLAGS_DECL;
10653
7918f24d
NC
10654 PERL_ARGS_ASSERT_REGTAIL_STUDY;
10655
07be1b83 10656
3dab1dad
YO
10657 if (SIZE_ONLY)
10658 return exact;
10659
10660 /* Find last node. */
10661
10662 scan = p;
10663 for (;;) {
10664 regnode * const temp = regnext(scan);
07be1b83
YO
10665#ifdef EXPERIMENTAL_INPLACESCAN
10666 if (PL_regkind[OP(scan)] == EXACT)
10667 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
10668 return EXACT;
10669#endif
3dab1dad
YO
10670 if ( exact ) {
10671 switch (OP(scan)) {
10672 case EXACT:
10673 case EXACTF:
2f7f8cb1 10674 case EXACTFA:
2c2b7f86 10675 case EXACTFU:
3dab1dad
YO
10676 case EXACTFL:
10677 if( exact == PSEUDO )
10678 exact= OP(scan);
07be1b83
YO
10679 else if ( exact != OP(scan) )
10680 exact= 0;
3dab1dad
YO
10681 case NOTHING:
10682 break;
10683 default:
10684 exact= 0;
10685 }
10686 }
10687 DEBUG_PARSE_r({
10688 SV * const mysv=sv_newmortal();
10689 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
10690 regprop(RExC_rx, mysv, scan);
eaf3ca90 10691 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 10692 SvPV_nolen_const(mysv),
eaf3ca90 10693 REG_NODE_NUM(scan),
13d6edb4 10694 PL_reg_name[exact]);
3dab1dad 10695 });
a0d0e21e
LW
10696 if (temp == NULL)
10697 break;
10698 scan = temp;
10699 }
07be1b83
YO
10700 DEBUG_PARSE_r({
10701 SV * const mysv_val=sv_newmortal();
10702 DEBUG_PARSE_MSG("");
10703 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
10704 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
10705 SvPV_nolen_const(mysv_val),
10706 (IV)REG_NODE_NUM(val),
10707 (IV)(val - scan)
07be1b83
YO
10708 );
10709 });
c277df42
IZ
10710 if (reg_off_by_arg[OP(scan)]) {
10711 ARG_SET(scan, val - scan);
a0ed51b3
LW
10712 }
10713 else {
c277df42
IZ
10714 NEXT_OFF(scan) = val - scan;
10715 }
3dab1dad
YO
10716
10717 return exact;
a687059c 10718}
07be1b83 10719#endif
a687059c
LW
10720
10721/*
fd181c75 10722 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 10723 */
f7819f85 10724#ifdef DEBUGGING
c33269f7 10725static void
7918f24d
NC
10726S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
10727{
f7819f85
A
10728 int bit;
10729 int set=0;
a62b1201 10730 regex_charset cs;
7918f24d 10731
f7819f85
A
10732 for (bit=0; bit<32; bit++) {
10733 if (flags & (1<<bit)) {
a62b1201
KW
10734 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
10735 continue;
10736 }
f7819f85
A
10737 if (!set++ && lead)
10738 PerlIO_printf(Perl_debug_log, "%s",lead);
10739 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
10740 }
10741 }
a62b1201
KW
10742 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
10743 if (!set++ && lead) {
10744 PerlIO_printf(Perl_debug_log, "%s",lead);
10745 }
10746 switch (cs) {
10747 case REGEX_UNICODE_CHARSET:
10748 PerlIO_printf(Perl_debug_log, "UNICODE");
10749 break;
10750 case REGEX_LOCALE_CHARSET:
10751 PerlIO_printf(Perl_debug_log, "LOCALE");
10752 break;
cfaf538b
KW
10753 case REGEX_ASCII_RESTRICTED_CHARSET:
10754 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
10755 break;
2f7f8cb1
KW
10756 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10757 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
10758 break;
a62b1201
KW
10759 default:
10760 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
10761 break;
10762 }
10763 }
f7819f85
A
10764 if (lead) {
10765 if (set)
10766 PerlIO_printf(Perl_debug_log, "\n");
10767 else
10768 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
10769 }
10770}
10771#endif
10772
a687059c 10773void
097eb12c 10774Perl_regdump(pTHX_ const regexp *r)
a687059c 10775{
35ff7856 10776#ifdef DEBUGGING
97aff369 10777 dVAR;
c445ea15 10778 SV * const sv = sv_newmortal();
ab3bbdeb 10779 SV *dsv= sv_newmortal();
f8fc2ecf 10780 RXi_GET_DECL(r,ri);
f7819f85 10781 GET_RE_DEBUG_FLAGS_DECL;
a687059c 10782
7918f24d
NC
10783 PERL_ARGS_ASSERT_REGDUMP;
10784
f8fc2ecf 10785 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
10786
10787 /* Header fields of interest. */
ab3bbdeb
YO
10788 if (r->anchored_substr) {
10789 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
10790 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 10791 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
10792 "anchored %s%s at %"IVdf" ",
10793 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 10794 (IV)r->anchored_offset);
ab3bbdeb
YO
10795 } else if (r->anchored_utf8) {
10796 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
10797 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 10798 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
10799 "anchored utf8 %s%s at %"IVdf" ",
10800 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 10801 (IV)r->anchored_offset);
ab3bbdeb
YO
10802 }
10803 if (r->float_substr) {
10804 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
10805 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 10806 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
10807 "floating %s%s at %"IVdf"..%"UVuf" ",
10808 s, RE_SV_TAIL(r->float_substr),
7b0972df 10809 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
10810 } else if (r->float_utf8) {
10811 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
10812 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 10813 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
10814 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
10815 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 10816 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 10817 }
33b8afdf 10818 if (r->check_substr || r->check_utf8)
b81d288d 10819 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
10820 (const char *)
10821 (r->check_substr == r->float_substr
10822 && r->check_utf8 == r->float_utf8
10823 ? "(checking floating" : "(checking anchored"));
bbe252da 10824 if (r->extflags & RXf_NOSCAN)
c277df42 10825 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 10826 if (r->extflags & RXf_CHECK_ALL)
c277df42 10827 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 10828 if (r->check_substr || r->check_utf8)
c277df42
IZ
10829 PerlIO_printf(Perl_debug_log, ") ");
10830
f8fc2ecf
YO
10831 if (ri->regstclass) {
10832 regprop(r, sv, ri->regstclass);
1de06328 10833 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 10834 }
bbe252da 10835 if (r->extflags & RXf_ANCH) {
774d564b 10836 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 10837 if (r->extflags & RXf_ANCH_BOL)
774d564b 10838 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 10839 if (r->extflags & RXf_ANCH_MBOL)
c277df42 10840 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 10841 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 10842 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 10843 if (r->extflags & RXf_ANCH_GPOS)
774d564b 10844 PerlIO_printf(Perl_debug_log, "(GPOS)");
10845 PerlIO_putc(Perl_debug_log, ' ');
10846 }
bbe252da 10847 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 10848 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 10849 if (r->intflags & PREGf_SKIP)
760ac839 10850 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 10851 if (r->intflags & PREGf_IMPLICIT)
760ac839 10852 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 10853 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 10854 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 10855 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 10856 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 10857 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 10858#else
7918f24d 10859 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 10860 PERL_UNUSED_CONTEXT;
65e66c80 10861 PERL_UNUSED_ARG(r);
17c3b450 10862#endif /* DEBUGGING */
a687059c
LW
10863}
10864
10865/*
a0d0e21e
LW
10866- regprop - printable representation of opcode
10867*/
3339dfd8
YO
10868#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
10869STMT_START { \
10870 if (do_sep) { \
10871 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
10872 if (flags & ANYOF_INVERT) \
10873 /*make sure the invert info is in each */ \
10874 sv_catpvs(sv, "^"); \
10875 do_sep = 0; \
10876 } \
10877} STMT_END
10878
46fc3d4c 10879void
32fc9b6a 10880Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 10881{
35ff7856 10882#ifdef DEBUGGING
97aff369 10883 dVAR;
9b155405 10884 register int k;
f8fc2ecf 10885 RXi_GET_DECL(prog,progi);
1de06328 10886 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 10887
7918f24d 10888 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 10889
76f68e9b 10890 sv_setpvs(sv, "");
8aa23a47 10891
03363afd 10892 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
10893 /* It would be nice to FAIL() here, but this may be called from
10894 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 10895 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 10896 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 10897
3dab1dad 10898 k = PL_regkind[OP(o)];
9b155405 10899
2a782b5b 10900 if (k == EXACT) {
f92a2122 10901 sv_catpvs(sv, " ");
ab3bbdeb
YO
10902 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
10903 * is a crude hack but it may be the best for now since
10904 * we have no flag "this EXACTish node was UTF-8"
10905 * --jhi */
f92a2122
NC
10906 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
10907 PERL_PV_ESCAPE_UNI_DETECT |
c89df6cf 10908 PERL_PV_ESCAPE_NONASCII |
f92a2122
NC
10909 PERL_PV_PRETTY_ELLIPSES |
10910 PERL_PV_PRETTY_LTGT |
10911 PERL_PV_PRETTY_NOCLEAR
10912 );
bb263b4e 10913 } else if (k == TRIE) {
3dab1dad 10914 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 10915 * progi->data isn't available here */
1de06328 10916 const char op = OP(o);
647f639f 10917 const U32 n = ARG(o);
1de06328 10918 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 10919 (reg_ac_data *)progi->data->data[n] :
1de06328 10920 NULL;
3251b653
NC
10921 const reg_trie_data * const trie
10922 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 10923
13d6edb4 10924 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
10925 DEBUG_TRIE_COMPILE_r(
10926 Perl_sv_catpvf(aTHX_ sv,
10927 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
10928 (UV)trie->startstate,
1e2e3d02 10929 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
10930 (UV)trie->wordcount,
10931 (UV)trie->minlen,
10932 (UV)trie->maxlen,
10933 (UV)TRIE_CHARCOUNT(trie),
10934 (UV)trie->uniquecharcount
10935 )
10936 );
10937 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
10938 int i;
10939 int rangestart = -1;
f46cb337 10940 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 10941 sv_catpvs(sv, "[");
1de06328
YO
10942 for (i = 0; i <= 256; i++) {
10943 if (i < 256 && BITMAP_TEST(bitmap,i)) {
10944 if (rangestart == -1)
10945 rangestart = i;
10946 } else if (rangestart != -1) {
10947 if (i <= rangestart + 3)
10948 for (; rangestart < i; rangestart++)
10949 put_byte(sv, rangestart);
10950 else {
10951 put_byte(sv, rangestart);
10952 sv_catpvs(sv, "-");
10953 put_byte(sv, i - 1);
10954 }
10955 rangestart = -1;
10956 }
10957 }
f3a2811a 10958 sv_catpvs(sv, "]");
1de06328
YO
10959 }
10960
a3621e74 10961 } else if (k == CURLY) {
cb434fcc 10962 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
10963 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
10964 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 10965 }
2c2d71f5
JH
10966 else if (k == WHILEM && o->flags) /* Ordinal/of */
10967 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 10968 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 10969 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 10970 if ( RXp_PAREN_NAMES(prog) ) {
9d6ecd7a 10971 if ( k != REF || (OP(o) < NREF)) {
502c6561 10972 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
10973 SV **name= av_fetch(list, ARG(o), 0 );
10974 if (name)
10975 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
10976 }
10977 else {
502c6561 10978 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 10979 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
10980 I32 *nums=(I32*)SvPVX(sv_dat);
10981 SV **name= av_fetch(list, nums[0], 0 );
10982 I32 n;
10983 if (name) {
10984 for ( n=0; n<SvIVX(sv_dat); n++ ) {
10985 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
10986 (n ? "," : ""), (IV)nums[n]);
10987 }
10988 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 10989 }
1f1031fe 10990 }
ee9b8eae 10991 }
1f1031fe 10992 } else if (k == GOSUB)
6bda09f9 10993 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
10994 else if (k == VERB) {
10995 if (!o->flags)
10996 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 10997 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 10998 } else if (k == LOGICAL)
04ebc1ab 10999 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
f9a79580 11000 else if (k == FOLDCHAR)
df44d732 11001 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
653099ff
GS
11002 else if (k == ANYOF) {
11003 int i, rangestart = -1;
2d03de9c 11004 const U8 flags = ANYOF_FLAGS(o);
24d786f4 11005 int do_sep = 0;
0bd48802
AL
11006
11007 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
11008 static const char * const anyofs[] = {
653099ff
GS
11009 "\\w",
11010 "\\W",
11011 "\\s",
11012 "\\S",
11013 "\\d",
11014 "\\D",
11015 "[:alnum:]",
11016 "[:^alnum:]",
11017 "[:alpha:]",
11018 "[:^alpha:]",
11019 "[:ascii:]",
11020 "[:^ascii:]",
24d786f4
YO
11021 "[:cntrl:]",
11022 "[:^cntrl:]",
653099ff
GS
11023 "[:graph:]",
11024 "[:^graph:]",
11025 "[:lower:]",
11026 "[:^lower:]",
11027 "[:print:]",
11028 "[:^print:]",
11029 "[:punct:]",
11030 "[:^punct:]",
11031 "[:upper:]",
aaa51d5e 11032 "[:^upper:]",
653099ff 11033 "[:xdigit:]",
aaa51d5e
JF
11034 "[:^xdigit:]",
11035 "[:space:]",
11036 "[:^space:]",
11037 "[:blank:]",
11038 "[:^blank:]"
653099ff
GS
11039 };
11040
19860706 11041 if (flags & ANYOF_LOCALE)
396482e1 11042 sv_catpvs(sv, "{loc}");
39065660 11043 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
396482e1 11044 sv_catpvs(sv, "{i}");
653099ff 11045 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 11046 if (flags & ANYOF_INVERT)
396482e1 11047 sv_catpvs(sv, "^");
3339dfd8
YO
11048
11049 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
11050 for (i = 0; i <= 256; i++) {
11051 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
11052 if (rangestart == -1)
11053 rangestart = i;
11054 } else if (rangestart != -1) {
11055 if (i <= rangestart + 3)
11056 for (; rangestart < i; rangestart++)
653099ff 11057 put_byte(sv, rangestart);
ffc61ed2
JH
11058 else {
11059 put_byte(sv, rangestart);
396482e1 11060 sv_catpvs(sv, "-");
ffc61ed2 11061 put_byte(sv, i - 1);
653099ff 11062 }
24d786f4 11063 do_sep = 1;
ffc61ed2 11064 rangestart = -1;
653099ff 11065 }
847a199f 11066 }
3339dfd8
YO
11067
11068 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
3a15e693
KW
11069 /* output any special charclass tests (used entirely under use locale) */
11070 if (ANYOF_CLASS_TEST_ANY_SET(o))
bb7a0f54 11071 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 11072 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 11073 sv_catpv(sv, anyofs[i]);
24d786f4
YO
11074 do_sep = 1;
11075 }
11076
3339dfd8
YO
11077 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11078
11454c59
KW
11079 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
11080 sv_catpvs(sv, "{non-utf8-latin1-all}");
11081 }
11082
3339dfd8 11083 /* output information about the unicode matching */
ef87b810 11084 if (flags & ANYOF_UNICODE_ALL)
396482e1 11085 sv_catpvs(sv, "{unicode_all}");
137165a6 11086 else if (ANYOF_NONBITMAP(o))
ef87b810 11087 sv_catpvs(sv, "{unicode}");
f5ecd18d 11088 if (flags & ANYOF_NONBITMAP_NON_UTF8)
ef87b810 11089 sv_catpvs(sv, "{outside bitmap}");
ffc61ed2
JH
11090
11091 {
11092 SV *lv;
32fc9b6a 11093 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 11094
ffc61ed2
JH
11095 if (lv) {
11096 if (sw) {
89ebb4a3 11097 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 11098
ffc61ed2 11099 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 11100 uvchr_to_utf8(s, i);
ffc61ed2 11101
3568d838 11102 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
11103 if (rangestart == -1)
11104 rangestart = i;
11105 } else if (rangestart != -1) {
ffc61ed2
JH
11106 if (i <= rangestart + 3)
11107 for (; rangestart < i; rangestart++) {
2d03de9c
AL
11108 const U8 * const e = uvchr_to_utf8(s,rangestart);
11109 U8 *p;
11110 for(p = s; p < e; p++)
ffc61ed2
JH
11111 put_byte(sv, *p);
11112 }
11113 else {
2d03de9c
AL
11114 const U8 *e = uvchr_to_utf8(s,rangestart);
11115 U8 *p;
11116 for (p = s; p < e; p++)
ffc61ed2 11117 put_byte(sv, *p);
396482e1 11118 sv_catpvs(sv, "-");
2d03de9c
AL
11119 e = uvchr_to_utf8(s, i-1);
11120 for (p = s; p < e; p++)
1df70142 11121 put_byte(sv, *p);
ffc61ed2
JH
11122 }
11123 rangestart = -1;
11124 }
19860706 11125 }
ffc61ed2 11126
396482e1 11127 sv_catpvs(sv, "..."); /* et cetera */
19860706 11128 }
fde631ed 11129
ffc61ed2 11130 {
2e0de35c 11131 char *s = savesvpv(lv);
c445ea15 11132 char * const origs = s;
b81d288d 11133
3dab1dad
YO
11134 while (*s && *s != '\n')
11135 s++;
b81d288d 11136
ffc61ed2 11137 if (*s == '\n') {
2d03de9c 11138 const char * const t = ++s;
ffc61ed2
JH
11139
11140 while (*s) {
11141 if (*s == '\n')
11142 *s = ' ';
11143 s++;
11144 }
11145 if (s[-1] == ' ')
11146 s[-1] = 0;
11147
11148 sv_catpv(sv, t);
fde631ed 11149 }
b81d288d 11150
ffc61ed2 11151 Safefree(origs);
fde631ed
JH
11152 }
11153 }
653099ff 11154 }
ffc61ed2 11155
653099ff
GS
11156 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
11157 }
9b155405 11158 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 11159 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 11160#else
96a5add6 11161 PERL_UNUSED_CONTEXT;
65e66c80
SP
11162 PERL_UNUSED_ARG(sv);
11163 PERL_UNUSED_ARG(o);
f9049ba1 11164 PERL_UNUSED_ARG(prog);
17c3b450 11165#endif /* DEBUGGING */
35ff7856 11166}
a687059c 11167
cad2e5aa 11168SV *
288b8c02 11169Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 11170{ /* Assume that RE_INTUIT is set */
97aff369 11171 dVAR;
288b8c02 11172 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 11173 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
11174
11175 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
11176 PERL_UNUSED_CONTEXT;
11177
a3621e74 11178 DEBUG_COMPILE_r(
cfd0369c 11179 {
2d03de9c 11180 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 11181 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
11182
11183 if (!PL_colorset) reginitcolors();
11184 PerlIO_printf(Perl_debug_log,
a0288114 11185 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
11186 PL_colors[4],
11187 prog->check_substr ? "" : "utf8 ",
11188 PL_colors[5],PL_colors[0],
cad2e5aa
JH
11189 s,
11190 PL_colors[1],
11191 (strlen(s) > 60 ? "..." : ""));
11192 } );
11193
33b8afdf 11194 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
11195}
11196
84da74a7 11197/*
f8149455 11198 pregfree()
84da74a7 11199
f8149455
YO
11200 handles refcounting and freeing the perl core regexp structure. When
11201 it is necessary to actually free the structure the first thing it
3b753521 11202 does is call the 'free' method of the regexp_engine associated to
f8149455
YO
11203 the regexp, allowing the handling of the void *pprivate; member
11204 first. (This routine is not overridable by extensions, which is why
11205 the extensions free is called first.)
11206
11207 See regdupe and regdupe_internal if you change anything here.
84da74a7 11208*/
f8149455 11209#ifndef PERL_IN_XSUB_RE
2b69d0c2 11210void
84679df5 11211Perl_pregfree(pTHX_ REGEXP *r)
a687059c 11212{
288b8c02
NC
11213 SvREFCNT_dec(r);
11214}
11215
11216void
11217Perl_pregfree2(pTHX_ REGEXP *rx)
11218{
27da23d5 11219 dVAR;
288b8c02 11220 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 11221 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 11222
7918f24d
NC
11223 PERL_ARGS_ASSERT_PREGFREE2;
11224
28d8d7f4
YO
11225 if (r->mother_re) {
11226 ReREFCNT_dec(r->mother_re);
11227 } else {
288b8c02 11228 CALLREGFREE_PVT(rx); /* free the private data */
ef8d46e8 11229 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
11230 }
11231 if (r->substrs) {
ef8d46e8
VP
11232 SvREFCNT_dec(r->anchored_substr);
11233 SvREFCNT_dec(r->anchored_utf8);
11234 SvREFCNT_dec(r->float_substr);
11235 SvREFCNT_dec(r->float_utf8);
28d8d7f4
YO
11236 Safefree(r->substrs);
11237 }
288b8c02 11238 RX_MATCH_COPY_FREE(rx);
f8c7b90f 11239#ifdef PERL_OLD_COPY_ON_WRITE
ef8d46e8 11240 SvREFCNT_dec(r->saved_copy);
ed252734 11241#endif
f0ab9afb 11242 Safefree(r->offs);
f8149455 11243}
28d8d7f4
YO
11244
11245/* reg_temp_copy()
11246
11247 This is a hacky workaround to the structural issue of match results
11248 being stored in the regexp structure which is in turn stored in
11249 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
11250 could be PL_curpm in multiple contexts, and could require multiple
11251 result sets being associated with the pattern simultaneously, such
11252 as when doing a recursive match with (??{$qr})
11253
11254 The solution is to make a lightweight copy of the regexp structure
11255 when a qr// is returned from the code executed by (??{$qr}) this
486ec47a 11256 lightweight copy doesn't actually own any of its data except for
28d8d7f4
YO
11257 the starp/end and the actual regexp structure itself.
11258
11259*/
11260
11261
84679df5 11262REGEXP *
f0826785 11263Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
7918f24d 11264{
f0826785 11265 struct regexp *ret;
288b8c02 11266 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 11267 register const I32 npar = r->nparens+1;
7918f24d
NC
11268
11269 PERL_ARGS_ASSERT_REG_TEMP_COPY;
11270
f0826785
BM
11271 if (!ret_x)
11272 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
11273 ret = (struct regexp *)SvANY(ret_x);
11274
288b8c02 11275 (void)ReREFCNT_inc(rx);
f7c278bf
NC
11276 /* We can take advantage of the existing "copied buffer" mechanism in SVs
11277 by pointing directly at the buffer, but flagging that the allocated
11278 space in the copy is zero. As we've just done a struct copy, it's now
11279 a case of zero-ing that, rather than copying the current length. */
11280 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 11281 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
b6f60916
NC
11282 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
11283 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
f7c278bf 11284 SvLEN_set(ret_x, 0);
b9ad13ac 11285 SvSTASH_set(ret_x, NULL);
703c388d 11286 SvMAGIC_set(ret_x, NULL);
f0ab9afb
NC
11287 Newx(ret->offs, npar, regexp_paren_pair);
11288 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 11289 if (r->substrs) {
28d8d7f4 11290 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
11291 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11292
11293 SvREFCNT_inc_void(ret->anchored_substr);
11294 SvREFCNT_inc_void(ret->anchored_utf8);
11295 SvREFCNT_inc_void(ret->float_substr);
11296 SvREFCNT_inc_void(ret->float_utf8);
11297
11298 /* check_substr and check_utf8, if non-NULL, point to either their
11299 anchored or float namesakes, and don't hold a second reference. */
486913e4 11300 }
288b8c02 11301 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 11302#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 11303 ret->saved_copy = NULL;
28d8d7f4 11304#endif
288b8c02 11305 ret->mother_re = rx;
28d8d7f4 11306
288b8c02 11307 return ret_x;
28d8d7f4 11308}
f8149455
YO
11309#endif
11310
11311/* regfree_internal()
11312
11313 Free the private data in a regexp. This is overloadable by
11314 extensions. Perl takes care of the regexp structure in pregfree(),
3b753521 11315 this covers the *pprivate pointer which technically perl doesn't
f8149455
YO
11316 know about, however of course we have to handle the
11317 regexp_internal structure when no extension is in use.
11318
11319 Note this is called before freeing anything in the regexp
11320 structure.
11321 */
11322
11323void
288b8c02 11324Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
11325{
11326 dVAR;
288b8c02 11327 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
11328 RXi_GET_DECL(r,ri);
11329 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
11330
11331 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
11332
f8149455
YO
11333 DEBUG_COMPILE_r({
11334 if (!PL_colorset)
11335 reginitcolors();
11336 {
11337 SV *dsv= sv_newmortal();
3c8556c3 11338 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 11339 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
11340 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
11341 PL_colors[4],PL_colors[5],s);
11342 }
11343 });
7122b237
YO
11344#ifdef RE_TRACK_PATTERN_OFFSETS
11345 if (ri->u.offsets)
11346 Safefree(ri->u.offsets); /* 20010421 MJD */
11347#endif
f8fc2ecf
YO
11348 if (ri->data) {
11349 int n = ri->data->count;
f3548bdc
DM
11350 PAD* new_comppad = NULL;
11351 PAD* old_comppad;
4026c95a 11352 PADOFFSET refcnt;
dfad63ad 11353
c277df42 11354 while (--n >= 0) {
261faec3 11355 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 11356 switch (ri->data->what[n]) {
af534a04 11357 case 'a':
c277df42 11358 case 's':
81714fb9 11359 case 'S':
55eed653 11360 case 'u':
ad64d0ec 11361 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 11362 break;
653099ff 11363 case 'f':
f8fc2ecf 11364 Safefree(ri->data->data[n]);
653099ff 11365 break;
dfad63ad 11366 case 'p':
502c6561 11367 new_comppad = MUTABLE_AV(ri->data->data[n]);
dfad63ad 11368 break;
c277df42 11369 case 'o':
dfad63ad 11370 if (new_comppad == NULL)
cea2e8a9 11371 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
11372 PAD_SAVE_LOCAL(old_comppad,
11373 /* Watch out for global destruction's random ordering. */
c445ea15 11374 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 11375 );
b34c0dd4 11376 OP_REFCNT_LOCK;
f8fc2ecf 11377 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
11378 OP_REFCNT_UNLOCK;
11379 if (!refcnt)
f8fc2ecf 11380 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 11381
f3548bdc 11382 PAD_RESTORE_LOCAL(old_comppad);
ad64d0ec 11383 SvREFCNT_dec(MUTABLE_SV(new_comppad));
dfad63ad 11384 new_comppad = NULL;
c277df42
IZ
11385 break;
11386 case 'n':
9e55ce06 11387 break;
07be1b83 11388 case 'T':
be8e71aa
YO
11389 { /* Aho Corasick add-on structure for a trie node.
11390 Used in stclass optimization only */
07be1b83 11391 U32 refcount;
f8fc2ecf 11392 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
11393 OP_REFCNT_LOCK;
11394 refcount = --aho->refcount;
11395 OP_REFCNT_UNLOCK;
11396 if ( !refcount ) {
446bd890
NC
11397 PerlMemShared_free(aho->states);
11398 PerlMemShared_free(aho->fail);
446bd890
NC
11399 /* do this last!!!! */
11400 PerlMemShared_free(ri->data->data[n]);
11401 PerlMemShared_free(ri->regstclass);
07be1b83
YO
11402 }
11403 }
11404 break;
a3621e74 11405 case 't':
07be1b83 11406 {
be8e71aa 11407 /* trie structure. */
07be1b83 11408 U32 refcount;
f8fc2ecf 11409 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
11410 OP_REFCNT_LOCK;
11411 refcount = --trie->refcount;
11412 OP_REFCNT_UNLOCK;
11413 if ( !refcount ) {
446bd890 11414 PerlMemShared_free(trie->charmap);
446bd890
NC
11415 PerlMemShared_free(trie->states);
11416 PerlMemShared_free(trie->trans);
07be1b83 11417 if (trie->bitmap)
446bd890 11418 PerlMemShared_free(trie->bitmap);
786e8c11 11419 if (trie->jump)
446bd890 11420 PerlMemShared_free(trie->jump);
2e64971a 11421 PerlMemShared_free(trie->wordinfo);
446bd890
NC
11422 /* do this last!!!! */
11423 PerlMemShared_free(ri->data->data[n]);
a3621e74 11424 }
07be1b83
YO
11425 }
11426 break;
c277df42 11427 default:
f8fc2ecf 11428 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
11429 }
11430 }
f8fc2ecf
YO
11431 Safefree(ri->data->what);
11432 Safefree(ri->data);
a0d0e21e 11433 }
28d8d7f4 11434
f8fc2ecf 11435 Safefree(ri);
a687059c 11436}
c277df42 11437
a09252eb
NC
11438#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
11439#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
84da74a7
YO
11440#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
11441
11442/*
32cd70f6 11443 re_dup - duplicate a regexp.
84da74a7 11444
8233f606
DM
11445 This routine is expected to clone a given regexp structure. It is only
11446 compiled under USE_ITHREADS.
32cd70f6 11447
f8149455
YO
11448 After all of the core data stored in struct regexp is duplicated
11449 the regexp_engine.dupe method is used to copy any private data
11450 stored in the *pprivate pointer. This allows extensions to handle
11451 any duplication it needs to do.
11452
11453 See pregfree() and regfree_internal() if you change anything here.
84da74a7 11454*/
a3c0e9ca 11455#if defined(USE_ITHREADS)
f8149455 11456#ifndef PERL_IN_XSUB_RE
288b8c02
NC
11457void
11458Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 11459{
84da74a7 11460 dVAR;
a86a1ca7 11461 I32 npar;
288b8c02
NC
11462 const struct regexp *r = (const struct regexp *)SvANY(sstr);
11463 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 11464
7918f24d
NC
11465 PERL_ARGS_ASSERT_RE_DUP_GUTS;
11466
84da74a7 11467 npar = r->nparens+1;
f0ab9afb
NC
11468 Newx(ret->offs, npar, regexp_paren_pair);
11469 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 11470 if(ret->swap) {
28d8d7f4 11471 /* no need to copy these */
f0ab9afb 11472 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 11473 }
84da74a7 11474
6057429f 11475 if (ret->substrs) {
32cd70f6
NC
11476 /* Do it this way to avoid reading from *r after the StructCopy().
11477 That way, if any of the sv_dup_inc()s dislodge *r from the L1
11478 cache, it doesn't matter. */
66b1de87
NC
11479 const bool anchored = r->check_substr
11480 ? r->check_substr == r->anchored_substr
11481 : r->check_utf8 == r->anchored_utf8;
785a26d5 11482 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
11483 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11484
32cd70f6
NC
11485 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
11486 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
11487 ret->float_substr = sv_dup_inc(ret->float_substr, param);
11488 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 11489
32cd70f6
NC
11490 /* check_substr and check_utf8, if non-NULL, point to either their
11491 anchored or float namesakes, and don't hold a second reference. */
11492
11493 if (ret->check_substr) {
11494 if (anchored) {
11495 assert(r->check_utf8 == r->anchored_utf8);
11496 ret->check_substr = ret->anchored_substr;
11497 ret->check_utf8 = ret->anchored_utf8;
11498 } else {
11499 assert(r->check_substr == r->float_substr);
11500 assert(r->check_utf8 == r->float_utf8);
11501 ret->check_substr = ret->float_substr;
11502 ret->check_utf8 = ret->float_utf8;
11503 }
66b1de87
NC
11504 } else if (ret->check_utf8) {
11505 if (anchored) {
11506 ret->check_utf8 = ret->anchored_utf8;
11507 } else {
11508 ret->check_utf8 = ret->float_utf8;
11509 }
32cd70f6 11510 }
6057429f 11511 }
f8149455 11512
5daac39c 11513 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 11514
6057429f 11515 if (ret->pprivate)
288b8c02 11516 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 11517
288b8c02 11518 if (RX_MATCH_COPIED(dstr))
6057429f 11519 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
11520 else
11521 ret->subbeg = NULL;
11522#ifdef PERL_OLD_COPY_ON_WRITE
11523 ret->saved_copy = NULL;
11524#endif
6057429f 11525
c2123ae3
NC
11526 if (ret->mother_re) {
11527 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
11528 /* Our storage points directly to our mother regexp, but that's
11529 1: a buffer in a different thread
11530 2: something we no longer hold a reference on
11531 so we need to copy it locally. */
11532 /* Note we need to sue SvCUR() on our mother_re, because it, in
11533 turn, may well be pointing to its own mother_re. */
11534 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
11535 SvCUR(ret->mother_re)+1));
11536 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
11537 }
11538 ret->mother_re = NULL;
11539 }
6057429f 11540 ret->gofs = 0;
f8149455
YO
11541}
11542#endif /* PERL_IN_XSUB_RE */
11543
11544/*
11545 regdupe_internal()
11546
11547 This is the internal complement to regdupe() which is used to copy
11548 the structure pointed to by the *pprivate pointer in the regexp.
11549 This is the core version of the extension overridable cloning hook.
11550 The regexp structure being duplicated will be copied by perl prior
11551 to this and will be provided as the regexp *r argument, however
11552 with the /old/ structures pprivate pointer value. Thus this routine
11553 may override any copying normally done by perl.
11554
11555 It returns a pointer to the new regexp_internal structure.
11556*/
11557
11558void *
288b8c02 11559Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
11560{
11561 dVAR;
288b8c02 11562 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
11563 regexp_internal *reti;
11564 int len, npar;
11565 RXi_GET_DECL(r,ri);
7918f24d
NC
11566
11567 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455
YO
11568
11569 npar = r->nparens+1;
7122b237 11570 len = ProgLen(ri);
f8149455 11571
45cf4570 11572 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455
YO
11573 Copy(ri->program, reti->program, len+1, regnode);
11574
f8149455 11575
f8fc2ecf 11576 reti->regstclass = NULL;
bcdf7404 11577
f8fc2ecf 11578 if (ri->data) {
84da74a7 11579 struct reg_data *d;
f8fc2ecf 11580 const int count = ri->data->count;
84da74a7
YO
11581 int i;
11582
11583 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
11584 char, struct reg_data);
11585 Newx(d->what, count, U8);
11586
11587 d->count = count;
11588 for (i = 0; i < count; i++) {
f8fc2ecf 11589 d->what[i] = ri->data->what[i];
84da74a7 11590 switch (d->what[i]) {
af534a04 11591 /* legal options are one of: sSfpontTua
84da74a7 11592 see also regcomp.h and pregfree() */
af534a04 11593 case 'a': /* actually an AV, but the dup function is identical. */
84da74a7 11594 case 's':
81714fb9 11595 case 'S':
0536c0a7 11596 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 11597 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 11598 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 11599 break;
84da74a7
YO
11600 case 'f':
11601 /* This is cheating. */
11602 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 11603 StructCopy(ri->data->data[i], d->data[i],
84da74a7 11604 struct regnode_charclass_class);
f8fc2ecf 11605 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
11606 break;
11607 case 'o':
bbe252da
YO
11608 /* Compiled op trees are readonly and in shared memory,
11609 and can thus be shared without duplication. */
84da74a7 11610 OP_REFCNT_LOCK;
f8fc2ecf 11611 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
11612 OP_REFCNT_UNLOCK;
11613 break;
23eab42c
NC
11614 case 'T':
11615 /* Trie stclasses are readonly and can thus be shared
11616 * without duplication. We free the stclass in pregfree
11617 * when the corresponding reg_ac_data struct is freed.
11618 */
11619 reti->regstclass= ri->regstclass;
11620 /* Fall through */
84da74a7 11621 case 't':
84da74a7 11622 OP_REFCNT_LOCK;
0536c0a7 11623 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 11624 OP_REFCNT_UNLOCK;
0536c0a7
NC
11625 /* Fall through */
11626 case 'n':
11627 d->data[i] = ri->data->data[i];
84da74a7 11628 break;
84da74a7 11629 default:
f8fc2ecf 11630 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
11631 }
11632 }
11633
f8fc2ecf 11634 reti->data = d;
84da74a7
YO
11635 }
11636 else
f8fc2ecf 11637 reti->data = NULL;
84da74a7 11638
cde0cee5
YO
11639 reti->name_list_idx = ri->name_list_idx;
11640
7122b237
YO
11641#ifdef RE_TRACK_PATTERN_OFFSETS
11642 if (ri->u.offsets) {
11643 Newx(reti->u.offsets, 2*len+1, U32);
11644 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
11645 }
11646#else
11647 SetProgLen(reti,len);
11648#endif
11649
f8149455 11650 return (void*)reti;
84da74a7 11651}
f8149455
YO
11652
11653#endif /* USE_ITHREADS */
84da74a7 11654
f8149455 11655#ifndef PERL_IN_XSUB_RE
bcdf7404 11656
c277df42
IZ
11657/*
11658 - regnext - dig the "next" pointer out of a node
c277df42
IZ
11659 */
11660regnode *
864dbfa3 11661Perl_regnext(pTHX_ register regnode *p)
c277df42 11662{
97aff369 11663 dVAR;
c277df42
IZ
11664 register I32 offset;
11665
f8fc2ecf 11666 if (!p)
c277df42
IZ
11667 return(NULL);
11668
35db910f
KW
11669 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
11670 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
11671 }
11672
c277df42
IZ
11673 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
11674 if (offset == 0)
11675 return(NULL);
11676
c277df42 11677 return(p+offset);
c277df42 11678}
76234dfb 11679#endif
c277df42 11680
01f988be 11681STATIC void
cea2e8a9 11682S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
11683{
11684 va_list args;
11685 STRLEN l1 = strlen(pat1);
11686 STRLEN l2 = strlen(pat2);
11687 char buf[512];
06bf62c7 11688 SV *msv;
73d840c0 11689 const char *message;
c277df42 11690
7918f24d
NC
11691 PERL_ARGS_ASSERT_RE_CROAK2;
11692
c277df42
IZ
11693 if (l1 > 510)
11694 l1 = 510;
11695 if (l1 + l2 > 510)
11696 l2 = 510 - l1;
11697 Copy(pat1, buf, l1 , char);
11698 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
11699 buf[l1 + l2] = '\n';
11700 buf[l1 + l2 + 1] = '\0';
8736538c
AS
11701#ifdef I_STDARG
11702 /* ANSI variant takes additional second argument */
c277df42 11703 va_start(args, pat2);
8736538c
AS
11704#else
11705 va_start(args);
11706#endif
5a844595 11707 msv = vmess(buf, &args);
c277df42 11708 va_end(args);
cfd0369c 11709 message = SvPV_const(msv,l1);
c277df42
IZ
11710 if (l1 > 512)
11711 l1 = 512;
11712 Copy(message, buf, l1 , char);
197cf9b9 11713 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 11714 Perl_croak(aTHX_ "%s", buf);
c277df42 11715}
a0ed51b3
LW
11716
11717/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
11718
76234dfb 11719#ifndef PERL_IN_XSUB_RE
a0ed51b3 11720void
864dbfa3 11721Perl_save_re_context(pTHX)
b81d288d 11722{
97aff369 11723 dVAR;
1ade1aa1
NC
11724
11725 struct re_save_state *state;
11726
11727 SAVEVPTR(PL_curcop);
11728 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
11729
11730 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
11731 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
c6bf6a65 11732 SSPUSHUV(SAVEt_RE_STATE);
1ade1aa1 11733
46ab3289 11734 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 11735
a0ed51b3 11736 PL_reg_start_tmp = 0;
a0ed51b3 11737 PL_reg_start_tmpl = 0;
c445ea15 11738 PL_reg_oldsaved = NULL;
a5db57d6 11739 PL_reg_oldsavedlen = 0;
a5db57d6 11740 PL_reg_maxiter = 0;
a5db57d6 11741 PL_reg_leftiter = 0;
c445ea15 11742 PL_reg_poscache = NULL;
a5db57d6 11743 PL_reg_poscache_size = 0;
1ade1aa1
NC
11744#ifdef PERL_OLD_COPY_ON_WRITE
11745 PL_nrs = NULL;
11746#endif
ada6e8a9 11747
c445ea15
AL
11748 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
11749 if (PL_curpm) {
11750 const REGEXP * const rx = PM_GETRE(PL_curpm);
11751 if (rx) {
1df70142 11752 U32 i;
07bc277f 11753 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 11754 char digits[TYPE_CHARS(long)];
d9fad198 11755 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
11756 GV *const *const gvp
11757 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
11758
b37c2d43
AL
11759 if (gvp) {
11760 GV * const gv = *gvp;
11761 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
11762 save_scalar(gv);
49f27e4b 11763 }
ada6e8a9
AMS
11764 }
11765 }
11766 }
a0ed51b3 11767}
76234dfb 11768#endif
51371543 11769
51371543 11770static void
acfe0abc 11771clear_re(pTHX_ void *r)
51371543 11772{
97aff369 11773 dVAR;
84679df5 11774 ReREFCNT_dec((REGEXP *)r);
51371543 11775}
ffbc6a93 11776
a28509cc
AL
11777#ifdef DEBUGGING
11778
11779STATIC void
11780S_put_byte(pTHX_ SV *sv, int c)
11781{
7918f24d
NC
11782 PERL_ARGS_ASSERT_PUT_BYTE;
11783
7fddd944
NC
11784 /* Our definition of isPRINT() ignores locales, so only bytes that are
11785 not part of UTF-8 are considered printable. I assume that the same
11786 holds for UTF-EBCDIC.
11787 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
11788 which Wikipedia says:
11789
11790 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
11791 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
11792 identical, to the ASCII delete (DEL) or rubout control character.
11793 ) So the old condition can be simplified to !isPRINT(c) */
9ce2357e
KW
11794 if (!isPRINT(c)) {
11795 if (c < 256) {
11796 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
11797 }
11798 else {
11799 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
11800 }
11801 }
5e7aa789 11802 else {
88c9ea1e 11803 const char string = c;
5e7aa789
NC
11804 if (c == '-' || c == ']' || c == '\\' || c == '^')
11805 sv_catpvs(sv, "\\");
11806 sv_catpvn(sv, &string, 1);
11807 }
a28509cc
AL
11808}
11809
786e8c11 11810
3dab1dad
YO
11811#define CLEAR_OPTSTART \
11812 if (optstart) STMT_START { \
70685ca0 11813 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
11814 optstart=NULL; \
11815 } STMT_END
11816
786e8c11 11817#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 11818
b5a2f8d8
NC
11819STATIC const regnode *
11820S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
11821 const regnode *last, const regnode *plast,
11822 SV* sv, I32 indent, U32 depth)
a28509cc 11823{
97aff369 11824 dVAR;
786e8c11 11825 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 11826 register const regnode *next;
3dab1dad 11827 const regnode *optstart= NULL;
1f1031fe 11828
f8fc2ecf 11829 RXi_GET_DECL(r,ri);
3dab1dad 11830 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
11831
11832 PERL_ARGS_ASSERT_DUMPUNTIL;
11833
786e8c11
YO
11834#ifdef DEBUG_DUMPUNTIL
11835 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
11836 last ? last-start : 0,plast ? plast-start : 0);
11837#endif
11838
11839 if (plast && plast < last)
11840 last= plast;
11841
11842 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 11843 /* While that wasn't END last time... */
a28509cc
AL
11844 NODE_ALIGN(node);
11845 op = OP(node);
de734bd5 11846 if (op == CLOSE || op == WHILEM)
786e8c11 11847 indent--;
b5a2f8d8 11848 next = regnext((regnode *)node);
1f1031fe 11849
a28509cc 11850 /* Where, what. */
8e11feef 11851 if (OP(node) == OPTIMIZED) {
e68ec53f 11852 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 11853 optstart = node;
3dab1dad 11854 else
8e11feef 11855 goto after_print;
3dab1dad
YO
11856 } else
11857 CLEAR_OPTSTART;
1f1031fe 11858
32fc9b6a 11859 regprop(r, sv, node);
a28509cc 11860 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 11861 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
11862
11863 if (OP(node) != OPTIMIZED) {
11864 if (next == NULL) /* Next ptr. */
11865 PerlIO_printf(Perl_debug_log, " (0)");
11866 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
11867 PerlIO_printf(Perl_debug_log, " (FAIL)");
11868 else
11869 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
11870 (void)PerlIO_putc(Perl_debug_log, '\n');
11871 }
11872
a28509cc
AL
11873 after_print:
11874 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
11875 assert(next);
11876 {
11877 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
11878 ? regnext((regnode *)next)
11879 : next);
be8e71aa
YO
11880 if (last && nnode > last)
11881 nnode = last;
786e8c11 11882 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 11883 }
a28509cc
AL
11884 }
11885 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 11886 assert(next);
786e8c11 11887 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
11888 }
11889 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 11890 const regnode *this_trie = node;
1de06328 11891 const char op = OP(node);
647f639f 11892 const U32 n = ARG(node);
1de06328 11893 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 11894 (reg_ac_data *)ri->data->data[n] :
1de06328 11895 NULL;
3251b653
NC
11896 const reg_trie_data * const trie =
11897 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 11898#ifdef DEBUGGING
502c6561 11899 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 11900#endif
786e8c11 11901 const regnode *nextbranch= NULL;
a28509cc 11902 I32 word_idx;
76f68e9b 11903 sv_setpvs(sv, "");
786e8c11 11904 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 11905 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
11906
11907 PerlIO_printf(Perl_debug_log, "%*s%s ",
11908 (int)(2*(indent+3)), "",
11909 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
11910 PL_colors[0], PL_colors[1],
11911 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 11912 PERL_PV_PRETTY_ELLIPSES |
7f69552c 11913 PERL_PV_PRETTY_LTGT
786e8c11
YO
11914 )
11915 : "???"
11916 );
11917 if (trie->jump) {
40d049e4 11918 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
11919 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
11920 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
11921 if (dist) {
11922 if (!nextbranch)
24b23f37 11923 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
11924 DUMPUNTIL(this_trie + dist, nextbranch);
11925 }
786e8c11
YO
11926 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
11927 nextbranch= regnext((regnode *)nextbranch);
11928 } else {
11929 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 11930 }
786e8c11
YO
11931 }
11932 if (last && next > last)
11933 node= last;
11934 else
11935 node= next;
a28509cc 11936 }
786e8c11
YO
11937 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
11938 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
11939 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
11940 }
11941 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 11942 assert(next);
786e8c11 11943 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
11944 }
11945 else if ( op == PLUS || op == STAR) {
786e8c11 11946 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc 11947 }
f56b6394 11948 else if (PL_regkind[(U8)op] == ANYOF) {
a28509cc 11949 /* arglen 1 + class block */
4a3ee7a8 11950 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
a28509cc
AL
11951 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
11952 node = NEXTOPER(node);
11953 }
11954 else if (PL_regkind[(U8)op] == EXACT) {
11955 /* Literal string, where present. */
11956 node += NODE_SZ_STR(node) - 1;
11957 node = NEXTOPER(node);
11958 }
11959 else {
11960 node = NEXTOPER(node);
11961 node += regarglen[(U8)op];
11962 }
11963 if (op == CURLYX || op == OPEN)
786e8c11 11964 indent++;
a28509cc 11965 }
3dab1dad 11966 CLEAR_OPTSTART;
786e8c11 11967#ifdef DEBUG_DUMPUNTIL
70685ca0 11968 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 11969#endif
1de06328 11970 return node;
a28509cc
AL
11971}
11972
11973#endif /* DEBUGGING */
11974
241d1a3b
NC
11975/*
11976 * Local variables:
11977 * c-indentation-style: bsd
11978 * c-basic-offset: 4
11979 * indent-tabs-mode: t
11980 * End:
11981 *
37442d52
RGS
11982 * ex: set ts=8 sts=4 sw=4 noet:
11983 */