This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re-generated Configure after the spell-check fixes from Peter J. Acklam
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
81714fb9 137 HV *paren_names; /* Paren names */
1f1031fe 138
40d049e4
YO
139 regnode **recurse; /* Recurse regops */
140 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
141#if ADD_TO_REGEXEC
142 char *starttry; /* -Dr: where regtry was called. */
143#define RExC_starttry (pRExC_state->starttry)
144#endif
3dab1dad 145#ifdef DEBUGGING
be8e71aa 146 const char *lastparse;
3dab1dad 147 I32 lastnum;
1f1031fe 148 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
149#define RExC_lastparse (pRExC_state->lastparse)
150#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 151#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 152#endif
830247a4
IZ
153} RExC_state_t;
154
e2509266 155#define RExC_flags (pRExC_state->flags)
830247a4 156#define RExC_precomp (pRExC_state->precomp)
288b8c02 157#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 158#define RExC_rx (pRExC_state->rx)
f8fc2ecf 159#define RExC_rxi (pRExC_state->rxi)
fac92740 160#define RExC_start (pRExC_state->start)
830247a4
IZ
161#define RExC_end (pRExC_state->end)
162#define RExC_parse (pRExC_state->parse)
163#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
164#ifdef RE_TRACK_PATTERN_OFFSETS
165#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
166#endif
830247a4 167#define RExC_emit (pRExC_state->emit)
fac92740 168#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 169#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
170#define RExC_naughty (pRExC_state->naughty)
171#define RExC_sawback (pRExC_state->sawback)
172#define RExC_seen (pRExC_state->seen)
173#define RExC_size (pRExC_state->size)
174#define RExC_npar (pRExC_state->npar)
e2e6a0f1 175#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
176#define RExC_extralen (pRExC_state->extralen)
177#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 179#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 180#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
181#define RExC_open_parens (pRExC_state->open_parens)
182#define RExC_close_parens (pRExC_state->close_parens)
183#define RExC_opend (pRExC_state->opend)
81714fb9 184#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
185#define RExC_recurse (pRExC_state->recurse)
186#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 187
cde0cee5 188
a687059c
LW
189#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
a687059c 192
35c8bce7
LW
193#ifdef SPSTART
194#undef SPSTART /* dratted cpp namespace... */
195#endif
a687059c
LW
196/*
197 * Flags to be passed up and down.
198 */
a687059c 199#define WORST 0 /* Worst case. */
a3b492c3 200#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
201
202/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
203 * character, and if utf8, must be invariant. */
204#define SIMPLE 0x02
a3b492c3
YO
205#define SPSTART 0x04 /* Starts with * or +. */
206#define TRYAGAIN 0x08 /* Weeded out a declaration. */
207#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 208
3dab1dad
YO
209#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
210
07be1b83
YO
211/* whether trie related optimizations are enabled */
212#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
213#define TRIE_STUDY_OPT
786e8c11 214#define FULL_TRIE_STUDY
07be1b83
YO
215#define TRIE_STCLASS
216#endif
1de06328
YO
217
218
40d049e4
YO
219
220#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
221#define PBITVAL(paren) (1 << ((paren) & 7))
222#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
223#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
224#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
225
bbd61b5f
KW
226/* If not already in utf8, do a longjmp back to the beginning */
227#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
228#define REQUIRE_UTF8 STMT_START { \
229 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
230 } STMT_END
40d049e4 231
1de06328
YO
232/* About scan_data_t.
233
234 During optimisation we recurse through the regexp program performing
235 various inplace (keyhole style) optimisations. In addition study_chunk
236 and scan_commit populate this data structure with information about
237 what strings MUST appear in the pattern. We look for the longest
3b753521 238 string that must appear at a fixed location, and we look for the
1de06328
YO
239 longest string that may appear at a floating location. So for instance
240 in the pattern:
241
242 /FOO[xX]A.*B[xX]BAR/
243
244 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
245 strings (because they follow a .* construct). study_chunk will identify
246 both FOO and BAR as being the longest fixed and floating strings respectively.
247
248 The strings can be composites, for instance
249
250 /(f)(o)(o)/
251
252 will result in a composite fixed substring 'foo'.
253
254 For each string some basic information is maintained:
255
256 - offset or min_offset
257 This is the position the string must appear at, or not before.
258 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
259 characters must match before the string we are searching for.
260 Likewise when combined with minlenp and the length of the string it
1de06328
YO
261 tells us how many characters must appear after the string we have
262 found.
263
264 - max_offset
265 Only used for floating strings. This is the rightmost point that
3b753521 266 the string can appear at. If set to I32 max it indicates that the
1de06328
YO
267 string can occur infinitely far to the right.
268
269 - minlenp
270 A pointer to the minimum length of the pattern that the string
271 was found inside. This is important as in the case of positive
272 lookahead or positive lookbehind we can have multiple patterns
273 involved. Consider
274
275 /(?=FOO).*F/
276
277 The minimum length of the pattern overall is 3, the minimum length
278 of the lookahead part is 3, but the minimum length of the part that
279 will actually match is 1. So 'FOO's minimum length is 3, but the
280 minimum length for the F is 1. This is important as the minimum length
281 is used to determine offsets in front of and behind the string being
282 looked for. Since strings can be composites this is the length of the
486ec47a 283 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
284 the length is calculated by study_chunk, so that the minimum lengths
285 are not known until the full pattern has been compiled, thus the
286 pointer to the value.
287
288 - lookbehind
289
290 In the case of lookbehind the string being searched for can be
291 offset past the start point of the final matching string.
292 If this value was just blithely removed from the min_offset it would
293 invalidate some of the calculations for how many chars must match
294 before or after (as they are derived from min_offset and minlen and
295 the length of the string being searched for).
296 When the final pattern is compiled and the data is moved from the
297 scan_data_t structure into the regexp structure the information
298 about lookbehind is factored in, with the information that would
299 have been lost precalculated in the end_shift field for the
300 associated string.
301
302 The fields pos_min and pos_delta are used to store the minimum offset
303 and the delta to the maximum offset at the current point in the pattern.
304
305*/
2c2d71f5
JH
306
307typedef struct scan_data_t {
1de06328
YO
308 /*I32 len_min; unused */
309 /*I32 len_delta; unused */
2c2d71f5
JH
310 I32 pos_min;
311 I32 pos_delta;
312 SV *last_found;
1de06328 313 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
314 I32 last_start_min;
315 I32 last_start_max;
1de06328
YO
316 SV **longest; /* Either &l_fixed, or &l_float. */
317 SV *longest_fixed; /* longest fixed string found in pattern */
318 I32 offset_fixed; /* offset where it starts */
486ec47a 319 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
320 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
321 SV *longest_float; /* longest floating string found in pattern */
322 I32 offset_float_min; /* earliest point in string it can appear */
323 I32 offset_float_max; /* latest point in string it can appear */
486ec47a 324 I32 *minlen_float; /* pointer to the minlen relevant to the string */
1de06328 325 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
326 I32 flags;
327 I32 whilem_c;
cb434fcc 328 I32 *last_closep;
653099ff 329 struct regnode_charclass_class *start_class;
2c2d71f5
JH
330} scan_data_t;
331
a687059c 332/*
e50aee73 333 * Forward declarations for pregcomp()'s friends.
a687059c 334 */
a0d0e21e 335
27da23d5 336static const scan_data_t zero_scan_data =
1de06328 337 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
338
339#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
340#define SF_BEFORE_SEOL 0x0001
341#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
342#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
343#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
344
09b7f37c
CB
345#ifdef NO_UNARY_PLUS
346# define SF_FIX_SHIFT_EOL (0+2)
347# define SF_FL_SHIFT_EOL (0+4)
348#else
349# define SF_FIX_SHIFT_EOL (+2)
350# define SF_FL_SHIFT_EOL (+4)
351#endif
c277df42
IZ
352
353#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
354#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
355
356#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
357#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
358#define SF_IS_INF 0x0040
359#define SF_HAS_PAR 0x0080
360#define SF_IN_PAR 0x0100
361#define SF_HAS_EVAL 0x0200
362#define SCF_DO_SUBSTR 0x0400
653099ff
GS
363#define SCF_DO_STCLASS_AND 0x0800
364#define SCF_DO_STCLASS_OR 0x1000
365#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 366#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 367
786e8c11 368#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 369#define SCF_SEEN_ACCEPT 0x8000
07be1b83 370
43fead97
KW
371#define UTF cBOOL(RExC_utf8)
372#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
373#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
374#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 375
ffc61ed2 376#define OOB_UNICODE 12345678
93733859 377#define OOB_NAMEDCLASS -1
b8c5462f 378
a0ed51b3
LW
379#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
380#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
381
8615cb43 382
b45f050a
JF
383/* length of regex to show in messages that don't mark a position within */
384#define RegexLengthToShowInErrorMessages 127
385
386/*
387 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
388 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
389 * op/pragma/warn/regcomp.
390 */
7253e4e3
RK
391#define MARKER1 "<-- HERE" /* marker as it appears in the description */
392#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 393
7253e4e3 394#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
395
396/*
397 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
398 * arg. Show regex, up to a maximum length. If it's too long, chop and add
399 * "...".
400 */
58e23c8d 401#define _FAIL(code) STMT_START { \
bfed75c6 402 const char *ellipses = ""; \
ccb2c380
MP
403 IV len = RExC_end - RExC_precomp; \
404 \
405 if (!SIZE_ONLY) \
288b8c02 406 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
407 if (len > RegexLengthToShowInErrorMessages) { \
408 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
409 len = RegexLengthToShowInErrorMessages - 10; \
410 ellipses = "..."; \
411 } \
58e23c8d 412 code; \
ccb2c380 413} STMT_END
8615cb43 414
58e23c8d
YO
415#define FAIL(msg) _FAIL( \
416 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
417 msg, (int)len, RExC_precomp, ellipses))
418
419#define FAIL2(msg,arg) _FAIL( \
420 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
421 arg, (int)len, RExC_precomp, ellipses))
422
b45f050a 423/*
b45f050a
JF
424 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
425 */
ccb2c380 426#define Simple_vFAIL(m) STMT_START { \
a28509cc 427 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
428 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
b45f050a
JF
431
432/*
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
434 */
ccb2c380
MP
435#define vFAIL(m) STMT_START { \
436 if (!SIZE_ONLY) \
288b8c02 437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
438 Simple_vFAIL(m); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Like Simple_vFAIL(), but accepts two arguments.
443 */
ccb2c380 444#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 445 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
448} STMT_END
b45f050a
JF
449
450/*
451 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
452 */
ccb2c380
MP
453#define vFAIL2(m,a1) STMT_START { \
454 if (!SIZE_ONLY) \
288b8c02 455 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
456 Simple_vFAIL2(m, a1); \
457} STMT_END
b45f050a
JF
458
459
460/*
461 * Like Simple_vFAIL(), but accepts three arguments.
462 */
ccb2c380 463#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 464 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
465 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
466 (int)offset, RExC_precomp, RExC_precomp + offset); \
467} STMT_END
b45f050a
JF
468
469/*
470 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
471 */
ccb2c380
MP
472#define vFAIL3(m,a1,a2) STMT_START { \
473 if (!SIZE_ONLY) \
288b8c02 474 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
475 Simple_vFAIL3(m, a1, a2); \
476} STMT_END
b45f050a
JF
477
478/*
479 * Like Simple_vFAIL(), but accepts four arguments.
480 */
ccb2c380 481#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 482 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
483 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
484 (int)offset, RExC_precomp, RExC_precomp + offset); \
485} STMT_END
b45f050a 486
668c081a 487#define ckWARNreg(loc,m) STMT_START { \
a28509cc 488 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
489 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
491} STMT_END
492
668c081a 493#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 494 const IV offset = loc - RExC_precomp; \
d1d15184 495 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
496 m REPORT_LOCATION, \
497 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
498} STMT_END
499
668c081a 500#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 501 const IV offset = loc - RExC_precomp; \
668c081a 502 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
503 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
504} STMT_END
505
506#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 507 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
508 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
509 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
510} STMT_END
511
668c081a
NC
512#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
513 const IV offset = loc - RExC_precomp; \
514 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
515 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
516} STMT_END
517
ccb2c380 518#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 519 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
520 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
521 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
522} STMT_END
523
668c081a
NC
524#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
527 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
528} STMT_END
529
ccb2c380 530#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 531 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
532 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
533 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
534} STMT_END
9d1d55b5 535
8615cb43 536
cd439c50 537/* Allow for side effects in s */
ccb2c380
MP
538#define REGC(c,s) STMT_START { \
539 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
540} STMT_END
cd439c50 541
fac92740
MJD
542/* Macros for recording node offsets. 20001227 mjd@plover.com
543 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
544 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
545 * Element 0 holds the number n.
07be1b83 546 * Position is 1 indexed.
fac92740 547 */
7122b237
YO
548#ifndef RE_TRACK_PATTERN_OFFSETS
549#define Set_Node_Offset_To_R(node,byte)
550#define Set_Node_Offset(node,byte)
551#define Set_Cur_Node_Offset
552#define Set_Node_Length_To_R(node,len)
553#define Set_Node_Length(node,len)
554#define Set_Node_Cur_Length(node)
555#define Node_Offset(n)
556#define Node_Length(n)
557#define Set_Node_Offset_Length(node,offset,len)
558#define ProgLen(ri) ri->u.proglen
559#define SetProgLen(ri,x) ri->u.proglen = x
560#else
561#define ProgLen(ri) ri->u.offsets[0]
562#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
563#define Set_Node_Offset_To_R(node,byte) STMT_START { \
564 if (! SIZE_ONLY) { \
565 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 566 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 567 if((node) < 0) { \
551405c4 568 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
569 } else { \
570 RExC_offsets[2*(node)-1] = (byte); \
571 } \
572 } \
573} STMT_END
574
575#define Set_Node_Offset(node,byte) \
576 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
577#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
578
579#define Set_Node_Length_To_R(node,len) STMT_START { \
580 if (! SIZE_ONLY) { \
581 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 582 __LINE__, (int)(node), (int)(len))); \
ccb2c380 583 if((node) < 0) { \
551405c4 584 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
585 } else { \
586 RExC_offsets[2*(node)] = (len); \
587 } \
588 } \
589} STMT_END
590
591#define Set_Node_Length(node,len) \
592 Set_Node_Length_To_R((node)-RExC_emit_start, len)
593#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
594#define Set_Node_Cur_Length(node) \
595 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
596
597/* Get offsets and lengths */
598#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
599#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
600
07be1b83
YO
601#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
602 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
603 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
604} STMT_END
7122b237 605#endif
07be1b83
YO
606
607#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
608#define EXPERIMENTAL_INPLACESCAN
f427392e 609#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 610
304ee84b
YO
611#define DEBUG_STUDYDATA(str,data,depth) \
612DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 613 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
614 "%*s" str "Pos:%"IVdf"/%"IVdf \
615 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
616 (int)(depth)*2, "", \
617 (IV)((data)->pos_min), \
618 (IV)((data)->pos_delta), \
304ee84b 619 (UV)((data)->flags), \
1de06328 620 (IV)((data)->whilem_c), \
304ee84b
YO
621 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
622 is_inf ? "INF " : "" \
1de06328
YO
623 ); \
624 if ((data)->last_found) \
625 PerlIO_printf(Perl_debug_log, \
626 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
627 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
628 SvPVX_const((data)->last_found), \
629 (IV)((data)->last_end), \
630 (IV)((data)->last_start_min), \
631 (IV)((data)->last_start_max), \
632 ((data)->longest && \
633 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
634 SvPVX_const((data)->longest_fixed), \
635 (IV)((data)->offset_fixed), \
636 ((data)->longest && \
637 (data)->longest==&((data)->longest_float)) ? "*" : "", \
638 SvPVX_const((data)->longest_float), \
639 (IV)((data)->offset_float_min), \
640 (IV)((data)->offset_float_max) \
641 ); \
642 PerlIO_printf(Perl_debug_log,"\n"); \
643});
644
acfe0abc 645static void clear_re(pTHX_ void *r);
4327152a 646
653099ff 647/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 648 Update the longest found anchored substring and the longest found
653099ff
GS
649 floating substrings if needed. */
650
4327152a 651STATIC void
304ee84b 652S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 653{
e1ec3a88
AL
654 const STRLEN l = CHR_SVLEN(data->last_found);
655 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 656 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 657
7918f24d
NC
658 PERL_ARGS_ASSERT_SCAN_COMMIT;
659
c277df42 660 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 661 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
662 if (*data->longest == data->longest_fixed) {
663 data->offset_fixed = l ? data->last_start_min : data->pos_min;
664 if (data->flags & SF_BEFORE_EOL)
b81d288d 665 data->flags
c277df42
IZ
666 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
667 else
668 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
669 data->minlen_fixed=minlenp;
670 data->lookbehind_fixed=0;
a0ed51b3 671 }
304ee84b 672 else { /* *data->longest == data->longest_float */
c277df42 673 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
674 data->offset_float_max = (l
675 ? data->last_start_max
c277df42 676 : data->pos_min + data->pos_delta);
304ee84b 677 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 678 data->offset_float_max = I32_MAX;
c277df42 679 if (data->flags & SF_BEFORE_EOL)
b81d288d 680 data->flags
c277df42
IZ
681 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
682 else
683 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
684 data->minlen_float=minlenp;
685 data->lookbehind_float=0;
c277df42
IZ
686 }
687 }
688 SvCUR_set(data->last_found, 0);
0eda9292 689 {
a28509cc 690 SV * const sv = data->last_found;
097eb12c
AL
691 if (SvUTF8(sv) && SvMAGICAL(sv)) {
692 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
693 if (mg)
694 mg->mg_len = 0;
695 }
0eda9292 696 }
c277df42
IZ
697 data->last_end = -1;
698 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 699 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
700}
701
653099ff
GS
702/* Can match anything (initialization) */
703STATIC void
097eb12c 704S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 705{
7918f24d
NC
706 PERL_ARGS_ASSERT_CL_ANYTHING;
707
653099ff 708 ANYOF_CLASS_ZERO(cl);
f8bef550 709 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 710 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
711 if (LOC)
712 cl->flags |= ANYOF_LOCALE;
39065660 713 cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
653099ff
GS
714}
715
716/* Can match anything (initialization) */
717STATIC int
5f66b61c 718S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
719{
720 int value;
721
7918f24d
NC
722 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
723
aaa51d5e 724 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
725 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
726 return 1;
1aa99e6b
IH
727 if (!(cl->flags & ANYOF_UNICODE_ALL))
728 return 0;
10edeb5d 729 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 730 return 0;
653099ff
GS
731 return 1;
732}
733
734/* Can match anything (initialization) */
735STATIC void
097eb12c 736S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 737{
7918f24d
NC
738 PERL_ARGS_ASSERT_CL_INIT;
739
8ecf7187 740 Zero(cl, 1, struct regnode_charclass_class);
653099ff 741 cl->type = ANYOF;
830247a4 742 cl_anything(pRExC_state, cl);
653099ff
GS
743}
744
745STATIC void
097eb12c 746S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 747{
7918f24d
NC
748 PERL_ARGS_ASSERT_CL_INIT_ZERO;
749
8ecf7187 750 Zero(cl, 1, struct regnode_charclass_class);
653099ff 751 cl->type = ANYOF;
830247a4 752 cl_anything(pRExC_state, cl);
653099ff
GS
753 if (LOC)
754 cl->flags |= ANYOF_LOCALE;
755}
756
757/* 'And' a given class with another one. Can create false positives */
758/* We assume that cl is not inverted */
759STATIC void
5f66b61c 760S_cl_and(struct regnode_charclass_class *cl,
a28509cc 761 const struct regnode_charclass_class *and_with)
653099ff 762{
7918f24d 763 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
764
765 assert(and_with->type == ANYOF);
1e6ade67
KW
766
767 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
768 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
653099ff 769 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
770 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
771 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
653099ff
GS
772 int i;
773
774 if (and_with->flags & ANYOF_INVERT)
775 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
776 cl->bitmap[i] &= ~and_with->bitmap[i];
777 else
778 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
779 cl->bitmap[i] &= and_with->bitmap[i];
780 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
781 if (!(and_with->flags & ANYOF_EOS))
782 cl->flags &= ~ANYOF_EOS;
1aa99e6b 783
39065660
KW
784 if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
785 cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
8951c461 786
3ff7ceb3 787 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
14ebb1a2 788 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b 789 cl->flags &= ~ANYOF_UNICODE_ALL;
ef87b810
KW
790 cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
791 only the one(s)
792 actually set */
1aa99e6b
IH
793 ARG_SET(cl, ARG(and_with));
794 }
14ebb1a2
JH
795 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
796 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 797 cl->flags &= ~ANYOF_UNICODE_ALL;
3ff7ceb3 798 if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
14ebb1a2 799 !(and_with->flags & ANYOF_INVERT))
3ff7ceb3 800 cl->flags &= ~ANYOF_NONBITMAP;
653099ff
GS
801}
802
803/* 'OR' a given class with another one. Can create false positives */
804/* We assume that cl is not inverted */
805STATIC void
097eb12c 806S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 807{
7918f24d
NC
808 PERL_ARGS_ASSERT_CL_OR;
809
653099ff
GS
810 if (or_with->flags & ANYOF_INVERT) {
811 /* We do not use
812 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
813 * <= (B1 | !B2) | (CL1 | !CL2)
814 * which is wasteful if CL2 is small, but we ignore CL2:
815 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
816 * XXXX Can we handle case-fold? Unclear:
817 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
818 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
819 */
820 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
821 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
822 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
653099ff
GS
823 int i;
824
825 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
826 cl->bitmap[i] |= ~or_with->bitmap[i];
827 } /* XXXX: logic is complicated otherwise */
828 else {
830247a4 829 cl_anything(pRExC_state, cl);
653099ff
GS
830 }
831 } else {
832 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
833 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
834 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
835 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
653099ff
GS
836 int i;
837
838 /* OR char bitmap and class bitmap separately */
839 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
840 cl->bitmap[i] |= or_with->bitmap[i];
1e6ade67 841 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
653099ff
GS
842 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
843 cl->classflags[i] |= or_with->classflags[i];
844 cl->flags |= ANYOF_CLASS;
845 }
846 }
847 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 848 cl_anything(pRExC_state, cl);
653099ff
GS
849 }
850 }
851 if (or_with->flags & ANYOF_EOS)
852 cl->flags |= ANYOF_EOS;
1aa99e6b 853
39065660
KW
854 if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
855 cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
8951c461 856
9826f543
KW
857 /* If both nodes match something outside the bitmap, but what they match
858 * outside is not the same pointer, and hence not easily compared, give up
859 * and allow the start class to match everything outside the bitmap */
3ff7ceb3 860 if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
1aa99e6b
IH
861 ARG(cl) != ARG(or_with)) {
862 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 863 }
9826f543 864
1aa99e6b
IH
865 if (or_with->flags & ANYOF_UNICODE_ALL) {
866 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 867 }
653099ff
GS
868}
869
a3621e74
YO
870#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
871#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
872#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
873#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
874
3dab1dad
YO
875
876#ifdef DEBUGGING
07be1b83 877/*
2b8b4781
NC
878 dump_trie(trie,widecharmap,revcharmap)
879 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
880 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
881
882 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
883 The _interim_ variants are used for debugging the interim
884 tables that are used to generate the final compressed
885 representation which is what dump_trie expects.
886
486ec47a 887 Part of the reason for their existence is to provide a form
3dab1dad 888 of documentation as to how the different representations function.
07be1b83
YO
889
890*/
3dab1dad
YO
891
892/*
3dab1dad
YO
893 Dumps the final compressed table form of the trie to Perl_debug_log.
894 Used for debugging make_trie().
895*/
b9a59e08 896
3dab1dad 897STATIC void
2b8b4781
NC
898S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
899 AV *revcharmap, U32 depth)
3dab1dad
YO
900{
901 U32 state;
ab3bbdeb 902 SV *sv=sv_newmortal();
55eed653 903 int colwidth= widecharmap ? 6 : 4;
2e64971a 904 U16 word;
3dab1dad
YO
905 GET_RE_DEBUG_FLAGS_DECL;
906
7918f24d 907 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 908
3dab1dad
YO
909 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
910 (int)depth * 2 + 2,"",
911 "Match","Base","Ofs" );
912
913 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 914 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 915 if ( tmp ) {
ab3bbdeb
YO
916 PerlIO_printf( Perl_debug_log, "%*s",
917 colwidth,
ddc5bc0f 918 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
919 PL_colors[0], PL_colors[1],
920 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
921 PERL_PV_ESCAPE_FIRSTCHAR
922 )
923 );
3dab1dad
YO
924 }
925 }
926 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
927 (int)depth * 2 + 2,"");
928
929 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 930 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
931 PerlIO_printf( Perl_debug_log, "\n");
932
1e2e3d02 933 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 934 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
935
936 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
937
938 if ( trie->states[ state ].wordnum ) {
939 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
940 } else {
941 PerlIO_printf( Perl_debug_log, "%6s", "" );
942 }
943
944 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
945
946 if ( base ) {
947 U32 ofs = 0;
948
949 while( ( base + ofs < trie->uniquecharcount ) ||
950 ( base + ofs - trie->uniquecharcount < trie->lasttrans
951 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
952 ofs++;
953
954 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
955
956 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
957 if ( ( base + ofs >= trie->uniquecharcount ) &&
958 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
959 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
960 {
ab3bbdeb
YO
961 PerlIO_printf( Perl_debug_log, "%*"UVXf,
962 colwidth,
3dab1dad
YO
963 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
964 } else {
ab3bbdeb 965 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
966 }
967 }
968
969 PerlIO_printf( Perl_debug_log, "]");
970
971 }
972 PerlIO_printf( Perl_debug_log, "\n" );
973 }
2e64971a
DM
974 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
975 for (word=1; word <= trie->wordcount; word++) {
976 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
977 (int)word, (int)(trie->wordinfo[word].prev),
978 (int)(trie->wordinfo[word].len));
979 }
980 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
981}
982/*
3dab1dad
YO
983 Dumps a fully constructed but uncompressed trie in list form.
984 List tries normally only are used for construction when the number of
985 possible chars (trie->uniquecharcount) is very high.
986 Used for debugging make_trie().
987*/
988STATIC void
55eed653 989S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
990 HV *widecharmap, AV *revcharmap, U32 next_alloc,
991 U32 depth)
3dab1dad
YO
992{
993 U32 state;
ab3bbdeb 994 SV *sv=sv_newmortal();
55eed653 995 int colwidth= widecharmap ? 6 : 4;
3dab1dad 996 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
997
998 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
999
3dab1dad 1000 /* print out the table precompression. */
ab3bbdeb
YO
1001 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1002 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1003 "------:-----+-----------------\n" );
3dab1dad
YO
1004
1005 for( state=1 ; state < next_alloc ; state ++ ) {
1006 U16 charid;
1007
ab3bbdeb 1008 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1009 (int)depth * 2 + 2,"", (UV)state );
1010 if ( ! trie->states[ state ].wordnum ) {
1011 PerlIO_printf( Perl_debug_log, "%5s| ","");
1012 } else {
1013 PerlIO_printf( Perl_debug_log, "W%4x| ",
1014 trie->states[ state ].wordnum
1015 );
1016 }
1017 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1018 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1019 if ( tmp ) {
1020 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1021 colwidth,
ddc5bc0f 1022 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1023 PL_colors[0], PL_colors[1],
1024 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1025 PERL_PV_ESCAPE_FIRSTCHAR
1026 ) ,
1e2e3d02
YO
1027 TRIE_LIST_ITEM(state,charid).forid,
1028 (UV)TRIE_LIST_ITEM(state,charid).newstate
1029 );
1030 if (!(charid % 10))
664e119d
RGS
1031 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1032 (int)((depth * 2) + 14), "");
1e2e3d02 1033 }
ab3bbdeb
YO
1034 }
1035 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1036 }
1037}
1038
1039/*
3dab1dad
YO
1040 Dumps a fully constructed but uncompressed trie in table form.
1041 This is the normal DFA style state transition table, with a few
1042 twists to facilitate compression later.
1043 Used for debugging make_trie().
1044*/
1045STATIC void
55eed653 1046S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1047 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1048 U32 depth)
3dab1dad
YO
1049{
1050 U32 state;
1051 U16 charid;
ab3bbdeb 1052 SV *sv=sv_newmortal();
55eed653 1053 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1054 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1055
1056 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1057
1058 /*
1059 print out the table precompression so that we can do a visual check
1060 that they are identical.
1061 */
1062
1063 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1064
1065 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1066 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1067 if ( tmp ) {
ab3bbdeb
YO
1068 PerlIO_printf( Perl_debug_log, "%*s",
1069 colwidth,
ddc5bc0f 1070 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1071 PL_colors[0], PL_colors[1],
1072 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1073 PERL_PV_ESCAPE_FIRSTCHAR
1074 )
1075 );
3dab1dad
YO
1076 }
1077 }
1078
1079 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1080
1081 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1082 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1083 }
1084
1085 PerlIO_printf( Perl_debug_log, "\n" );
1086
1087 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1088
1089 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1090 (int)depth * 2 + 2,"",
1091 (UV)TRIE_NODENUM( state ) );
1092
1093 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1094 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1095 if (v)
1096 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1097 else
1098 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1099 }
1100 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1101 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1102 } else {
1103 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1104 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1105 }
1106 }
07be1b83 1107}
3dab1dad
YO
1108
1109#endif
1110
2e64971a 1111
786e8c11
YO
1112/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1113 startbranch: the first branch in the whole branch sequence
1114 first : start branch of sequence of branch-exact nodes.
1115 May be the same as startbranch
1116 last : Thing following the last branch.
1117 May be the same as tail.
1118 tail : item following the branch sequence
1119 count : words in the sequence
1120 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1121 depth : indent depth
3dab1dad 1122
786e8c11 1123Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1124
786e8c11
YO
1125A trie is an N'ary tree where the branches are determined by digital
1126decomposition of the key. IE, at the root node you look up the 1st character and
1127follow that branch repeat until you find the end of the branches. Nodes can be
1128marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1129
786e8c11 1130 /he|she|his|hers/
72f13be8 1131
786e8c11
YO
1132would convert into the following structure. Numbers represent states, letters
1133following numbers represent valid transitions on the letter from that state, if
1134the number is in square brackets it represents an accepting state, otherwise it
1135will be in parenthesis.
07be1b83 1136
786e8c11
YO
1137 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1138 | |
1139 | (2)
1140 | |
1141 (1) +-i->(6)-+-s->[7]
1142 |
1143 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1144
786e8c11
YO
1145 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1146
1147This shows that when matching against the string 'hers' we will begin at state 1
1148read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1149then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1150is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1151single traverse. We store a mapping from accepting to state to which word was
1152matched, and then when we have multiple possibilities we try to complete the
1153rest of the regex in the order in which they occured in the alternation.
1154
1155The only prior NFA like behaviour that would be changed by the TRIE support is
1156the silent ignoring of duplicate alternations which are of the form:
1157
1158 / (DUPE|DUPE) X? (?{ ... }) Y /x
1159
4b714af6 1160Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1161and without the optimisation. With the optimisations dupes will be silently
486ec47a 1162ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1163the following demonstrates:
1164
1165 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1166
1167which prints out 'word' three times, but
1168
1169 'words'=~/(word|word|word)(?{ print $1 })S/
1170
1171which doesnt print it out at all. This is due to other optimisations kicking in.
1172
1173Example of what happens on a structural level:
1174
486ec47a 1175The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1176
1177 1: CURLYM[1] {1,32767}(18)
1178 5: BRANCH(8)
1179 6: EXACT <ac>(16)
1180 8: BRANCH(11)
1181 9: EXACT <ad>(16)
1182 11: BRANCH(14)
1183 12: EXACT <ab>(16)
1184 16: SUCCEED(0)
1185 17: NOTHING(18)
1186 18: END(0)
1187
1188This would be optimizable with startbranch=5, first=5, last=16, tail=16
1189and should turn into:
1190
1191 1: CURLYM[1] {1,32767}(18)
1192 5: TRIE(16)
1193 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1194 <ac>
1195 <ad>
1196 <ab>
1197 16: SUCCEED(0)
1198 17: NOTHING(18)
1199 18: END(0)
1200
1201Cases where tail != last would be like /(?foo|bar)baz/:
1202
1203 1: BRANCH(4)
1204 2: EXACT <foo>(8)
1205 4: BRANCH(7)
1206 5: EXACT <bar>(8)
1207 7: TAIL(8)
1208 8: EXACT <baz>(10)
1209 10: END(0)
1210
1211which would be optimizable with startbranch=1, first=1, last=7, tail=8
1212and would end up looking like:
1213
1214 1: TRIE(8)
1215 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1216 <foo>
1217 <bar>
1218 7: TAIL(8)
1219 8: EXACT <baz>(10)
1220 10: END(0)
1221
1222 d = uvuni_to_utf8_flags(d, uv, 0);
1223
1224is the recommended Unicode-aware way of saying
1225
1226 *(d++) = uv;
1227*/
1228
1e2e3d02 1229#define TRIE_STORE_REVCHAR \
786e8c11 1230 STMT_START { \
73031816
NC
1231 if (UTF) { \
1232 SV *zlopp = newSV(2); \
88c9ea1e
CB
1233 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1234 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1235 SvCUR_set(zlopp, kapow - flrbbbbb); \
1236 SvPOK_on(zlopp); \
1237 SvUTF8_on(zlopp); \
1238 av_push(revcharmap, zlopp); \
1239 } else { \
6bdeddd2 1240 char ooooff = (char)uvc; \
73031816
NC
1241 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1242 } \
1243 } STMT_END
786e8c11
YO
1244
1245#define TRIE_READ_CHAR STMT_START { \
1246 wordlen++; \
1247 if ( UTF ) { \
1248 if ( folder ) { \
1249 if ( foldlen > 0 ) { \
1250 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1251 foldlen -= len; \
1252 scan += len; \
1253 len = 0; \
1254 } else { \
1255 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1256 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1257 foldlen -= UNISKIP( uvc ); \
1258 scan = foldbuf + UNISKIP( uvc ); \
1259 } \
1260 } else { \
1261 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1262 } \
1263 } else { \
1264 uvc = (U32)*uc; \
1265 len = 1; \
1266 } \
1267} STMT_END
1268
1269
1270
1271#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1272 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1273 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1274 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1275 } \
1276 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1277 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1278 TRIE_LIST_CUR( state )++; \
1279} STMT_END
07be1b83 1280
786e8c11
YO
1281#define TRIE_LIST_NEW(state) STMT_START { \
1282 Newxz( trie->states[ state ].trans.list, \
1283 4, reg_trie_trans_le ); \
1284 TRIE_LIST_CUR( state ) = 1; \
1285 TRIE_LIST_LEN( state ) = 4; \
1286} STMT_END
07be1b83 1287
786e8c11
YO
1288#define TRIE_HANDLE_WORD(state) STMT_START { \
1289 U16 dupe= trie->states[ state ].wordnum; \
1290 regnode * const noper_next = regnext( noper ); \
1291 \
786e8c11
YO
1292 DEBUG_r({ \
1293 /* store the word for dumping */ \
1294 SV* tmp; \
1295 if (OP(noper) != NOTHING) \
740cce10 1296 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1297 else \
740cce10 1298 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1299 av_push( trie_words, tmp ); \
786e8c11
YO
1300 }); \
1301 \
1302 curword++; \
2e64971a
DM
1303 trie->wordinfo[curword].prev = 0; \
1304 trie->wordinfo[curword].len = wordlen; \
1305 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1306 \
1307 if ( noper_next < tail ) { \
1308 if (!trie->jump) \
c944940b 1309 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1310 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1311 if (!jumper) \
1312 jumper = noper_next; \
1313 if (!nextbranch) \
1314 nextbranch= regnext(cur); \
1315 } \
1316 \
1317 if ( dupe ) { \
2e64971a
DM
1318 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1319 /* chain, so that when the bits of chain are later */\
1320 /* linked together, the dups appear in the chain */\
1321 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1322 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1323 } else { \
1324 /* we haven't inserted this word yet. */ \
1325 trie->states[ state ].wordnum = curword; \
1326 } \
1327} STMT_END
07be1b83 1328
3dab1dad 1329
786e8c11
YO
1330#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1331 ( ( base + charid >= ucharcount \
1332 && base + charid < ubound \
1333 && state == trie->trans[ base - ucharcount + charid ].check \
1334 && trie->trans[ base - ucharcount + charid ].next ) \
1335 ? trie->trans[ base - ucharcount + charid ].next \
1336 : ( state==1 ? special : 0 ) \
1337 )
3dab1dad 1338
786e8c11
YO
1339#define MADE_TRIE 1
1340#define MADE_JUMP_TRIE 2
1341#define MADE_EXACT_TRIE 4
3dab1dad 1342
a3621e74 1343STATIC I32
786e8c11 1344S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1345{
27da23d5 1346 dVAR;
a3621e74
YO
1347 /* first pass, loop through and scan words */
1348 reg_trie_data *trie;
55eed653 1349 HV *widecharmap = NULL;
2b8b4781 1350 AV *revcharmap = newAV();
a3621e74 1351 regnode *cur;
9f7f3913 1352 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1353 STRLEN len = 0;
1354 UV uvc = 0;
1355 U16 curword = 0;
1356 U32 next_alloc = 0;
786e8c11
YO
1357 regnode *jumper = NULL;
1358 regnode *nextbranch = NULL;
7f69552c 1359 regnode *convert = NULL;
2e64971a 1360 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1361 /* we just use folder as a flag in utf8 */
1e696034 1362 const U8 * folder = NULL;
a3621e74 1363
2b8b4781
NC
1364#ifdef DEBUGGING
1365 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1366 AV *trie_words = NULL;
1367 /* along with revcharmap, this only used during construction but both are
1368 * useful during debugging so we store them in the struct when debugging.
8e11feef 1369 */
2b8b4781
NC
1370#else
1371 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1372 STRLEN trie_charcount=0;
3dab1dad 1373#endif
2b8b4781 1374 SV *re_trie_maxbuff;
a3621e74 1375 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1376
1377 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1378#ifndef DEBUGGING
1379 PERL_UNUSED_ARG(depth);
1380#endif
a3621e74 1381
1e696034
KW
1382 switch (flags) {
1383 case EXACTFU: folder = PL_fold_latin1; break;
1384 case EXACTF: folder = PL_fold; break;
1385 case EXACTFL: folder = PL_fold_locale; break;
1386 }
1387
c944940b 1388 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1389 trie->refcount = 1;
3dab1dad 1390 trie->startstate = 1;
786e8c11 1391 trie->wordcount = word_count;
f8fc2ecf 1392 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1393 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1394 if (!(UTF && folder))
c944940b 1395 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1396 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1397 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1398
a3621e74 1399 DEBUG_r({
2b8b4781 1400 trie_words = newAV();
a3621e74 1401 });
a3621e74 1402
0111c4fd 1403 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1404 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1405 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1406 }
3dab1dad
YO
1407 DEBUG_OPTIMISE_r({
1408 PerlIO_printf( Perl_debug_log,
786e8c11 1409 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1410 (int)depth * 2 + 2, "",
1411 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1412 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1413 (int)depth);
3dab1dad 1414 });
7f69552c
YO
1415
1416 /* Find the node we are going to overwrite */
1417 if ( first == startbranch && OP( last ) != BRANCH ) {
1418 /* whole branch chain */
1419 convert = first;
1420 } else {
1421 /* branch sub-chain */
1422 convert = NEXTOPER( first );
1423 }
1424
a3621e74
YO
1425 /* -- First loop and Setup --
1426
1427 We first traverse the branches and scan each word to determine if it
1428 contains widechars, and how many unique chars there are, this is
1429 important as we have to build a table with at least as many columns as we
1430 have unique chars.
1431
1432 We use an array of integers to represent the character codes 0..255
38a44b82 1433 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1434 native representation of the character value as the key and IV's for the
1435 coded index.
1436
1437 *TODO* If we keep track of how many times each character is used we can
1438 remap the columns so that the table compression later on is more
3b753521 1439 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1440 middle and the least common are on the outside. IMO this would be better
1441 than a most to least common mapping as theres a decent chance the most
1442 common letter will share a node with the least common, meaning the node
486ec47a 1443 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
1444 case is when we have the least common nodes twice.
1445
1446 */
1447
a3621e74 1448 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1449 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1450 const U8 *uc = (U8*)STRING( noper );
a28509cc 1451 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1452 STRLEN foldlen = 0;
1453 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1454 const U8 *scan = (U8*)NULL;
07be1b83 1455 U32 wordlen = 0; /* required init */
02daf0ab
YO
1456 STRLEN chars = 0;
1457 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1458
3dab1dad
YO
1459 if (OP(noper) == NOTHING) {
1460 trie->minlen= 0;
1461 continue;
1462 }
02daf0ab
YO
1463 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1464 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1465 regardless of encoding */
1466
a3621e74 1467 for ( ; uc < e ; uc += len ) {
3dab1dad 1468 TRIE_CHARCOUNT(trie)++;
a3621e74 1469 TRIE_READ_CHAR;
3dab1dad 1470 chars++;
a3621e74
YO
1471 if ( uvc < 256 ) {
1472 if ( !trie->charmap[ uvc ] ) {
1473 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1474 if ( folder )
1475 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1476 TRIE_STORE_REVCHAR;
a3621e74 1477 }
02daf0ab 1478 if ( set_bit ) {
62012aee
KW
1479 /* store the codepoint in the bitmap, and its folded
1480 * equivalent. */
02daf0ab 1481 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1482
1483 /* store the folded codepoint */
1484 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1485
1486 if ( !UTF ) {
1487 /* store first byte of utf8 representation of
acdf4139
KW
1488 variant codepoints */
1489 if (! UNI_IS_INVARIANT(uvc)) {
1490 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
1491 }
1492 }
02daf0ab
YO
1493 set_bit = 0; /* We've done our bit :-) */
1494 }
a3621e74
YO
1495 } else {
1496 SV** svpp;
55eed653
NC
1497 if ( !widecharmap )
1498 widecharmap = newHV();
a3621e74 1499
55eed653 1500 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1501
1502 if ( !svpp )
e4584336 1503 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1504
1505 if ( !SvTRUE( *svpp ) ) {
1506 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1507 TRIE_STORE_REVCHAR;
a3621e74
YO
1508 }
1509 }
1510 }
3dab1dad
YO
1511 if( cur == first ) {
1512 trie->minlen=chars;
1513 trie->maxlen=chars;
1514 } else if (chars < trie->minlen) {
1515 trie->minlen=chars;
1516 } else if (chars > trie->maxlen) {
1517 trie->maxlen=chars;
1518 }
1519
a3621e74
YO
1520 } /* end first pass */
1521 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1522 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1523 (int)depth * 2 + 2,"",
55eed653 1524 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1525 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1526 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1527 );
a3621e74
YO
1528
1529 /*
1530 We now know what we are dealing with in terms of unique chars and
1531 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1532 representation using a flat table will take. If it's over a reasonable
1533 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1534 conservative but potentially much slower representation using an array
1535 of lists.
1536
1537 At the end we convert both representations into the same compressed
1538 form that will be used in regexec.c for matching with. The latter
1539 is a form that cannot be used to construct with but has memory
1540 properties similar to the list form and access properties similar
1541 to the table form making it both suitable for fast searches and
1542 small enough that its feasable to store for the duration of a program.
1543
1544 See the comment in the code where the compressed table is produced
1545 inplace from the flat tabe representation for an explanation of how
1546 the compression works.
1547
1548 */
1549
1550
2e64971a
DM
1551 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1552 prev_states[1] = 0;
1553
3dab1dad 1554 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1555 /*
1556 Second Pass -- Array Of Lists Representation
1557
1558 Each state will be represented by a list of charid:state records
1559 (reg_trie_trans_le) the first such element holds the CUR and LEN
1560 points of the allocated array. (See defines above).
1561
1562 We build the initial structure using the lists, and then convert
1563 it into the compressed table form which allows faster lookups
1564 (but cant be modified once converted).
a3621e74
YO
1565 */
1566
a3621e74
YO
1567 STRLEN transcount = 1;
1568
1e2e3d02
YO
1569 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1570 "%*sCompiling trie using list compiler\n",
1571 (int)depth * 2 + 2, ""));
446bd890 1572
c944940b
JH
1573 trie->states = (reg_trie_state *)
1574 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1575 sizeof(reg_trie_state) );
a3621e74
YO
1576 TRIE_LIST_NEW(1);
1577 next_alloc = 2;
1578
1579 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1580
c445ea15
AL
1581 regnode * const noper = NEXTOPER( cur );
1582 U8 *uc = (U8*)STRING( noper );
1583 const U8 * const e = uc + STR_LEN( noper );
1584 U32 state = 1; /* required init */
1585 U16 charid = 0; /* sanity init */
1586 U8 *scan = (U8*)NULL; /* sanity init */
1587 STRLEN foldlen = 0; /* required init */
07be1b83 1588 U32 wordlen = 0; /* required init */
c445ea15
AL
1589 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1590
3dab1dad 1591 if (OP(noper) != NOTHING) {
786e8c11 1592 for ( ; uc < e ; uc += len ) {
c445ea15 1593
786e8c11 1594 TRIE_READ_CHAR;
c445ea15 1595
786e8c11
YO
1596 if ( uvc < 256 ) {
1597 charid = trie->charmap[ uvc ];
c445ea15 1598 } else {
55eed653 1599 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1600 if ( !svpp ) {
1601 charid = 0;
1602 } else {
1603 charid=(U16)SvIV( *svpp );
1604 }
c445ea15 1605 }
786e8c11
YO
1606 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1607 if ( charid ) {
a3621e74 1608
786e8c11
YO
1609 U16 check;
1610 U32 newstate = 0;
a3621e74 1611
786e8c11
YO
1612 charid--;
1613 if ( !trie->states[ state ].trans.list ) {
1614 TRIE_LIST_NEW( state );
c445ea15 1615 }
786e8c11
YO
1616 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1617 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1618 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1619 break;
1620 }
1621 }
1622 if ( ! newstate ) {
1623 newstate = next_alloc++;
2e64971a 1624 prev_states[newstate] = state;
786e8c11
YO
1625 TRIE_LIST_PUSH( state, charid, newstate );
1626 transcount++;
1627 }
1628 state = newstate;
1629 } else {
1630 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1631 }
a28509cc 1632 }
c445ea15 1633 }
3dab1dad 1634 TRIE_HANDLE_WORD(state);
a3621e74
YO
1635
1636 } /* end second pass */
1637
1e2e3d02
YO
1638 /* next alloc is the NEXT state to be allocated */
1639 trie->statecount = next_alloc;
c944940b
JH
1640 trie->states = (reg_trie_state *)
1641 PerlMemShared_realloc( trie->states,
1642 next_alloc
1643 * sizeof(reg_trie_state) );
a3621e74 1644
3dab1dad 1645 /* and now dump it out before we compress it */
2b8b4781
NC
1646 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1647 revcharmap, next_alloc,
1648 depth+1)
1e2e3d02 1649 );
a3621e74 1650
c944940b
JH
1651 trie->trans = (reg_trie_trans *)
1652 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1653 {
1654 U32 state;
a3621e74
YO
1655 U32 tp = 0;
1656 U32 zp = 0;
1657
1658
1659 for( state=1 ; state < next_alloc ; state ++ ) {
1660 U32 base=0;
1661
1662 /*
1663 DEBUG_TRIE_COMPILE_MORE_r(
1664 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1665 );
1666 */
1667
1668 if (trie->states[state].trans.list) {
1669 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1670 U16 maxid=minid;
a28509cc 1671 U16 idx;
a3621e74
YO
1672
1673 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1674 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1675 if ( forid < minid ) {
1676 minid=forid;
1677 } else if ( forid > maxid ) {
1678 maxid=forid;
1679 }
a3621e74
YO
1680 }
1681 if ( transcount < tp + maxid - minid + 1) {
1682 transcount *= 2;
c944940b
JH
1683 trie->trans = (reg_trie_trans *)
1684 PerlMemShared_realloc( trie->trans,
446bd890
NC
1685 transcount
1686 * sizeof(reg_trie_trans) );
a3621e74
YO
1687 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1688 }
1689 base = trie->uniquecharcount + tp - minid;
1690 if ( maxid == minid ) {
1691 U32 set = 0;
1692 for ( ; zp < tp ; zp++ ) {
1693 if ( ! trie->trans[ zp ].next ) {
1694 base = trie->uniquecharcount + zp - minid;
1695 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1696 trie->trans[ zp ].check = state;
1697 set = 1;
1698 break;
1699 }
1700 }
1701 if ( !set ) {
1702 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1703 trie->trans[ tp ].check = state;
1704 tp++;
1705 zp = tp;
1706 }
1707 } else {
1708 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1709 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1710 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1711 trie->trans[ tid ].check = state;
1712 }
1713 tp += ( maxid - minid + 1 );
1714 }
1715 Safefree(trie->states[ state ].trans.list);
1716 }
1717 /*
1718 DEBUG_TRIE_COMPILE_MORE_r(
1719 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1720 );
1721 */
1722 trie->states[ state ].trans.base=base;
1723 }
cc601c31 1724 trie->lasttrans = tp + 1;
a3621e74
YO
1725 }
1726 } else {
1727 /*
1728 Second Pass -- Flat Table Representation.
1729
1730 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1731 We know that we will need Charcount+1 trans at most to store the data
1732 (one row per char at worst case) So we preallocate both structures
1733 assuming worst case.
1734
1735 We then construct the trie using only the .next slots of the entry
1736 structs.
1737
3b753521 1738 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
1739 make compression both faster and easier by keeping track of how many non
1740 zero fields are in the node.
1741
1742 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1743 transition.
1744
1745 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1746 number representing the first entry of the node, and state as a
1747 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1748 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1749 are 2 entrys per node. eg:
1750
1751 A B A B
1752 1. 2 4 1. 3 7
1753 2. 0 3 3. 0 5
1754 3. 0 0 5. 0 0
1755 4. 0 0 7. 0 0
1756
1757 The table is internally in the right hand, idx form. However as we also
1758 have to deal with the states array which is indexed by nodenum we have to
1759 use TRIE_NODENUM() to convert.
1760
1761 */
1e2e3d02
YO
1762 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1763 "%*sCompiling trie using table compiler\n",
1764 (int)depth * 2 + 2, ""));
3dab1dad 1765
c944940b
JH
1766 trie->trans = (reg_trie_trans *)
1767 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1768 * trie->uniquecharcount + 1,
1769 sizeof(reg_trie_trans) );
1770 trie->states = (reg_trie_state *)
1771 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1772 sizeof(reg_trie_state) );
a3621e74
YO
1773 next_alloc = trie->uniquecharcount + 1;
1774
3dab1dad 1775
a3621e74
YO
1776 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1777
c445ea15 1778 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1779 const U8 *uc = (U8*)STRING( noper );
1780 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1781
1782 U32 state = 1; /* required init */
1783
1784 U16 charid = 0; /* sanity init */
1785 U32 accept_state = 0; /* sanity init */
1786 U8 *scan = (U8*)NULL; /* sanity init */
1787
1788 STRLEN foldlen = 0; /* required init */
07be1b83 1789 U32 wordlen = 0; /* required init */
a3621e74
YO
1790 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1791
3dab1dad 1792 if ( OP(noper) != NOTHING ) {
786e8c11 1793 for ( ; uc < e ; uc += len ) {
a3621e74 1794
786e8c11 1795 TRIE_READ_CHAR;
a3621e74 1796
786e8c11
YO
1797 if ( uvc < 256 ) {
1798 charid = trie->charmap[ uvc ];
1799 } else {
55eed653 1800 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1801 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1802 }
786e8c11
YO
1803 if ( charid ) {
1804 charid--;
1805 if ( !trie->trans[ state + charid ].next ) {
1806 trie->trans[ state + charid ].next = next_alloc;
1807 trie->trans[ state ].check++;
2e64971a
DM
1808 prev_states[TRIE_NODENUM(next_alloc)]
1809 = TRIE_NODENUM(state);
786e8c11
YO
1810 next_alloc += trie->uniquecharcount;
1811 }
1812 state = trie->trans[ state + charid ].next;
1813 } else {
1814 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1815 }
1816 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1817 }
a3621e74 1818 }
3dab1dad
YO
1819 accept_state = TRIE_NODENUM( state );
1820 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1821
1822 } /* end second pass */
1823
3dab1dad 1824 /* and now dump it out before we compress it */
2b8b4781
NC
1825 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1826 revcharmap,
1827 next_alloc, depth+1));
a3621e74 1828
a3621e74
YO
1829 {
1830 /*
1831 * Inplace compress the table.*
1832
1833 For sparse data sets the table constructed by the trie algorithm will
1834 be mostly 0/FAIL transitions or to put it another way mostly empty.
1835 (Note that leaf nodes will not contain any transitions.)
1836
1837 This algorithm compresses the tables by eliminating most such
1838 transitions, at the cost of a modest bit of extra work during lookup:
1839
1840 - Each states[] entry contains a .base field which indicates the
1841 index in the state[] array wheres its transition data is stored.
1842
3b753521 1843 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
1844
1845 - If .base is nonzero then charid is added to it to find an entry in
1846 the trans array.
1847
1848 -If trans[states[state].base+charid].check!=state then the
1849 transition is taken to be a 0/Fail transition. Thus if there are fail
1850 transitions at the front of the node then the .base offset will point
1851 somewhere inside the previous nodes data (or maybe even into a node
1852 even earlier), but the .check field determines if the transition is
1853 valid.
1854
786e8c11 1855 XXX - wrong maybe?
a3621e74 1856 The following process inplace converts the table to the compressed
3b753521 1857 table: We first do not compress the root node 1,and mark all its
a3621e74 1858 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
1859 allows us to do a DFA construction from the compressed table later,
1860 and ensures that any .base pointers we calculate later are greater
1861 than 0.
a3621e74
YO
1862
1863 - We set 'pos' to indicate the first entry of the second node.
1864
1865 - We then iterate over the columns of the node, finding the first and
1866 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1867 and set the .check pointers accordingly, and advance pos
1868 appropriately and repreat for the next node. Note that when we copy
1869 the next pointers we have to convert them from the original
1870 NODEIDX form to NODENUM form as the former is not valid post
1871 compression.
1872
1873 - If a node has no transitions used we mark its base as 0 and do not
1874 advance the pos pointer.
1875
1876 - If a node only has one transition we use a second pointer into the
1877 structure to fill in allocated fail transitions from other states.
1878 This pointer is independent of the main pointer and scans forward
1879 looking for null transitions that are allocated to a state. When it
1880 finds one it writes the single transition into the "hole". If the
786e8c11 1881 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1882
1883 - Once compressed we can Renew/realloc the structures to release the
1884 excess space.
1885
1886 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1887 specifically Fig 3.47 and the associated pseudocode.
1888
1889 demq
1890 */
a3b680e6 1891 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1892 U32 state, charid;
a3621e74 1893 U32 pos = 0, zp=0;
1e2e3d02 1894 trie->statecount = laststate;
a3621e74
YO
1895
1896 for ( state = 1 ; state < laststate ; state++ ) {
1897 U8 flag = 0;
a28509cc
AL
1898 const U32 stateidx = TRIE_NODEIDX( state );
1899 const U32 o_used = trie->trans[ stateidx ].check;
1900 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1901 trie->trans[ stateidx ].check = 0;
1902
1903 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1904 if ( flag || trie->trans[ stateidx + charid ].next ) {
1905 if ( trie->trans[ stateidx + charid ].next ) {
1906 if (o_used == 1) {
1907 for ( ; zp < pos ; zp++ ) {
1908 if ( ! trie->trans[ zp ].next ) {
1909 break;
1910 }
1911 }
1912 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1913 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1914 trie->trans[ zp ].check = state;
1915 if ( ++zp > pos ) pos = zp;
1916 break;
1917 }
1918 used--;
1919 }
1920 if ( !flag ) {
1921 flag = 1;
1922 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1923 }
1924 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1925 trie->trans[ pos ].check = state;
1926 pos++;
1927 }
1928 }
1929 }
cc601c31 1930 trie->lasttrans = pos + 1;
c944940b
JH
1931 trie->states = (reg_trie_state *)
1932 PerlMemShared_realloc( trie->states, laststate
1933 * sizeof(reg_trie_state) );
a3621e74 1934 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1935 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1936 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1937 (int)depth * 2 + 2,"",
1938 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1939 (IV)next_alloc,
1940 (IV)pos,
a3621e74
YO
1941 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1942 );
1943
1944 } /* end table compress */
1945 }
1e2e3d02
YO
1946 DEBUG_TRIE_COMPILE_MORE_r(
1947 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1948 (int)depth * 2 + 2, "",
1949 (UV)trie->statecount,
1950 (UV)trie->lasttrans)
1951 );
cc601c31 1952 /* resize the trans array to remove unused space */
c944940b
JH
1953 trie->trans = (reg_trie_trans *)
1954 PerlMemShared_realloc( trie->trans, trie->lasttrans
1955 * sizeof(reg_trie_trans) );
a3621e74 1956
3b753521 1957 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
1958 U8 nodetype =(U8)(flags & 0xFF);
1959 char *str=NULL;
786e8c11 1960
07be1b83 1961#ifdef DEBUGGING
e62cc96a 1962 regnode *optimize = NULL;
7122b237
YO
1963#ifdef RE_TRACK_PATTERN_OFFSETS
1964
b57a0404
JH
1965 U32 mjd_offset = 0;
1966 U32 mjd_nodelen = 0;
7122b237
YO
1967#endif /* RE_TRACK_PATTERN_OFFSETS */
1968#endif /* DEBUGGING */
a3621e74 1969 /*
3dab1dad
YO
1970 This means we convert either the first branch or the first Exact,
1971 depending on whether the thing following (in 'last') is a branch
1972 or not and whther first is the startbranch (ie is it a sub part of
1973 the alternation or is it the whole thing.)
3b753521 1974 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 1975 the whole branch sequence, including the first.
a3621e74 1976 */
3dab1dad 1977 /* Find the node we are going to overwrite */
7f69552c 1978 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1979 /* branch sub-chain */
3dab1dad 1980 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1981#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1982 DEBUG_r({
1983 mjd_offset= Node_Offset((convert));
1984 mjd_nodelen= Node_Length((convert));
1985 });
7122b237 1986#endif
7f69552c 1987 /* whole branch chain */
7122b237
YO
1988 }
1989#ifdef RE_TRACK_PATTERN_OFFSETS
1990 else {
7f69552c
YO
1991 DEBUG_r({
1992 const regnode *nop = NEXTOPER( convert );
1993 mjd_offset= Node_Offset((nop));
1994 mjd_nodelen= Node_Length((nop));
1995 });
07be1b83
YO
1996 }
1997 DEBUG_OPTIMISE_r(
1998 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1999 (int)depth * 2 + 2, "",
786e8c11 2000 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2001 );
7122b237 2002#endif
3dab1dad
YO
2003 /* But first we check to see if there is a common prefix we can
2004 split out as an EXACT and put in front of the TRIE node. */
2005 trie->startstate= 1;
55eed653 2006 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2007 U32 state;
1e2e3d02 2008 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2009 U32 ofs = 0;
8e11feef
RGS
2010 I32 idx = -1;
2011 U32 count = 0;
2012 const U32 base = trie->states[ state ].trans.base;
a3621e74 2013
3dab1dad 2014 if ( trie->states[state].wordnum )
8e11feef 2015 count = 1;
a3621e74 2016
8e11feef 2017 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2018 if ( ( base + ofs >= trie->uniquecharcount ) &&
2019 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2020 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2021 {
3dab1dad 2022 if ( ++count > 1 ) {
2b8b4781 2023 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2024 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2025 if ( state == 1 ) break;
3dab1dad
YO
2026 if ( count == 2 ) {
2027 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2028 DEBUG_OPTIMISE_r(
8e11feef
RGS
2029 PerlIO_printf(Perl_debug_log,
2030 "%*sNew Start State=%"UVuf" Class: [",
2031 (int)depth * 2 + 2, "",
786e8c11 2032 (UV)state));
be8e71aa 2033 if (idx >= 0) {
2b8b4781 2034 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2035 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2036
3dab1dad 2037 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2038 if ( folder )
2039 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2040 DEBUG_OPTIMISE_r(
f1f66076 2041 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2042 );
8e11feef
RGS
2043 }
2044 }
2045 TRIE_BITMAP_SET(trie,*ch);
2046 if ( folder )
2047 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2048 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2049 }
2050 idx = ofs;
2051 }
3dab1dad
YO
2052 }
2053 if ( count == 1 ) {
2b8b4781 2054 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2055 STRLEN len;
2056 char *ch = SvPV( *tmp, len );
de734bd5
A
2057 DEBUG_OPTIMISE_r({
2058 SV *sv=sv_newmortal();
8e11feef
RGS
2059 PerlIO_printf( Perl_debug_log,
2060 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2061 (int)depth * 2 + 2, "",
de734bd5
A
2062 (UV)state, (UV)idx,
2063 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2064 PL_colors[0], PL_colors[1],
2065 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2066 PERL_PV_ESCAPE_FIRSTCHAR
2067 )
2068 );
2069 });
3dab1dad
YO
2070 if ( state==1 ) {
2071 OP( convert ) = nodetype;
2072 str=STRING(convert);
2073 STR_LEN(convert)=0;
2074 }
c490c714
YO
2075 STR_LEN(convert) += len;
2076 while (len--)
de734bd5 2077 *str++ = *ch++;
8e11feef 2078 } else {
f9049ba1 2079#ifdef DEBUGGING
8e11feef
RGS
2080 if (state>1)
2081 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2082#endif
8e11feef
RGS
2083 break;
2084 }
2085 }
2e64971a 2086 trie->prefixlen = (state-1);
3dab1dad 2087 if (str) {
8e11feef 2088 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2089 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2090 trie->startstate = state;
07be1b83
YO
2091 trie->minlen -= (state - 1);
2092 trie->maxlen -= (state - 1);
33809eae
JH
2093#ifdef DEBUGGING
2094 /* At least the UNICOS C compiler choked on this
2095 * being argument to DEBUG_r(), so let's just have
2096 * it right here. */
2097 if (
2098#ifdef PERL_EXT_RE_BUILD
2099 1
2100#else
2101 DEBUG_r_TEST
2102#endif
2103 ) {
2104 regnode *fix = convert;
2105 U32 word = trie->wordcount;
2106 mjd_nodelen++;
2107 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2108 while( ++fix < n ) {
2109 Set_Node_Offset_Length(fix, 0, 0);
2110 }
2111 while (word--) {
2112 SV ** const tmp = av_fetch( trie_words, word, 0 );
2113 if (tmp) {
2114 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2115 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2116 else
2117 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2118 }
2119 }
2120 }
2121#endif
8e11feef
RGS
2122 if (trie->maxlen) {
2123 convert = n;
2124 } else {
3dab1dad 2125 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2126 DEBUG_r(optimize= n);
3dab1dad
YO
2127 }
2128 }
2129 }
a5ca303d
YO
2130 if (!jumper)
2131 jumper = last;
3dab1dad 2132 if ( trie->maxlen ) {
8e11feef
RGS
2133 NEXT_OFF( convert ) = (U16)(tail - convert);
2134 ARG_SET( convert, data_slot );
786e8c11
YO
2135 /* Store the offset to the first unabsorbed branch in
2136 jump[0], which is otherwise unused by the jump logic.
2137 We use this when dumping a trie and during optimisation. */
2138 if (trie->jump)
7f69552c 2139 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2140
6c48061a
YO
2141 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2142 * and there is a bitmap
2143 * and the first "jump target" node we found leaves enough room
2144 * then convert the TRIE node into a TRIEC node, with the bitmap
2145 * embedded inline in the opcode - this is hypothetically faster.
2146 */
2147 if ( !trie->states[trie->startstate].wordnum
2148 && trie->bitmap
2149 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2150 {
2151 OP( convert ) = TRIEC;
2152 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2153 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2154 trie->bitmap= NULL;
2155 } else
2156 OP( convert ) = TRIE;
a3621e74 2157
3dab1dad
YO
2158 /* store the type in the flags */
2159 convert->flags = nodetype;
a5ca303d
YO
2160 DEBUG_r({
2161 optimize = convert
2162 + NODE_STEP_REGNODE
2163 + regarglen[ OP( convert ) ];
2164 });
2165 /* XXX We really should free up the resource in trie now,
2166 as we won't use them - (which resources?) dmq */
3dab1dad 2167 }
a3621e74 2168 /* needed for dumping*/
e62cc96a 2169 DEBUG_r(if (optimize) {
07be1b83 2170 regnode *opt = convert;
bcdf7404 2171
e62cc96a 2172 while ( ++opt < optimize) {
07be1b83
YO
2173 Set_Node_Offset_Length(opt,0,0);
2174 }
786e8c11
YO
2175 /*
2176 Try to clean up some of the debris left after the
2177 optimisation.
a3621e74 2178 */
786e8c11 2179 while( optimize < jumper ) {
07be1b83 2180 mjd_nodelen += Node_Length((optimize));
a3621e74 2181 OP( optimize ) = OPTIMIZED;
07be1b83 2182 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2183 optimize++;
2184 }
07be1b83 2185 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2186 });
2187 } /* end node insert */
2e64971a
DM
2188
2189 /* Finish populating the prev field of the wordinfo array. Walk back
2190 * from each accept state until we find another accept state, and if
2191 * so, point the first word's .prev field at the second word. If the
2192 * second already has a .prev field set, stop now. This will be the
2193 * case either if we've already processed that word's accept state,
3b753521
FN
2194 * or that state had multiple words, and the overspill words were
2195 * already linked up earlier.
2e64971a
DM
2196 */
2197 {
2198 U16 word;
2199 U32 state;
2200 U16 prev;
2201
2202 for (word=1; word <= trie->wordcount; word++) {
2203 prev = 0;
2204 if (trie->wordinfo[word].prev)
2205 continue;
2206 state = trie->wordinfo[word].accept;
2207 while (state) {
2208 state = prev_states[state];
2209 if (!state)
2210 break;
2211 prev = trie->states[state].wordnum;
2212 if (prev)
2213 break;
2214 }
2215 trie->wordinfo[word].prev = prev;
2216 }
2217 Safefree(prev_states);
2218 }
2219
2220
2221 /* and now dump out the compressed format */
2222 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2223
55eed653 2224 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2225#ifdef DEBUGGING
2226 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2227 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2228#else
2229 SvREFCNT_dec(revcharmap);
07be1b83 2230#endif
786e8c11
YO
2231 return trie->jump
2232 ? MADE_JUMP_TRIE
2233 : trie->startstate>1
2234 ? MADE_EXACT_TRIE
2235 : MADE_TRIE;
2236}
2237
2238STATIC void
2239S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2240{
3b753521 2241/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2242
2243 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2244 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2245 ISBN 0-201-10088-6
2246
2247 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2248 suffix of the current state's 'word' that is also a proper prefix of another word in our
2249 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2250 the DFA not to have to restart after its tried and failed a word at a given point, it
2251 simply continues as though it had been matching the other word in the first place.
2252 Consider
2253 'abcdgu'=~/abcdefg|cdgu/
2254 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2255 fail, which would bring us to the state representing 'd' in the second word where we would
2256 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2257 */
2258 /* add a fail transition */
3251b653
NC
2259 const U32 trie_offset = ARG(source);
2260 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2261 U32 *q;
2262 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2263 const U32 numstates = trie->statecount;
786e8c11
YO
2264 const U32 ubound = trie->lasttrans + ucharcount;
2265 U32 q_read = 0;
2266 U32 q_write = 0;
2267 U32 charid;
2268 U32 base = trie->states[ 1 ].trans.base;
2269 U32 *fail;
2270 reg_ac_data *aho;
2271 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2272 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2273
2274 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2275#ifndef DEBUGGING
2276 PERL_UNUSED_ARG(depth);
2277#endif
2278
2279
2280 ARG_SET( stclass, data_slot );
c944940b 2281 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2282 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2283 aho->trie=trie_offset;
446bd890
NC
2284 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2285 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2286 Newxz( q, numstates, U32);
c944940b 2287 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2288 aho->refcount = 1;
2289 fail = aho->fail;
2290 /* initialize fail[0..1] to be 1 so that we always have
2291 a valid final fail state */
2292 fail[ 0 ] = fail[ 1 ] = 1;
2293
2294 for ( charid = 0; charid < ucharcount ; charid++ ) {
2295 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2296 if ( newstate ) {
2297 q[ q_write ] = newstate;
2298 /* set to point at the root */
2299 fail[ q[ q_write++ ] ]=1;
2300 }
2301 }
2302 while ( q_read < q_write) {
2303 const U32 cur = q[ q_read++ % numstates ];
2304 base = trie->states[ cur ].trans.base;
2305
2306 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2307 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2308 if (ch_state) {
2309 U32 fail_state = cur;
2310 U32 fail_base;
2311 do {
2312 fail_state = fail[ fail_state ];
2313 fail_base = aho->states[ fail_state ].trans.base;
2314 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2315
2316 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2317 fail[ ch_state ] = fail_state;
2318 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2319 {
2320 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2321 }
2322 q[ q_write++ % numstates] = ch_state;
2323 }
2324 }
2325 }
2326 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2327 when we fail in state 1, this allows us to use the
2328 charclass scan to find a valid start char. This is based on the principle
2329 that theres a good chance the string being searched contains lots of stuff
2330 that cant be a start char.
2331 */
2332 fail[ 0 ] = fail[ 1 ] = 0;
2333 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2334 PerlIO_printf(Perl_debug_log,
2335 "%*sStclass Failtable (%"UVuf" states): 0",
2336 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2337 );
786e8c11
YO
2338 for( q_read=1; q_read<numstates; q_read++ ) {
2339 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2340 }
2341 PerlIO_printf(Perl_debug_log, "\n");
2342 });
2343 Safefree(q);
2344 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2345}
2346
786e8c11 2347
a3621e74 2348/*
5d1c421c
JH
2349 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2350 * These need to be revisited when a newer toolchain becomes available.
2351 */
2352#if defined(__sparc64__) && defined(__GNUC__)
2353# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2354# undef SPARC64_GCC_WORKAROUND
2355# define SPARC64_GCC_WORKAROUND 1
2356# endif
2357#endif
2358
07be1b83 2359#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2360 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2361 SV * const mysv=sv_newmortal(); \
2362 regnode *Next = regnext(scan); \
2363 regprop(RExC_rx, mysv, scan); \
7f69552c 2364 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2365 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2366 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2367 }});
07be1b83 2368
1de06328
YO
2369
2370
2371
2372
07be1b83
YO
2373#define JOIN_EXACT(scan,min,flags) \
2374 if (PL_regkind[OP(scan)] == EXACT) \
2375 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2376
be8e71aa 2377STATIC U32
07be1b83
YO
2378S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2379 /* Merge several consecutive EXACTish nodes into one. */
2380 regnode *n = regnext(scan);
2381 U32 stringok = 1;
2382 regnode *next = scan + NODE_SZ_STR(scan);
2383 U32 merged = 0;
2384 U32 stopnow = 0;
2385#ifdef DEBUGGING
2386 regnode *stop = scan;
72f13be8 2387 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2388#else
d47053eb
RGS
2389 PERL_UNUSED_ARG(depth);
2390#endif
7918f24d
NC
2391
2392 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2393#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2394 PERL_UNUSED_ARG(flags);
2395 PERL_UNUSED_ARG(val);
07be1b83 2396#endif
07be1b83
YO
2397 DEBUG_PEEP("join",scan,depth);
2398
2399 /* Skip NOTHING, merge EXACT*. */
2400 while (n &&
2401 ( PL_regkind[OP(n)] == NOTHING ||
2402 (stringok && (OP(n) == OP(scan))))
2403 && NEXT_OFF(n)
2404 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2405
2406 if (OP(n) == TAIL || n > next)
2407 stringok = 0;
2408 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2409 DEBUG_PEEP("skip:",n,depth);
2410 NEXT_OFF(scan) += NEXT_OFF(n);
2411 next = n + NODE_STEP_REGNODE;
2412#ifdef DEBUGGING
2413 if (stringok)
2414 stop = n;
2415#endif
2416 n = regnext(n);
2417 }
2418 else if (stringok) {
786e8c11 2419 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2420 regnode * const nnext = regnext(n);
2421
2422 DEBUG_PEEP("merg",n,depth);
2423
2424 merged++;
2425 if (oldl + STR_LEN(n) > U8_MAX)
2426 break;
2427 NEXT_OFF(scan) += NEXT_OFF(n);
2428 STR_LEN(scan) += STR_LEN(n);
2429 next = n + NODE_SZ_STR(n);
2430 /* Now we can overwrite *n : */
2431 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2432#ifdef DEBUGGING
2433 stop = next - 1;
2434#endif
2435 n = nnext;
2436 if (stopnow) break;
2437 }
2438
d47053eb
RGS
2439#ifdef EXPERIMENTAL_INPLACESCAN
2440 if (flags && !NEXT_OFF(n)) {
2441 DEBUG_PEEP("atch", val, depth);
2442 if (reg_off_by_arg[OP(n)]) {
2443 ARG_SET(n, val - n);
2444 }
2445 else {
2446 NEXT_OFF(n) = val - n;
2447 }
2448 stopnow = 1;
2449 }
07be1b83
YO
2450#endif
2451 }
ced7f090
KW
2452#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2453#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2454#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2455#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2c2b7f86
KW
2456
2457 if (UTF
2458 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
2459 && ( STR_LEN(scan) >= 6 ) )
2460 {
07be1b83
YO
2461 /*
2462 Two problematic code points in Unicode casefolding of EXACT nodes:
2463
2464 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2465 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2466
2467 which casefold to
2468
2469 Unicode UTF-8
2470
2471 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2472 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2473
2474 This means that in case-insensitive matching (or "loose matching",
2475 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2476 length of the above casefolded versions) can match a target string
2477 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2478 This would rather mess up the minimum length computation.
2479
2480 What we'll do is to look for the tail four bytes, and then peek
2481 at the preceding two bytes to see whether we need to decrease
2482 the minimum length by four (six minus two).
2483
2484 Thanks to the design of UTF-8, there cannot be false matches:
2485 A sequence of valid UTF-8 bytes cannot be a subsequence of
2486 another valid sequence of UTF-8 bytes.
2487
2488 */
2489 char * const s0 = STRING(scan), *s, *t;
2490 char * const s1 = s0 + STR_LEN(scan) - 1;
2491 char * const s2 = s1 - 4;
e294cc5d
JH
2492#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2493 const char t0[] = "\xaf\x49\xaf\x42";
2494#else
07be1b83 2495 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2496#endif
07be1b83
YO
2497 const char * const t1 = t0 + 3;
2498
2499 for (s = s0 + 2;
2500 s < s2 && (t = ninstr(s, s1, t0, t1));
2501 s = t + 4) {
e294cc5d
JH
2502#ifdef EBCDIC
2503 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2504 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2505#else
07be1b83
YO
2506 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2507 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2508#endif
07be1b83
YO
2509 *min -= 4;
2510 }
2511 }
2512
2513#ifdef DEBUGGING
2514 /* Allow dumping */
2515 n = scan + NODE_SZ_STR(scan);
2516 while (n <= stop) {
2517 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2518 OP(n) = OPTIMIZED;
2519 NEXT_OFF(n) = 0;
2520 }
2521 n++;
2522 }
2523#endif
2524 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2525 return stopnow;
2526}
2527
486ec47a 2528/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
2529 Finds fixed substrings. */
2530
a0288114 2531/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2532 to the position after last scanned or to NULL. */
2533
40d049e4
YO
2534#define INIT_AND_WITHP \
2535 assert(!and_withp); \
2536 Newx(and_withp,1,struct regnode_charclass_class); \
2537 SAVEFREEPV(and_withp)
07be1b83 2538
b515a41d 2539/* this is a chain of data about sub patterns we are processing that
486ec47a 2540 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
2541 we can simulate recursion without losing state. */
2542struct scan_frame;
2543typedef struct scan_frame {
2544 regnode *last; /* last node to process in this frame */
2545 regnode *next; /* next node to process when last is reached */
2546 struct scan_frame *prev; /*previous frame*/
2547 I32 stop; /* what stopparen do we use */
2548} scan_frame;
2549
304ee84b
YO
2550
2551#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2552
e1d1eefb
YO
2553#define CASE_SYNST_FNC(nAmE) \
2554case nAmE: \
2555 if (flags & SCF_DO_STCLASS_AND) { \
2556 for (value = 0; value < 256; value++) \
2557 if (!is_ ## nAmE ## _cp(value)) \
2558 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2559 } \
2560 else { \
2561 for (value = 0; value < 256; value++) \
2562 if (is_ ## nAmE ## _cp(value)) \
2563 ANYOF_BITMAP_SET(data->start_class, value); \
2564 } \
2565 break; \
2566case N ## nAmE: \
2567 if (flags & SCF_DO_STCLASS_AND) { \
2568 for (value = 0; value < 256; value++) \
2569 if (is_ ## nAmE ## _cp(value)) \
2570 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2571 } \
2572 else { \
2573 for (value = 0; value < 256; value++) \
2574 if (!is_ ## nAmE ## _cp(value)) \
2575 ANYOF_BITMAP_SET(data->start_class, value); \
2576 } \
2577 break
2578
2579
2580
76e3520e 2581STATIC I32
40d049e4 2582S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2583 I32 *minlenp, I32 *deltap,
40d049e4
YO
2584 regnode *last,
2585 scan_data_t *data,
2586 I32 stopparen,
2587 U8* recursed,
2588 struct regnode_charclass_class *and_withp,
2589 U32 flags, U32 depth)
c277df42
IZ
2590 /* scanp: Start here (read-write). */
2591 /* deltap: Write maxlen-minlen here. */
2592 /* last: Stop before this one. */
40d049e4
YO
2593 /* data: string data about the pattern */
2594 /* stopparen: treat close N as END */
2595 /* recursed: which subroutines have we recursed into */
2596 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2597{
97aff369 2598 dVAR;
c277df42
IZ
2599 I32 min = 0, pars = 0, code;
2600 regnode *scan = *scanp, *next;
2601 I32 delta = 0;
2602 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2603 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2604 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2605 scan_data_t data_fake;
a3621e74 2606 SV *re_trie_maxbuff = NULL;
786e8c11 2607 regnode *first_non_open = scan;
e2e6a0f1 2608 I32 stopmin = I32_MAX;
8aa23a47 2609 scan_frame *frame = NULL;
a3621e74 2610 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2611
7918f24d
NC
2612 PERL_ARGS_ASSERT_STUDY_CHUNK;
2613
13a24bad 2614#ifdef DEBUGGING
40d049e4 2615 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2616#endif
40d049e4 2617
786e8c11 2618 if ( depth == 0 ) {
40d049e4 2619 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2620 first_non_open=regnext(first_non_open);
2621 }
2622
b81d288d 2623
8aa23a47
YO
2624 fake_study_recurse:
2625 while ( scan && OP(scan) != END && scan < last ){
2626 /* Peephole optimizer: */
304ee84b 2627 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2628 DEBUG_PEEP("Peep",scan,depth);
2629 JOIN_EXACT(scan,&min,0);
2630
2631 /* Follow the next-chain of the current node and optimize
2632 away all the NOTHINGs from it. */
2633 if (OP(scan) != CURLYX) {
2634 const int max = (reg_off_by_arg[OP(scan)]
2635 ? I32_MAX
2636 /* I32 may be smaller than U16 on CRAYs! */
2637 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2638 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2639 int noff;
2640 regnode *n = scan;
2641
2642 /* Skip NOTHING and LONGJMP. */
2643 while ((n = regnext(n))
2644 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2645 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2646 && off + noff < max)
2647 off += noff;
2648 if (reg_off_by_arg[OP(scan)])
2649 ARG(scan) = off;
2650 else
2651 NEXT_OFF(scan) = off;
2652 }
a3621e74 2653
c277df42 2654
8aa23a47
YO
2655
2656 /* The principal pseudo-switch. Cannot be a switch, since we
2657 look into several different things. */
2658 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2659 || OP(scan) == IFTHEN) {
2660 next = regnext(scan);
2661 code = OP(scan);
2662 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2663
2664 if (OP(next) == code || code == IFTHEN) {
2665 /* NOTE - There is similar code to this block below for handling
2666 TRIE nodes on a re-study. If you change stuff here check there
2667 too. */
2668 I32 max1 = 0, min1 = I32_MAX, num = 0;
2669 struct regnode_charclass_class accum;
2670 regnode * const startbranch=scan;
2671
2672 if (flags & SCF_DO_SUBSTR)
304ee84b 2673 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2674 if (flags & SCF_DO_STCLASS)
2675 cl_init_zero(pRExC_state, &accum);
2676
2677 while (OP(scan) == code) {
2678 I32 deltanext, minnext, f = 0, fake;
2679 struct regnode_charclass_class this_class;
2680
2681 num++;
2682 data_fake.flags = 0;
2683 if (data) {
2684 data_fake.whilem_c = data->whilem_c;
2685 data_fake.last_closep = data->last_closep;
2686 }
2687 else
2688 data_fake.last_closep = &fake;
58e23c8d
YO
2689
2690 data_fake.pos_delta = delta;
8aa23a47
YO
2691 next = regnext(scan);
2692 scan = NEXTOPER(scan);
2693 if (code != BRANCH)
c277df42 2694 scan = NEXTOPER(scan);
8aa23a47
YO
2695 if (flags & SCF_DO_STCLASS) {
2696 cl_init(pRExC_state, &this_class);
2697 data_fake.start_class = &this_class;
2698 f = SCF_DO_STCLASS_AND;
58e23c8d 2699 }
8aa23a47
YO
2700 if (flags & SCF_WHILEM_VISITED_POS)
2701 f |= SCF_WHILEM_VISITED_POS;
2702
2703 /* we suppose the run is continuous, last=next...*/
2704 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2705 next, &data_fake,
2706 stopparen, recursed, NULL, f,depth+1);
2707 if (min1 > minnext)
2708 min1 = minnext;
2709 if (max1 < minnext + deltanext)
2710 max1 = minnext + deltanext;
2711 if (deltanext == I32_MAX)
2712 is_inf = is_inf_internal = 1;
2713 scan = next;
2714 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2715 pars++;
2716 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2717 if ( stopmin > minnext)
2718 stopmin = min + min1;
2719 flags &= ~SCF_DO_SUBSTR;
2720 if (data)
2721 data->flags |= SCF_SEEN_ACCEPT;
2722 }
2723 if (data) {
2724 if (data_fake.flags & SF_HAS_EVAL)
2725 data->flags |= SF_HAS_EVAL;
2726 data->whilem_c = data_fake.whilem_c;
3dab1dad 2727 }
8aa23a47
YO
2728 if (flags & SCF_DO_STCLASS)
2729 cl_or(pRExC_state, &accum, &this_class);
2730 }
2731 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2732 min1 = 0;
2733 if (flags & SCF_DO_SUBSTR) {
2734 data->pos_min += min1;
2735 data->pos_delta += max1 - min1;
2736 if (max1 != min1 || is_inf)
2737 data->longest = &(data->longest_float);
2738 }
2739 min += min1;
2740 delta += max1 - min1;
2741 if (flags & SCF_DO_STCLASS_OR) {
2742 cl_or(pRExC_state, data->start_class, &accum);
2743 if (min1) {
2744 cl_and(data->start_class, and_withp);
2745 flags &= ~SCF_DO_STCLASS;
653099ff 2746 }
8aa23a47
YO
2747 }
2748 else if (flags & SCF_DO_STCLASS_AND) {
2749 if (min1) {
2750 cl_and(data->start_class, &accum);
2751 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2752 }
8aa23a47
YO
2753 else {
2754 /* Switch to OR mode: cache the old value of
2755 * data->start_class */
2756 INIT_AND_WITHP;
2757 StructCopy(data->start_class, and_withp,
2758 struct regnode_charclass_class);
2759 flags &= ~SCF_DO_STCLASS_AND;
2760 StructCopy(&accum, data->start_class,
2761 struct regnode_charclass_class);
2762 flags |= SCF_DO_STCLASS_OR;
2763 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2764 }
8aa23a47 2765 }
a3621e74 2766
8aa23a47
YO
2767 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2768 /* demq.
a3621e74 2769
8aa23a47
YO
2770 Assuming this was/is a branch we are dealing with: 'scan' now
2771 points at the item that follows the branch sequence, whatever
2772 it is. We now start at the beginning of the sequence and look
2773 for subsequences of
a3621e74 2774
8aa23a47
YO
2775 BRANCH->EXACT=>x1
2776 BRANCH->EXACT=>x2
2777 tail
a3621e74 2778
8aa23a47 2779 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2780
486ec47a 2781 If we can find such a subsequence we need to turn the first
8aa23a47
YO
2782 element into a trie and then add the subsequent branch exact
2783 strings to the trie.
a3621e74 2784
8aa23a47 2785 We have two cases
a3621e74 2786
3b753521 2787 1. patterns where the whole set of branches can be converted.
a3621e74 2788
8aa23a47 2789 2. patterns where only a subset can be converted.
a3621e74 2790
8aa23a47
YO
2791 In case 1 we can replace the whole set with a single regop
2792 for the trie. In case 2 we need to keep the start and end
3b753521 2793 branches so
a3621e74 2794
8aa23a47
YO
2795 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2796 becomes BRANCH TRIE; BRANCH X;
786e8c11 2797
8aa23a47
YO
2798 There is an additional case, that being where there is a
2799 common prefix, which gets split out into an EXACT like node
2800 preceding the TRIE node.
a3621e74 2801
8aa23a47
YO
2802 If x(1..n)==tail then we can do a simple trie, if not we make
2803 a "jump" trie, such that when we match the appropriate word
486ec47a 2804 we "jump" to the appropriate tail node. Essentially we turn
8aa23a47 2805 a nested if into a case structure of sorts.
b515a41d 2806
8aa23a47
YO
2807 */
2808
2809 int made=0;
2810 if (!re_trie_maxbuff) {
2811 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2812 if (!SvIOK(re_trie_maxbuff))
2813 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2814 }
2815 if ( SvIV(re_trie_maxbuff)>=0 ) {
2816 regnode *cur;
2817 regnode *first = (regnode *)NULL;
2818 regnode *last = (regnode *)NULL;
2819 regnode *tail = scan;
2820 U8 optype = 0;
2821 U32 count=0;
a3621e74
YO
2822
2823#ifdef DEBUGGING
8aa23a47 2824 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2825#endif
8aa23a47
YO
2826 /* var tail is used because there may be a TAIL
2827 regop in the way. Ie, the exacts will point to the
2828 thing following the TAIL, but the last branch will
2829 point at the TAIL. So we advance tail. If we
2830 have nested (?:) we may have to move through several
2831 tails.
2832 */
2833
2834 while ( OP( tail ) == TAIL ) {
2835 /* this is the TAIL generated by (?:) */
2836 tail = regnext( tail );
2837 }
a3621e74 2838
8aa23a47
YO
2839
2840 DEBUG_OPTIMISE_r({
2841 regprop(RExC_rx, mysv, tail );
2842 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2843 (int)depth * 2 + 2, "",
2844 "Looking for TRIE'able sequences. Tail node is: ",
2845 SvPV_nolen_const( mysv )
2846 );
2847 });
2848
2849 /*
2850
2851 step through the branches, cur represents each
2852 branch, noper is the first thing to be matched
2853 as part of that branch and noper_next is the
2854 regnext() of that node. if noper is an EXACT
2855 and noper_next is the same as scan (our current
2856 position in the regex) then the EXACT branch is
2857 a possible optimization target. Once we have
486ec47a 2858 two or more consecutive such branches we can
8aa23a47
YO
2859 create a trie of the EXACT's contents and stich
2860 it in place. If the sequence represents all of
2861 the branches we eliminate the whole thing and
2862 replace it with a single TRIE. If it is a
2863 subsequence then we need to stitch it in. This
2864 means the first branch has to remain, and needs
2865 to be repointed at the item on the branch chain
2866 following the last branch optimized. This could
2867 be either a BRANCH, in which case the
2868 subsequence is internal, or it could be the
2869 item following the branch sequence in which
2870 case the subsequence is at the end.
2871
2872 */
2873
2874 /* dont use tail as the end marker for this traverse */
2875 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2876 regnode * const noper = NEXTOPER( cur );
b515a41d 2877#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2878 regnode * const noper_next = regnext( noper );
b515a41d
YO
2879#endif
2880
8aa23a47
YO
2881 DEBUG_OPTIMISE_r({
2882 regprop(RExC_rx, mysv, cur);
2883 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2884 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2885
2886 regprop(RExC_rx, mysv, noper);
2887 PerlIO_printf( Perl_debug_log, " -> %s",
2888 SvPV_nolen_const(mysv));
2889
2890 if ( noper_next ) {
2891 regprop(RExC_rx, mysv, noper_next );
2892 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2893 SvPV_nolen_const(mysv));
2894 }
2895 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2896 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2897 });
2898 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2899 : PL_regkind[ OP( noper ) ] == EXACT )
2900 || OP(noper) == NOTHING )
786e8c11 2901#ifdef NOJUMPTRIE
8aa23a47 2902 && noper_next == tail
786e8c11 2903#endif
8aa23a47
YO
2904 && count < U16_MAX)
2905 {
2906 count++;
2907 if ( !first || optype == NOTHING ) {
2908 if (!first) first = cur;
2909 optype = OP( noper );
2910 } else {
2911 last = cur;
2912 }
2913 } else {
a0a388a1 2914/*
0abd0d78
YO
2915 Currently we do not believe that the trie logic can
2916 handle case insensitive matching properly when the
2917 pattern is not unicode (thus forcing unicode semantics).
2918
2919 If/when this is fixed the following define can be swapped
2920 in below to fully enable trie logic.
2921
a0a388a1 2922#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2923
2924*/
2925#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2926
a0a388a1 2927 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2928 make_trie( pRExC_state,
2929 startbranch, first, cur, tail, count,
2930 optype, depth+1 );
2931 }
2932 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2933#ifdef NOJUMPTRIE
8aa23a47 2934 && noper_next == tail
786e8c11 2935#endif
8aa23a47
YO
2936 ){
2937 count = 1;
2938 first = cur;
2939 optype = OP( noper );
2940 } else {
2941 count = 0;
2942 first = NULL;
2943 optype = 0;
2944 }
2945 last = NULL;
2946 }
2947 }
2948 DEBUG_OPTIMISE_r({
2949 regprop(RExC_rx, mysv, cur);
2950 PerlIO_printf( Perl_debug_log,
2951 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2952 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2953
2954 });
a0a388a1
YO
2955
2956 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2957 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2958#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2959 if ( ((made == MADE_EXACT_TRIE &&
2960 startbranch == first)
2961 || ( first_non_open == first )) &&
2962 depth==0 ) {
2963 flags |= SCF_TRIE_RESTUDY;
2964 if ( startbranch == first
2965 && scan == tail )
2966 {
2967 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2968 }
2969 }
3dab1dad 2970#endif
8aa23a47
YO
2971 }
2972 }
2973
2974 } /* do trie */
2975
653099ff 2976 }
8aa23a47
YO
2977 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2978 scan = NEXTOPER(NEXTOPER(scan));
2979 } else /* single branch is optimized. */
2980 scan = NEXTOPER(scan);
2981 continue;
2982 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2983 scan_frame *newframe = NULL;
2984 I32 paren;
2985 regnode *start;
2986 regnode *end;
2987
2988 if (OP(scan) != SUSPEND) {
2989 /* set the pointer */
2990 if (OP(scan) == GOSUB) {
2991 paren = ARG(scan);
2992 RExC_recurse[ARG2L(scan)] = scan;
2993 start = RExC_open_parens[paren-1];
2994 end = RExC_close_parens[paren-1];
2995 } else {
2996 paren = 0;
f8fc2ecf 2997 start = RExC_rxi->program + 1;
8aa23a47
YO
2998 end = RExC_opend;
2999 }
3000 if (!recursed) {
3001 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3002 SAVEFREEPV(recursed);
3003 }
3004 if (!PAREN_TEST(recursed,paren+1)) {
3005 PAREN_SET(recursed,paren+1);
3006 Newx(newframe,1,scan_frame);
3007 } else {
3008 if (flags & SCF_DO_SUBSTR) {
304ee84b 3009 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3010 data->longest = &(data->longest_float);
3011 }
3012 is_inf = is_inf_internal = 1;
3013 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3014 cl_anything(pRExC_state, data->start_class);
3015 flags &= ~SCF_DO_STCLASS;
3016 }
3017 } else {
3018 Newx(newframe,1,scan_frame);
3019 paren = stopparen;
3020 start = scan+2;
3021 end = regnext(scan);
3022 }
3023 if (newframe) {
3024 assert(start);
3025 assert(end);
3026 SAVEFREEPV(newframe);
3027 newframe->next = regnext(scan);
3028 newframe->last = last;
3029 newframe->stop = stopparen;
3030 newframe->prev = frame;
3031
3032 frame = newframe;
3033 scan = start;
3034 stopparen = paren;
3035 last = end;
3036
3037 continue;
3038 }
3039 }
3040 else if (OP(scan) == EXACT) {
3041 I32 l = STR_LEN(scan);
3042 UV uc;
3043 if (UTF) {
3044 const U8 * const s = (U8*)STRING(scan);
3045 l = utf8_length(s, s + l);
3046 uc = utf8_to_uvchr(s, NULL);
3047 } else {
3048 uc = *((U8*)STRING(scan));
3049 }
3050 min += l;
3051 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3052 /* The code below prefers earlier match for fixed
3053 offset, later match for variable offset. */
3054 if (data->last_end == -1) { /* Update the start info. */
3055 data->last_start_min = data->pos_min;
3056 data->last_start_max = is_inf
3057 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3058 }
8aa23a47
YO
3059 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3060 if (UTF)
3061 SvUTF8_on(data->last_found);
3062 {
3063 SV * const sv = data->last_found;
3064 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3065 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3066 if (mg && mg->mg_len >= 0)
3067 mg->mg_len += utf8_length((U8*)STRING(scan),
3068 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3069 }
8aa23a47
YO
3070 data->last_end = data->pos_min + l;
3071 data->pos_min += l; /* As in the first entry. */
3072 data->flags &= ~SF_BEFORE_EOL;
3073 }
3074 if (flags & SCF_DO_STCLASS_AND) {
3075 /* Check whether it is compatible with what we know already! */
3076 int compat = 1;
3077
54251c2e 3078
486ec47a 3079 /* If compatible, we or it in below. It is compatible if is
54251c2e
KW
3080 * in the bitmp and either 1) its bit or its fold is set, or 2)
3081 * it's for a locale. Even if there isn't unicode semantics
3082 * here, at runtime there may be because of matching against a
3083 * utf8 string, so accept a possible false positive for
3084 * latin1-range folds */
8aa23a47
YO
3085 if (uc >= 0x100 ||
3086 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3087 && !ANYOF_BITMAP_TEST(data->start_class, uc)
39065660 3088 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
54251c2e 3089 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
8aa23a47
YO
3090 )
3091 compat = 0;
3092 ANYOF_CLASS_ZERO(data->start_class);
3093 ANYOF_BITMAP_ZERO(data->start_class);
3094 if (compat)
3095 ANYOF_BITMAP_SET(data->start_class, uc);
3096 data->start_class->flags &= ~ANYOF_EOS;
3097 if (uc < 0x100)
3098 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3099 }
3100 else if (flags & SCF_DO_STCLASS_OR) {
3101 /* false positive possible if the class is case-folded */
3102 if (uc < 0x100)
3103 ANYOF_BITMAP_SET(data->start_class, uc);
3104 else
3105 data->start_class->flags |= ANYOF_UNICODE_ALL;
3106 data->start_class->flags &= ~ANYOF_EOS;
3107 cl_and(data->start_class, and_withp);
3108 }
3109 flags &= ~SCF_DO_STCLASS;
3110 }
3111 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3112 I32 l = STR_LEN(scan);
3113 UV uc = *((U8*)STRING(scan));
3114
3115 /* Search for fixed substrings supports EXACT only. */
3116 if (flags & SCF_DO_SUBSTR) {
3117 assert(data);
304ee84b 3118 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3119 }
3120 if (UTF) {
3121 const U8 * const s = (U8 *)STRING(scan);
3122 l = utf8_length(s, s + l);
3123 uc = utf8_to_uvchr(s, NULL);
3124 }
3125 min += l;
3126 if (flags & SCF_DO_SUBSTR)
3127 data->pos_min += l;
3128 if (flags & SCF_DO_STCLASS_AND) {
3129 /* Check whether it is compatible with what we know already! */
3130 int compat = 1;
8aa23a47 3131 if (uc >= 0x100 ||
54251c2e
KW
3132 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3133 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3134 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3135 {
8aa23a47 3136 compat = 0;
54251c2e 3137 }
8aa23a47
YO
3138 ANYOF_CLASS_ZERO(data->start_class);
3139 ANYOF_BITMAP_ZERO(data->start_class);
3140 if (compat) {
3141 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3142 data->start_class->flags &= ~ANYOF_EOS;
39065660 3143 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
970c8436 3144 if (OP(scan) == EXACTFL) {
8aa23a47 3145 data->start_class->flags |= ANYOF_LOCALE;
970c8436
KW
3146 }
3147 else {
3148
54251c2e
KW
3149 /* Also set the other member of the fold pair. In case
3150 * that unicode semantics is called for at runtime, use
3151 * the full latin1 fold. (Can't do this for locale,
3152 * because not known until runtime */
3153 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
970c8436 3154 }
653099ff 3155 }
8aa23a47
YO
3156 }
3157 else if (flags & SCF_DO_STCLASS_OR) {
39065660 3158 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
8aa23a47
YO
3159 /* false positive possible if the class is case-folded.
3160 Assume that the locale settings are the same... */
970c8436 3161 if (uc < 0x100) {
1aa99e6b 3162 ANYOF_BITMAP_SET(data->start_class, uc);
970c8436
KW
3163 if (OP(scan) != EXACTFL) {
3164
3165 /* And set the other member of the fold pair, but
3166 * can't do that in locale because not known until
3167 * run-time */
3168 ANYOF_BITMAP_SET(data->start_class,
54251c2e 3169 PL_fold_latin1[uc]);
970c8436
KW
3170 }
3171 }
653099ff
GS
3172 data->start_class->flags &= ~ANYOF_EOS;
3173 }
8aa23a47 3174 cl_and(data->start_class, and_withp);
653099ff 3175 }
8aa23a47
YO
3176 flags &= ~SCF_DO_STCLASS;
3177 }
e52fc539 3178 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3179 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3180 I32 f = flags, pos_before = 0;
3181 regnode * const oscan = scan;
3182 struct regnode_charclass_class this_class;
3183 struct regnode_charclass_class *oclass = NULL;
3184 I32 next_is_eval = 0;
3185
3186 switch (PL_regkind[OP(scan)]) {
3187 case WHILEM: /* End of (?:...)* . */
3188 scan = NEXTOPER(scan);
3189 goto finish;
3190 case PLUS:
3191 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3192 next = NEXTOPER(scan);
3193 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3194 mincount = 1;
3195 maxcount = REG_INFTY;
3196 next = regnext(scan);
3197 scan = NEXTOPER(scan);
3198 goto do_curly;
3199 }
3200 }
3201 if (flags & SCF_DO_SUBSTR)
3202 data->pos_min++;
3203 min++;
3204 /* Fall through. */
3205 case STAR:
3206 if (flags & SCF_DO_STCLASS) {
3207 mincount = 0;
3208 maxcount = REG_INFTY;
3209 next = regnext(scan);
3210 scan = NEXTOPER(scan);
3211 goto do_curly;
3212 }
3213 is_inf = is_inf_internal = 1;
3214 scan = regnext(scan);
c277df42 3215 if (flags & SCF_DO_SUBSTR) {
304ee84b 3216 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3217 data->longest = &(data->longest_float);
c277df42 3218 }
8aa23a47
YO
3219 goto optimize_curly_tail;
3220 case CURLY:
3221 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3222 && (scan->flags == stopparen))
3223 {
3224 mincount = 1;
3225 maxcount = 1;
3226 } else {
3227 mincount = ARG1(scan);
3228 maxcount = ARG2(scan);
653099ff 3229 }
8aa23a47
YO
3230 next = regnext(scan);
3231 if (OP(scan) == CURLYX) {
3232 I32 lp = (data ? *(data->last_closep) : 0);
3233 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3234 }
8aa23a47
YO
3235 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3236 next_is_eval = (OP(scan) == EVAL);
3237 do_curly:
3238 if (flags & SCF_DO_SUBSTR) {
304ee84b 3239 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3240 pos_before = data->pos_min;
b45f050a 3241 }
8aa23a47
YO
3242 if (data) {
3243 fl = data->flags;
3244 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3245 if (is_inf)
3246 data->flags |= SF_IS_INF;
3247 }
3248 if (flags & SCF_DO_STCLASS) {
3249 cl_init(pRExC_state, &this_class);
3250 oclass = data->start_class;
3251 data->start_class = &this_class;
3252 f |= SCF_DO_STCLASS_AND;
3253 f &= ~SCF_DO_STCLASS_OR;
3254 }
779bcb7d
NC
3255 /* Exclude from super-linear cache processing any {n,m}
3256 regops for which the combination of input pos and regex
3257 pos is not enough information to determine if a match
3258 will be possible.
3259
3260 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3261 regex pos at the \s*, the prospects for a match depend not
3262 only on the input position but also on how many (bar\s*)
3263 repeats into the {4,8} we are. */
3264 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
8aa23a47 3265 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3266
8aa23a47
YO
3267 /* This will finish on WHILEM, setting scan, or on NULL: */
3268 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3269 last, data, stopparen, recursed, NULL,
3270 (mincount == 0
3271 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3272
8aa23a47
YO
3273 if (flags & SCF_DO_STCLASS)
3274 data->start_class = oclass;
3275 if (mincount == 0 || minnext == 0) {
3276 if (flags & SCF_DO_STCLASS_OR) {
3277 cl_or(pRExC_state, data->start_class, &this_class);
3278 }
3279 else if (flags & SCF_DO_STCLASS_AND) {
3280 /* Switch to OR mode: cache the old value of
3281 * data->start_class */
3282 INIT_AND_WITHP;
3283 StructCopy(data->start_class, and_withp,
3284 struct regnode_charclass_class);
3285 flags &= ~SCF_DO_STCLASS_AND;
3286 StructCopy(&this_class, data->start_class,
3287 struct regnode_charclass_class);
3288 flags |= SCF_DO_STCLASS_OR;
3289 data->start_class->flags |= ANYOF_EOS;
3290 }
3291 } else { /* Non-zero len */
3292 if (flags & SCF_DO_STCLASS_OR) {
3293 cl_or(pRExC_state, data->start_class, &this_class);
3294 cl_and(data->start_class, and_withp);
3295 }
3296 else if (flags & SCF_DO_STCLASS_AND)
3297 cl_and(data->start_class, &this_class);
3298 flags &= ~SCF_DO_STCLASS;
3299 }
3300 if (!scan) /* It was not CURLYX, but CURLY. */
3301 scan = next;
3302 if ( /* ? quantifier ok, except for (?{ ... }) */
3303 (next_is_eval || !(mincount == 0 && maxcount == 1))
3304 && (minnext == 0) && (deltanext == 0)
3305 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3306 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3307 {
668c081a
NC
3308 ckWARNreg(RExC_parse,
3309 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3310 }
3311
3312 min += minnext * mincount;
3313 is_inf_internal |= ((maxcount == REG_INFTY
3314 && (minnext + deltanext) > 0)
3315 || deltanext == I32_MAX);
3316 is_inf |= is_inf_internal;
3317 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3318
3319 /* Try powerful optimization CURLYX => CURLYN. */
3320 if ( OP(oscan) == CURLYX && data
3321 && data->flags & SF_IN_PAR
3322 && !(data->flags & SF_HAS_EVAL)
3323 && !deltanext && minnext == 1 ) {
3324 /* Try to optimize to CURLYN. */
3325 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3326 regnode * const nxt1 = nxt;
497b47a8 3327#ifdef DEBUGGING
8aa23a47 3328 regnode *nxt2;
497b47a8 3329#endif
c277df42 3330
8aa23a47
YO
3331 /* Skip open. */
3332 nxt = regnext(nxt);
e52fc539 3333 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3334 && !(PL_regkind[OP(nxt)] == EXACT
3335 && STR_LEN(nxt) == 1))
3336 goto nogo;
497b47a8 3337#ifdef DEBUGGING
8aa23a47 3338 nxt2 = nxt;
497b47a8 3339#endif
8aa23a47
YO
3340 nxt = regnext(nxt);
3341 if (OP(nxt) != CLOSE)
3342 goto nogo;
3343 if (RExC_open_parens) {
3344 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3345 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3346 }
3347 /* Now we know that nxt2 is the only contents: */
3348 oscan->flags = (U8)ARG(nxt);
3349 OP(oscan) = CURLYN;
3350 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3351
c277df42 3352#ifdef DEBUGGING
8aa23a47 3353 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3354 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3355 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3356 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3357 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3358 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3359#endif
8aa23a47
YO
3360 }
3361 nogo:
3362
3363 /* Try optimization CURLYX => CURLYM. */
3364 if ( OP(oscan) == CURLYX && data
3365 && !(data->flags & SF_HAS_PAR)
3366 && !(data->flags & SF_HAS_EVAL)
3367 && !deltanext /* atom is fixed width */
3368 && minnext != 0 /* CURLYM can't handle zero width */
3369 ) {
3370 /* XXXX How to optimize if data == 0? */
3371 /* Optimize to a simpler form. */
3372 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3373 regnode *nxt2;
3374
3375 OP(oscan) = CURLYM;
3376 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3377 && (OP(nxt2) != WHILEM))
3378 nxt = nxt2;
3379 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3380 /* Need to optimize away parenths. */
b3c0965f 3381 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3382 /* Set the parenth number. */
3383 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3384
8aa23a47
YO
3385 oscan->flags = (U8)ARG(nxt);
3386 if (RExC_open_parens) {
3387 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3388 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3389 }
8aa23a47
YO
3390 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3391 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3392
c277df42 3393#ifdef DEBUGGING
8aa23a47
YO
3394 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3395 OP(nxt + 1) = OPTIMIZED; /* was count. */
486ec47a
PA
3396 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3397 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
b81d288d 3398#endif
c277df42 3399#if 0
8aa23a47
YO
3400 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3401 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3402 if (nnxt == nxt) {
3403 if (reg_off_by_arg[OP(nxt1)])
3404 ARG_SET(nxt1, nxt2 - nxt1);
3405 else if (nxt2 - nxt1 < U16_MAX)
3406 NEXT_OFF(nxt1) = nxt2 - nxt1;
3407 else
3408 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3409 }
8aa23a47 3410 nxt1 = nnxt;
c277df42 3411 }
5d1c421c 3412#endif
8aa23a47
YO
3413 /* Optimize again: */
3414 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3415 NULL, stopparen, recursed, NULL, 0,depth+1);
3416 }
3417 else
3418 oscan->flags = 0;
3419 }
3420 else if ((OP(oscan) == CURLYX)
3421 && (flags & SCF_WHILEM_VISITED_POS)
3422 /* See the comment on a similar expression above.
3b753521 3423 However, this time it's not a subexpression
8aa23a47
YO
3424 we care about, but the expression itself. */
3425 && (maxcount == REG_INFTY)
3426 && data && ++data->whilem_c < 16) {
3427 /* This stays as CURLYX, we can put the count/of pair. */
3428 /* Find WHILEM (as in regexec.c) */
3429 regnode *nxt = oscan + NEXT_OFF(oscan);
3430
3431 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3432 nxt += ARG(nxt);
3433 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3434 | (RExC_whilem_seen << 4)); /* On WHILEM */
3435 }
3436 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3437 pars++;
3438 if (flags & SCF_DO_SUBSTR) {
3439 SV *last_str = NULL;
3440 int counted = mincount != 0;
a0ed51b3 3441
8aa23a47
YO
3442 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3443#if defined(SPARC64_GCC_WORKAROUND)
3444 I32 b = 0;
3445 STRLEN l = 0;
3446 const char *s = NULL;
3447 I32 old = 0;
b515a41d 3448
8aa23a47
YO
3449 if (pos_before >= data->last_start_min)
3450 b = pos_before;
3451 else
3452 b = data->last_start_min;
b515a41d 3453
8aa23a47
YO
3454 l = 0;
3455 s = SvPV_const(data->last_found, l);
3456 old = b - data->last_start_min;
3457
3458#else
3459 I32 b = pos_before >= data->last_start_min
3460 ? pos_before : data->last_start_min;
3461 STRLEN l;
3462 const char * const s = SvPV_const(data->last_found, l);
3463 I32 old = b - data->last_start_min;
3464#endif
3465
3466 if (UTF)
3467 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3468 l -= old;
3469 /* Get the added string: */
740cce10 3470 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3471 if (deltanext == 0 && pos_before == b) {
3472 /* What was added is a constant string */
3473 if (mincount > 1) {
3474 SvGROW(last_str, (mincount * l) + 1);
3475 repeatcpy(SvPVX(last_str) + l,
3476 SvPVX_const(last_str), l, mincount - 1);
3477 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3478 /* Add additional parts. */
3479 SvCUR_set(data->last_found,
3480 SvCUR(data->last_found) - l);
3481 sv_catsv(data->last_found, last_str);
3482 {
3483 SV * sv = data->last_found;
3484 MAGIC *mg =
3485 SvUTF8(sv) && SvMAGICAL(sv) ?
3486 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3487 if (mg && mg->mg_len >= 0)
bd94e887 3488 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3489 }
8aa23a47 3490 data->last_end += l * (mincount - 1);
b515a41d 3491 }
8aa23a47
YO
3492 } else {
3493 /* start offset must point into the last copy */
3494 data->last_start_min += minnext * (mincount - 1);
3495 data->last_start_max += is_inf ? I32_MAX
3496 : (maxcount - 1) * (minnext + data->pos_delta);
3497 }
c277df42 3498 }
8aa23a47
YO
3499 /* It is counted once already... */
3500 data->pos_min += minnext * (mincount - counted);
3501 data->pos_delta += - counted * deltanext +
3502 (minnext + deltanext) * maxcount - minnext * mincount;
3503 if (mincount != maxcount) {
3504 /* Cannot extend fixed substrings found inside
3505 the group. */
304ee84b 3506 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3507 if (mincount && last_str) {
3508 SV * const sv = data->last_found;
3509 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3510 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3511
3512 if (mg)
3513 mg->mg_len = -1;
3514 sv_setsv(sv, last_str);
3515 data->last_end = data->pos_min;
3516 data->last_start_min =
3517 data->pos_min - CHR_SVLEN(last_str);
3518 data->last_start_max = is_inf
3519 ? I32_MAX
3520 : data->pos_min + data->pos_delta
3521 - CHR_SVLEN(last_str);
3522 }
3523 data->longest = &(data->longest_float);
3524 }
3525 SvREFCNT_dec(last_str);
c277df42 3526 }
8aa23a47
YO
3527 if (data && (fl & SF_HAS_EVAL))
3528 data->flags |= SF_HAS_EVAL;
3529 optimize_curly_tail:
3530 if (OP(oscan) != CURLYX) {
3531 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3532 && NEXT_OFF(next))
3533 NEXT_OFF(oscan) += NEXT_OFF(next);
3534 }
3535 continue;
3536 default: /* REF and CLUMP only? */
3537 if (flags & SCF_DO_SUBSTR) {
304ee84b 3538 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3539 data->longest = &(data->longest_float);
3540 }
3541 is_inf = is_inf_internal = 1;
3542 if (flags & SCF_DO_STCLASS_OR)
3543 cl_anything(pRExC_state, data->start_class);
3544 flags &= ~SCF_DO_STCLASS;
3545 break;
c277df42 3546 }
8aa23a47 3547 }
e1d1eefb
YO
3548 else if (OP(scan) == LNBREAK) {
3549 if (flags & SCF_DO_STCLASS) {
3550 int value = 0;
3551 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3552 if (flags & SCF_DO_STCLASS_AND) {
3553 for (value = 0; value < 256; value++)
e64b1bd1 3554 if (!is_VERTWS_cp(value))
b9a59e08
KW
3555 ANYOF_BITMAP_CLEAR(data->start_class, value);
3556 }
3557 else {
e1d1eefb 3558 for (value = 0; value < 256; value++)
e64b1bd1 3559 if (is_VERTWS_cp(value))
b9a59e08
KW
3560 ANYOF_BITMAP_SET(data->start_class, value);
3561 }
e1d1eefb
YO
3562 if (flags & SCF_DO_STCLASS_OR)
3563 cl_and(data->start_class, and_withp);
3564 flags &= ~SCF_DO_STCLASS;
3565 }
3566 min += 1;
f9a79580 3567 delta += 1;
e1d1eefb
YO
3568 if (flags & SCF_DO_SUBSTR) {
3569 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3570 data->pos_min += 1;
f9a79580 3571 data->pos_delta += 1;
e1d1eefb
YO
3572 data->longest = &(data->longest_float);
3573 }
e1d1eefb 3574 }
f9a79580 3575 else if (OP(scan) == FOLDCHAR) {
ced7f090 3576 int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
f9a79580
RGS
3577 flags &= ~SCF_DO_STCLASS;
3578 min += 1;
3579 delta += d;
3580 if (flags & SCF_DO_SUBSTR) {
3581 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3582 data->pos_min += 1;
3583 data->pos_delta += d;
3584 data->longest = &(data->longest_float);
3585 }
3586 }
e52fc539 3587 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 3588 int value = 0;
653099ff 3589
8aa23a47 3590 if (flags & SCF_DO_SUBSTR) {
304ee84b 3591 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3592 data->pos_min++;
3593 }
3594 min++;
3595 if (flags & SCF_DO_STCLASS) {
3596 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3597
8aa23a47
YO
3598 /* Some of the logic below assumes that switching
3599 locale on will only add false positives. */
3600 switch (PL_regkind[OP(scan)]) {
3601 case SANY:
3602 default:
3603 do_default:
3604 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3605 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3606 cl_anything(pRExC_state, data->start_class);
3607 break;
3608 case REG_ANY:
3609 if (OP(scan) == SANY)
3610 goto do_default;
3611 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3612 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3a15e693 3613 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
8aa23a47 3614 cl_anything(pRExC_state, data->start_class);
653099ff 3615 }
8aa23a47
YO
3616 if (flags & SCF_DO_STCLASS_AND || !value)
3617 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3618 break;
3619 case ANYOF:
3620 if (flags & SCF_DO_STCLASS_AND)
3621 cl_and(data->start_class,
3622 (struct regnode_charclass_class*)scan);
653099ff 3623 else
8aa23a47
YO
3624 cl_or(pRExC_state, data->start_class,
3625 (struct regnode_charclass_class*)scan);
3626 break;
3627 case ALNUM:
3628 if (flags & SCF_DO_STCLASS_AND) {
3629 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3630 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
a12cf05f
KW
3631 if (FLAGS(scan) & USE_UNI) {
3632 for (value = 0; value < 256; value++) {
3633 if (!isWORDCHAR_L1(value)) {
3634 ANYOF_BITMAP_CLEAR(data->start_class, value);
3635 }
3636 }
3637 } else {
3638 for (value = 0; value < 256; value++) {
3639 if (!isALNUM(value)) {
3640 ANYOF_BITMAP_CLEAR(data->start_class, value);
3641 }
3642 }
3643 }
8aa23a47 3644 }
653099ff 3645 }
8aa23a47
YO
3646 else {
3647 if (data->start_class->flags & ANYOF_LOCALE)
3648 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
a12cf05f
KW
3649 else if (FLAGS(scan) & USE_UNI) {
3650 for (value = 0; value < 256; value++) {
3651 if (isWORDCHAR_L1(value)) {
3652 ANYOF_BITMAP_SET(data->start_class, value);
3653 }
3654 }
3655 } else {
3656 for (value = 0; value < 256; value++) {
3657 if (isALNUM(value)) {
3658 ANYOF_BITMAP_SET(data->start_class, value);
3659 }
3660 }
3661 }
8aa23a47
YO
3662 }
3663 break;
3664 case ALNUML:
3665 if (flags & SCF_DO_STCLASS_AND) {
3666 if (data->start_class->flags & ANYOF_LOCALE)
3667 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3668 }
3669 else {
3670 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3671 data->start_class->flags |= ANYOF_LOCALE;
3672 }
3673 break;
3674 case NALNUM:
3675 if (flags & SCF_DO_STCLASS_AND) {
3676 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3677 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
a12cf05f
KW
3678 if (FLAGS(scan) & USE_UNI) {
3679 for (value = 0; value < 256; value++) {
3680 if (isWORDCHAR_L1(value)) {
3681 ANYOF_BITMAP_CLEAR(data->start_class, value);
3682 }
3683 }
3684 } else {
3685 for (value = 0; value < 256; value++) {
3686 if (isALNUM(value)) {
3687 ANYOF_BITMAP_CLEAR(data->start_class, value);
3688 }
3689 }
3690 }
653099ff
GS
3691 }
3692 }
8aa23a47
YO
3693 else {
3694 if (data->start_class->flags & ANYOF_LOCALE)
3695 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3696 else {
3697 for (value = 0; value < 256; value++)
3698 if (!isALNUM(value))
b9a59e08 3699 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47 3700 }
653099ff 3701 }
8aa23a47
YO
3702 break;
3703 case NALNUML:
3704 if (flags & SCF_DO_STCLASS_AND) {
3705 if (data->start_class->flags & ANYOF_LOCALE)
3706 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3707 }
8aa23a47
YO
3708 else {
3709 data->start_class->flags |= ANYOF_LOCALE;
3710 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3711 }
3712 break;
3713 case SPACE:
3714 if (flags & SCF_DO_STCLASS_AND) {
3715 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3716 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
a12cf05f
KW
3717 if (FLAGS(scan) & USE_UNI) {
3718 for (value = 0; value < 256; value++) {
3719 if (!isSPACE_L1(value)) {
3720 ANYOF_BITMAP_CLEAR(data->start_class, value);
3721 }
3722 }
3723 } else {
3724 for (value = 0; value < 256; value++) {
3725 if (!isSPACE(value)) {
3726 ANYOF_BITMAP_CLEAR(data->start_class, value);
3727 }
3728 }
3729 }
653099ff
GS
3730 }
3731 }
8aa23a47 3732 else {
a12cf05f 3733 if (data->start_class->flags & ANYOF_LOCALE) {
8aa23a47 3734 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
a12cf05f
KW
3735 }
3736 else if (FLAGS(scan) & USE_UNI) {
3737 for (value = 0; value < 256; value++) {
3738 if (isSPACE_L1(value)) {
3739 ANYOF_BITMAP_SET(data->start_class, value);
3740 }
3741 }
3742 } else {
3743 for (value = 0; value < 256; value++) {
3744 if (isSPACE(value)) {
3745 ANYOF_BITMAP_SET(data->start_class, value);
3746 }
3747 }
8aa23a47 3748 }
653099ff 3749 }
8aa23a47
YO
3750 break;
3751 case SPACEL:
3752 if (flags & SCF_DO_STCLASS_AND) {
3753 if (data->start_class->flags & ANYOF_LOCALE)
3754 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3755 }
3756 else {
3757 data->start_class->flags |= ANYOF_LOCALE;
3758 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3759 }
3760 break;
3761 case NSPACE:
3762 if (flags & SCF_DO_STCLASS_AND) {
3763 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3764 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
a12cf05f
KW
3765 if (FLAGS(scan) & USE_UNI) {
3766 for (value = 0; value < 256; value++) {
3767 if (isSPACE_L1(value)) {
3768 ANYOF_BITMAP_CLEAR(data->start_class, value);
3769 }
3770 }
3771 } else {
3772 for (value = 0; value < 256; value++) {
3773 if (isSPACE(value)) {
3774 ANYOF_BITMAP_CLEAR(data->start_class, value);
3775 }
3776 }
3777 }
653099ff 3778 }
8aa23a47
YO
3779 }
3780 else {
3781 if (data->start_class->flags & ANYOF_LOCALE)
3782 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
a12cf05f
KW
3783 else if (FLAGS(scan) & USE_UNI) {
3784 for (value = 0; value < 256; value++) {
3785 if (!isSPACE_L1(value)) {
3786 ANYOF_BITMAP_SET(data->start_class, value);
3787 }
3788 }
3789 }
3790 else {
3791 for (value = 0; value < 256; value++) {
3792 if (!isSPACE(value)) {
3793 ANYOF_BITMAP_SET(data->start_class, value);
3794 }
3795 }
3796 }
653099ff 3797 }
8aa23a47
YO
3798 break;
3799 case NSPACEL:
3800 if (flags & SCF_DO_STCLASS_AND) {
3801 if (data->start_class->flags & ANYOF_LOCALE) {
3802 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3803 for (value = 0; value < 256; value++)
3804 if (!isSPACE(value))
3805 ANYOF_BITMAP_CLEAR(data->start_class, value);
3806 }
653099ff 3807 }
8aa23a47
YO
3808 else {
3809 data->start_class->flags |= ANYOF_LOCALE;
3810 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3811 }
3812 break;
3813 case DIGIT:
3814 if (flags & SCF_DO_STCLASS_AND) {
3815 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3816 for (value = 0; value < 256; value++)
3817 if (!isDIGIT(value))
3818 ANYOF_BITMAP_CLEAR(data->start_class, value);
3819 }
3820 else {
3821 if (data->start_class->flags & ANYOF_LOCALE)
3822 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3823 else {
3824 for (value = 0; value < 256; value++)
3825 if (isDIGIT(value))
b9a59e08 3826 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47
YO
3827 }
3828 }
3829 break;
3830 case NDIGIT:
3831 if (flags & SCF_DO_STCLASS_AND) {
3832 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3833 for (value = 0; value < 256; value++)
3834 if (isDIGIT(value))
3835 ANYOF_BITMAP_CLEAR(data->start_class, value);
3836 }
3837 else {
3838 if (data->start_class->flags & ANYOF_LOCALE)
3839 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3840 else {
3841 for (value = 0; value < 256; value++)
3842 if (!isDIGIT(value))
b9a59e08 3843 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3844 }
3845 }
8aa23a47 3846 break;
e1d1eefb
YO
3847 CASE_SYNST_FNC(VERTWS);
3848 CASE_SYNST_FNC(HORIZWS);
3849
8aa23a47
YO
3850 }
3851 if (flags & SCF_DO_STCLASS_OR)
3852 cl_and(data->start_class, and_withp);
3853 flags &= ~SCF_DO_STCLASS;
3854 }
3855 }
3856 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3857 data->flags |= (OP(scan) == MEOL
3858 ? SF_BEFORE_MEOL
3859 : SF_BEFORE_SEOL);
3860 }
3861 else if ( PL_regkind[OP(scan)] == BRANCHJ
3862 /* Lookbehind, or need to calculate parens/evals/stclass: */
3863 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3864 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3865 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3866 || OP(scan) == UNLESSM )
3867 {
3868 /* Negative Lookahead/lookbehind
3869 In this case we can't do fixed string optimisation.
3870 */
1de06328 3871
8aa23a47
YO
3872 I32 deltanext, minnext, fake = 0;
3873 regnode *nscan;
3874 struct regnode_charclass_class intrnl;
3875 int f = 0;
1de06328 3876
8aa23a47
YO
3877 data_fake.flags = 0;
3878 if (data) {
3879 data_fake.whilem_c = data->whilem_c;
3880 data_fake.last_closep = data->last_closep;
c277df42 3881 }
8aa23a47
YO
3882 else
3883 data_fake.last_closep = &fake;
58e23c8d 3884 data_fake.pos_delta = delta;
8aa23a47
YO
3885 if ( flags & SCF_DO_STCLASS && !scan->flags
3886 && OP(scan) == IFMATCH ) { /* Lookahead */
3887 cl_init(pRExC_state, &intrnl);
3888 data_fake.start_class = &intrnl;
3889 f |= SCF_DO_STCLASS_AND;
3890 }
3891 if (flags & SCF_WHILEM_VISITED_POS)
3892 f |= SCF_WHILEM_VISITED_POS;
3893 next = regnext(scan);
3894 nscan = NEXTOPER(NEXTOPER(scan));
3895 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3896 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3897 if (scan->flags) {
3898 if (deltanext) {
58e23c8d 3899 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3900 }
3901 else if (minnext > (I32)U8_MAX) {
58e23c8d 3902 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3903 }
3904 scan->flags = (U8)minnext;
3905 }
3906 if (data) {
3907 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3908 pars++;
3909 if (data_fake.flags & SF_HAS_EVAL)
3910 data->flags |= SF_HAS_EVAL;
3911 data->whilem_c = data_fake.whilem_c;
3912 }
3913 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
3914 if (flags & SCF_DO_STCLASS_OR) {
3915 /* OR before, AND after: ideally we would recurse with
3916 * data_fake to get the AND applied by study of the
3917 * remainder of the pattern, and then derecurse;
3918 * *** HACK *** for now just treat as "no information".
3919 * See [perl #56690].
3920 */
3921 cl_init(pRExC_state, data->start_class);
3922 } else {
3923 /* AND before and after: combine and continue */
3924 const int was = (data->start_class->flags & ANYOF_EOS);
3925
3926 cl_and(data->start_class, &intrnl);
3927 if (was)
3928 data->start_class->flags |= ANYOF_EOS;
3929 }
8aa23a47 3930 }
cb434fcc 3931 }
8aa23a47
YO
3932#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3933 else {
3934 /* Positive Lookahead/lookbehind
3935 In this case we can do fixed string optimisation,
3936 but we must be careful about it. Note in the case of
3937 lookbehind the positions will be offset by the minimum
3938 length of the pattern, something we won't know about
3939 until after the recurse.
3940 */
3941 I32 deltanext, fake = 0;
3942 regnode *nscan;
3943 struct regnode_charclass_class intrnl;
3944 int f = 0;
3945 /* We use SAVEFREEPV so that when the full compile
3946 is finished perl will clean up the allocated
3b753521 3947 minlens when it's all done. This way we don't
8aa23a47
YO
3948 have to worry about freeing them when we know
3949 they wont be used, which would be a pain.
3950 */
3951 I32 *minnextp;
3952 Newx( minnextp, 1, I32 );
3953 SAVEFREEPV(minnextp);
3954
3955 if (data) {
3956 StructCopy(data, &data_fake, scan_data_t);
3957 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3958 f |= SCF_DO_SUBSTR;
3959 if (scan->flags)
304ee84b 3960 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3961 data_fake.last_found=newSVsv(data->last_found);
3962 }
3963 }
3964 else
3965 data_fake.last_closep = &fake;
3966 data_fake.flags = 0;
58e23c8d 3967 data_fake.pos_delta = delta;
8aa23a47
YO
3968 if (is_inf)
3969 data_fake.flags |= SF_IS_INF;
3970 if ( flags & SCF_DO_STCLASS && !scan->flags
3971 && OP(scan) == IFMATCH ) { /* Lookahead */
3972 cl_init(pRExC_state, &intrnl);
3973 data_fake.start_class = &intrnl;
3974 f |= SCF_DO_STCLASS_AND;
3975 }
3976 if (flags & SCF_WHILEM_VISITED_POS)
3977 f |= SCF_WHILEM_VISITED_POS;
3978 next = regnext(scan);
3979 nscan = NEXTOPER(NEXTOPER(scan));
3980
3981 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3982 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3983 if (scan->flags) {
3984 if (deltanext) {
58e23c8d 3985 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3986 }
3987 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3988 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3989 }
3990 scan->flags = (U8)*minnextp;
3991 }
3992
3993 *minnextp += min;
3994
3995 if (f & SCF_DO_STCLASS_AND) {
3996 const int was = (data->start_class->flags & ANYOF_EOS);
3997
3998 cl_and(data->start_class, &intrnl);
3999 if (was)
4000 data->start_class->flags |= ANYOF_EOS;
4001 }
4002 if (data) {
4003 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4004 pars++;
4005 if (data_fake.flags & SF_HAS_EVAL)
4006 data->flags |= SF_HAS_EVAL;
4007 data->whilem_c = data_fake.whilem_c;
4008 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4009 if (RExC_rx->minlen<*minnextp)
4010 RExC_rx->minlen=*minnextp;
304ee84b 4011 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
4012 SvREFCNT_dec(data_fake.last_found);
4013
4014 if ( data_fake.minlen_fixed != minlenp )
4015 {
4016 data->offset_fixed= data_fake.offset_fixed;
4017 data->minlen_fixed= data_fake.minlen_fixed;
4018 data->lookbehind_fixed+= scan->flags;
4019 }
4020 if ( data_fake.minlen_float != minlenp )
4021 {
4022 data->minlen_float= data_fake.minlen_float;
4023 data->offset_float_min=data_fake.offset_float_min;
4024 data->offset_float_max=data_fake.offset_float_max;
4025 data->lookbehind_float+= scan->flags;
4026 }
4027 }
4028 }
4029
4030
40d049e4 4031 }
8aa23a47
YO
4032#endif
4033 }
4034 else if (OP(scan) == OPEN) {
4035 if (stopparen != (I32)ARG(scan))
4036 pars++;
4037 }
4038 else if (OP(scan) == CLOSE) {
4039 if (stopparen == (I32)ARG(scan)) {
4040 break;
4041 }
4042 if ((I32)ARG(scan) == is_par) {
4043 next = regnext(scan);
b515a41d 4044
8aa23a47
YO
4045 if ( next && (OP(next) != WHILEM) && next < last)
4046 is_par = 0; /* Disable optimization */
40d049e4 4047 }
8aa23a47
YO
4048 if (data)
4049 *(data->last_closep) = ARG(scan);
4050 }
4051 else if (OP(scan) == EVAL) {
c277df42
IZ
4052 if (data)
4053 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
4054 }
4055 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4056 if (flags & SCF_DO_SUBSTR) {
304ee84b 4057 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 4058 flags &= ~SCF_DO_SUBSTR;
40d049e4 4059 }
8aa23a47
YO
4060 if (data && OP(scan)==ACCEPT) {
4061 data->flags |= SCF_SEEN_ACCEPT;
4062 if (stopmin > min)
4063 stopmin = min;
e2e6a0f1 4064 }
8aa23a47
YO
4065 }
4066 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4067 {
0f5d15d6 4068 if (flags & SCF_DO_SUBSTR) {
304ee84b 4069 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
4070 data->longest = &(data->longest_float);
4071 }
4072 is_inf = is_inf_internal = 1;
653099ff 4073 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 4074 cl_anything(pRExC_state, data->start_class);
96776eda 4075 flags &= ~SCF_DO_STCLASS;
8aa23a47 4076 }
58e23c8d 4077 else if (OP(scan) == GPOS) {
bbe252da 4078 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
4079 !(delta || is_inf || (data && data->pos_delta)))
4080 {
bbe252da
YO
4081 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4082 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
4083 if (RExC_rx->gofs < (U32)min)
4084 RExC_rx->gofs = min;
4085 } else {
bbe252da 4086 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
4087 RExC_rx->gofs = 0;
4088 }
4089 }
786e8c11 4090#ifdef TRIE_STUDY_OPT
40d049e4 4091#ifdef FULL_TRIE_STUDY
8aa23a47
YO
4092 else if (PL_regkind[OP(scan)] == TRIE) {
4093 /* NOTE - There is similar code to this block above for handling
4094 BRANCH nodes on the initial study. If you change stuff here
4095 check there too. */
4096 regnode *trie_node= scan;
4097 regnode *tail= regnext(scan);
f8fc2ecf 4098 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4099 I32 max1 = 0, min1 = I32_MAX;
4100 struct regnode_charclass_class accum;
4101
4102 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 4103 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
4104 if (flags & SCF_DO_STCLASS)
4105 cl_init_zero(pRExC_state, &accum);
4106
4107 if (!trie->jump) {
4108 min1= trie->minlen;
4109 max1= trie->maxlen;
4110 } else {
4111 const regnode *nextbranch= NULL;
4112 U32 word;
4113
4114 for ( word=1 ; word <= trie->wordcount ; word++)
4115 {
4116 I32 deltanext=0, minnext=0, f = 0, fake;
4117 struct regnode_charclass_class this_class;
4118
4119 data_fake.flags = 0;
4120 if (data) {
4121 data_fake.whilem_c = data->whilem_c;
4122 data_fake.last_closep = data->last_closep;
4123 }
4124 else
4125 data_fake.last_closep = &fake;
58e23c8d 4126 data_fake.pos_delta = delta;
8aa23a47
YO
4127 if (flags & SCF_DO_STCLASS) {
4128 cl_init(pRExC_state, &this_class);
4129 data_fake.start_class = &this_class;
4130 f = SCF_DO_STCLASS_AND;
4131 }
4132 if (flags & SCF_WHILEM_VISITED_POS)
4133 f |= SCF_WHILEM_VISITED_POS;
4134
4135 if (trie->jump[word]) {
4136 if (!nextbranch)
4137 nextbranch = trie_node + trie->jump[0];
4138 scan= trie_node + trie->jump[word];
4139 /* We go from the jump point to the branch that follows
4140 it. Note this means we need the vestigal unused branches
4141 even though they arent otherwise used.
4142 */
4143 minnext = study_chunk(pRExC_state, &scan, minlenp,
4144 &deltanext, (regnode *)nextbranch, &data_fake,
4145 stopparen, recursed, NULL, f,depth+1);
4146 }
4147 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4148 nextbranch= regnext((regnode*)nextbranch);
4149
4150 if (min1 > (I32)(minnext + trie->minlen))
4151 min1 = minnext + trie->minlen;
4152 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4153 max1 = minnext + deltanext + trie->maxlen;
4154 if (deltanext == I32_MAX)
4155 is_inf = is_inf_internal = 1;
4156
4157 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4158 pars++;
4159 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4160 if ( stopmin > min + min1)
4161 stopmin = min + min1;
4162 flags &= ~SCF_DO_SUBSTR;
4163 if (data)
4164 data->flags |= SCF_SEEN_ACCEPT;
4165 }
4166 if (data) {
4167 if (data_fake.flags & SF_HAS_EVAL)
4168 data->flags |= SF_HAS_EVAL;
4169 data->whilem_c = data_fake.whilem_c;
4170 }
4171 if (flags & SCF_DO_STCLASS)
4172 cl_or(pRExC_state, &accum, &this_class);
4173 }
4174 }
4175 if (flags & SCF_DO_SUBSTR) {
4176 data->pos_min += min1;
4177 data->pos_delta += max1 - min1;
4178 if (max1 != min1 || is_inf)
4179 data->longest = &(data->longest_float);
4180 }
4181 min += min1;
4182 delta += max1 - min1;
4183 if (flags & SCF_DO_STCLASS_OR) {
4184 cl_or(pRExC_state, data->start_class, &accum);
4185 if (min1) {
4186 cl_and(data->start_class, and_withp);
4187 flags &= ~SCF_DO_STCLASS;
4188 }
4189 }
4190 else if (flags & SCF_DO_STCLASS_AND) {
4191 if (min1) {
4192 cl_and(data->start_class, &accum);
4193 flags &= ~SCF_DO_STCLASS;
4194 }
4195 else {
4196 /* Switch to OR mode: cache the old value of
4197 * data->start_class */
4198 INIT_AND_WITHP;
4199 StructCopy(data->start_class, and_withp,
4200 struct regnode_charclass_class);
4201 flags &= ~SCF_DO_STCLASS_AND;
4202 StructCopy(&accum, data->start_class,
4203 struct regnode_charclass_class);
4204 flags |= SCF_DO_STCLASS_OR;
4205 data->start_class->flags |= ANYOF_EOS;
4206 }
4207 }
4208 scan= tail;
4209 continue;
4210 }
786e8c11 4211#else
8aa23a47 4212 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4213 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4214 U8*bang=NULL;
4215
4216 min += trie->minlen;
4217 delta += (trie->maxlen - trie->minlen);
4218 flags &= ~SCF_DO_STCLASS; /* xxx */
4219 if (flags & SCF_DO_SUBSTR) {
304ee84b 4220 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4221 data->pos_min += trie->minlen;
4222 data->pos_delta += (trie->maxlen - trie->minlen);
4223 if (trie->maxlen != trie->minlen)
4224 data->longest = &(data->longest_float);
4225 }
4226 if (trie->jump) /* no more substrings -- for now /grr*/
4227 flags &= ~SCF_DO_SUBSTR;
b515a41d 4228 }
8aa23a47
YO
4229#endif /* old or new */
4230#endif /* TRIE_STUDY_OPT */
e1d1eefb 4231
8aa23a47
YO
4232 /* Else: zero-length, ignore. */
4233 scan = regnext(scan);
4234 }
4235 if (frame) {
4236 last = frame->last;
4237 scan = frame->next;
4238 stopparen = frame->stop;
4239 frame = frame->prev;
4240 goto fake_study_recurse;
c277df42
IZ
4241 }
4242
4243 finish:
8aa23a47 4244 assert(!frame);
304ee84b 4245 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4246
c277df42 4247 *scanp = scan;
aca2d497 4248 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4249 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4250 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4251 if (is_par > (I32)U8_MAX)
c277df42
IZ
4252 is_par = 0;
4253 if (is_par && pars==1 && data) {
4254 data->flags |= SF_IN_PAR;
4255 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4256 }
4257 else if (pars && data) {
c277df42
IZ
4258 data->flags |= SF_HAS_PAR;
4259 data->flags &= ~SF_IN_PAR;
4260 }
653099ff 4261 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4262 cl_and(data->start_class, and_withp);
786e8c11
YO
4263 if (flags & SCF_TRIE_RESTUDY)
4264 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4265
304ee84b 4266 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4267
e2e6a0f1 4268 return min < stopmin ? min : stopmin;
c277df42
IZ
4269}
4270
2eccd3b2
NC
4271STATIC U32
4272S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4273{
4a4e7719
NC
4274 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4275
7918f24d
NC
4276 PERL_ARGS_ASSERT_ADD_DATA;
4277
4a4e7719
NC
4278 Renewc(RExC_rxi->data,
4279 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4280 char, struct reg_data);
4281 if(count)
f8fc2ecf 4282 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4283 else
f8fc2ecf 4284 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4285 RExC_rxi->data->count = count + n;
4286 Copy(s, RExC_rxi->data->what + count, n, U8);
4287 return count;
c277df42
IZ
4288}
4289
f8149455 4290/*XXX: todo make this not included in a non debugging perl */
76234dfb 4291#ifndef PERL_IN_XSUB_RE
d88dccdf 4292void
864dbfa3 4293Perl_reginitcolors(pTHX)
d88dccdf 4294{
97aff369 4295 dVAR;
1df70142 4296 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4297 if (s) {
1df70142
AL
4298 char *t = savepv(s);
4299 int i = 0;
4300 PL_colors[0] = t;
d88dccdf 4301 while (++i < 6) {
1df70142
AL
4302 t = strchr(t, '\t');
4303 if (t) {
4304 *t = '\0';
4305 PL_colors[i] = ++t;
d88dccdf
IZ
4306 }
4307 else
1df70142 4308 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4309 }
4310 } else {
1df70142 4311 int i = 0;
b81d288d 4312 while (i < 6)
06b5626a 4313 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4314 }
4315 PL_colorset = 1;
4316}
76234dfb 4317#endif
8615cb43 4318
07be1b83 4319
786e8c11
YO
4320#ifdef TRIE_STUDY_OPT
4321#define CHECK_RESTUDY_GOTO \
4322 if ( \
4323 (data.flags & SCF_TRIE_RESTUDY) \
4324 && ! restudied++ \
4325 ) goto reStudy
4326#else
4327#define CHECK_RESTUDY_GOTO
4328#endif
f9f4320a 4329
a687059c 4330/*
e50aee73 4331 - pregcomp - compile a regular expression into internal code
a687059c
LW
4332 *
4333 * We can't allocate space until we know how big the compiled form will be,
4334 * but we can't compile it (and thus know how big it is) until we've got a
4335 * place to put the code. So we cheat: we compile it twice, once with code
4336 * generation turned off and size counting turned on, and once "for real".
4337 * This also means that we don't allocate space until we are sure that the
4338 * thing really will compile successfully, and we never have to move the
4339 * code and thus invalidate pointers into it. (Note that it has to be in
4340 * one piece because free() must be able to free it all.) [NB: not true in perl]
4341 *
4342 * Beware that the optimization-preparation code in here knows about some
4343 * of the structure of the compiled regexp. [I'll say.]
4344 */
b9b4dddf
YO
4345
4346
4347
f9f4320a 4348#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4349#define RE_ENGINE_PTR &PL_core_reg_engine
4350#else
f9f4320a
YO
4351extern const struct regexp_engine my_reg_engine;
4352#define RE_ENGINE_PTR &my_reg_engine
4353#endif
6d5c990f
RGS
4354
4355#ifndef PERL_IN_XSUB_RE
3ab4a224 4356REGEXP *
1593ad57 4357Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4358{
97aff369 4359 dVAR;
6d5c990f 4360 HV * const table = GvHV(PL_hintgv);
7918f24d
NC
4361
4362 PERL_ARGS_ASSERT_PREGCOMP;
4363
f9f4320a
YO
4364 /* Dispatch a request to compile a regexp to correct
4365 regexp engine. */
f9f4320a
YO
4366 if (table) {
4367 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4368 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4369 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4370 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4371 DEBUG_COMPILE_r({
8d8756e7 4372 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4373 SvIV(*ptr));
4374 });
3ab4a224 4375 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4376 }
b9b4dddf 4377 }
3ab4a224 4378 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4379}
6d5c990f 4380#endif
2a5d9b1d 4381
3ab4a224 4382REGEXP *
29b09c41 4383Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
2a5d9b1d
RGS
4384{
4385 dVAR;
288b8c02
NC
4386 REGEXP *rx;
4387 struct regexp *r;
f8fc2ecf 4388 register regexp_internal *ri;
3ab4a224 4389 STRLEN plen;
5d51ce98
KW
4390 char *exp;
4391 char* xend;
c277df42 4392 regnode *scan;
a0d0e21e 4393 I32 flags;
a0d0e21e 4394 I32 minlen = 0;
29b09c41 4395 U32 pm_flags;
e7f38d0f
YO
4396
4397 /* these are all flags - maybe they should be turned
4398 * into a single int with different bit masks */
4399 I32 sawlookahead = 0;
a0d0e21e
LW
4400 I32 sawplus = 0;
4401 I32 sawopen = 0;
29b09c41 4402 bool used_setjump = FALSE;
e7f38d0f 4403
bbd61b5f
KW
4404 U8 jump_ret = 0;
4405 dJMPENV;
2c2d71f5 4406 scan_data_t data;
830247a4 4407 RExC_state_t RExC_state;
be8e71aa 4408 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83 4409#ifdef TRIE_STUDY_OPT
5d51ce98 4410 int restudied;
07be1b83
YO
4411 RExC_state_t copyRExC_state;
4412#endif
2a5d9b1d 4413 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4414
4415 PERL_ARGS_ASSERT_RE_COMPILE;
4416
6d5c990f 4417 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4418
29b09c41 4419 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
7b597bb8 4420
d6bd454d 4421 /****************** LONG JUMP TARGET HERE***********************/
bbd61b5f
KW
4422 /* Longjmp back to here if have to switch in midstream to utf8 */
4423 if (! RExC_orig_utf8) {
4424 JMPENV_PUSH(jump_ret);
29b09c41 4425 used_setjump = TRUE;
bbd61b5f
KW
4426 }
4427
5d51ce98 4428 if (jump_ret == 0) { /* First time through */
29b09c41
KW
4429 exp = SvPV(pattern, plen);
4430 xend = exp + plen;
4431 /* ignore the utf8ness if the pattern is 0 length */
4432 if (plen == 0) {
4433 RExC_utf8 = RExC_orig_utf8 = 0;
4434 }
4435
5d51ce98
KW
4436 DEBUG_COMPILE_r({
4437 SV *dsv= sv_newmortal();
4438 RE_PV_QUOTED_DECL(s, RExC_utf8,
4439 dsv, exp, plen, 60);
4440 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4441 PL_colors[4],PL_colors[5],s);
4442 });
4443 }
4444 else { /* longjumped back */
bbd61b5f
KW
4445 STRLEN len = plen;
4446
5d51ce98
KW
4447 /* If the cause for the longjmp was other than changing to utf8, pop
4448 * our own setjmp, and longjmp to the correct handler */
bbd61b5f
KW
4449 if (jump_ret != UTF8_LONGJMP) {
4450 JMPENV_POP;
4451 JMPENV_JUMP(jump_ret);
4452 }
4453
595598ee
KW
4454 GET_RE_DEBUG_FLAGS;
4455
bbd61b5f
KW
4456 /* It's possible to write a regexp in ascii that represents Unicode
4457 codepoints outside of the byte range, such as via \x{100}. If we
4458 detect such a sequence we have to convert the entire pattern to utf8
4459 and then recompile, as our sizing calculation will have been based
4460 on 1 byte == 1 character, but we will need to use utf8 to encode
4461 at least some part of the pattern, and therefore must convert the whole
4462 thing.
4463 -- dmq */
4464 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4465 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
595598ee 4466 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
bbd61b5f
KW
4467 xend = exp + len;
4468 RExC_orig_utf8 = RExC_utf8 = 1;
4469 SAVEFREEPV(exp);
4470 }
4471
5d51ce98
KW
4472#ifdef TRIE_STUDY_OPT
4473 restudied = 0;
4474#endif
4475
29b09c41
KW
4476 /* Set to use unicode semantics if the pattern is in utf8 and has the
4477 * 'dual' charset specified, as it means unicode when utf8 */
4478 pm_flags = orig_pm_flags;
4479 if (RExC_utf8 && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
4480 pm_flags |= RXf_PMf_UNICODE;
4481 }
4482
02daf0ab 4483 RExC_precomp = exp;
c737faaf 4484 RExC_flags = pm_flags;
830247a4 4485 RExC_sawback = 0;
bbce6d69 4486
830247a4
IZ
4487 RExC_seen = 0;
4488 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4489 RExC_seen_evals = 0;
4490 RExC_extralen = 0;
c277df42 4491
bbce6d69 4492 /* First pass: determine size, legality. */
830247a4 4493 RExC_parse = exp;
fac92740 4494 RExC_start = exp;
830247a4
IZ
4495 RExC_end = xend;
4496 RExC_naughty = 0;
4497 RExC_npar = 1;
e2e6a0f1 4498 RExC_nestroot = 0;
830247a4
IZ
4499 RExC_size = 0L;
4500 RExC_emit = &PL_regdummy;
4501 RExC_whilem_seen = 0;
40d049e4
YO
4502 RExC_open_parens = NULL;
4503 RExC_close_parens = NULL;
4504 RExC_opend = NULL;
81714fb9 4505 RExC_paren_names = NULL;
1f1031fe
YO
4506#ifdef DEBUGGING
4507 RExC_paren_name_list = NULL;
4508#endif
40d049e4
YO
4509 RExC_recurse = NULL;
4510 RExC_recurse_count = 0;
81714fb9 4511
85ddcde9
JH
4512#if 0 /* REGC() is (currently) a NOP at the first pass.
4513 * Clever compilers notice this and complain. --jhi */
830247a4 4514 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4515#endif
3dab1dad
YO
4516 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4517 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4518 RExC_precomp = NULL;
a0d0e21e
LW
4519 return(NULL);
4520 }
bbd61b5f 4521
29b09c41
KW
4522 /* Here, finished first pass. Get rid of any added setjmp */
4523 if (used_setjump) {
bbd61b5f 4524 JMPENV_POP;
02daf0ab 4525 }
07be1b83 4526 DEBUG_PARSE_r({
81714fb9
YO
4527 PerlIO_printf(Perl_debug_log,
4528 "Required size %"IVdf" nodes\n"
4529 "Starting second pass (creation)\n",
4530 (IV)RExC_size);
07be1b83
YO
4531 RExC_lastnum=0;
4532 RExC_lastparse=NULL;
4533 });
c277df42
IZ
4534 /* Small enough for pointer-storage convention?
4535 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4536 if (RExC_size >= 0x10000L && RExC_extralen)
4537 RExC_size += RExC_extralen;
c277df42 4538 else
830247a4
IZ
4539 RExC_extralen = 0;
4540 if (RExC_whilem_seen > 15)
4541 RExC_whilem_seen = 15;
a0d0e21e 4542
f9f4320a
YO
4543 /* Allocate space and zero-initialize. Note, the two step process
4544 of zeroing when in debug mode, thus anything assigned has to
4545 happen after that */
d2f13c59 4546 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 4547 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
4548 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4549 char, regexp_internal);
4550 if ( r == NULL || ri == NULL )
b45f050a 4551 FAIL("Regexp out of space");
0f79a09d
GS
4552#ifdef DEBUGGING
4553 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4554 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4555#else
f8fc2ecf
YO
4556 /* bulk initialize base fields with 0. */
4557 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4558#endif
58e23c8d
YO
4559
4560 /* non-zero initialization begins here */
f8fc2ecf 4561 RXi_SET( r, ri );
f9f4320a 4562 r->engine= RE_ENGINE_PTR;
c737faaf 4563 r->extflags = pm_flags;
bcdf7404 4564 {
f7819f85 4565 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
77ba44fb 4566 bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
c5ea2ffa
KW
4567
4568 /* The caret is output if there are any defaults: if not all the STD
4569 * flags are set, or if no character set specifier is needed */
4570 bool has_default =
4571 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4572 || ! has_charset);
bcdf7404 4573 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
4574 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4575 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
4576 const char *fptr = STD_PAT_MODS; /*"msix"*/
4577 char *p;
fb85c044 4578 /* Allocate for the worst case, which is all the std flags are turned
c5ea2ffa
KW
4579 * on. If more precision is desired, we could do a population count of
4580 * the flags set. This could be done with a small lookup table, or by
4581 * shifting, masking and adding, or even, when available, assembly
4582 * language for a machine-language population count.
4583 * We never output a minus, as all those are defaults, so are
4584 * covered by the caret */
fb85c044 4585 const STRLEN wraplen = plen + has_p + has_runon
c5ea2ffa
KW
4586 + has_default /* If needs a caret */
4587 + has_charset /* If needs a character set specifier */
bcdf7404
YO
4588 + (sizeof(STD_PAT_MODS) - 1)
4589 + (sizeof("(?:)") - 1);
4590
c5ea2ffa 4591 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
f7c278bf 4592 SvPOK_on(rx);
8f6ae13c 4593 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 4594 *p++='('; *p++='?';
9de15fec
KW
4595
4596 /* If a default, cover it using the caret */
c5ea2ffa 4597 if (has_default) {
85508812 4598 *p++= DEFAULT_PAT_MOD;
fb85c044 4599 }
c5ea2ffa
KW
4600 if (has_charset) {
4601 if (r->extflags & RXf_PMf_LOCALE) {
4602 *p++ = LOCALE_PAT_MOD;
4603 } else {
4604 *p++ = UNICODE_PAT_MOD;
4605 }
9de15fec 4606 }
f7819f85
A
4607 if (has_p)
4608 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404 4609 {
bcdf7404 4610 char ch;
bcdf7404
YO
4611 while((ch = *fptr++)) {
4612 if(reganch & 1)
4613 *p++ = ch;
bcdf7404
YO
4614 reganch >>= 1;
4615 }
bcdf7404
YO
4616 }
4617
28d8d7f4 4618 *p++ = ':';
bb661a58 4619 Copy(RExC_precomp, p, plen, char);
efd26800
NC
4620 assert ((RX_WRAPPED(rx) - p) < 16);
4621 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 4622 p += plen;
bcdf7404 4623 if (has_runon)
28d8d7f4
YO
4624 *p++ = '\n';
4625 *p++ = ')';
4626 *p = 0;
fb85c044 4627 SvCUR_set(rx, p - SvPVX_const(rx));
bcdf7404
YO
4628 }
4629
bbe252da 4630 r->intflags = 0;
830247a4 4631 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4632
6bda09f9 4633 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4634 Newxz(RExC_open_parens, RExC_npar,regnode *);
4635 SAVEFREEPV(RExC_open_parens);
4636 Newxz(RExC_close_parens,RExC_npar,regnode *);
4637 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4638 }
4639
4640 /* Useful during FAIL. */
7122b237
YO
4641#ifdef RE_TRACK_PATTERN_OFFSETS
4642 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4643 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4644 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4645 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4646 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4647#endif
4648 SetProgLen(ri,RExC_size);
288b8c02 4649 RExC_rx_sv = rx;
830247a4 4650 RExC_rx = r;
f8fc2ecf 4651 RExC_rxi = ri;
bbce6d69 4652
4653 /* Second pass: emit code. */
c737faaf 4654 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
4655 RExC_parse = exp;
4656 RExC_end = xend;
4657 RExC_naughty = 0;
4658 RExC_npar = 1;
f8fc2ecf
YO
4659 RExC_emit_start = ri->program;
4660 RExC_emit = ri->program;
3b57cd43
YO
4661 RExC_emit_bound = ri->program + RExC_size + 1;
4662
2cd61cdb 4663 /* Store the count of eval-groups for security checks: */
f8149455 4664 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4665 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 4666 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 4667 ReREFCNT_dec(rx);
a0d0e21e 4668 return(NULL);
80757612 4669 }
07be1b83
YO
4670 /* XXXX To minimize changes to RE engine we always allocate
4671 3-units-long substrs field. */
4672 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4673 if (RExC_recurse_count) {
4674 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4675 SAVEFREEPV(RExC_recurse);
4676 }
a0d0e21e 4677
07be1b83 4678reStudy:
e7f38d0f 4679 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
07be1b83 4680 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4681
07be1b83 4682#ifdef TRIE_STUDY_OPT
0934c9d9
SH
4683 if (!restudied) {
4684 StructCopy(&zero_scan_data, &data, scan_data_t);
4685 copyRExC_state = RExC_state;
4686 } else {
5d458dd8 4687 U32 seen=RExC_seen;
07be1b83 4688 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4689
4690 RExC_state = copyRExC_state;
4691 if (seen & REG_TOP_LEVEL_BRANCHES)
4692 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4693 else
4694 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4695 if (data.last_found) {
07be1b83 4696 SvREFCNT_dec(data.longest_fixed);
07be1b83 4697 SvREFCNT_dec(data.longest_float);
07be1b83 4698 SvREFCNT_dec(data.last_found);
1de06328 4699 }
40d049e4 4700 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4701 }
40d049e4
YO
4702#else
4703 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4704#endif
fc8cd66c 4705
a0d0e21e 4706 /* Dig out information for optimizations. */
f7819f85 4707 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
4708 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4709
a0ed51b3 4710 if (UTF)
8f6ae13c 4711 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 4712 ri->regstclass = NULL;
830247a4 4713 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4714 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4715 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4716
1de06328
YO
4717 /* testing for BRANCH here tells us whether there is "must appear"
4718 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4719 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4720 I32 fake;
c5254dd6 4721 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4722 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4723 int stclass_flag;
07be1b83 4724 I32 last_close = 0; /* pointed to by data */
5339e136
YO
4725 regnode *first= scan;
4726 regnode *first_next= regnext(first);
639081d6
YO
4727 /*
4728 * Skip introductions and multiplicators >= 1
4729 * so that we can extract the 'meat' of the pattern that must
4730 * match in the large if() sequence following.
4731 * NOTE that EXACT is NOT covered here, as it is normally
4732 * picked up by the optimiser separately.
4733 *
4734 * This is unfortunate as the optimiser isnt handling lookahead
4735 * properly currently.
4736 *
4737 */
a0d0e21e 4738 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4739 /* An OR of *one* alternative - should not happen now. */
5339e136 4740 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83 4741 /* for now we can't handle lookbehind IFMATCH*/
e7f38d0f 4742 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
a0d0e21e
LW
4743 (OP(first) == PLUS) ||
4744 (OP(first) == MINMOD) ||
653099ff 4745 /* An {n,m} with n>0 */
5339e136
YO
4746 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4747 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 4748 {
639081d6
YO
4749 /*
4750 * the only op that could be a regnode is PLUS, all the rest
4751 * will be regnode_1 or regnode_2.
4752 *
4753 */
a0d0e21e
LW
4754 if (OP(first) == PLUS)
4755 sawplus = 1;
4756 else
3dab1dad 4757 first += regarglen[OP(first)];
639081d6
YO
4758
4759 first = NEXTOPER(first);
5339e136 4760 first_next= regnext(first);
a687059c
LW
4761 }
4762
a0d0e21e
LW
4763 /* Starting-point info. */
4764 again:
786e8c11 4765 DEBUG_PEEP("first:",first,0);
07be1b83 4766 /* Ignore EXACT as we deal with it later. */
3dab1dad 4767 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4768 if (OP(first) == EXACT)
6f207bd3 4769 NOOP; /* Empty, get anchored substr later. */
e5fbd0ff 4770 else
f8fc2ecf 4771 ri->regstclass = first;
b3c9acc1 4772 }
07be1b83 4773#ifdef TRIE_STCLASS
786e8c11 4774 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4775 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4776 {
786e8c11 4777 regnode *trie_op;
07be1b83 4778 /* this can happen only on restudy */
786e8c11 4779 if ( OP(first) == TRIE ) {
c944940b 4780 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4781 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4782 StructCopy(first,trieop,struct regnode_1);
4783 trie_op=(regnode *)trieop;
4784 } else {
c944940b 4785 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4786 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4787 StructCopy(first,trieop,struct regnode_charclass);
4788 trie_op=(regnode *)trieop;
4789 }
1de06328 4790 OP(trie_op)+=2;
786e8c11 4791 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4792 ri->regstclass = trie_op;
07be1b83
YO
4793 }
4794#endif
e52fc539 4795 else if (REGNODE_SIMPLE(OP(first)))
f8fc2ecf 4796 ri->regstclass = first;
3dab1dad
YO
4797 else if (PL_regkind[OP(first)] == BOUND ||
4798 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4799 ri->regstclass = first;
3dab1dad 4800 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4801 r->extflags |= (OP(first) == MBOL
4802 ? RXf_ANCH_MBOL
cad2e5aa 4803 : (OP(first) == SBOL
bbe252da
YO
4804 ? RXf_ANCH_SBOL
4805 : RXf_ANCH_BOL));
a0d0e21e 4806 first = NEXTOPER(first);
774d564b 4807 goto again;
4808 }
4809 else if (OP(first) == GPOS) {
bbe252da 4810 r->extflags |= RXf_ANCH_GPOS;
774d564b 4811 first = NEXTOPER(first);
4812 goto again;
a0d0e21e 4813 }
cf2a2b69
YO
4814 else if ((!sawopen || !RExC_sawback) &&
4815 (OP(first) == STAR &&
3dab1dad 4816 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4817 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4818 {
4819 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4820 const int type =
4821 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4822 ? RXf_ANCH_MBOL
4823 : RXf_ANCH_SBOL;
4824 r->extflags |= type;
4825 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4826 first = NEXTOPER(first);
774d564b 4827 goto again;
a0d0e21e 4828 }
e7f38d0f 4829 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
830247a4 4830 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4831 /* x+ must match at the 1st pos of run of x's */
bbe252da 4832 r->intflags |= PREGf_SKIP;
a0d0e21e 4833
c277df42 4834 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4835#ifdef TRIE_STUDY_OPT
81714fb9 4836 DEBUG_PARSE_r(
be8e71aa
YO
4837 if (!restudied)
4838 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4839 (IV)(first - scan + 1))
4840 );
4841#else
81714fb9 4842 DEBUG_PARSE_r(
be8e71aa
YO
4843 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4844 (IV)(first - scan + 1))
4845 );
4846#endif
4847
4848
a0d0e21e
LW
4849 /*
4850 * If there's something expensive in the r.e., find the
4851 * longest literal string that must appear and make it the
4852 * regmust. Resolve ties in favor of later strings, since
4853 * the regstart check works with the beginning of the r.e.
4854 * and avoiding duplication strengthens checking. Not a
4855 * strong reason, but sufficient in the absence of others.
4856 * [Now we resolve ties in favor of the earlier string if
c277df42 4857 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4858 * earlier string may buy us something the later one won't.]
4859 */
de8c5301 4860
396482e1
GA
4861 data.longest_fixed = newSVpvs("");
4862 data.longest_float = newSVpvs("");
4863 data.last_found = newSVpvs("");
c277df42
IZ
4864 data.longest = &(data.longest_fixed);
4865 first = scan;
f8fc2ecf 4866 if (!ri->regstclass) {
830247a4 4867 cl_init(pRExC_state, &ch_class);
653099ff
GS
4868 data.start_class = &ch_class;
4869 stclass_flag = SCF_DO_STCLASS_AND;
4870 } else /* XXXX Check for BOUND? */
4871 stclass_flag = 0;
cb434fcc 4872 data.last_closep = &last_close;
de8c5301 4873
1de06328 4874 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4875 &data, -1, NULL, NULL,
4876 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4877
07be1b83 4878
786e8c11
YO
4879 CHECK_RESTUDY_GOTO;
4880
4881
830247a4 4882 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4883 && data.last_start_min == 0 && data.last_end > 0
830247a4 4884 && !RExC_seen_zerolen
2bf803e2 4885 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4886 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4887 r->extflags |= RXf_CHECK_ALL;
304ee84b 4888 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4889 SvREFCNT_dec(data.last_found);
4890
1de06328
YO
4891 /* Note that code very similar to this but for anchored string
4892 follows immediately below, changes may need to be made to both.
4893 Be careful.
4894 */
a0ed51b3 4895 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4896 if (longest_float_length
c277df42
IZ
4897 || (data.flags & SF_FL_BEFORE_EOL
4898 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4899 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4900 {
1182767e 4901 I32 t,ml;
cf93c79d 4902
1de06328 4903 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4904 && data.offset_fixed == data.offset_float_min
4905 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4906 goto remove_float; /* As in (a)+. */
4907
1de06328
YO
4908 /* copy the information about the longest float from the reg_scan_data
4909 over to the program. */
33b8afdf
JH
4910 if (SvUTF8(data.longest_float)) {
4911 r->float_utf8 = data.longest_float;
c445ea15 4912 r->float_substr = NULL;
33b8afdf
JH
4913 } else {
4914 r->float_substr = data.longest_float;
c445ea15 4915 r->float_utf8 = NULL;
33b8afdf 4916 }
1de06328
YO
4917 /* float_end_shift is how many chars that must be matched that
4918 follow this item. We calculate it ahead of time as once the
4919 lookbehind offset is added in we lose the ability to correctly
4920 calculate it.*/
4921 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4922 : (I32)longest_float_length;
1de06328
YO
4923 r->float_end_shift = ml - data.offset_float_min
4924 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4925 + data.lookbehind_float;
4926 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4927 r->float_max_offset = data.offset_float_max;
1182767e 4928 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4929 r->float_max_offset -= data.lookbehind_float;
4930
cf93c79d
IZ
4931 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4932 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4933 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4934 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4935 }
4936 else {
aca2d497 4937 remove_float:
c445ea15 4938 r->float_substr = r->float_utf8 = NULL;
c277df42 4939 SvREFCNT_dec(data.longest_float);
c5254dd6 4940 longest_float_length = 0;
a0d0e21e 4941 }
c277df42 4942
1de06328
YO
4943 /* Note that code very similar to this but for floating string
4944 is immediately above, changes may need to be made to both.
4945 Be careful.
4946 */
a0ed51b3 4947 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4948 if (longest_fixed_length
c277df42
IZ
4949 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4950 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4951 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4952 {
1182767e 4953 I32 t,ml;
cf93c79d 4954
1de06328
YO
4955 /* copy the information about the longest fixed
4956 from the reg_scan_data over to the program. */
33b8afdf
JH
4957 if (SvUTF8(data.longest_fixed)) {
4958 r->anchored_utf8 = data.longest_fixed;
c445ea15 4959 r->anchored_substr = NULL;
33b8afdf
JH
4960 } else {
4961 r->anchored_substr = data.longest_fixed;
c445ea15 4962 r->anchored_utf8 = NULL;
33b8afdf 4963 }
1de06328
YO
4964 /* fixed_end_shift is how many chars that must be matched that
4965 follow this item. We calculate it ahead of time as once the
4966 lookbehind offset is added in we lose the ability to correctly
4967 calculate it.*/
4968 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4969 : (I32)longest_fixed_length;
1de06328
YO
4970 r->anchored_end_shift = ml - data.offset_fixed
4971 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4972 + data.lookbehind_fixed;
4973 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4974
cf93c79d
IZ
4975 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4976 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4977 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4978 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4979 }
4980 else {
c445ea15 4981 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4982 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4983 longest_fixed_length = 0;
a0d0e21e 4984 }
f8fc2ecf
YO
4985 if (ri->regstclass
4986 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4987 ri->regstclass = NULL;
33b8afdf
JH
4988 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4989 && stclass_flag
653099ff 4990 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4991 && !cl_is_anything(data.start_class))
4992 {
2eccd3b2 4993 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4994
f8fc2ecf 4995 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4996 struct regnode_charclass_class);
4997 StructCopy(data.start_class,
f8fc2ecf 4998 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4999 struct regnode_charclass_class);
f8fc2ecf 5000 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5001 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5002 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 5003 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5004 PerlIO_printf(Perl_debug_log,
a0288114 5005 "synthetic stclass \"%s\".\n",
3f7c398e 5006 SvPVX_const(sv));});
653099ff 5007 }
c277df42
IZ
5008
5009 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 5010 if (longest_fixed_length > longest_float_length) {
1de06328 5011 r->check_end_shift = r->anchored_end_shift;
c277df42 5012 r->check_substr = r->anchored_substr;
33b8afdf 5013 r->check_utf8 = r->anchored_utf8;
c277df42 5014 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
5015 if (r->extflags & RXf_ANCH_SINGLE)
5016 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
5017 }
5018 else {
1de06328 5019 r->check_end_shift = r->float_end_shift;
c277df42 5020 r->check_substr = r->float_substr;
33b8afdf 5021 r->check_utf8 = r->float_utf8;
1de06328
YO
5022 r->check_offset_min = r->float_min_offset;
5023 r->check_offset_max = r->float_max_offset;
a0d0e21e 5024 }
30382c73
IZ
5025 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5026 This should be changed ASAP! */
bbe252da
YO
5027 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5028 r->extflags |= RXf_USE_INTUIT;
33b8afdf 5029 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 5030 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 5031 }
1de06328
YO
5032 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5033 if ( (STRLEN)minlen < longest_float_length )
5034 minlen= longest_float_length;
5035 if ( (STRLEN)minlen < longest_fixed_length )
5036 minlen= longest_fixed_length;
5037 */
a0ed51b3
LW
5038 }
5039 else {
c277df42
IZ
5040 /* Several toplevels. Best we can is to set minlen. */
5041 I32 fake;
653099ff 5042 struct regnode_charclass_class ch_class;
cb434fcc 5043 I32 last_close = 0;
c277df42 5044
5d458dd8 5045 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 5046
f8fc2ecf 5047 scan = ri->program + 1;
830247a4 5048 cl_init(pRExC_state, &ch_class);
653099ff 5049 data.start_class = &ch_class;
cb434fcc 5050 data.last_closep = &last_close;
07be1b83 5051
de8c5301 5052
1de06328 5053 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 5054 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 5055
786e8c11 5056 CHECK_RESTUDY_GOTO;
07be1b83 5057
33b8afdf 5058 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 5059 = r->float_substr = r->float_utf8 = NULL;
653099ff 5060 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
5061 && !cl_is_anything(data.start_class))
5062 {
2eccd3b2 5063 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 5064
f8fc2ecf 5065 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
5066 struct regnode_charclass_class);
5067 StructCopy(data.start_class,
f8fc2ecf 5068 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 5069 struct regnode_charclass_class);
f8fc2ecf 5070 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5071 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5072 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 5073 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5074 PerlIO_printf(Perl_debug_log,
a0288114 5075 "synthetic stclass \"%s\".\n",
3f7c398e 5076 SvPVX_const(sv));});
653099ff 5077 }
a0d0e21e
LW
5078 }
5079
1de06328
YO
5080 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5081 the "real" pattern. */
cf9788e3
RGS
5082 DEBUG_OPTIMISE_r({
5083 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 5084 (IV)minlen, (IV)r->minlen);
cf9788e3 5085 });
de8c5301 5086 r->minlenret = minlen;
1de06328
YO
5087 if (r->minlen < minlen)
5088 r->minlen = minlen;
5089
b81d288d 5090 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 5091 r->extflags |= RXf_GPOS_SEEN;
830247a4 5092 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 5093 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 5094 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 5095 r->extflags |= RXf_EVAL_SEEN;
f33976b4 5096 if (RExC_seen & REG_SEEN_CANY)
bbe252da 5097 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 5098 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 5099 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 5100 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 5101 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 5102 if (RExC_paren_names)
85fbaab2 5103 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 5104 else
5daac39c 5105 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 5106
7bd1e614 5107#ifdef STUPID_PATTERN_CHECKS
5509d87a 5108 if (RX_PRELEN(rx) == 0)
640f820d 5109 r->extflags |= RXf_NULL;
5509d87a 5110 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
5111 /* XXX: this should happen BEFORE we compile */
5112 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 5113 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 5114 r->extflags |= RXf_WHITE;
5509d87a 5115 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 5116 r->extflags |= RXf_START_ONLY;
f1b875a0 5117#else
5509d87a 5118 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
5119 /* XXX: this should happen BEFORE we compile */
5120 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5121 else {
5122 regnode *first = ri->program + 1;
39aa8307
JH
5123 U8 fop = OP(first);
5124 U8 nop = OP(NEXTOPER(first));
7bd1e614 5125
640f820d
AB
5126 if (PL_regkind[fop] == NOTHING && nop == END)
5127 r->extflags |= RXf_NULL;
5128 else if (PL_regkind[fop] == BOL && nop == END)
7bd1e614
YO
5129 r->extflags |= RXf_START_ONLY;
5130 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5131 r->extflags |= RXf_WHITE;
5132 }
f1b875a0 5133#endif
1f1031fe
YO
5134#ifdef DEBUGGING
5135 if (RExC_paren_names) {
af534a04 5136 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
1f1031fe
YO
5137 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5138 } else
1f1031fe 5139#endif
cde0cee5 5140 ri->name_list_idx = 0;
1f1031fe 5141
40d049e4
YO
5142 if (RExC_recurse_count) {
5143 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5144 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5145 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5146 }
5147 }
f0ab9afb 5148 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
5149 /* assume we don't need to swap parens around before we match */
5150
be8e71aa
YO
5151 DEBUG_DUMP_r({
5152 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
5153 regdump(r);
5154 });
7122b237
YO
5155#ifdef RE_TRACK_PATTERN_OFFSETS
5156 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5157 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
5158 U32 i;
5159 GET_RE_DEBUG_FLAGS_DECL;
7122b237 5160 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 5161 for (i = 1; i <= len; i++) {
7122b237 5162 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 5163 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 5164 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
5165 }
5166 PerlIO_printf(Perl_debug_log, "\n");
5167 });
7122b237 5168#endif
288b8c02 5169 return rx;
a687059c
LW
5170}
5171
f9f4320a 5172#undef RE_ENGINE_PTR
3dab1dad 5173
93b32b6d 5174
81714fb9 5175SV*
192b9cd1
AB
5176Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5177 const U32 flags)
5178{
7918f24d
NC
5179 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5180
192b9cd1
AB
5181 PERL_UNUSED_ARG(value);
5182
f1b875a0 5183 if (flags & RXapif_FETCH) {
192b9cd1 5184 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 5185 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6ad8f254 5186 Perl_croak_no_modify(aTHX);
192b9cd1 5187 return NULL;
f1b875a0 5188 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
5189 return reg_named_buff_exists(rx, key, flags)
5190 ? &PL_sv_yes
5191 : &PL_sv_no;
f1b875a0 5192 } else if (flags & RXapif_REGNAMES) {
192b9cd1 5193 return reg_named_buff_all(rx, flags);
f1b875a0 5194 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
5195 return reg_named_buff_scalar(rx, flags);
5196 } else {
5197 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5198 return NULL;
5199 }
5200}
5201
5202SV*
5203Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5204 const U32 flags)
5205{
7918f24d 5206 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
5207 PERL_UNUSED_ARG(lastkey);
5208
f1b875a0 5209 if (flags & RXapif_FIRSTKEY)
192b9cd1 5210 return reg_named_buff_firstkey(rx, flags);
f1b875a0 5211 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
5212 return reg_named_buff_nextkey(rx, flags);
5213 else {
5214 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5215 return NULL;
5216 }
5217}
5218
5219SV*
288b8c02
NC
5220Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5221 const U32 flags)
81714fb9 5222{
44a2ac75
YO
5223 AV *retarray = NULL;
5224 SV *ret;
288b8c02 5225 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5226
5227 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5228
f1b875a0 5229 if (flags & RXapif_ALL)
44a2ac75 5230 retarray=newAV();
93b32b6d 5231
5daac39c
NC
5232 if (rx && RXp_PAREN_NAMES(rx)) {
5233 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
5234 if (he_str) {
5235 IV i;
5236 SV* sv_dat=HeVAL(he_str);
5237 I32 *nums=(I32*)SvPVX(sv_dat);
5238 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
5239 if ((I32)(rx->nparens) >= nums[i]
5240 && rx->offs[nums[i]].start != -1
5241 && rx->offs[nums[i]].end != -1)
93b32b6d 5242 {
49d7dfbc 5243 ret = newSVpvs("");
288b8c02 5244 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
5245 if (!retarray)
5246 return ret;
5247 } else {
5248 ret = newSVsv(&PL_sv_undef);
5249 }
ec83ea38 5250 if (retarray)
93b32b6d 5251 av_push(retarray, ret);
81714fb9 5252 }
93b32b6d 5253 if (retarray)
ad64d0ec 5254 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
5255 }
5256 }
5257 return NULL;
5258}
5259
5260bool
288b8c02 5261Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
5262 const U32 flags)
5263{
288b8c02 5264 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5265
5266 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5267
5daac39c 5268 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5269 if (flags & RXapif_ALL) {
5daac39c 5270 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 5271 } else {
288b8c02 5272 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
5273 if (sv) {
5274 SvREFCNT_dec(sv);
192b9cd1
AB
5275 return TRUE;
5276 } else {
5277 return FALSE;
5278 }
5279 }
5280 } else {
5281 return FALSE;
5282 }
5283}
5284
5285SV*
288b8c02 5286Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5287{
288b8c02 5288 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5289
5290 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5291
5daac39c
NC
5292 if ( rx && RXp_PAREN_NAMES(rx) ) {
5293 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 5294
288b8c02 5295 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
5296 } else {
5297 return FALSE;
5298 }
192b9cd1
AB
5299}
5300
5301SV*
288b8c02 5302Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5303{
288b8c02 5304 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 5305 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5306
5307 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5308
5daac39c
NC
5309 if (rx && RXp_PAREN_NAMES(rx)) {
5310 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
5311 HE *temphe;
5312 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5313 IV i;
5314 IV parno = 0;
5315 SV* sv_dat = HeVAL(temphe);
5316 I32 *nums = (I32*)SvPVX(sv_dat);
5317 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5318 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5319 rx->offs[nums[i]].start != -1 &&
5320 rx->offs[nums[i]].end != -1)
5321 {
5322 parno = nums[i];
5323 break;
5324 }
5325 }
f1b875a0 5326 if (parno || flags & RXapif_ALL) {
a663657d 5327 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5328 }
81714fb9
YO
5329 }
5330 }
44a2ac75
YO
5331 return NULL;
5332}
5333
192b9cd1 5334SV*
288b8c02 5335Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5336{
5337 SV *ret;
5338 AV *av;
5339 I32 length;
288b8c02 5340 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5341
7918f24d
NC
5342 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5343
5daac39c 5344 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5345 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5346 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5347 } else if (flags & RXapif_ONE) {
288b8c02 5348 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 5349 av = MUTABLE_AV(SvRV(ret));
192b9cd1 5350 length = av_len(av);
ec83ea38 5351 SvREFCNT_dec(ret);
192b9cd1
AB
5352 return newSViv(length + 1);
5353 } else {
5354 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5355 return NULL;
5356 }
5357 }
5358 return &PL_sv_undef;
5359}
5360
5361SV*
288b8c02 5362Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5363{
288b8c02 5364 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5365 AV *av = newAV();
5366
7918f24d
NC
5367 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5368
5daac39c
NC
5369 if (rx && RXp_PAREN_NAMES(rx)) {
5370 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5371 HE *temphe;
5372 (void)hv_iterinit(hv);
5373 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5374 IV i;
5375 IV parno = 0;
5376 SV* sv_dat = HeVAL(temphe);
5377 I32 *nums = (I32*)SvPVX(sv_dat);
5378 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5379 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5380 rx->offs[nums[i]].start != -1 &&
5381 rx->offs[nums[i]].end != -1)
5382 {
5383 parno = nums[i];
5384 break;
5385 }
5386 }
f1b875a0 5387 if (parno || flags & RXapif_ALL) {
a663657d 5388 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
5389 }
5390 }
5391 }
5392
ad64d0ec 5393 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
5394}
5395
49d7dfbc 5396void
288b8c02
NC
5397Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5398 SV * const sv)
44a2ac75 5399{
288b8c02 5400 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 5401 char *s = NULL;
a9d504c3 5402 I32 i = 0;
44a2ac75 5403 I32 s1, t1;
7918f24d
NC
5404
5405 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 5406
cde0cee5
YO
5407 if (!rx->subbeg) {
5408 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5409 return;
cde0cee5
YO
5410 }
5411 else
f1b875a0 5412 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 5413 /* $` */
f0ab9afb 5414 i = rx->offs[0].start;
cde0cee5 5415 s = rx->subbeg;
44a2ac75
YO
5416 }
5417 else
f1b875a0 5418 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 5419 /* $' */
f0ab9afb
NC
5420 s = rx->subbeg + rx->offs[0].end;
5421 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
5422 }
5423 else
5424 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
5425 (s1 = rx->offs[paren].start) != -1 &&
5426 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
5427 {
5428 /* $& $1 ... */
5429 i = t1 - s1;
5430 s = rx->subbeg + s1;
cde0cee5
YO
5431 } else {
5432 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5433 return;
cde0cee5
YO
5434 }
5435 assert(rx->sublen >= (s - rx->subbeg) + i );
5436 if (i >= 0) {
5437 const int oldtainted = PL_tainted;
5438 TAINT_NOT;
5439 sv_setpvn(sv, s, i);
5440 PL_tainted = oldtainted;
5441 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 5442 ? (RXp_MATCH_UTF8(rx)
cde0cee5 5443 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 5444 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
5445 {
5446 SvUTF8_on(sv);
5447 }
5448 else
5449 SvUTF8_off(sv);
5450 if (PL_tainting) {
07bc277f 5451 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
5452 if (SvTYPE(sv) >= SVt_PVMG) {
5453 MAGIC* const mg = SvMAGIC(sv);
5454 MAGIC* mgt;
5455 PL_tainted = 1;
5456 SvMAGIC_set(sv, mg->mg_moremagic);
5457 SvTAINT(sv);
5458 if ((mgt = SvMAGIC(sv))) {
5459 mg->mg_moremagic = mgt;
5460 SvMAGIC_set(sv, mg);
44a2ac75 5461 }
cde0cee5
YO
5462 } else {
5463 PL_tainted = 1;
5464 SvTAINT(sv);
5465 }
5466 } else
5467 SvTAINTED_off(sv);
44a2ac75 5468 }
81714fb9 5469 } else {
44a2ac75 5470 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5471 return;
81714fb9
YO
5472 }
5473}
93b32b6d 5474
2fdbfb4d
AB
5475void
5476Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5477 SV const * const value)
5478{
7918f24d
NC
5479 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5480
2fdbfb4d
AB
5481 PERL_UNUSED_ARG(rx);
5482 PERL_UNUSED_ARG(paren);
5483 PERL_UNUSED_ARG(value);
5484
5485 if (!PL_localizing)
6ad8f254 5486 Perl_croak_no_modify(aTHX);
2fdbfb4d
AB
5487}
5488
5489I32
288b8c02 5490Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
5491 const I32 paren)
5492{
288b8c02 5493 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
5494 I32 i;
5495 I32 s1, t1;
5496
7918f24d
NC
5497 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5498
2fdbfb4d
AB
5499 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5500 switch (paren) {
192b9cd1 5501 /* $` / ${^PREMATCH} */
f1b875a0 5502 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
5503 if (rx->offs[0].start != -1) {
5504 i = rx->offs[0].start;
5505 if (i > 0) {
5506 s1 = 0;
5507 t1 = i;
5508 goto getlen;
5509 }
5510 }
5511 return 0;
192b9cd1 5512 /* $' / ${^POSTMATCH} */
f1b875a0 5513 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
5514 if (rx->offs[0].end != -1) {
5515 i = rx->sublen - rx->offs[0].end;
5516 if (i > 0) {
5517 s1 = rx->offs[0].end;
5518 t1 = rx->sublen;
5519 goto getlen;
5520 }
5521 }
5522 return 0;
192b9cd1
AB
5523 /* $& / ${^MATCH}, $1, $2, ... */
5524 default:
2fdbfb4d
AB
5525 if (paren <= (I32)rx->nparens &&
5526 (s1 = rx->offs[paren].start) != -1 &&
5527 (t1 = rx->offs[paren].end) != -1)
5528 {
5529 i = t1 - s1;
5530 goto getlen;
5531 } else {
5532 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 5533 report_uninit((const SV *)sv);
2fdbfb4d
AB
5534 return 0;
5535 }
5536 }
5537 getlen:
07bc277f 5538 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
5539 const char * const s = rx->subbeg + s1;
5540 const U8 *ep;
5541 STRLEN el;
5542
5543 i = t1 - s1;
5544 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5545 i = el;
5546 }
5547 return i;
5548}
5549
fe578d7f 5550SV*
49d7dfbc 5551Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 5552{
7918f24d 5553 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 5554 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
5555 if (0)
5556 return NULL;
5557 else
5558 return newSVpvs("Regexp");
fe578d7f 5559}
0a4db386 5560
894be9b7 5561/* Scans the name of a named buffer from the pattern.
0a4db386
YO
5562 * If flags is REG_RSN_RETURN_NULL returns null.
5563 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5564 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5565 * to the parsed name as looked up in the RExC_paren_names hash.
5566 * If there is an error throws a vFAIL().. type exception.
894be9b7 5567 */
0a4db386
YO
5568
5569#define REG_RSN_RETURN_NULL 0
5570#define REG_RSN_RETURN_NAME 1
5571#define REG_RSN_RETURN_DATA 2
5572
894be9b7 5573STATIC SV*
7918f24d
NC
5574S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5575{
894be9b7 5576 char *name_start = RExC_parse;
1f1031fe 5577
7918f24d
NC
5578 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5579
1f1031fe
YO
5580 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5581 /* skip IDFIRST by using do...while */
5582 if (UTF)
5583 do {
5584 RExC_parse += UTF8SKIP(RExC_parse);
5585 } while (isALNUM_utf8((U8*)RExC_parse));
5586 else
5587 do {
5588 RExC_parse++;
5589 } while (isALNUM(*RExC_parse));
894be9b7 5590 }
1f1031fe 5591
0a4db386 5592 if ( flags ) {
59cd0e26
NC
5593 SV* sv_name
5594 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5595 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
5596 if ( flags == REG_RSN_RETURN_NAME)
5597 return sv_name;
5598 else if (flags==REG_RSN_RETURN_DATA) {
5599 HE *he_str = NULL;
5600 SV *sv_dat = NULL;
5601 if ( ! sv_name ) /* should not happen*/
5602 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5603 if (RExC_paren_names)
5604 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5605 if ( he_str )
5606 sv_dat = HeVAL(he_str);
5607 if ( ! sv_dat )
5608 vFAIL("Reference to nonexistent named group");
5609 return sv_dat;
5610 }
5611 else {
5612 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5613 }
5614 /* NOT REACHED */
894be9b7 5615 }
0a4db386 5616 return NULL;
894be9b7
YO
5617}
5618
3dab1dad
YO
5619#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5620 int rem=(int)(RExC_end - RExC_parse); \
5621 int cut; \
5622 int num; \
5623 int iscut=0; \
5624 if (rem>10) { \
5625 rem=10; \
5626 iscut=1; \
5627 } \
5628 cut=10-rem; \
5629 if (RExC_lastparse!=RExC_parse) \
5630 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5631 rem, RExC_parse, \
5632 cut + 4, \
5633 iscut ? "..." : "<" \
5634 ); \
5635 else \
5636 PerlIO_printf(Perl_debug_log,"%16s",""); \
5637 \
5638 if (SIZE_ONLY) \
3b57cd43 5639 num = RExC_size + 1; \
3dab1dad
YO
5640 else \
5641 num=REG_NODE_NUM(RExC_emit); \
5642 if (RExC_lastnum!=num) \
0a4db386 5643 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 5644 else \
0a4db386 5645 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
5646 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5647 (int)((depth*2)), "", \
3dab1dad
YO
5648 (funcname) \
5649 ); \
5650 RExC_lastnum=num; \
5651 RExC_lastparse=RExC_parse; \
5652})
5653
07be1b83
YO
5654
5655
3dab1dad
YO
5656#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5657 DEBUG_PARSE_MSG((funcname)); \
5658 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5659})
6bda09f9
YO
5660#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5661 DEBUG_PARSE_MSG((funcname)); \
5662 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5663})
a687059c
LW
5664/*
5665 - reg - regular expression, i.e. main body or parenthesized thing
5666 *
5667 * Caller must absorb opening parenthesis.
5668 *
5669 * Combining parenthesis handling with the base level of regular expression
5670 * is a trifle forced, but the need to tie the tails of the branches to what
5671 * follows makes it hard to avoid.
5672 */
07be1b83
YO
5673#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5674#ifdef DEBUGGING
5675#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5676#else
5677#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5678#endif
3dab1dad 5679
76e3520e 5680STATIC regnode *
3dab1dad 5681S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 5682 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 5683{
27da23d5 5684 dVAR;
c277df42
IZ
5685 register regnode *ret; /* Will be the head of the group. */
5686 register regnode *br;
5687 register regnode *lastbr;
cbbf8932 5688 register regnode *ender = NULL;
a0d0e21e 5689 register I32 parno = 0;
cbbf8932 5690 I32 flags;
f7819f85 5691 U32 oregflags = RExC_flags;
6136c704
AL
5692 bool have_branch = 0;
5693 bool is_open = 0;
594d7033
YO
5694 I32 freeze_paren = 0;
5695 I32 after_freeze = 0;
9d1d55b5
JP
5696
5697 /* for (?g), (?gc), and (?o) warnings; warning
5698 about (?c) will warn about (?g) -- japhy */
5699
6136c704
AL
5700#define WASTED_O 0x01
5701#define WASTED_G 0x02
5702#define WASTED_C 0x04
5703#define WASTED_GC (0x02|0x04)
cbbf8932 5704 I32 wastedflags = 0x00;
9d1d55b5 5705
fac92740 5706 char * parse_start = RExC_parse; /* MJD */
a28509cc 5707 char * const oregcomp_parse = RExC_parse;
a0d0e21e 5708
3dab1dad 5709 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5710
5711 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
5712 DEBUG_PARSE("reg ");
5713
821b33a5 5714 *flagp = 0; /* Tentatively. */
a0d0e21e 5715
9d1d55b5 5716
a0d0e21e
LW
5717 /* Make an OPEN node, if parenthesized. */
5718 if (paren) {
e2e6a0f1
YO
5719 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5720 char *start_verb = RExC_parse;
5721 STRLEN verb_len = 0;
5722 char *start_arg = NULL;
5723 unsigned char op = 0;
5724 int argok = 1;
5725 int internal_argval = 0; /* internal_argval is only useful if !argok */
5726 while ( *RExC_parse && *RExC_parse != ')' ) {
5727 if ( *RExC_parse == ':' ) {
5728 start_arg = RExC_parse + 1;
5729 break;
5730 }
5731 RExC_parse++;
5732 }
5733 ++start_verb;
5734 verb_len = RExC_parse - start_verb;
5735 if ( start_arg ) {
5736 RExC_parse++;
5737 while ( *RExC_parse && *RExC_parse != ')' )
5738 RExC_parse++;
5739 if ( *RExC_parse != ')' )
5740 vFAIL("Unterminated verb pattern argument");
5741 if ( RExC_parse == start_arg )
5742 start_arg = NULL;
5743 } else {
5744 if ( *RExC_parse != ')' )
5745 vFAIL("Unterminated verb pattern");
5746 }
5d458dd8 5747
e2e6a0f1
YO
5748 switch ( *start_verb ) {
5749 case 'A': /* (*ACCEPT) */
568a785a 5750 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
5751 op = ACCEPT;
5752 internal_argval = RExC_nestroot;
5753 }
5754 break;
5755 case 'C': /* (*COMMIT) */
568a785a 5756 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 5757 op = COMMIT;
e2e6a0f1
YO
5758 break;
5759 case 'F': /* (*FAIL) */
568a785a 5760 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
5761 op = OPFAIL;
5762 argok = 0;
5763 }
5764 break;
5d458dd8
YO
5765 case ':': /* (*:NAME) */
5766 case 'M': /* (*MARK:NAME) */
568a785a 5767 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 5768 op = MARKPOINT;
5d458dd8
YO
5769 argok = -1;
5770 }
5771 break;
5772 case 'P': /* (*PRUNE) */
568a785a 5773 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 5774 op = PRUNE;
e2e6a0f1 5775 break;
5d458dd8 5776 case 'S': /* (*SKIP) */
568a785a 5777 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
5778 op = SKIP;
5779 break;
5780 case 'T': /* (*THEN) */
5781 /* [19:06] <TimToady> :: is then */
568a785a 5782 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
5783 op = CUTGROUP;
5784 RExC_seen |= REG_SEEN_CUTGROUP;
5785 }
e2e6a0f1
YO
5786 break;
5787 }
5788 if ( ! op ) {
5789 RExC_parse++;
5790 vFAIL3("Unknown verb pattern '%.*s'",
5791 verb_len, start_verb);
5792 }
5793 if ( argok ) {
5794 if ( start_arg && internal_argval ) {
5795 vFAIL3("Verb pattern '%.*s' may not have an argument",
5796 verb_len, start_verb);
5797 } else if ( argok < 0 && !start_arg ) {
5798 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5799 verb_len, start_verb);
5800 } else {
5801 ret = reganode(pRExC_state, op, internal_argval);
5802 if ( ! internal_argval && ! SIZE_ONLY ) {
5803 if (start_arg) {
5804 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5805 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5806 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
5807 ret->flags = 0;
5808 } else {
5809 ret->flags = 1;
5810 }
5811 }
5812 }
5813 if (!internal_argval)
5814 RExC_seen |= REG_SEEN_VERBARG;
5815 } else if ( start_arg ) {
5816 vFAIL3("Verb pattern '%.*s' may not have an argument",
5817 verb_len, start_verb);
5818 } else {
5819 ret = reg_node(pRExC_state, op);
5820 }
5821 nextchar(pRExC_state);
5822 return ret;
5823 } else
fac92740 5824 if (*RExC_parse == '?') { /* (?...) */
6136c704 5825 bool is_logical = 0;
a28509cc 5826 const char * const seqstart = RExC_parse;
fb85c044 5827 bool has_use_defaults = FALSE;
ca9dfc88 5828
830247a4
IZ
5829 RExC_parse++;
5830 paren = *RExC_parse++;
c277df42 5831 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5832 switch (paren) {
894be9b7 5833
1f1031fe
YO
5834 case 'P': /* (?P...) variants for those used to PCRE/Python */
5835 paren = *RExC_parse++;
5836 if ( paren == '<') /* (?P<...>) named capture */
5837 goto named_capture;
5838 else if (paren == '>') { /* (?P>name) named recursion */
5839 goto named_recursion;
5840 }
5841 else if (paren == '=') { /* (?P=...) named backref */
5842 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5843 you change this make sure you change that */
5844 char* name_start = RExC_parse;
5845 U32 num = 0;
5846 SV *sv_dat = reg_scan_name(pRExC_state,
5847 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5848 if (RExC_parse == name_start || *RExC_parse != ')')
5849 vFAIL2("Sequence %.3s... not terminated",parse_start);
5850
5851 if (!SIZE_ONLY) {
5852 num = add_data( pRExC_state, 1, "S" );
5853 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5854 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
5855 }
5856 RExC_sawback = 1;
4444fd9f
KW
5857 ret = reganode(pRExC_state,
5858 ((! FOLD)
5859 ? NREF
5860 : (UNI_SEMANTICS)
5861 ? NREFFU
5862 : (LOC)
5863 ? NREFFL
5864 : NREFF),
5865 num);
1f1031fe
YO
5866 *flagp |= HASWIDTH;
5867
5868 Set_Node_Offset(ret, parse_start+1);
5869 Set_Node_Cur_Length(ret); /* MJD */
5870
5871 nextchar(pRExC_state);
5872 return ret;
5873 }
57b84237
YO
5874 RExC_parse++;
5875 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5876 /*NOTREACHED*/
5877 case '<': /* (?<...) */
b81d288d 5878 if (*RExC_parse == '!')
c277df42 5879 paren = ',';
0a4db386 5880 else if (*RExC_parse != '=')
1f1031fe 5881 named_capture:
0a4db386 5882 { /* (?<...>) */
81714fb9 5883 char *name_start;
894be9b7 5884 SV *svname;
81714fb9
YO
5885 paren= '>';
5886 case '\'': /* (?'...') */
5887 name_start= RExC_parse;
0a4db386
YO
5888 svname = reg_scan_name(pRExC_state,
5889 SIZE_ONLY ? /* reverse test from the others */
5890 REG_RSN_RETURN_NAME :
5891 REG_RSN_RETURN_NULL);
57b84237
YO
5892 if (RExC_parse == name_start) {
5893 RExC_parse++;
5894 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5895 /*NOTREACHED*/
5896 }
81714fb9
YO
5897 if (*RExC_parse != paren)
5898 vFAIL2("Sequence (?%c... not terminated",
5899 paren=='>' ? '<' : paren);
5900 if (SIZE_ONLY) {
e62cc96a
YO
5901 HE *he_str;
5902 SV *sv_dat = NULL;
486ec47a 5903 if (!svname) /* shouldn't happen */
894be9b7
YO
5904 Perl_croak(aTHX_
5905 "panic: reg_scan_name returned NULL");
81714fb9
YO
5906 if (!RExC_paren_names) {
5907 RExC_paren_names= newHV();
ad64d0ec 5908 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
5909#ifdef DEBUGGING
5910 RExC_paren_name_list= newAV();
ad64d0ec 5911 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 5912#endif
81714fb9
YO
5913 }
5914 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5915 if ( he_str )
81714fb9 5916 sv_dat = HeVAL(he_str);
e62cc96a 5917 if ( ! sv_dat ) {
81714fb9 5918 /* croak baby croak */
e62cc96a
YO
5919 Perl_croak(aTHX_
5920 "panic: paren_name hash element allocation failed");
5921 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
5922 /* (?|...) can mean we have dupes so scan to check
5923 its already been stored. Maybe a flag indicating
5924 we are inside such a construct would be useful,
5925 but the arrays are likely to be quite small, so
5926 for now we punt -- dmq */
5927 IV count = SvIV(sv_dat);
5928 I32 *pv = (I32*)SvPVX(sv_dat);
5929 IV i;
5930 for ( i = 0 ; i < count ; i++ ) {
5931 if ( pv[i] == RExC_npar ) {
5932 count = 0;
5933 break;
5934 }
5935 }
5936 if ( count ) {
5937 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5938 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5939 pv[count] = RExC_npar;
3a92e6ae 5940 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 5941 }
81714fb9
YO
5942 } else {
5943 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5944 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5945 SvIOK_on(sv_dat);
3ec35e0f 5946 SvIV_set(sv_dat, 1);
e62cc96a 5947 }
1f1031fe
YO
5948#ifdef DEBUGGING
5949 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5950 SvREFCNT_dec(svname);
5951#endif
e62cc96a 5952
81714fb9
YO
5953 /*sv_dump(sv_dat);*/
5954 }
5955 nextchar(pRExC_state);
5956 paren = 1;
5957 goto capturing_parens;
5958 }
5959 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5960 RExC_parse++;
fac92740 5961 case '=': /* (?=...) */
89c6a13e 5962 RExC_seen_zerolen++;
5c3fa2e7 5963 break;
fac92740 5964 case '!': /* (?!...) */
830247a4 5965 RExC_seen_zerolen++;
e2e6a0f1
YO
5966 if (*RExC_parse == ')') {
5967 ret=reg_node(pRExC_state, OPFAIL);
5968 nextchar(pRExC_state);
5969 return ret;
5970 }
594d7033
YO
5971 break;
5972 case '|': /* (?|...) */
5973 /* branch reset, behave like a (?:...) except that
5974 buffers in alternations share the same numbers */
5975 paren = ':';
5976 after_freeze = freeze_paren = RExC_npar;
5977 break;
fac92740
MJD
5978 case ':': /* (?:...) */
5979 case '>': /* (?>...) */
a0d0e21e 5980 break;
fac92740
MJD
5981 case '$': /* (?$...) */
5982 case '@': /* (?@...) */
8615cb43 5983 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5984 break;
fac92740 5985 case '#': /* (?#...) */
830247a4
IZ
5986 while (*RExC_parse && *RExC_parse != ')')
5987 RExC_parse++;
5988 if (*RExC_parse != ')')
c277df42 5989 FAIL("Sequence (?#... not terminated");
830247a4 5990 nextchar(pRExC_state);
a0d0e21e
LW
5991 *flagp = TRYAGAIN;
5992 return NULL;
894be9b7
YO
5993 case '0' : /* (?0) */
5994 case 'R' : /* (?R) */
5995 if (*RExC_parse != ')')
6bda09f9 5996 FAIL("Sequence (?R) not terminated");
1a147d38 5997 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 5998 *flagp |= POSTPONED;
7f69552c
YO
5999 nextchar(pRExC_state);
6000 return ret;
6001 /*notreached*/
894be9b7
YO
6002 { /* named and numeric backreferences */
6003 I32 num;
894be9b7
YO
6004 case '&': /* (?&NAME) */
6005 parse_start = RExC_parse - 1;
1f1031fe 6006 named_recursion:
894be9b7 6007 {
0a4db386
YO
6008 SV *sv_dat = reg_scan_name(pRExC_state,
6009 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6010 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
6011 }
6012 goto gen_recurse_regop;
6013 /* NOT REACHED */
542fa716
YO
6014 case '+':
6015 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6016 RExC_parse++;
6017 vFAIL("Illegal pattern");
6018 }
6019 goto parse_recursion;
6020 /* NOT REACHED*/
6021 case '-': /* (?-1) */
6022 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6023 RExC_parse--; /* rewind to let it be handled later */
6024 goto parse_flags;
6025 }
6026 /*FALLTHROUGH */
6bda09f9
YO
6027 case '1': case '2': case '3': case '4': /* (?1) */
6028 case '5': case '6': case '7': case '8': case '9':
6029 RExC_parse--;
542fa716 6030 parse_recursion:
894be9b7
YO
6031 num = atoi(RExC_parse);
6032 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
6033 if (*RExC_parse == '-')
6034 RExC_parse++;
6bda09f9
YO
6035 while (isDIGIT(*RExC_parse))
6036 RExC_parse++;
6037 if (*RExC_parse!=')')
6038 vFAIL("Expecting close bracket");
894be9b7
YO
6039
6040 gen_recurse_regop:
542fa716
YO
6041 if ( paren == '-' ) {
6042 /*
6043 Diagram of capture buffer numbering.
6044 Top line is the normal capture buffer numbers
3b753521 6045 Bottom line is the negative indexing as from
542fa716
YO
6046 the X (the (?-2))
6047
6048 + 1 2 3 4 5 X 6 7
6049 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6050 - 5 4 3 2 1 X x x
6051
6052 */
6053 num = RExC_npar + num;
6054 if (num < 1) {
6055 RExC_parse++;
6056 vFAIL("Reference to nonexistent group");
6057 }
6058 } else if ( paren == '+' ) {
6059 num = RExC_npar + num - 1;
6060 }
6061
1a147d38 6062 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
6063 if (!SIZE_ONLY) {
6064 if (num > (I32)RExC_rx->nparens) {
6065 RExC_parse++;
6066 vFAIL("Reference to nonexistent group");
6067 }
40d049e4 6068 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 6069 RExC_emit++;
226de585 6070 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 6071 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 6072 } else {
6bda09f9 6073 RExC_size++;
6bda09f9 6074 }
0a4db386 6075 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 6076 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
6077 Set_Node_Offset(ret, parse_start); /* MJD */
6078
a3b492c3 6079 *flagp |= POSTPONED;
6bda09f9
YO
6080 nextchar(pRExC_state);
6081 return ret;
894be9b7
YO
6082 } /* named and numeric backreferences */
6083 /* NOT REACHED */
6084
fac92740 6085 case '?': /* (??...) */
6136c704 6086 is_logical = 1;
57b84237
YO
6087 if (*RExC_parse != '{') {
6088 RExC_parse++;
6089 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6090 /*NOTREACHED*/
6091 }
a3b492c3 6092 *flagp |= POSTPONED;
830247a4 6093 paren = *RExC_parse++;
0f5d15d6 6094 /* FALL THROUGH */
fac92740 6095 case '{': /* (?{...}) */
c277df42 6096 {
2eccd3b2
NC
6097 I32 count = 1;
6098 U32 n = 0;
c277df42 6099 char c;
830247a4 6100 char *s = RExC_parse;
c277df42 6101
830247a4
IZ
6102 RExC_seen_zerolen++;
6103 RExC_seen |= REG_SEEN_EVAL;
6104 while (count && (c = *RExC_parse)) {
6136c704
AL
6105 if (c == '\\') {
6106 if (RExC_parse[1])
6107 RExC_parse++;
6108 }
b81d288d 6109 else if (c == '{')
c277df42 6110 count++;
b81d288d 6111 else if (c == '}')
c277df42 6112 count--;
830247a4 6113 RExC_parse++;
c277df42 6114 }
6136c704 6115 if (*RExC_parse != ')') {
b81d288d 6116 RExC_parse = s;
b45f050a
JF
6117 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6118 }
c277df42 6119 if (!SIZE_ONLY) {
f3548bdc 6120 PAD *pad;
6136c704
AL
6121 OP_4tree *sop, *rop;
6122 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 6123
569233ed
SB
6124 ENTER;
6125 Perl_save_re_context(aTHX);
d59a8b3e 6126 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
9b978d73
DM
6127 sop->op_private |= OPpREFCOUNTED;
6128 /* re_dup will OpREFCNT_inc */
6129 OpREFCNT_set(sop, 1);
569233ed 6130 LEAVE;
c277df42 6131
830247a4 6132 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
6133 RExC_rxi->data->data[n] = (void*)rop;
6134 RExC_rxi->data->data[n+1] = (void*)sop;
6135 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 6136 SvREFCNT_dec(sv);
a0ed51b3 6137 }
e24b16f9 6138 else { /* First pass */
830247a4 6139 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 6140 && IN_PERL_RUNTIME)
2cd61cdb
IZ
6141 /* No compiled RE interpolated, has runtime
6142 components ===> unsafe. */
6143 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 6144 if (PL_tainting && PL_tainted)
cc6b7395 6145 FAIL("Eval-group in insecure regular expression");
54df2634 6146#if PERL_VERSION > 8
923e4eb5 6147 if (IN_PERL_COMPILETIME)
b5c19bd7 6148 PL_cv_has_eval = 1;
54df2634 6149#endif
c277df42 6150 }
b5c19bd7 6151
830247a4 6152 nextchar(pRExC_state);
6136c704 6153 if (is_logical) {
830247a4 6154 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6155 if (!SIZE_ONLY)
6156 ret->flags = 2;
3dab1dad 6157 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 6158 /* deal with the length of this later - MJD */
0f5d15d6
IZ
6159 return ret;
6160 }
ccb2c380
MP
6161 ret = reganode(pRExC_state, EVAL, n);
6162 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6163 Set_Node_Offset(ret, parse_start);
6164 return ret;
c277df42 6165 }
fac92740 6166 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 6167 {
0a4db386 6168 int is_define= 0;
fac92740 6169 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
6170 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6171 || RExC_parse[1] == '<'
830247a4 6172 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
6173 I32 flag;
6174
830247a4 6175 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6176 if (!SIZE_ONLY)
6177 ret->flags = 1;
3dab1dad 6178 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 6179 goto insert_if;
b81d288d 6180 }
a0ed51b3 6181 }
0a4db386
YO
6182 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6183 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6184 {
6185 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6186 char *name_start= RExC_parse++;
2eccd3b2 6187 U32 num = 0;
0a4db386
YO
6188 SV *sv_dat=reg_scan_name(pRExC_state,
6189 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6190 if (RExC_parse == name_start || *RExC_parse != ch)
6191 vFAIL2("Sequence (?(%c... not terminated",
6192 (ch == '>' ? '<' : ch));
6193 RExC_parse++;
6194 if (!SIZE_ONLY) {
6195 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 6196 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 6197 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
6198 }
6199 ret = reganode(pRExC_state,NGROUPP,num);
6200 goto insert_if_check_paren;
6201 }
6202 else if (RExC_parse[0] == 'D' &&
6203 RExC_parse[1] == 'E' &&
6204 RExC_parse[2] == 'F' &&
6205 RExC_parse[3] == 'I' &&
6206 RExC_parse[4] == 'N' &&
6207 RExC_parse[5] == 'E')
6208 {
6209 ret = reganode(pRExC_state,DEFINEP,0);
6210 RExC_parse +=6 ;
6211 is_define = 1;
6212 goto insert_if_check_paren;
6213 }
6214 else if (RExC_parse[0] == 'R') {
6215 RExC_parse++;
6216 parno = 0;
6217 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6218 parno = atoi(RExC_parse++);
6219 while (isDIGIT(*RExC_parse))
6220 RExC_parse++;
6221 } else if (RExC_parse[0] == '&') {
6222 SV *sv_dat;
6223 RExC_parse++;
6224 sv_dat = reg_scan_name(pRExC_state,
6225 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6226 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6227 }
1a147d38 6228 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
6229 goto insert_if_check_paren;
6230 }
830247a4 6231 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 6232 /* (?(1)...) */
6136c704 6233 char c;
830247a4 6234 parno = atoi(RExC_parse++);
c277df42 6235
830247a4
IZ
6236 while (isDIGIT(*RExC_parse))
6237 RExC_parse++;
fac92740 6238 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 6239
0a4db386 6240 insert_if_check_paren:
830247a4 6241 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 6242 vFAIL("Switch condition not recognized");
c277df42 6243 insert_if:
3dab1dad
YO
6244 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6245 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 6246 if (br == NULL)
830247a4 6247 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 6248 else
3dab1dad 6249 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 6250 c = *nextchar(pRExC_state);
d1b80229
IZ
6251 if (flags&HASWIDTH)
6252 *flagp |= HASWIDTH;
c277df42 6253 if (c == '|') {
0a4db386
YO
6254 if (is_define)
6255 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 6256 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
6257 regbranch(pRExC_state, &flags, 1,depth+1);
6258 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
6259 if (flags&HASWIDTH)
6260 *flagp |= HASWIDTH;
830247a4 6261 c = *nextchar(pRExC_state);
a0ed51b3
LW
6262 }
6263 else
c277df42
IZ
6264 lastbr = NULL;
6265 if (c != ')')
8615cb43 6266 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 6267 ender = reg_node(pRExC_state, TAIL);
3dab1dad 6268 REGTAIL(pRExC_state, br, ender);
c277df42 6269 if (lastbr) {
3dab1dad
YO
6270 REGTAIL(pRExC_state, lastbr, ender);
6271 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
6272 }
6273 else
3dab1dad 6274 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
6275 RExC_size++; /* XXX WHY do we need this?!!
6276 For large programs it seems to be required
6277 but I can't figure out why. -- dmq*/
c277df42 6278 return ret;
a0ed51b3
LW
6279 }
6280 else {
830247a4 6281 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
6282 }
6283 }
1b1626e4 6284 case 0:
830247a4 6285 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 6286 vFAIL("Sequence (? incomplete");
1b1626e4 6287 break;
85508812
KW
6288 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6289 that follow */
fb85c044
KW
6290 has_use_defaults = TRUE;
6291 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
7b98bc43
KW
6292 if (RExC_utf8) { /* But the default for a utf8 pattern is
6293 unicode semantics */
6294 RExC_flags |= RXf_PMf_UNICODE;
6295 }
fb85c044 6296 goto parse_flags;
a0d0e21e 6297 default:
cde0cee5
YO
6298 --RExC_parse;
6299 parse_flags: /* (?i) */
6300 {
6301 U32 posflags = 0, negflags = 0;
6302 U32 *flagsp = &posflags;
9de15fec 6303 bool has_charset_modifier = 0;
cde0cee5
YO
6304
6305 while (*RExC_parse) {
6306 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
6307 /* (?g), (?gc) and (?o) are useless here
6308 and must be globally applied -- japhy */
cde0cee5
YO
6309 switch (*RExC_parse) {
6310 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9de15fec
KW
6311 case LOCALE_PAT_MOD:
6312 if (has_charset_modifier || flagsp == &negflags) {
6313 goto fail_modifiers;
6314 }
8cc86590
KW
6315 posflags |= RXf_PMf_LOCALE;
6316 negflags |= RXf_PMf_UNICODE;
9de15fec
KW
6317 has_charset_modifier = 1;
6318 break;
6319 case UNICODE_PAT_MOD:
6320 if (has_charset_modifier || flagsp == &negflags) {
6321 goto fail_modifiers;
6322 }
8cc86590
KW
6323 posflags |= RXf_PMf_UNICODE;
6324 negflags |= RXf_PMf_LOCALE;
9de15fec
KW
6325 has_charset_modifier = 1;
6326 break;
6327 case DUAL_PAT_MOD:
6328 if (has_use_defaults
6329 || has_charset_modifier
6330 || flagsp == &negflags)
6331 {
6332 goto fail_modifiers;
6333 }
7b98bc43
KW
6334
6335 /* The dual charset means unicode semantics if the
6336 * pattern (or target, not known until runtime) are
6337 * utf8 */
6338 if (RExC_utf8) {
6339 posflags |= RXf_PMf_UNICODE;
6340 negflags |= RXf_PMf_LOCALE;
6341 }
6342 else {
6343 negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6344 }
9de15fec
KW
6345 has_charset_modifier = 1;
6346 break;
f7819f85
A
6347 case ONCE_PAT_MOD: /* 'o' */
6348 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 6349 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 6350 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
6351 if (! (wastedflags & wflagbit) ) {
6352 wastedflags |= wflagbit;
6353 vWARN5(
6354 RExC_parse + 1,
6355 "Useless (%s%c) - %suse /%c modifier",
6356 flagsp == &negflags ? "?-" : "?",
6357 *RExC_parse,
6358 flagsp == &negflags ? "don't " : "",
6359 *RExC_parse
6360 );
6361 }
6362 }
cde0cee5
YO
6363 break;
6364
f7819f85 6365 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 6366 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
6367 if (! (wastedflags & WASTED_C) ) {
6368 wastedflags |= WASTED_GC;
9d1d55b5
JP
6369 vWARN3(
6370 RExC_parse + 1,
6371 "Useless (%sc) - %suse /gc modifier",
6372 flagsp == &negflags ? "?-" : "?",
6373 flagsp == &negflags ? "don't " : ""
6374 );
6375 }
6376 }
cde0cee5 6377 break;
f7819f85 6378 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5 6379 if (flagsp == &negflags) {
668c081a
NC
6380 if (SIZE_ONLY)
6381 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
6382 } else {
6383 *flagsp |= RXf_PMf_KEEPCOPY;
6384 }
6385 break;
6386 case '-':
3b753521 6387 /* A flag is a default iff it is following a minus, so
fb85c044
KW
6388 * if there is a minus, it means will be trying to
6389 * re-specify a default which is an error */
6390 if (has_use_defaults || flagsp == &negflags) {
9de15fec 6391 fail_modifiers:
57b84237
YO
6392 RExC_parse++;
6393 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6394 /*NOTREACHED*/
6395 }
cde0cee5
YO
6396 flagsp = &negflags;
6397 wastedflags = 0; /* reset so (?g-c) warns twice */
6398 break;
6399 case ':':
6400 paren = ':';
6401 /*FALLTHROUGH*/
6402 case ')':
6403 RExC_flags |= posflags;
6404 RExC_flags &= ~negflags;
f7819f85
A
6405 if (paren != ':') {
6406 oregflags |= posflags;
6407 oregflags &= ~negflags;
6408 }
cde0cee5
YO
6409 nextchar(pRExC_state);
6410 if (paren != ':') {
6411 *flagp = TRYAGAIN;
6412 return NULL;
6413 } else {
6414 ret = NULL;
6415 goto parse_rest;
6416 }
6417 /*NOTREACHED*/
6418 default:
cde0cee5
YO
6419 RExC_parse++;
6420 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6421 /*NOTREACHED*/
6422 }
830247a4 6423 ++RExC_parse;
48c036b1 6424 }
cde0cee5 6425 }} /* one for the default block, one for the switch */
a0d0e21e 6426 }
fac92740 6427 else { /* (...) */
81714fb9 6428 capturing_parens:
830247a4
IZ
6429 parno = RExC_npar;
6430 RExC_npar++;
e2e6a0f1 6431
830247a4 6432 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
6433 if (!SIZE_ONLY ){
6434 if (!RExC_nestroot)
6435 RExC_nestroot = parno;
c009da3d
YO
6436 if (RExC_seen & REG_SEEN_RECURSE
6437 && !RExC_open_parens[parno-1])
6438 {
e2e6a0f1 6439 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
6440 "Setting open paren #%"IVdf" to %d\n",
6441 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
6442 RExC_open_parens[parno-1]= ret;
6443 }
6bda09f9 6444 }
fac92740
MJD
6445 Set_Node_Length(ret, 1); /* MJD */
6446 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 6447 is_open = 1;
a0d0e21e 6448 }
a0ed51b3 6449 }
fac92740 6450 else /* ! paren */
a0d0e21e 6451 ret = NULL;
cde0cee5
YO
6452
6453 parse_rest:
a0d0e21e 6454 /* Pick up the branches, linking them together. */
fac92740 6455 parse_start = RExC_parse; /* MJD */
3dab1dad 6456 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e
VP
6457
6458 if (freeze_paren) {
6459 if (RExC_npar > after_freeze)
6460 after_freeze = RExC_npar;
6461 RExC_npar = freeze_paren;
6462 }
6463
fac92740 6464 /* branch_len = (paren != 0); */
2af232bd 6465
a0d0e21e
LW
6466 if (br == NULL)
6467 return(NULL);
830247a4
IZ
6468 if (*RExC_parse == '|') {
6469 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 6470 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 6471 }
fac92740 6472 else { /* MJD */
6bda09f9 6473 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
6474 Set_Node_Length(br, paren != 0);
6475 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6476 }
c277df42
IZ
6477 have_branch = 1;
6478 if (SIZE_ONLY)
830247a4 6479 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
6480 }
6481 else if (paren == ':') {
c277df42
IZ
6482 *flagp |= flags&SIMPLE;
6483 }
6136c704 6484 if (is_open) { /* Starts with OPEN. */
3dab1dad 6485 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
6486 }
6487 else if (paren != '?') /* Not Conditional */
a0d0e21e 6488 ret = br;
8ae10a67 6489 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 6490 lastbr = br;
830247a4
IZ
6491 while (*RExC_parse == '|') {
6492 if (!SIZE_ONLY && RExC_extralen) {
6493 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 6494 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
6495 }
6496 if (SIZE_ONLY)
830247a4
IZ
6497 RExC_extralen += 2; /* Account for LONGJMP. */
6498 nextchar(pRExC_state);
594d7033
YO
6499 if (freeze_paren) {
6500 if (RExC_npar > after_freeze)
6501 after_freeze = RExC_npar;
6502 RExC_npar = freeze_paren;
6503 }
3dab1dad 6504 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 6505
a687059c 6506 if (br == NULL)
a0d0e21e 6507 return(NULL);
3dab1dad 6508 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 6509 lastbr = br;
8ae10a67 6510 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
6511 }
6512
c277df42
IZ
6513 if (have_branch || paren != ':') {
6514 /* Make a closing node, and hook it on the end. */
6515 switch (paren) {
6516 case ':':
830247a4 6517 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
6518 break;
6519 case 1:
830247a4 6520 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
6521 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6522 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6523 "Setting close paren #%"IVdf" to %d\n",
6524 (IV)parno, REG_NODE_NUM(ender)));
6525 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
6526 if (RExC_nestroot == parno)
6527 RExC_nestroot = 0;
40d049e4 6528 }
fac92740
MJD
6529 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6530 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
6531 break;
6532 case '<':
c277df42
IZ
6533 case ',':
6534 case '=':
6535 case '!':
c277df42 6536 *flagp &= ~HASWIDTH;
821b33a5
IZ
6537 /* FALL THROUGH */
6538 case '>':
830247a4 6539 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
6540 break;
6541 case 0:
830247a4 6542 ender = reg_node(pRExC_state, END);
40d049e4
YO
6543 if (!SIZE_ONLY) {
6544 assert(!RExC_opend); /* there can only be one! */
6545 RExC_opend = ender;
6546 }
c277df42
IZ
6547 break;
6548 }
eaf3ca90 6549 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 6550
9674d46a 6551 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
6552 if (depth==1)
6553 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6554
c277df42 6555 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
6556 for (br = ret; br; br = regnext(br)) {
6557 const U8 op = PL_regkind[OP(br)];
6558 if (op == BRANCH) {
07be1b83 6559 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
6560 }
6561 else if (op == BRANCHJ) {
07be1b83 6562 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 6563 }
c277df42
IZ
6564 }
6565 }
a0d0e21e 6566 }
c277df42
IZ
6567
6568 {
e1ec3a88
AL
6569 const char *p;
6570 static const char parens[] = "=!<,>";
c277df42
IZ
6571
6572 if (paren && (p = strchr(parens, paren))) {
eb160463 6573 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
6574 int flag = (p - parens) > 1;
6575
6576 if (paren == '>')
6577 node = SUSPEND, flag = 0;
6bda09f9 6578 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
6579 Set_Node_Cur_Length(ret);
6580 Set_Node_Offset(ret, parse_start + 1);
c277df42 6581 ret->flags = flag;
07be1b83 6582 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 6583 }
a0d0e21e
LW
6584 }
6585
6586 /* Check for proper termination. */
ce3e6498 6587 if (paren) {
e2509266 6588 RExC_flags = oregflags;
830247a4
IZ
6589 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6590 RExC_parse = oregcomp_parse;
380a0633 6591 vFAIL("Unmatched (");
ce3e6498 6592 }
a0ed51b3 6593 }
830247a4
IZ
6594 else if (!paren && RExC_parse < RExC_end) {
6595 if (*RExC_parse == ')') {
6596 RExC_parse++;
380a0633 6597 vFAIL("Unmatched )");
a0ed51b3
LW
6598 }
6599 else
b45f050a 6600 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
6601 /* NOTREACHED */
6602 }
594d7033
YO
6603 if (after_freeze)
6604 RExC_npar = after_freeze;
a0d0e21e 6605 return(ret);
a687059c
LW
6606}
6607
6608/*
6609 - regbranch - one alternative of an | operator
6610 *
6611 * Implements the concatenation operator.
6612 */
76e3520e 6613STATIC regnode *
3dab1dad 6614S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 6615{
97aff369 6616 dVAR;
c277df42
IZ
6617 register regnode *ret;
6618 register regnode *chain = NULL;
6619 register regnode *latest;
6620 I32 flags = 0, c = 0;
3dab1dad 6621 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6622
6623 PERL_ARGS_ASSERT_REGBRANCH;
6624
3dab1dad 6625 DEBUG_PARSE("brnc");
02daf0ab 6626
b81d288d 6627 if (first)
c277df42
IZ
6628 ret = NULL;
6629 else {
b81d288d 6630 if (!SIZE_ONLY && RExC_extralen)
830247a4 6631 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 6632 else {
830247a4 6633 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
6634 Set_Node_Length(ret, 1);
6635 }
c277df42
IZ
6636 }
6637
b81d288d 6638 if (!first && SIZE_ONLY)
830247a4 6639 RExC_extralen += 1; /* BRANCHJ */
b81d288d 6640
c277df42 6641 *flagp = WORST; /* Tentatively. */
a0d0e21e 6642
830247a4
IZ
6643 RExC_parse--;
6644 nextchar(pRExC_state);
6645 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 6646 flags &= ~TRYAGAIN;
3dab1dad 6647 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6648 if (latest == NULL) {
6649 if (flags & TRYAGAIN)
6650 continue;
6651 return(NULL);
a0ed51b3
LW
6652 }
6653 else if (ret == NULL)
c277df42 6654 ret = latest;
8ae10a67 6655 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 6656 if (chain == NULL) /* First piece. */
a0d0e21e
LW
6657 *flagp |= flags&SPSTART;
6658 else {
830247a4 6659 RExC_naughty++;
3dab1dad 6660 REGTAIL(pRExC_state, chain, latest);
a687059c 6661 }
a0d0e21e 6662 chain = latest;
c277df42
IZ
6663 c++;
6664 }
6665 if (chain == NULL) { /* Loop ran zero times. */
830247a4 6666 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
6667 if (ret == NULL)
6668 ret = chain;
6669 }
6670 if (c == 1) {
6671 *flagp |= flags&SIMPLE;
a0d0e21e 6672 }
a687059c 6673
d4c19fe8 6674 return ret;
a687059c
LW
6675}
6676
6677/*
6678 - regpiece - something followed by possible [*+?]
6679 *
6680 * Note that the branching code sequences used for ? and the general cases
6681 * of * and + are somewhat optimized: they use the same NOTHING node as
6682 * both the endmarker for their branch list and the body of the last branch.
6683 * It might seem that this node could be dispensed with entirely, but the
6684 * endmarker role is not redundant.
6685 */
76e3520e 6686STATIC regnode *
3dab1dad 6687S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6688{
97aff369 6689 dVAR;
c277df42 6690 register regnode *ret;
a0d0e21e
LW
6691 register char op;
6692 register char *next;
6693 I32 flags;
1df70142 6694 const char * const origparse = RExC_parse;
a0d0e21e 6695 I32 min;
c277df42 6696 I32 max = REG_INFTY;
fac92740 6697 char *parse_start;
10edeb5d 6698 const char *maxpos = NULL;
3dab1dad 6699 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6700
6701 PERL_ARGS_ASSERT_REGPIECE;
6702
3dab1dad 6703 DEBUG_PARSE("piec");
a0d0e21e 6704
3dab1dad 6705 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6706 if (ret == NULL) {
6707 if (flags & TRYAGAIN)
6708 *flagp |= TRYAGAIN;
6709 return(NULL);
6710 }
6711
830247a4 6712 op = *RExC_parse;
a0d0e21e 6713
830247a4 6714 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 6715 maxpos = NULL;
fac92740 6716 parse_start = RExC_parse; /* MJD */
830247a4 6717 next = RExC_parse + 1;
a0d0e21e
LW
6718 while (isDIGIT(*next) || *next == ',') {
6719 if (*next == ',') {
6720 if (maxpos)
6721 break;
6722 else
6723 maxpos = next;
a687059c 6724 }
a0d0e21e
LW
6725 next++;
6726 }
6727 if (*next == '}') { /* got one */
6728 if (!maxpos)
6729 maxpos = next;
830247a4
IZ
6730 RExC_parse++;
6731 min = atoi(RExC_parse);
a0d0e21e
LW
6732 if (*maxpos == ',')
6733 maxpos++;
6734 else
830247a4 6735 maxpos = RExC_parse;
a0d0e21e
LW
6736 max = atoi(maxpos);
6737 if (!max && *maxpos != '0')
c277df42
IZ
6738 max = REG_INFTY; /* meaning "infinity" */
6739 else if (max >= REG_INFTY)
8615cb43 6740 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
6741 RExC_parse = next;
6742 nextchar(pRExC_state);
a0d0e21e
LW
6743
6744 do_curly:
6745 if ((flags&SIMPLE)) {
830247a4 6746 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 6747 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
6748 Set_Node_Offset(ret, parse_start+1); /* MJD */
6749 Set_Node_Cur_Length(ret);
a0d0e21e
LW
6750 }
6751 else {
3dab1dad 6752 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
6753
6754 w->flags = 0;
3dab1dad 6755 REGTAIL(pRExC_state, ret, w);
830247a4 6756 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
6757 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6758 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
6759 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6760 }
6bda09f9 6761 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
6762 /* MJD hk */
6763 Set_Node_Offset(ret, parse_start+1);
2af232bd 6764 Set_Node_Length(ret,
fac92740 6765 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 6766
830247a4 6767 if (!SIZE_ONLY && RExC_extralen)
c277df42 6768 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 6769 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 6770 if (SIZE_ONLY)
830247a4
IZ
6771 RExC_whilem_seen++, RExC_extralen += 3;
6772 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 6773 }
c277df42 6774 ret->flags = 0;
a0d0e21e
LW
6775
6776 if (min > 0)
821b33a5
IZ
6777 *flagp = WORST;
6778 if (max > 0)
6779 *flagp |= HASWIDTH;
8fa23287 6780 if (max < min)
8615cb43 6781 vFAIL("Can't do {n,m} with n > m");
c277df42 6782 if (!SIZE_ONLY) {
eb160463
GS
6783 ARG1_SET(ret, (U16)min);
6784 ARG2_SET(ret, (U16)max);
a687059c 6785 }
a687059c 6786
a0d0e21e 6787 goto nest_check;
a687059c 6788 }
a0d0e21e 6789 }
a687059c 6790
a0d0e21e
LW
6791 if (!ISMULT1(op)) {
6792 *flagp = flags;
a687059c 6793 return(ret);
a0d0e21e 6794 }
bb20fd44 6795
c277df42 6796#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
6797
6798 /* if this is reinstated, don't forget to put this back into perldiag:
6799
6800 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6801
6802 (F) The part of the regexp subject to either the * or + quantifier
6803 could match an empty string. The {#} shows in the regular
6804 expression about where the problem was discovered.
6805
6806 */
6807
bb20fd44 6808 if (!(flags&HASWIDTH) && op != '?')
b45f050a 6809 vFAIL("Regexp *+ operand could be empty");
b81d288d 6810#endif
bb20fd44 6811
fac92740 6812 parse_start = RExC_parse;
830247a4 6813 nextchar(pRExC_state);
a0d0e21e 6814
821b33a5 6815 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
6816
6817 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 6818 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 6819 ret->flags = 0;
830247a4 6820 RExC_naughty += 4;
a0d0e21e
LW
6821 }
6822 else if (op == '*') {
6823 min = 0;
6824 goto do_curly;
a0ed51b3
LW
6825 }
6826 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 6827 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 6828 ret->flags = 0;
830247a4 6829 RExC_naughty += 3;
a0d0e21e
LW
6830 }
6831 else if (op == '+') {
6832 min = 1;
6833 goto do_curly;
a0ed51b3
LW
6834 }
6835 else if (op == '?') {
a0d0e21e
LW
6836 min = 0; max = 1;
6837 goto do_curly;
6838 }
6839 nest_check:
668c081a
NC
6840 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6841 ckWARN3reg(RExC_parse,
6842 "%.*s matches null string many times",
6843 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6844 origparse);
a0d0e21e
LW
6845 }
6846
b9b4dddf 6847 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 6848 nextchar(pRExC_state);
6bda09f9 6849 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 6850 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 6851 }
b9b4dddf
YO
6852#ifndef REG_ALLOW_MINMOD_SUSPEND
6853 else
6854#endif
6855 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6856 regnode *ender;
6857 nextchar(pRExC_state);
6858 ender = reg_node(pRExC_state, SUCCEED);
6859 REGTAIL(pRExC_state, ret, ender);
6860 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6861 ret->flags = 0;
6862 ender = reg_node(pRExC_state, TAIL);
6863 REGTAIL(pRExC_state, ret, ender);
6864 /*ret= ender;*/
6865 }
6866
6867 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 6868 RExC_parse++;
b45f050a
JF
6869 vFAIL("Nested quantifiers");
6870 }
a0d0e21e
LW
6871
6872 return(ret);
a687059c
LW
6873}
6874
fc8cd66c
YO
6875
6876/* reg_namedseq(pRExC_state,UVp)
6877
6878 This is expected to be called by a parser routine that has
afefe6bf 6879 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
6880 expected to point at the first char following the N at the time
6881 of the call.
ff3f963a
KW
6882
6883 The \N may be inside (indicated by valuep not being NULL) or outside a
6884 character class.
6885
6886 \N may begin either a named sequence, or if outside a character class, mean
6887 to match a non-newline. For non single-quoted regexes, the tokenizer has
6888 attempted to decide which, and in the case of a named sequence converted it
6889 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6890 where c1... are the characters in the sequence. For single-quoted regexes,
6891 the tokenizer passes the \N sequence through unchanged; this code will not
6892 attempt to determine this nor expand those. The net effect is that if the
6893 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6894 signals that this \N occurrence means to match a non-newline.
6895
6896 Only the \N{U+...} form should occur in a character class, for the same
6897 reason that '.' inside a character class means to just match a period: it
6898 just doesn't make sense.
fc8cd66c
YO
6899
6900 If valuep is non-null then it is assumed that we are parsing inside
6901 of a charclass definition and the first codepoint in the resolved
6902 string is returned via *valuep and the routine will return NULL.
6903 In this mode if a multichar string is returned from the charnames
ff3f963a 6904 handler, a warning will be issued, and only the first char in the
fc8cd66c
YO
6905 sequence will be examined. If the string returned is zero length
6906 then the value of *valuep is undefined and NON-NULL will
6907 be returned to indicate failure. (This will NOT be a valid pointer
6908 to a regnode.)
6909
ff3f963a
KW
6910 If valuep is null then it is assumed that we are parsing normal text and a
6911 new EXACT node is inserted into the program containing the resolved string,
6912 and a pointer to the new node is returned. But if the string is zero length
6913 a NOTHING node is emitted instead.
afefe6bf 6914
fc8cd66c 6915 On success RExC_parse is set to the char following the endbrace.
ff3f963a 6916 Parsing failures will generate a fatal error via vFAIL(...)
fc8cd66c
YO
6917 */
6918STATIC regnode *
afefe6bf 6919S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
fc8cd66c 6920{
c3c41406 6921 char * endbrace; /* '}' following the name */
fc8cd66c 6922 regnode *ret = NULL;
ff3f963a
KW
6923#ifdef DEBUGGING
6924 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6925#endif
c3c41406 6926 char* p;
ff3f963a
KW
6927
6928 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6929
6930 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
ff3f963a
KW
6931
6932 GET_RE_DEBUG_FLAGS;
c3c41406
KW
6933
6934 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6935 * modifier. The other meaning does not */
6936 p = (RExC_flags & RXf_PMf_EXTENDED)
6937 ? regwhite( pRExC_state, RExC_parse )
6938 : RExC_parse;
7918f24d 6939
ff3f963a 6940 /* Disambiguate between \N meaning a named character versus \N meaning
c3c41406
KW
6941 * [^\n]. The former is assumed when it can't be the latter. */
6942 if (*p != '{' || regcurly(p)) {
6943 RExC_parse = p;
ff3f963a 6944 if (valuep) {
afefe6bf 6945 /* no bare \N in a charclass */
ff3f963a
KW
6946 vFAIL("\\N in a character class must be a named character: \\N{...}");
6947 }
afefe6bf
RGS
6948 nextchar(pRExC_state);
6949 ret = reg_node(pRExC_state, REG_ANY);
6950 *flagp |= HASWIDTH|SIMPLE;
6951 RExC_naughty++;
6952 RExC_parse--;
6953 Set_Node_Length(ret, 1); /* MJD */
6954 return ret;
fc8cd66c 6955 }
a4893424 6956
c3c41406
KW
6957 /* Here, we have decided it should be a named sequence */
6958
6959 /* The test above made sure that the next real character is a '{', but
6960 * under the /x modifier, it could be separated by space (or a comment and
6961 * \n) and this is not allowed (for consistency with \x{...} and the
6962 * tokenizer handling of \N{NAME}). */
6963 if (*RExC_parse != '{') {
6964 vFAIL("Missing braces on \\N{}");
6965 }
6966
ff3f963a 6967 RExC_parse++; /* Skip past the '{' */
c3c41406
KW
6968
6969 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6970 || ! (endbrace == RExC_parse /* nothing between the {} */
6971 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6972 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6973 {
6974 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6975 vFAIL("\\N{NAME} must be resolved by the lexer");
6976 }
6977
ff3f963a
KW
6978 if (endbrace == RExC_parse) { /* empty: \N{} */
6979 if (! valuep) {
6980 RExC_parse = endbrace + 1;
6981 return reg_node(pRExC_state,NOTHING);
a4893424 6982 }
fc8cd66c 6983
ff3f963a
KW
6984 if (SIZE_ONLY) {
6985 ckWARNreg(RExC_parse,
6986 "Ignoring zero length \\N{} in character class"
6987 );
6988 RExC_parse = endbrace + 1;
6989 }
6990 *valuep = 0;
6991 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 6992 }
ff3f963a 6993
62fed28b 6994 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
ff3f963a
KW
6995 RExC_parse += 2; /* Skip past the 'U+' */
6996
6997 if (valuep) { /* In a bracketed char class */
6998 /* We only pay attention to the first char of
6999 multichar strings being returned. I kinda wonder
7000 if this makes sense as it does change the behaviour
7001 from earlier versions, OTOH that behaviour was broken
7002 as well. XXX Solution is to recharacterize as
7003 [rest-of-class]|multi1|multi2... */
7004
7005 STRLEN length_of_hex;
7006 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7007 | PERL_SCAN_DISALLOW_PREFIX
7008 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7009
37820adc
KW
7010 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7011 if (endchar < endbrace) {
ff3f963a
KW
7012 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7013 }
ff3f963a
KW
7014
7015 length_of_hex = (STRLEN)(endchar - RExC_parse);
7016 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7017
7018 /* The tokenizer should have guaranteed validity, but it's possible to
7019 * bypass it by using single quoting, so check */
c3c41406
KW
7020 if (length_of_hex == 0
7021 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7022 {
7023 RExC_parse += length_of_hex; /* Includes all the valid */
7024 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7025 ? UTF8SKIP(RExC_parse)
7026 : 1;
7027 /* Guard against malformed utf8 */
7028 if (RExC_parse >= endchar) RExC_parse = endchar;
7029 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
7030 }
7031
7032 RExC_parse = endbrace + 1;
7033 if (endchar == endbrace) return NULL;
7034
7035 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 7036 }
ff3f963a
KW
7037 else { /* Not a char class */
7038 char *s; /* String to put in generated EXACT node */
fda99bee 7039 STRLEN len = 0; /* Its current byte length */
ff3f963a
KW
7040 char *endchar; /* Points to '.' or '}' ending cur char in the input
7041 stream */
7042
2c2b7f86
KW
7043 ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
7044 : (LOC)
7045 ? EXACTFL
7046 : UNI_SEMANTICS
7047 ? EXACTFU
7048 : EXACTF));
ff3f963a
KW
7049 s= STRING(ret);
7050
7051 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
7052 * the input which is of the form now 'c1.c2.c3...}' until find the
fda99bee 7053 * ending brace or exceed length 255. The characters that exceed this
ff3f963a
KW
7054 * limit are dropped. The limit could be relaxed should it become
7055 * desirable by reparsing this as (?:\N{NAME}), so could generate
7056 * multiple EXACT nodes, as is done for just regular input. But this
7057 * is primarily a named character, and not intended to be a huge long
7058 * string, so 255 bytes should be good enough */
7059 while (1) {
c3c41406 7060 STRLEN length_of_hex;
ff3f963a
KW
7061 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7062 | PERL_SCAN_DISALLOW_PREFIX
7063 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7064 UV cp; /* Ord of current character */
7065
7066 /* Code points are separated by dots. If none, there is only one
7067 * code point, and is terminated by the brace */
37820adc 7068 endchar = RExC_parse + strcspn(RExC_parse, ".}");
ff3f963a
KW
7069
7070 /* The values are Unicode even on EBCDIC machines */
c3c41406
KW
7071 length_of_hex = (STRLEN)(endchar - RExC_parse);
7072 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7073 if ( length_of_hex == 0
7074 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
ff3f963a 7075 {
c3c41406
KW
7076 RExC_parse += length_of_hex; /* Includes all the valid */
7077 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7078 ? UTF8SKIP(RExC_parse)
7079 : 1;
7080 /* Guard against malformed utf8 */
7081 if (RExC_parse >= endchar) RExC_parse = endchar;
7082 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
7083 }
7084
7085 if (! FOLD) { /* Not folding, just append to the string */
7086 STRLEN unilen;
7087
7088 /* Quit before adding this character if would exceed limit */
7089 if (len + UNISKIP(cp) > U8_MAX) break;
fc8cd66c 7090
ff3f963a
KW
7091 unilen = reguni(pRExC_state, cp, s);
7092 if (unilen > 0) {
7093 s += unilen;
7094 len += unilen;
7095 }
7096 } else { /* Folding, output the folded equivalent */
7097 STRLEN foldlen,numlen;
7098 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7099 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7100
7101 /* Quit before exceeding size limit */
7102 if (len + foldlen > U8_MAX) break;
7103
7104 for (foldbuf = tmpbuf;
7105 foldlen;
7106 foldlen -= numlen)
7107 {
7108 cp = utf8_to_uvchr(foldbuf, &numlen);
7109 if (numlen > 0) {
7110 const STRLEN unilen = reguni(pRExC_state, cp, s);
7111 s += unilen;
7112 len += unilen;
7113 /* In EBCDIC the numlen and unilen can differ. */
7114 foldbuf += numlen;
7115 if (numlen >= foldlen)
7116 break;
7117 }
7118 else
7119 break; /* "Can't happen." */
7120 }
7121 }
7122
7123 /* Point to the beginning of the next character in the sequence. */
7124 RExC_parse = endchar + 1;
7125
7126 /* Quit if no more characters */
7127 if (RExC_parse >= endbrace) break;
7128 }
7129
7130
7131 if (SIZE_ONLY) {
7132 if (RExC_parse < endbrace) {
7133 ckWARNreg(RExC_parse - 1,
7134 "Using just the first characters returned by \\N{}");
7135 }
7136
7137 RExC_size += STR_SZ(len);
7138 } else {
7139 STR_LEN(ret) = len;
7140 RExC_emit += STR_SZ(len);
7141 }
7142
7143 RExC_parse = endbrace + 1;
7144
7145 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7146 with malformed in t/re/pat_advanced.t */
7147 RExC_parse --;
7148 Set_Node_Cur_Length(ret); /* MJD */
7149 nextchar(pRExC_state);
7150 }
7151
7152 return ret;
fc8cd66c
YO
7153}
7154
7155
9e08bc66
TS
7156/*
7157 * reg_recode
7158 *
7159 * It returns the code point in utf8 for the value in *encp.
7160 * value: a code value in the source encoding
7161 * encp: a pointer to an Encode object
7162 *
7163 * If the result from Encode is not a single character,
7164 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7165 */
7166STATIC UV
7167S_reg_recode(pTHX_ const char value, SV **encp)
7168{
7169 STRLEN numlen = 1;
59cd0e26 7170 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 7171 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
7172 const STRLEN newlen = SvCUR(sv);
7173 UV uv = UNICODE_REPLACEMENT;
7174
7918f24d
NC
7175 PERL_ARGS_ASSERT_REG_RECODE;
7176
9e08bc66
TS
7177 if (newlen)
7178 uv = SvUTF8(sv)
7179 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7180 : *(U8*)s;
7181
7182 if (!newlen || numlen != newlen) {
7183 uv = UNICODE_REPLACEMENT;
c86f7df5 7184 *encp = NULL;
9e08bc66
TS
7185 }
7186 return uv;
7187}
7188
fc8cd66c 7189
a687059c
LW
7190/*
7191 - regatom - the lowest level
ee9b8eae
YO
7192
7193 Try to identify anything special at the start of the pattern. If there
7194 is, then handle it as required. This may involve generating a single regop,
7195 such as for an assertion; or it may involve recursing, such as to
7196 handle a () structure.
7197
7198 If the string doesn't start with something special then we gobble up
7199 as much literal text as we can.
7200
7201 Once we have been able to handle whatever type of thing started the
7202 sequence, we return.
7203
7204 Note: we have to be careful with escapes, as they can be both literal
7205 and special, and in the case of \10 and friends can either, depending
486ec47a 7206 on context. Specifically there are two separate switches for handling
ee9b8eae
YO
7207 escape sequences, with the one for handling literal escapes requiring
7208 a dummy entry for all of the special escapes that are actually handled
7209 by the other.
7210*/
7211
76e3520e 7212STATIC regnode *
3dab1dad 7213S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 7214{
97aff369 7215 dVAR;
cbbf8932 7216 register regnode *ret = NULL;
a0d0e21e 7217 I32 flags;
45948336 7218 char *parse_start = RExC_parse;
3dab1dad
YO
7219 GET_RE_DEBUG_FLAGS_DECL;
7220 DEBUG_PARSE("atom");
a0d0e21e
LW
7221 *flagp = WORST; /* Tentatively. */
7222
7918f24d 7223 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 7224
a0d0e21e 7225tryagain:
f9a79580 7226 switch ((U8)*RExC_parse) {
a0d0e21e 7227 case '^':
830247a4
IZ
7228 RExC_seen_zerolen++;
7229 nextchar(pRExC_state);
bbe252da 7230 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7231 ret = reg_node(pRExC_state, MBOL);
bbe252da 7232 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7233 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7234 else
830247a4 7235 ret = reg_node(pRExC_state, BOL);
fac92740 7236 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7237 break;
7238 case '$':
830247a4 7239 nextchar(pRExC_state);
b81d288d 7240 if (*RExC_parse)
830247a4 7241 RExC_seen_zerolen++;
bbe252da 7242 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7243 ret = reg_node(pRExC_state, MEOL);
bbe252da 7244 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7245 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7246 else
830247a4 7247 ret = reg_node(pRExC_state, EOL);
fac92740 7248 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7249 break;
7250 case '.':
830247a4 7251 nextchar(pRExC_state);
bbe252da 7252 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
7253 ret = reg_node(pRExC_state, SANY);
7254 else
7255 ret = reg_node(pRExC_state, REG_ANY);
7256 *flagp |= HASWIDTH|SIMPLE;
830247a4 7257 RExC_naughty++;
fac92740 7258 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7259 break;
7260 case '[':
b45f050a 7261 {
3dab1dad
YO
7262 char * const oregcomp_parse = ++RExC_parse;
7263 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
7264 if (*RExC_parse != ']') {
7265 RExC_parse = oregcomp_parse;
b45f050a
JF
7266 vFAIL("Unmatched [");
7267 }
830247a4 7268 nextchar(pRExC_state);
a0d0e21e 7269 *flagp |= HASWIDTH|SIMPLE;
fac92740 7270 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 7271 break;
b45f050a 7272 }
a0d0e21e 7273 case '(':
830247a4 7274 nextchar(pRExC_state);
3dab1dad 7275 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 7276 if (ret == NULL) {
bf93d4cc 7277 if (flags & TRYAGAIN) {
830247a4 7278 if (RExC_parse == RExC_end) {
bf93d4cc
GS
7279 /* Make parent create an empty node if needed. */
7280 *flagp |= TRYAGAIN;
7281 return(NULL);
7282 }
a0d0e21e 7283 goto tryagain;
bf93d4cc 7284 }
a0d0e21e
LW
7285 return(NULL);
7286 }
a3b492c3 7287 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
7288 break;
7289 case '|':
7290 case ')':
7291 if (flags & TRYAGAIN) {
7292 *flagp |= TRYAGAIN;
7293 return NULL;
7294 }
b45f050a 7295 vFAIL("Internal urp");
a0d0e21e
LW
7296 /* Supposed to be caught earlier. */
7297 break;
85afd4ae 7298 case '{':
830247a4
IZ
7299 if (!regcurly(RExC_parse)) {
7300 RExC_parse++;
85afd4ae
CS
7301 goto defchar;
7302 }
7303 /* FALL THROUGH */
a0d0e21e
LW
7304 case '?':
7305 case '+':
7306 case '*':
830247a4 7307 RExC_parse++;
b45f050a 7308 vFAIL("Quantifier follows nothing");
a0d0e21e 7309 break;
ced7f090
KW
7310 case LATIN_SMALL_LETTER_SHARP_S:
7311 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7312 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7313#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
7314#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below.
7315 case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
7316#endif
a0a388a1 7317 do_foldchar:
56d400ed 7318 if (!LOC && FOLD) {
e64b1bd1 7319 U32 len,cp;
7cf3a6a3 7320 len=0; /* silence a spurious compiler warning */
56d400ed 7321 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
e64b1bd1
YO
7322 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7323 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7324 ret = reganode(pRExC_state, FOLDCHAR, cp);
7325 Set_Node_Length(ret, 1); /* MJD */
7326 nextchar(pRExC_state); /* kill whitespace under /x */
7327 return ret;
7328 }
7329 }
7330 goto outer_default;
a0d0e21e 7331 case '\\':
ee9b8eae
YO
7332 /* Special Escapes
7333
7334 This switch handles escape sequences that resolve to some kind
7335 of special regop and not to literal text. Escape sequnces that
7336 resolve to literal text are handled below in the switch marked
7337 "Literal Escapes".
7338
7339 Every entry in this switch *must* have a corresponding entry
7340 in the literal escape switch. However, the opposite is not
7341 required, as the default for this switch is to jump to the
7342 literal text handling code.
7343 */
a0a388a1 7344 switch ((U8)*++RExC_parse) {
ced7f090
KW
7345 case LATIN_SMALL_LETTER_SHARP_S:
7346 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7347 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
a0a388a1 7348 goto do_foldchar;
ee9b8eae 7349 /* Special Escapes */
a0d0e21e 7350 case 'A':
830247a4
IZ
7351 RExC_seen_zerolen++;
7352 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7353 *flagp |= SIMPLE;
ee9b8eae 7354 goto finish_meta_pat;
a0d0e21e 7355 case 'G':
830247a4
IZ
7356 ret = reg_node(pRExC_state, GPOS);
7357 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 7358 *flagp |= SIMPLE;
ee9b8eae
YO
7359 goto finish_meta_pat;
7360 case 'K':
7361 RExC_seen_zerolen++;
7362 ret = reg_node(pRExC_state, KEEPS);
7363 *flagp |= SIMPLE;
37923168
RGS
7364 /* XXX:dmq : disabling in-place substitution seems to
7365 * be necessary here to avoid cases of memory corruption, as
7366 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7367 */
7368 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 7369 goto finish_meta_pat;
a0d0e21e 7370 case 'Z':
830247a4 7371 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7372 *flagp |= SIMPLE;
a1917ab9 7373 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7374 goto finish_meta_pat;
b85d18e9 7375 case 'z':
830247a4 7376 ret = reg_node(pRExC_state, EOS);
b85d18e9 7377 *flagp |= SIMPLE;
830247a4 7378 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7379 goto finish_meta_pat;
4a2d328f 7380 case 'C':
f33976b4
DB
7381 ret = reg_node(pRExC_state, CANY);
7382 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 7383 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7384 goto finish_meta_pat;
a0ed51b3 7385 case 'X':
830247a4 7386 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 7387 *flagp |= HASWIDTH;
ee9b8eae 7388 goto finish_meta_pat;
a0d0e21e 7389 case 'w':
a12cf05f
KW
7390 if (LOC) {
7391 ret = reg_node(pRExC_state, (U8)(ALNUML));
7392 } else {
7393 ret = reg_node(pRExC_state, (U8)(ALNUM));
7394 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7395 }
a0d0e21e 7396 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7397 goto finish_meta_pat;
a0d0e21e 7398 case 'W':
a12cf05f
KW
7399 if (LOC) {
7400 ret = reg_node(pRExC_state, (U8)(NALNUML));
7401 } else {
7402 ret = reg_node(pRExC_state, (U8)(NALNUM));
7403 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7404 }
a0d0e21e 7405 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7406 goto finish_meta_pat;
a0d0e21e 7407 case 'b':
830247a4
IZ
7408 RExC_seen_zerolen++;
7409 RExC_seen |= REG_SEEN_LOOKBEHIND;
a12cf05f
KW
7410 if (LOC) {
7411 ret = reg_node(pRExC_state, (U8)(BOUNDL));
7412 } else {
7413 ret = reg_node(pRExC_state, (U8)(BOUND));
7414 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7415 }
a0d0e21e 7416 *flagp |= SIMPLE;
ee9b8eae 7417 goto finish_meta_pat;
a0d0e21e 7418 case 'B':
830247a4
IZ
7419 RExC_seen_zerolen++;
7420 RExC_seen |= REG_SEEN_LOOKBEHIND;
a12cf05f
KW
7421 if (LOC) {
7422 ret = reg_node(pRExC_state, (U8)(NBOUNDL));
7423 } else {
7424 ret = reg_node(pRExC_state, (U8)(NBOUND));
7425 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7426 }
a0d0e21e 7427 *flagp |= SIMPLE;
ee9b8eae 7428 goto finish_meta_pat;
a0d0e21e 7429 case 's':
a12cf05f
KW
7430 if (LOC) {
7431 ret = reg_node(pRExC_state, (U8)(SPACEL));
7432 } else {
7433 ret = reg_node(pRExC_state, (U8)(SPACE));
7434 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7435 }
a0d0e21e 7436 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7437 goto finish_meta_pat;
a0d0e21e 7438 case 'S':
a12cf05f
KW
7439 if (LOC) {
7440 ret = reg_node(pRExC_state, (U8)(NSPACEL));
7441 } else {
7442 ret = reg_node(pRExC_state, (U8)(NSPACE));
7443 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7444 }
a0d0e21e 7445 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7446 goto finish_meta_pat;
a0d0e21e 7447 case 'd':
6ab9ea91
KW
7448 if (LOC) {
7449 ret = reg_node(pRExC_state, (U8)(DIGITL));
7450 } else {
7451 ret = reg_node(pRExC_state, (U8)(DIGIT));
7452 }
a0d0e21e 7453 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7454 goto finish_meta_pat;
a0d0e21e 7455 case 'D':
6ab9ea91
KW
7456 if (LOC) {
7457 ret = reg_node(pRExC_state, (U8)(NDIGITL));
7458 } else {
7459 ret = reg_node(pRExC_state, (U8)(NDIGIT));
7460 }
a0d0e21e 7461 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7462 goto finish_meta_pat;
e1d1eefb
YO
7463 case 'R':
7464 ret = reg_node(pRExC_state, LNBREAK);
7465 *flagp |= HASWIDTH|SIMPLE;
7466 goto finish_meta_pat;
7467 case 'h':
7468 ret = reg_node(pRExC_state, HORIZWS);
7469 *flagp |= HASWIDTH|SIMPLE;
7470 goto finish_meta_pat;
7471 case 'H':
7472 ret = reg_node(pRExC_state, NHORIZWS);
7473 *flagp |= HASWIDTH|SIMPLE;
7474 goto finish_meta_pat;
ee9b8eae 7475 case 'v':
e1d1eefb
YO
7476 ret = reg_node(pRExC_state, VERTWS);
7477 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
7478 goto finish_meta_pat;
7479 case 'V':
e1d1eefb
YO
7480 ret = reg_node(pRExC_state, NVERTWS);
7481 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7482 finish_meta_pat:
830247a4 7483 nextchar(pRExC_state);
fac92740 7484 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 7485 break;
a14b48bc
LW
7486 case 'p':
7487 case 'P':
3568d838 7488 {
3dab1dad 7489 char* const oldregxend = RExC_end;
d008bc60 7490#ifdef DEBUGGING
ccb2c380 7491 char* parse_start = RExC_parse - 2;
d008bc60 7492#endif
a14b48bc 7493
830247a4 7494 if (RExC_parse[1] == '{') {
3568d838 7495 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
7496 RExC_end = strchr(RExC_parse, '}');
7497 if (!RExC_end) {
3dab1dad 7498 const U8 c = (U8)*RExC_parse;
830247a4
IZ
7499 RExC_parse += 2;
7500 RExC_end = oldregxend;
0da60cf5 7501 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 7502 }
830247a4 7503 RExC_end++;
a14b48bc 7504 }
af6f566e 7505 else {
830247a4 7506 RExC_end = RExC_parse + 2;
af6f566e
HS
7507 if (RExC_end > oldregxend)
7508 RExC_end = oldregxend;
7509 }
830247a4 7510 RExC_parse--;
a14b48bc 7511
3dab1dad 7512 ret = regclass(pRExC_state,depth+1);
a14b48bc 7513
830247a4
IZ
7514 RExC_end = oldregxend;
7515 RExC_parse--;
ccb2c380
MP
7516
7517 Set_Node_Offset(ret, parse_start + 2);
7518 Set_Node_Cur_Length(ret);
830247a4 7519 nextchar(pRExC_state);
a14b48bc
LW
7520 *flagp |= HASWIDTH|SIMPLE;
7521 }
7522 break;
fc8cd66c 7523 case 'N':
afefe6bf 7524 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
7525 multicharacter. join_exact() will join them up later on.
7526 Also this makes sure that things like /\N{BLAH}+/ and
7527 \N{BLAH} being multi char Just Happen. dmq*/
7528 ++RExC_parse;
afefe6bf 7529 ret= reg_namedseq(pRExC_state, NULL, flagp);
fc8cd66c 7530 break;
0a4db386 7531 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 7532 parse_named_seq:
81714fb9
YO
7533 {
7534 char ch= RExC_parse[1];
1f1031fe
YO
7535 if (ch != '<' && ch != '\'' && ch != '{') {
7536 RExC_parse++;
7537 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 7538 } else {
1f1031fe
YO
7539 /* this pretty much dupes the code for (?P=...) in reg(), if
7540 you change this make sure you change that */
81714fb9 7541 char* name_start = (RExC_parse += 2);
2eccd3b2 7542 U32 num = 0;
0a4db386
YO
7543 SV *sv_dat = reg_scan_name(pRExC_state,
7544 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 7545 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 7546 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
7547 vFAIL2("Sequence %.3s... not terminated",parse_start);
7548
7549 if (!SIZE_ONLY) {
7550 num = add_data( pRExC_state, 1, "S" );
7551 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 7552 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
7553 }
7554
81714fb9
YO
7555 RExC_sawback = 1;
7556 ret = reganode(pRExC_state,
4444fd9f
KW
7557 ((! FOLD)
7558 ? NREF
7559 : (UNI_SEMANTICS)
7560 ? NREFFU
7561 : (LOC)
7562 ? NREFFL
7563 : NREFF),
7564 num);
81714fb9 7565 *flagp |= HASWIDTH;
1f1031fe 7566
81714fb9
YO
7567 /* override incorrect value set in reganode MJD */
7568 Set_Node_Offset(ret, parse_start+1);
7569 Set_Node_Cur_Length(ret); /* MJD */
7570 nextchar(pRExC_state);
1f1031fe 7571
81714fb9
YO
7572 }
7573 break;
1f1031fe 7574 }
2bf803e2 7575 case 'g':
a0d0e21e
LW
7576 case '1': case '2': case '3': case '4':
7577 case '5': case '6': case '7': case '8': case '9':
7578 {
c74340f9 7579 I32 num;
2bf803e2
YO
7580 bool isg = *RExC_parse == 'g';
7581 bool isrel = 0;
7582 bool hasbrace = 0;
7583 if (isg) {
c74340f9 7584 RExC_parse++;
2bf803e2
YO
7585 if (*RExC_parse == '{') {
7586 RExC_parse++;
7587 hasbrace = 1;
7588 }
7589 if (*RExC_parse == '-') {
7590 RExC_parse++;
7591 isrel = 1;
7592 }
1f1031fe
YO
7593 if (hasbrace && !isDIGIT(*RExC_parse)) {
7594 if (isrel) RExC_parse--;
7595 RExC_parse -= 2;
7596 goto parse_named_seq;
7597 } }
c74340f9 7598 num = atoi(RExC_parse);
b72d83b2
RGS
7599 if (isg && num == 0)
7600 vFAIL("Reference to invalid group 0");
c74340f9 7601 if (isrel) {
5624f11d 7602 num = RExC_npar - num;
c74340f9
YO
7603 if (num < 1)
7604 vFAIL("Reference to nonexistent or unclosed group");
7605 }
2bf803e2 7606 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
7607 goto defchar;
7608 else {
3dab1dad 7609 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
7610 while (isDIGIT(*RExC_parse))
7611 RExC_parse++;
1f1031fe
YO
7612 if (parse_start == RExC_parse - 1)
7613 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
7614 if (hasbrace) {
7615 if (*RExC_parse != '}')
7616 vFAIL("Unterminated \\g{...} pattern");
7617 RExC_parse++;
7618 }
c74340f9
YO
7619 if (!SIZE_ONLY) {
7620 if (num > (I32)RExC_rx->nparens)
7621 vFAIL("Reference to nonexistent group");
c74340f9 7622 }
830247a4 7623 RExC_sawback = 1;
eb160463 7624 ret = reganode(pRExC_state,
4444fd9f
KW
7625 ((! FOLD)
7626 ? REF
7627 : (UNI_SEMANTICS)
7628 ? REFFU
7629 : (LOC)
7630 ? REFFL
7631 : REFF),
7632 num);
a0d0e21e 7633 *flagp |= HASWIDTH;
2af232bd 7634
fac92740 7635 /* override incorrect value set in reganode MJD */
2af232bd 7636 Set_Node_Offset(ret, parse_start+1);
fac92740 7637 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
7638 RExC_parse--;
7639 nextchar(pRExC_state);
a0d0e21e
LW
7640 }
7641 }
7642 break;
7643 case '\0':
830247a4 7644 if (RExC_parse >= RExC_end)
b45f050a 7645 FAIL("Trailing \\");
a0d0e21e
LW
7646 /* FALL THROUGH */
7647 default:
a0288114 7648 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 7649 back into the quick-grab loop below */
45948336 7650 parse_start--;
a0d0e21e
LW
7651 goto defchar;
7652 }
7653 break;
4633a7c4
LW
7654
7655 case '#':
bbe252da 7656 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 7657 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
7658 goto tryagain;
7659 }
7660 /* FALL THROUGH */
7661
f9a79580
RGS
7662 default:
7663 outer_default:{
ba210ebe 7664 register STRLEN len;
58ae7d3f 7665 register UV ender;
a0d0e21e 7666 register char *p;
3dab1dad 7667 char *s;
80aecb99 7668 STRLEN foldlen;
89ebb4a3 7669 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
7670
7671 parse_start = RExC_parse - 1;
a0d0e21e 7672
830247a4 7673 RExC_parse++;
a0d0e21e
LW
7674
7675 defchar:
58ae7d3f 7676 ender = 0;
eb160463 7677 ret = reg_node(pRExC_state,
2c2b7f86
KW
7678 (U8) ((! FOLD) ? EXACT
7679 : (LOC)
7680 ? EXACTFL
7681 : (UNI_SEMANTICS)
7682 ? EXACTFU
7683 : EXACTF)
7684 );
cd439c50 7685 s = STRING(ret);
830247a4
IZ
7686 for (len = 0, p = RExC_parse - 1;
7687 len < 127 && p < RExC_end;
a0d0e21e
LW
7688 len++)
7689 {
3dab1dad 7690 char * const oldp = p;
5b5a24f7 7691
bbe252da 7692 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 7693 p = regwhite( pRExC_state, p );
f9a79580 7694 switch ((U8)*p) {
ced7f090
KW
7695 case LATIN_SMALL_LETTER_SHARP_S:
7696 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7697 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
56d400ed 7698 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
f9a79580 7699 goto normal_default;
a0d0e21e
LW
7700 case '^':
7701 case '$':
7702 case '.':
7703 case '[':
7704 case '(':
7705 case ')':
7706 case '|':
7707 goto loopdone;
7708 case '\\':
ee9b8eae
YO
7709 /* Literal Escapes Switch
7710
7711 This switch is meant to handle escape sequences that
7712 resolve to a literal character.
7713
7714 Every escape sequence that represents something
7715 else, like an assertion or a char class, is handled
7716 in the switch marked 'Special Escapes' above in this
7717 routine, but also has an entry here as anything that
7718 isn't explicitly mentioned here will be treated as
7719 an unescaped equivalent literal.
7720 */
7721
a0a388a1 7722 switch ((U8)*++p) {
ee9b8eae 7723 /* These are all the special escapes. */
ced7f090
KW
7724 case LATIN_SMALL_LETTER_SHARP_S:
7725 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7726 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
a0a388a1
YO
7727 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7728 goto normal_default;
ee9b8eae
YO
7729 case 'A': /* Start assertion */
7730 case 'b': case 'B': /* Word-boundary assertion*/
7731 case 'C': /* Single char !DANGEROUS! */
7732 case 'd': case 'D': /* digit class */
7733 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 7734 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
7735 case 'k': case 'K': /* named backref, keep marker */
7736 case 'N': /* named char sequence */
38a44b82 7737 case 'p': case 'P': /* Unicode property */
e1d1eefb 7738 case 'R': /* LNBREAK */
ee9b8eae 7739 case 's': case 'S': /* space class */
e1d1eefb 7740 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
7741 case 'w': case 'W': /* word class */
7742 case 'X': /* eXtended Unicode "combining character sequence" */
7743 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
7744 --p;
7745 goto loopdone;
ee9b8eae
YO
7746
7747 /* Anything after here is an escape that resolves to a
7748 literal. (Except digits, which may or may not)
7749 */
a0d0e21e
LW
7750 case 'n':
7751 ender = '\n';
7752 p++;
a687059c 7753 break;
a0d0e21e
LW
7754 case 'r':
7755 ender = '\r';
7756 p++;
a687059c 7757 break;
a0d0e21e
LW
7758 case 't':
7759 ender = '\t';
7760 p++;
a687059c 7761 break;
a0d0e21e
LW
7762 case 'f':
7763 ender = '\f';
7764 p++;
a687059c 7765 break;
a0d0e21e 7766 case 'e':
c7f1f016 7767 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 7768 p++;
a687059c 7769 break;
a0d0e21e 7770 case 'a':
c7f1f016 7771 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 7772 p++;
a687059c 7773 break;
f0a2b745
KW
7774 case 'o':
7775 {
7776 STRLEN brace_len = len;
00c0cb6d 7777 UV result;
454155d9
KW
7778 const char* error_msg;
7779
7780 bool valid = grok_bslash_o(p,
7781 &result,
7782 &brace_len,
7783 &error_msg,
7784 1);
7785 p += brace_len;
7786 if (! valid) {
7787 RExC_parse = p; /* going to die anyway; point
7788 to exact spot of failure */
f0a2b745
KW
7789 vFAIL(error_msg);
7790 }
00c0cb6d
DG
7791 else
7792 {
7793 ender = result;
7794 }
f0a2b745
KW
7795 if (PL_encoding && ender < 0x100) {
7796 goto recode_encoding;
7797 }
7798 if (ender > 0xff) {
62fed28b 7799 REQUIRE_UTF8;
f0a2b745
KW
7800 }
7801 break;
7802 }
a0d0e21e 7803 case 'x':
a0ed51b3 7804 if (*++p == '{') {
1df70142 7805 char* const e = strchr(p, '}');
b81d288d 7806
b45f050a 7807 if (!e) {
830247a4 7808 RExC_parse = p + 1;
b45f050a
JF
7809 vFAIL("Missing right brace on \\x{}");
7810 }
de5f0749 7811 else {
a4c04bdc
NC
7812 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7813 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 7814 STRLEN numlen = e - p - 1;
53305cf1 7815 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028 7816 if (ender > 0xff)
62fed28b 7817 REQUIRE_UTF8;
a0ed51b3
LW
7818 p = e + 1;
7819 }
a0ed51b3
LW
7820 }
7821 else {
a4c04bdc 7822 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 7823 STRLEN numlen = 2;
53305cf1 7824 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
7825 p += numlen;
7826 }
9e08bc66
TS
7827 if (PL_encoding && ender < 0x100)
7828 goto recode_encoding;
a687059c 7829 break;
a0d0e21e
LW
7830 case 'c':
7831 p++;
f9d13529 7832 ender = grok_bslash_c(*p++, SIZE_ONLY);
a687059c 7833 break;
a0d0e21e
LW
7834 case '0': case '1': case '2': case '3':case '4':
7835 case '5': case '6': case '7': case '8':case '9':
7836 if (*p == '0' ||
ca67da41 7837 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
c99e91e9
KW
7838 {
7839 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
1df70142 7840 STRLEN numlen = 3;
53305cf1 7841 ender = grok_oct(p, &numlen, &flags, NULL);
fa1639c5 7842 if (ender > 0xff) {
62fed28b 7843 REQUIRE_UTF8;
609122bd 7844 }
a0d0e21e
LW
7845 p += numlen;
7846 }
7847 else {
7848 --p;
7849 goto loopdone;
a687059c 7850 }
9e08bc66
TS
7851 if (PL_encoding && ender < 0x100)
7852 goto recode_encoding;
7853 break;
7854 recode_encoding:
7855 {
7856 SV* enc = PL_encoding;
7857 ender = reg_recode((const char)(U8)ender, &enc);
668c081a
NC
7858 if (!enc && SIZE_ONLY)
7859 ckWARNreg(p, "Invalid escape in the specified encoding");
62fed28b 7860 REQUIRE_UTF8;
9e08bc66 7861 }
a687059c 7862 break;
a0d0e21e 7863 case '\0':
830247a4 7864 if (p >= RExC_end)
b45f050a 7865 FAIL("Trailing \\");
a687059c 7866 /* FALL THROUGH */
a0d0e21e 7867 default:
668c081a
NC
7868 if (!SIZE_ONLY&& isALPHA(*p))
7869 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 7870 goto normal_default;
a0d0e21e
LW
7871 }
7872 break;
a687059c 7873 default:
a0ed51b3 7874 normal_default:
fd400ab9 7875 if (UTF8_IS_START(*p) && UTF) {
1df70142 7876 STRLEN numlen;
5e12f4fb 7877 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 7878 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
7879 p += numlen;
7880 }
7881 else
7882 ender = *p++;
a0d0e21e 7883 break;
a687059c 7884 }
bcdf7404
YO
7885 if ( RExC_flags & RXf_PMf_EXTENDED)
7886 p = regwhite( pRExC_state, p );
60a8b682
JH
7887 if (UTF && FOLD) {
7888 /* Prime the casefolded buffer. */
ac7e0132 7889 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 7890 }
bcdf7404 7891 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
7892 if (len)
7893 p = oldp;
16ea2a2e 7894 else if (UTF) {
80aecb99 7895 if (FOLD) {
60a8b682 7896 /* Emit all the Unicode characters. */
1df70142 7897 STRLEN numlen;
80aecb99
JH
7898 for (foldbuf = tmpbuf;
7899 foldlen;
7900 foldlen -= numlen) {
7901 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7902 if (numlen > 0) {
71207a34 7903 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7904 s += unilen;
7905 len += unilen;
7906 /* In EBCDIC the numlen
7907 * and unilen can differ. */
9dc45d57 7908 foldbuf += numlen;
47654450
JH
7909 if (numlen >= foldlen)
7910 break;
9dc45d57
JH
7911 }
7912 else
7913 break; /* "Can't happen." */
80aecb99
JH
7914 }
7915 }
7916 else {
71207a34 7917 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7918 if (unilen > 0) {
0ebc6274
JH
7919 s += unilen;
7920 len += unilen;
9dc45d57 7921 }
80aecb99 7922 }
a0ed51b3 7923 }
a0d0e21e
LW
7924 else {
7925 len++;
eb160463 7926 REGC((char)ender, s++);
a0d0e21e
LW
7927 }
7928 break;
a687059c 7929 }
16ea2a2e 7930 if (UTF) {
80aecb99 7931 if (FOLD) {
60a8b682 7932 /* Emit all the Unicode characters. */
1df70142 7933 STRLEN numlen;
80aecb99
JH
7934 for (foldbuf = tmpbuf;
7935 foldlen;
7936 foldlen -= numlen) {
7937 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7938 if (numlen > 0) {
71207a34 7939 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7940 len += unilen;
7941 s += unilen;
7942 /* In EBCDIC the numlen
7943 * and unilen can differ. */
9dc45d57 7944 foldbuf += numlen;
47654450
JH
7945 if (numlen >= foldlen)
7946 break;
9dc45d57
JH
7947 }
7948 else
7949 break;
80aecb99
JH
7950 }
7951 }
7952 else {
71207a34 7953 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7954 if (unilen > 0) {
0ebc6274
JH
7955 s += unilen;
7956 len += unilen;
9dc45d57 7957 }
80aecb99
JH
7958 }
7959 len--;
a0ed51b3
LW
7960 }
7961 else
eb160463 7962 REGC((char)ender, s++);
a0d0e21e
LW
7963 }
7964 loopdone:
830247a4 7965 RExC_parse = p - 1;
fac92740 7966 Set_Node_Cur_Length(ret); /* MJD */
830247a4 7967 nextchar(pRExC_state);
793db0cb
JH
7968 {
7969 /* len is STRLEN which is unsigned, need to copy to signed */
7970 IV iv = len;
7971 if (iv < 0)
7972 vFAIL("Internal disaster");
7973 }
a0d0e21e
LW
7974 if (len > 0)
7975 *flagp |= HASWIDTH;
090f7165 7976 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 7977 *flagp |= SIMPLE;
3dab1dad 7978
cd439c50 7979 if (SIZE_ONLY)
830247a4 7980 RExC_size += STR_SZ(len);
3dab1dad
YO
7981 else {
7982 STR_LEN(ret) = len;
830247a4 7983 RExC_emit += STR_SZ(len);
07be1b83 7984 }
3dab1dad 7985 }
a0d0e21e
LW
7986 break;
7987 }
a687059c 7988
a0d0e21e 7989 return(ret);
a687059c
LW
7990}
7991
873ef191 7992STATIC char *
bcdf7404 7993S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 7994{
bcdf7404 7995 const char *e = RExC_end;
7918f24d
NC
7996
7997 PERL_ARGS_ASSERT_REGWHITE;
7998
5b5a24f7
CS
7999 while (p < e) {
8000 if (isSPACE(*p))
8001 ++p;
8002 else if (*p == '#') {
bcdf7404 8003 bool ended = 0;
5b5a24f7 8004 do {
bcdf7404
YO
8005 if (*p++ == '\n') {
8006 ended = 1;
8007 break;
8008 }
8009 } while (p < e);
8010 if (!ended)
8011 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
8012 }
8013 else
8014 break;
8015 }
8016 return p;
8017}
8018
b8c5462f
JH
8019/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
8020 Character classes ([:foo:]) can also be negated ([:^foo:]).
8021 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
8022 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 8023 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
8024
8025#define POSIXCC_DONE(c) ((c) == ':')
8026#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
8027#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
8028
b8c5462f 8029STATIC I32
830247a4 8030S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 8031{
97aff369 8032 dVAR;
936ed897 8033 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 8034
7918f24d
NC
8035 PERL_ARGS_ASSERT_REGPPOSIXCC;
8036
830247a4 8037 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 8038 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 8039 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 8040 const char c = UCHARAT(RExC_parse);
097eb12c 8041 char* const s = RExC_parse++;
b81d288d 8042
9a86a77b 8043 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
8044 RExC_parse++;
8045 if (RExC_parse == RExC_end)
620e46c5 8046 /* Grandfather lone [:, [=, [. */
830247a4 8047 RExC_parse = s;
620e46c5 8048 else {
3dab1dad 8049 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
8050 assert(*t == c);
8051
9a86a77b 8052 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 8053 const char *posixcc = s + 1;
830247a4 8054 RExC_parse++; /* skip over the ending ] */
3dab1dad 8055
b8c5462f 8056 if (*s == ':') {
1df70142
AL
8057 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
8058 const I32 skip = t - posixcc;
80916619
NC
8059
8060 /* Initially switch on the length of the name. */
8061 switch (skip) {
8062 case 4:
3dab1dad
YO
8063 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
8064 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 8065 break;
80916619
NC
8066 case 5:
8067 /* Names all of length 5. */
8068 /* alnum alpha ascii blank cntrl digit graph lower
8069 print punct space upper */
8070 /* Offset 4 gives the best switch position. */
8071 switch (posixcc[4]) {
8072 case 'a':
3dab1dad
YO
8073 if (memEQ(posixcc, "alph", 4)) /* alpha */
8074 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
8075 break;
8076 case 'e':
3dab1dad
YO
8077 if (memEQ(posixcc, "spac", 4)) /* space */
8078 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
8079 break;
8080 case 'h':
3dab1dad
YO
8081 if (memEQ(posixcc, "grap", 4)) /* graph */
8082 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
8083 break;
8084 case 'i':
3dab1dad
YO
8085 if (memEQ(posixcc, "asci", 4)) /* ascii */
8086 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
8087 break;
8088 case 'k':
3dab1dad
YO
8089 if (memEQ(posixcc, "blan", 4)) /* blank */
8090 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
8091 break;
8092 case 'l':
3dab1dad
YO
8093 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
8094 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
8095 break;
8096 case 'm':
3dab1dad
YO
8097 if (memEQ(posixcc, "alnu", 4)) /* alnum */
8098 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
8099 break;
8100 case 'r':
3dab1dad
YO
8101 if (memEQ(posixcc, "lowe", 4)) /* lower */
8102 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8103 else if (memEQ(posixcc, "uppe", 4)) /* upper */
8104 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
8105 break;
8106 case 't':
3dab1dad
YO
8107 if (memEQ(posixcc, "digi", 4)) /* digit */
8108 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8109 else if (memEQ(posixcc, "prin", 4)) /* print */
8110 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8111 else if (memEQ(posixcc, "punc", 4)) /* punct */
8112 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 8113 break;
b8c5462f
JH
8114 }
8115 break;
80916619 8116 case 6:
3dab1dad
YO
8117 if (memEQ(posixcc, "xdigit", 6))
8118 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
8119 break;
8120 }
80916619
NC
8121
8122 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
8123 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8124 t - s - 1, s + 1);
80916619
NC
8125 assert (posixcc[skip] == ':');
8126 assert (posixcc[skip+1] == ']');
b45f050a 8127 } else if (!SIZE_ONLY) {
b8c5462f 8128 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 8129
830247a4 8130 /* adjust RExC_parse so the warning shows after
b45f050a 8131 the class closes */
9a86a77b 8132 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 8133 RExC_parse++;
b45f050a
JF
8134 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8135 }
b8c5462f
JH
8136 } else {
8137 /* Maternal grandfather:
8138 * "[:" ending in ":" but not in ":]" */
830247a4 8139 RExC_parse = s;
767d463e 8140 }
620e46c5
JH
8141 }
8142 }
8143
b8c5462f
JH
8144 return namedclass;
8145}
8146
8147STATIC void
830247a4 8148S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 8149{
97aff369 8150 dVAR;
7918f24d
NC
8151
8152 PERL_ARGS_ASSERT_CHECKPOSIXCC;
8153
3dab1dad 8154 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
8155 const char *s = RExC_parse;
8156 const char c = *s++;
b8c5462f 8157
3dab1dad 8158 while (isALNUM(*s))
b8c5462f
JH
8159 s++;
8160 if (*s && c == *s && s[1] == ']') {
668c081a
NC
8161 ckWARN3reg(s+2,
8162 "POSIX syntax [%c %c] belongs inside character classes",
8163 c, c);
b45f050a
JF
8164
8165 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 8166 if (POSIXCC_NOTYET(c)) {
830247a4 8167 /* adjust RExC_parse so the error shows after
b45f050a 8168 the class closes */
9a86a77b 8169 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 8170 NOOP;
b45f050a
JF
8171 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8172 }
b8c5462f
JH
8173 }
8174 }
620e46c5
JH
8175}
8176
0399b215 8177/* No locale test */
e1d1eefb
YO
8178#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
8179ANYOF_##NAME: \
8180 for (value = 0; value < 256; value++) \
8181 if (TEST) \
2dcac756 8182 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
e1d1eefb
YO
8183 yesno = '+'; \
8184 what = WORD; \
8185 break; \
8186case ANYOF_N##NAME: \
8187 for (value = 0; value < 256; value++) \
8188 if (!TEST) \
2dcac756 8189 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
e1d1eefb
YO
8190 yesno = '!'; \
8191 what = WORD; \
8192 break
89836f1f 8193
a12cf05f
KW
8194/* Like the above, but there are differences if we are in uni-8-bit or not, so
8195 * there are two tests passed in, to use depending on that. There aren't any
8196 * cases where the label is different from the name, so no need for that
8197 * parameter */
0399b215 8198#define _C_C_T_(NAME,TEST_8,TEST_7,WORD) \
a12cf05f
KW
8199ANYOF_##NAME: \
8200 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8201 else if (UNI_SEMANTICS) { \
8202 for (value = 0; value < 256; value++) { \
2283d326 8203 if (TEST_8) stored += \
62e21d5f 8204 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
a12cf05f
KW
8205 } \
8206 } \
8207 else { \
3c8eea2b 8208 for (value = 0; value < 128; value++) { \
2283d326 8209 if (TEST_7) stored += \
62e21d5f 8210 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value)); \
a12cf05f
KW
8211 } \
8212 } \
8213 yesno = '+'; \
8214 what = WORD; \
8215 break; \
8216case ANYOF_N##NAME: \
8217 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8218 else if (UNI_SEMANTICS) { \
8219 for (value = 0; value < 256; value++) { \
2283d326 8220 if (! TEST_8) stored += \
62e21d5f 8221 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
a12cf05f
KW
8222 } \
8223 } \
8224 else { \
3c8eea2b 8225 for (value = 0; value < 128; value++) { \
2283d326 8226 if (! TEST_7) stored += \
62e21d5f 8227 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
a12cf05f 8228 } \
3c8eea2b 8229 for (value = 128; value < 256; value++) { \
62e21d5f 8230 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
3c8eea2b 8231 } \
a12cf05f
KW
8232 } \
8233 yesno = '!'; \
8234 what = WORD; \
8235 break
8236
da7fcca4
YO
8237/*
8238 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8239 so that it is possible to override the option here without having to
8240 rebuild the entire core. as we are required to do if we change regcomp.h
8241 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8242*/
8243#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8244#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8245#endif
8246
8247#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8248#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8249#else
8250#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8251#endif
8252
2283d326
KW
8253STATIC U8
8254S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8255{
8256
8257 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
8258 * Locale folding is done at run-time, so this function should not be
8259 * called for nodes that are for locales.
8260 *
8261 * This function simply sets the bit corresponding to the fold of the input
8262 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
8263 * 'F' is 'f'.
8264 *
8265 * It also sets any necessary flags, and returns the number of bits that
8266 * actually changed from 0 to 1 */
8267
8268 U8 stored = 0;
2283d326
KW
8269 U8 fold;
8270
8271 fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
8272 : PL_fold[value];
8273
8274 /* It assumes the bit for 'value' has already been set */
8275 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
8276 ANYOF_BITMAP_SET(node, fold);
8277 stored++;
8278 }
8279
8280 /* The fold of the German sharp s is two ASCII characters, so isn't in the
8281 * bitmap and doesn't have to be in utf8, but we only process it if unicode
8282 * semantics are called for */
8283 if (UNI_SEMANTICS && value == LATIN_SMALL_LETTER_SHARP_S) {
8284 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
8285 }
e2e75538 8286 else if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
2283d326
KW
8287 || (! UNI_SEMANTICS
8288 && ! isASCII(value)
8289 && PL_fold_latin1[value] != value))
8290 { /* A character that has a fold outside of Latin1 matches outside the
8291 bitmap, but only when the target string is utf8. Similarly when we
8292 don't have unicode semantics for the above ASCII Latin-1 characters,
8293 and they have a fold, they should match if the target is utf8, and
8294 not otherwise */
8295 ANYOF_FLAGS(node) |= ANYOF_UTF8;
8296 }
8297
8298 return stored;
8299}
8300
8301
8302PERL_STATIC_INLINE U8
41b0f1c1 8303S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
2283d326
KW
8304{
8305 /* This inline function sets a bit in the bitmap if not already set, and if
8306 * appropriate, its fold, returning the number of bits that actually
8307 * changed from 0 to 1 */
8308
8309 U8 stored;
8310
8311 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
8312 return 0;
8313 }
8314
8315 ANYOF_BITMAP_SET(node, value);
8316 stored = 1;
8317
8318 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
8319 stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
8320 }
8321
8322 return stored;
8323}
8324
7f6f358c
YO
8325/*
8326 parse a class specification and produce either an ANYOF node that
89836f1f
YO
8327 matches the pattern or if the pattern matches a single char only and
8328 that char is < 256 and we are case insensitive then we produce an
8329 EXACT node instead.
7f6f358c 8330*/
89836f1f 8331
76e3520e 8332STATIC regnode *
3dab1dad 8333S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 8334{
97aff369 8335 dVAR;
9a86a77b 8336 register UV nextvalue;
3568d838 8337 register IV prevvalue = OOB_UNICODE;
ffc61ed2 8338 register IV range = 0;
e1d1eefb 8339 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 8340 register regnode *ret;
ba210ebe 8341 STRLEN numlen;
ffc61ed2 8342 IV namedclass;
cbbf8932 8343 char *rangebegin = NULL;
936ed897 8344 bool need_class = 0;
c445ea15 8345 SV *listsv = NULL;
ffc61ed2 8346 UV n;
cbbf8932 8347 AV* unicode_alternate = NULL;
1b2d223b
JH
8348#ifdef EBCDIC
8349 UV literal_endpoint = 0;
8350#endif
ffc130aa 8351 UV stored = 0; /* how many chars stored in the bitmap */
ffc61ed2 8352
3dab1dad 8353 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 8354 case we need to change the emitted regop to an EXACT. */
07be1b83 8355 const char * orig_parse = RExC_parse;
72f13be8 8356 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8357
8358 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
8359#ifndef DEBUGGING
8360 PERL_UNUSED_ARG(depth);
8361#endif
72f13be8 8362
3dab1dad 8363 DEBUG_PARSE("clas");
7f6f358c
YO
8364
8365 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
8366 ret = reganode(pRExC_state, ANYOF, 0);
8367
8368 if (!SIZE_ONLY)
8369 ANYOF_FLAGS(ret) = 0;
8370
9a86a77b 8371 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
8372 RExC_naughty++;
8373 RExC_parse++;
8374 if (!SIZE_ONLY)
8375 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8376 }
a0d0e21e 8377
73060fc4 8378 if (SIZE_ONLY) {
830247a4 8379 RExC_size += ANYOF_SKIP;
3a15e693
KW
8380#ifdef ANYOF_ADD_LOC_SKIP
8381 if (LOC) {
8382 RExC_size += ANYOF_ADD_LOC_SKIP;
8383 }
8384#endif
73060fc4
JH
8385 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8386 }
936ed897 8387 else {
830247a4 8388 RExC_emit += ANYOF_SKIP;
3a15e693 8389 if (LOC) {
936ed897 8390 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3a15e693
KW
8391#ifdef ANYOF_ADD_LOC_SKIP
8392 RExC_emit += ANYOF_ADD_LOC_SKIP;
8393#endif
8394 }
ffc61ed2 8395 ANYOF_BITMAP_ZERO(ret);
396482e1 8396 listsv = newSVpvs("# comment\n");
a0d0e21e 8397 }
b8c5462f 8398
9a86a77b
JH
8399 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8400
b938889d 8401 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 8402 checkposixcc(pRExC_state);
b8c5462f 8403
f064b6ad
HS
8404 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8405 if (UCHARAT(RExC_parse) == ']')
8406 goto charclassloop;
ffc61ed2 8407
fc8cd66c 8408parseit:
9a86a77b 8409 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
8410
8411 charclassloop:
8412
8413 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8414
73b437c8 8415 if (!range)
830247a4 8416 rangebegin = RExC_parse;
ffc61ed2 8417 if (UTF) {
5e12f4fb 8418 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 8419 RExC_end - RExC_parse,
9f7f3913 8420 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
8421 RExC_parse += numlen;
8422 }
8423 else
8424 value = UCHARAT(RExC_parse++);
7f6f358c 8425
9a86a77b
JH
8426 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8427 if (value == '[' && POSIXCC(nextvalue))
830247a4 8428 namedclass = regpposixcc(pRExC_state, value);
620e46c5 8429 else if (value == '\\') {
ffc61ed2 8430 if (UTF) {
5e12f4fb 8431 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 8432 RExC_end - RExC_parse,
9f7f3913 8433 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
8434 RExC_parse += numlen;
8435 }
8436 else
8437 value = UCHARAT(RExC_parse++);
470c3474 8438 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 8439 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
8440 * be a problem later if we want switch on Unicode.
8441 * A similar issue a little bit later when switching on
8442 * namedclass. --jhi */
ffc61ed2 8443 switch ((I32)value) {
b8c5462f
JH
8444 case 'w': namedclass = ANYOF_ALNUM; break;
8445 case 'W': namedclass = ANYOF_NALNUM; break;
8446 case 's': namedclass = ANYOF_SPACE; break;
8447 case 'S': namedclass = ANYOF_NSPACE; break;
8448 case 'd': namedclass = ANYOF_DIGIT; break;
8449 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
8450 case 'v': namedclass = ANYOF_VERTWS; break;
8451 case 'V': namedclass = ANYOF_NVERTWS; break;
8452 case 'h': namedclass = ANYOF_HORIZWS; break;
8453 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
8454 case 'N': /* Handle \N{NAME} in class */
8455 {
8456 /* We only pay attention to the first char of
8457 multichar strings being returned. I kinda wonder
8458 if this makes sense as it does change the behaviour
8459 from earlier versions, OTOH that behaviour was broken
8460 as well. */
8461 UV v; /* value is register so we cant & it /grrr */
afefe6bf 8462 if (reg_namedseq(pRExC_state, &v, NULL)) {
fc8cd66c
YO
8463 goto parseit;
8464 }
8465 value= v;
8466 }
8467 break;
ffc61ed2
JH
8468 case 'p':
8469 case 'P':
3dab1dad
YO
8470 {
8471 char *e;
af6f566e 8472 if (RExC_parse >= RExC_end)
2a4859cd 8473 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 8474 if (*RExC_parse == '{') {
1df70142 8475 const U8 c = (U8)value;
ffc61ed2
JH
8476 e = strchr(RExC_parse++, '}');
8477 if (!e)
0da60cf5 8478 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
8479 while (isSPACE(UCHARAT(RExC_parse)))
8480 RExC_parse++;
8481 if (e == RExC_parse)
0da60cf5 8482 vFAIL2("Empty \\%c{}", c);
ffc61ed2 8483 n = e - RExC_parse;
ab13f0c7
JH
8484 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8485 n--;
ffc61ed2
JH
8486 }
8487 else {
8488 e = RExC_parse;
8489 n = 1;
8490 }
8491 if (!SIZE_ONLY) {
ab13f0c7
JH
8492 if (UCHARAT(RExC_parse) == '^') {
8493 RExC_parse++;
8494 n--;
8495 value = value == 'p' ? 'P' : 'p'; /* toggle */
8496 while (isSPACE(UCHARAT(RExC_parse))) {
8497 RExC_parse++;
8498 n--;
8499 }
8500 }
097eb12c
AL
8501 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8502 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
8503 }
8504 RExC_parse = e + 1;
08fc12dd
KW
8505
8506 /* The \p could match something in the Latin1 range, hence
8507 * something that isn't utf8 */
3ff7ceb3 8508 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
f81125e2 8509 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 8510 }
f81125e2 8511 break;
b8c5462f
JH
8512 case 'n': value = '\n'; break;
8513 case 'r': value = '\r'; break;
8514 case 't': value = '\t'; break;
8515 case 'f': value = '\f'; break;
8516 case 'b': value = '\b'; break;
c7f1f016
NIS
8517 case 'e': value = ASCII_TO_NATIVE('\033');break;
8518 case 'a': value = ASCII_TO_NATIVE('\007');break;
f0a2b745
KW
8519 case 'o':
8520 RExC_parse--; /* function expects to be pointed at the 'o' */
454155d9
KW
8521 {
8522 const char* error_msg;
8523 bool valid = grok_bslash_o(RExC_parse,
f0a2b745
KW
8524 &value,
8525 &numlen,
454155d9
KW
8526 &error_msg,
8527 SIZE_ONLY);
8528 RExC_parse += numlen;
8529 if (! valid) {
8530 vFAIL(error_msg);
8531 }
f0a2b745 8532 }
f0a2b745
KW
8533 if (PL_encoding && value < 0x100) {
8534 goto recode_encoding;
8535 }
8536 break;
b8c5462f 8537 case 'x':
ffc61ed2 8538 if (*RExC_parse == '{') {
a4c04bdc
NC
8539 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8540 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 8541 char * const e = strchr(RExC_parse++, '}');
b81d288d 8542 if (!e)
ffc61ed2 8543 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
8544
8545 numlen = e - RExC_parse;
8546 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8547 RExC_parse = e + 1;
8548 }
8549 else {
a4c04bdc 8550 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
8551 numlen = 2;
8552 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8553 RExC_parse += numlen;
8554 }
9e08bc66
TS
8555 if (PL_encoding && value < 0x100)
8556 goto recode_encoding;
b8c5462f
JH
8557 break;
8558 case 'c':
f9d13529 8559 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
b8c5462f
JH
8560 break;
8561 case '0': case '1': case '2': case '3': case '4':
c99e91e9 8562 case '5': case '6': case '7':
9e08bc66 8563 {
c99e91e9
KW
8564 /* Take 1-3 octal digits */
8565 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9e08bc66
TS
8566 numlen = 3;
8567 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8568 RExC_parse += numlen;
8569 if (PL_encoding && value < 0x100)
8570 goto recode_encoding;
8571 break;
8572 }
8573 recode_encoding:
8574 {
8575 SV* enc = PL_encoding;
8576 value = reg_recode((const char)(U8)value, &enc);
668c081a
NC
8577 if (!enc && SIZE_ONLY)
8578 ckWARNreg(RExC_parse,
8579 "Invalid escape in the specified encoding");
9e08bc66
TS
8580 break;
8581 }
1028017a 8582 default:
c99e91e9
KW
8583 /* Allow \_ to not give an error */
8584 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
668c081a
NC
8585 ckWARN2reg(RExC_parse,
8586 "Unrecognized escape \\%c in character class passed through",
8587 (int)value);
c99e91e9 8588 }
1028017a 8589 break;
b8c5462f 8590 }
ffc61ed2 8591 } /* end of \blah */
1b2d223b
JH
8592#ifdef EBCDIC
8593 else
8594 literal_endpoint++;
8595#endif
ffc61ed2
JH
8596
8597 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8598
2c63ecad
KW
8599 /* What matches in a locale is not known until runtime, so need to
8600 * (one time per class) allocate extra space to pass to regexec.
8601 * The space will contain a bit for each named class that is to be
8602 * matched against. This isn't needed for \p{} and pseudo-classes,
8603 * as they are not affected by locale, and hence are dealt with
8604 * separately */
8605 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8606 need_class = 1;
8607 if (SIZE_ONLY) {
3a15e693 8608#ifdef ANYOF_CLASS_ADD_SKIP
2c63ecad 8609 RExC_size += ANYOF_CLASS_ADD_SKIP;
3a15e693 8610#endif
2c63ecad
KW
8611 }
8612 else {
3a15e693 8613#ifdef ANYOF_CLASS_ADD_SKIP
2c63ecad 8614 RExC_emit += ANYOF_CLASS_ADD_SKIP;
3a15e693 8615#endif
2c63ecad
KW
8616 ANYOF_CLASS_ZERO(ret);
8617 }
9051cfd9 8618 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
2c63ecad 8619 }
ffc61ed2
JH
8620
8621 /* a bad range like a-\d, a-[:digit:] ? */
8622 if (range) {
73b437c8 8623 if (!SIZE_ONLY) {
668c081a
NC
8624 const int w =
8625 RExC_parse >= rangebegin ?
8626 RExC_parse - rangebegin : 0;
8627 ckWARN4reg(RExC_parse,
b45f050a 8628 "False [] range \"%*.*s\"",
097eb12c 8629 w, w, rangebegin);
668c081a 8630
3568d838 8631 if (prevvalue < 256) {
2283d326 8632 stored +=
62e21d5f 8633 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue);
2283d326
KW
8634 stored +=
8635 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
ffc61ed2
JH
8636 }
8637 else {
ef87b810 8638 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
ffc61ed2 8639 Perl_sv_catpvf(aTHX_ listsv,
a1f3213b 8640 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 8641 }
b8c5462f 8642 }
ffc61ed2
JH
8643
8644 range = 0; /* this was not a true range */
73b437c8 8645 }
ffc61ed2 8646
89836f1f
YO
8647
8648
73b437c8 8649 if (!SIZE_ONLY) {
c49a72a9
NC
8650 const char *what = NULL;
8651 char yesno = 0;
8652
e2962f66
JH
8653 /* Possible truncation here but in some 64-bit environments
8654 * the compiler gets heartburn about switch on 64-bit values.
8655 * A similar issue a little earlier when switching on value.
98f323fa 8656 * --jhi */
e2962f66 8657 switch ((I32)namedclass) {
da7fcca4 8658
0399b215
KW
8659 case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
8660 case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
8661 case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
8662 case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
8663 case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
8664 case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
8665 case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
8666 case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
8667 case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
8668 case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
da7fcca4 8669#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
a12cf05f 8670 /* \s, \w match all unicode if utf8. */
0399b215
KW
8671 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8672 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
da7fcca4 8673#else
a12cf05f 8674 /* \s, \w match ascii and locale only */
0399b215
KW
8675 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8676 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
da7fcca4 8677#endif
0399b215 8678 case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
e1d1eefb
YO
8679 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8680 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
73b437c8
JH
8681 case ANYOF_ASCII:
8682 if (LOC)
936ed897 8683 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 8684 else {
1ba5c669 8685 for (value = 0; value < 128; value++)
2283d326 8686 stored +=
62e21d5f 8687 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
73b437c8 8688 }
c49a72a9 8689 yesno = '+';
ce1c68b2
KW
8690 what = NULL; /* Doesn't match outside ascii, so
8691 don't want to add +utf8:: */
73b437c8
JH
8692 break;
8693 case ANYOF_NASCII:
8694 if (LOC)
936ed897 8695 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 8696 else {
1ba5c669 8697 for (value = 128; value < 256; value++)
2283d326 8698 stored +=
62e21d5f 8699 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
73b437c8 8700 }
c49a72a9
NC
8701 yesno = '!';
8702 what = "ASCII";
89836f1f 8703 break;
ffc61ed2
JH
8704 case ANYOF_DIGIT:
8705 if (LOC)
8706 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8707 else {
8708 /* consecutive digits assumed */
8709 for (value = '0'; value <= '9'; value++)
2283d326 8710 stored +=
62e21d5f 8711 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
ffc61ed2 8712 }
c49a72a9 8713 yesno = '+';
da7fcca4 8714 what = POSIX_CC_UNI_NAME("Digit");
ffc61ed2
JH
8715 break;
8716 case ANYOF_NDIGIT:
8717 if (LOC)
8718 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8719 else {
8720 /* consecutive digits assumed */
8721 for (value = 0; value < '0'; value++)
2283d326 8722 stored +=
62e21d5f 8723 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
ffc61ed2 8724 for (value = '9' + 1; value < 256; value++)
2283d326 8725 stored +=
62e21d5f 8726 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
ffc61ed2 8727 }
c49a72a9 8728 yesno = '!';
da7fcca4 8729 what = POSIX_CC_UNI_NAME("Digit");
89836f1f 8730 break;
f81125e2
JP
8731 case ANYOF_MAX:
8732 /* this is to handle \p and \P */
8733 break;
73b437c8 8734 default:
b45f050a 8735 vFAIL("Invalid [::] class");
73b437c8 8736 break;
b8c5462f 8737 }
c49a72a9
NC
8738 if (what) {
8739 /* Strings such as "+utf8::isWord\n" */
8740 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
ef87b810
KW
8741 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8742 }
ce1c68b2 8743
73b437c8 8744 continue;
a0d0e21e 8745 }
ffc61ed2
JH
8746 } /* end of namedclass \blah */
8747
a0d0e21e 8748 if (range) {
eb160463 8749 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
8750 const int w = RExC_parse - rangebegin;
8751 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 8752 range = 0; /* not a valid range */
73b437c8 8753 }
a0d0e21e
LW
8754 }
8755 else {
3568d838 8756 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
8757 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8758 RExC_parse[1] != ']') {
8759 RExC_parse++;
ffc61ed2
JH
8760
8761 /* a bad range like \w-, [:word:]- ? */
8762 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 8763 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 8764 const int w =
afd78fd5
JH
8765 RExC_parse >= rangebegin ?
8766 RExC_parse - rangebegin : 0;
830247a4 8767 vWARN4(RExC_parse,
b45f050a 8768 "False [] range \"%*.*s\"",
097eb12c 8769 w, w, rangebegin);
afd78fd5 8770 }
73b437c8 8771 if (!SIZE_ONLY)
2283d326
KW
8772 stored +=
8773 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
73b437c8 8774 } else
ffc61ed2
JH
8775 range = 1; /* yeah, it's a range! */
8776 continue; /* but do it the next time */
a0d0e21e 8777 }
a687059c 8778 }
ffc61ed2 8779
93733859 8780 /* now is the next time */
ae5c130c 8781 if (!SIZE_ONLY) {
3568d838 8782 if (prevvalue < 256) {
1df70142 8783 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 8784 IV i;
3568d838 8785#ifdef EBCDIC
1b2d223b
JH
8786 /* In EBCDIC [\x89-\x91] should include
8787 * the \x8e but [i-j] should not. */
8788 if (literal_endpoint == 2 &&
8789 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8790 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 8791 {
3568d838
JH
8792 if (isLOWER(prevvalue)) {
8793 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 8794 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326 8795 stored +=
62e21d5f 8796 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
2670d666 8797 }
ffc61ed2 8798 } else {
3568d838 8799 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 8800 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326 8801 stored +=
62e21d5f 8802 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
2670d666 8803 }
ffc61ed2 8804 }
8ada0baa 8805 }
ffc61ed2 8806 else
8ada0baa 8807#endif
07be1b83 8808 for (i = prevvalue; i <= ceilvalue; i++) {
62e21d5f 8809 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
07be1b83 8810 }
3568d838 8811 }
a5961de5 8812 if (value > 255 || UTF) {
1df70142
AL
8813 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8814 const UV natvalue = NATIVE_TO_UNI(value);
ef87b810
KW
8815
8816 /* If the code point requires utf8 to represent, and we are not
8817 * folding, it can't match unless the target is in utf8. Only
8818 * a few code points above 255 fold to below it, so XXX an
8819 * optimization would be to know which ones and set the flag
8820 * appropriately. */
8821 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
8822 ? ANYOF_NONBITMAP
8823 : ANYOF_UTF8;
b08decb7 8824 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 8825 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
8826 prevnatvalue, natvalue);
8827 }
8828 else if (prevnatvalue == natvalue) {
8829 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 8830 if (FOLD) {
89ebb4a3 8831 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 8832 STRLEN foldlen;
1df70142 8833 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 8834
e294cc5d
JH
8835#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8836 if (RExC_precomp[0] == ':' &&
8837 RExC_precomp[1] == '[' &&
8838 (f == 0xDF || f == 0x92)) {
8839 f = NATIVE_TO_UNI(f);
8840 }
8841#endif
c840d2a2
JH
8842 /* If folding and foldable and a single
8843 * character, insert also the folded version
8844 * to the charclass. */
9e55ce06 8845 if (f != value) {
e294cc5d
JH
8846#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8847 if ((RExC_precomp[0] == ':' &&
8848 RExC_precomp[1] == '[' &&
8849 (f == 0xA2 &&
8850 (value == 0xFB05 || value == 0xFB06))) ?
8851 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8852 foldlen == (STRLEN)UNISKIP(f) )
8853#else
eb160463 8854 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 8855#endif
9e55ce06
JH
8856 Perl_sv_catpvf(aTHX_ listsv,
8857 "%04"UVxf"\n", f);
8858 else {
8859 /* Any multicharacter foldings
8860 * require the following transform:
8861 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8862 * where E folds into "pq" and F folds
8863 * into "rst", all other characters
8864 * fold to single characters. We save
8865 * away these multicharacter foldings,
8866 * to be later saved as part of the
8867 * additional "s" data. */
8868 SV *sv;
8869
8870 if (!unicode_alternate)
8871 unicode_alternate = newAV();
740cce10
NC
8872 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8873 TRUE);
9e55ce06
JH
8874 av_push(unicode_alternate, sv);
8875 }
8876 }
254ba52a 8877
60a8b682
JH
8878 /* If folding and the value is one of the Greek
8879 * sigmas insert a few more sigmas to make the
8880 * folding rules of the sigmas to work right.
8881 * Note that not all the possible combinations
8882 * are handled here: some of them are handled
9e55ce06
JH
8883 * by the standard folding rules, and some of
8884 * them (literal or EXACTF cases) are handled
8885 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
8886 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8887 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8888 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 8889 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8890 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8891 }
8892 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8893 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8894 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8895 }
8896 }
ffc61ed2 8897 }
1b2d223b
JH
8898#ifdef EBCDIC
8899 literal_endpoint = 0;
8900#endif
8ada0baa 8901 }
ffc61ed2
JH
8902
8903 range = 0; /* this range (if it was one) is done now */
a0d0e21e 8904 }
ffc61ed2 8905
ffc61ed2 8906
7f6f358c
YO
8907
8908 if (SIZE_ONLY)
8909 return ret;
8910 /****** !SIZE_ONLY AFTER HERE *********/
8911
b8e7c664
KW
8912 /* Folding in the bitmap is taken care of above, but not for locale, for
8913 * which we have to wait to see what folding is in effect at runtime, and
8914 * for things not in the bitmap */
8915 if (FOLD && (LOC || ANYOF_FLAGS(ret) & ANYOF_NONBITMAP)) {
39065660 8916 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
b8e7c664 8917 }
40c78556
KW
8918
8919 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that this doesn't
8920 * optimize locale. Doing so perhaps could be done as long as there is
8921 * nothing like \w in it; some thought also would have to be given to the
8922 * interaction with above 0x100 chars */
8923 if ((ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8924 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8925 ANYOF_BITMAP(ret)[value] ^= 0xFF;
8926 stored = 256 - stored;
8927
8928 /* The inversion means that everything above 255 is matched */
8929 ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
8930 }
8931
2786be71
KW
8932 /* A single character class can be "optimized" into an EXACTish node.
8933 * Note that since we don't currently count how many characters there are
8934 * outside the bitmap, we are XXX missing optimization possibilities for
8935 * them. This optimization can't happen unless this is a truly single
8936 * character class, which means that it can't be an inversion into a
8937 * many-character class, and there must be no possibility of there being
8938 * things outside the bitmap. 'stored' (only) for locales doesn't include
6da63e10
KW
8939 * \w, etc, so have to make a special test that they aren't present
8940 *
8941 * Similarly A 2-character class of the very special form like [bB] can be
8942 * optimized into an EXACTFish node, but only for non-locales, and for
8943 * characters which only have the two folds; so things like 'fF' and 'Ii'
8944 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
8945 * FI'. */
2786be71 8946 if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
6da63e10
KW
8947 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
8948 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
8949 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
8950 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
8951 /* If the latest code point has a fold whose
8952 * bit is set, it must be the only other one */
2dcac756 8953 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
6da63e10 8954 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
2786be71
KW
8955 {
8956 /* Note that the information needed to decide to do this optimization
8957 * is not currently available until the 2nd pass, and that the actually
6da63e10
KW
8958 * used EXACTish node takes less space than the calculated ANYOF node,
8959 * and hence the amount of space calculated in the first pass is larger
2786be71
KW
8960 * than actually used, so this optimization doesn't gain us any space.
8961 * But an EXACT node is faster than an ANYOF node, and can be combined
8962 * with any adjacent EXACT nodes later by the optimizer for further
6da63e10
KW
8963 * gains. The speed of executing an EXACTF is similar to an ANYOF
8964 * node, so the optimization advantage comes from the ability to join
8965 * it to adjacent EXACT nodes */
2786be71 8966
07be1b83 8967 const char * cur_parse= RExC_parse;
6da63e10 8968 U8 op;
07be1b83
YO
8969 RExC_emit = (regnode *)orig_emit;
8970 RExC_parse = (char *)orig_parse;
2786be71 8971
6da63e10
KW
8972 if (stored == 1) {
8973
8974 /* A locale node with one point can be folded; all the other cases
8975 * with folding will have two points, since we calculate them above
8976 */
39065660 8977 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
6da63e10
KW
8978 op = EXACTFL;
8979 }
8980 else {
8981 op = EXACT;
8982 }
8983 } /* else 2 chars in the bit map: the folds of each other */
8984 else if (UNI_SEMANTICS || !isASCII(value)) {
8985
8986 /* To join adjacent nodes, they must be the exact EXACTish type.
8987 * Try to use the most likely type, by using EXACTFU if the regex
8988 * calls for them, or is required because the character is
8989 * non-ASCII */
8990 op = EXACTFU;
8991 }
8992 else { /* Otherwise, more likely to be EXACTF type */
8993 op = EXACTF;
8994 }
8995
8996 ret = reg_node(pRExC_state, op);
07be1b83 8997 RExC_parse = (char *)cur_parse;
2786be71
KW
8998 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
8999 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
9000 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
9001 STR_LEN(ret)= 2;
9002 RExC_emit += STR_SZ(2);
9003 }
9004 else {
9005 *STRING(ret)= (char)value;
9006 STR_LEN(ret)= 1;
9007 RExC_emit += STR_SZ(1);
9008 }
ef8d46e8 9009 SvREFCNT_dec(listsv);
7f6f358c
YO
9010 return ret;
9011 }
ffc61ed2 9012
7f6f358c 9013 {
097eb12c 9014 AV * const av = newAV();
ffc61ed2 9015 SV *rv;
9e55ce06 9016 /* The 0th element stores the character class description
6a0407ee 9017 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
9018 * to initialize the appropriate swash (which gets stored in
9019 * the 1st element), and also useful for dumping the regnode.
9020 * The 2nd element stores the multicharacter foldings,
6a0407ee 9021 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
9022 av_store(av, 0, listsv);
9023 av_store(av, 1, NULL);
ad64d0ec
NC
9024 av_store(av, 2, MUTABLE_SV(unicode_alternate));
9025 rv = newRV_noinc(MUTABLE_SV(av));
19860706 9026 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 9027 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 9028 ARG_SET(ret, n);
a0ed51b3 9029 }
a0ed51b3
LW
9030 return ret;
9031}
89836f1f
YO
9032#undef _C_C_T_
9033
a0ed51b3 9034
bcdf7404
YO
9035/* reg_skipcomment()
9036
9037 Absorbs an /x style # comments from the input stream.
9038 Returns true if there is more text remaining in the stream.
9039 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
9040 terminates the pattern without including a newline.
9041
9042 Note its the callers responsibility to ensure that we are
9043 actually in /x mode
9044
9045*/
9046
9047STATIC bool
9048S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
9049{
9050 bool ended = 0;
7918f24d
NC
9051
9052 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
9053
bcdf7404
YO
9054 while (RExC_parse < RExC_end)
9055 if (*RExC_parse++ == '\n') {
9056 ended = 1;
9057 break;
9058 }
9059 if (!ended) {
9060 /* we ran off the end of the pattern without ending
9061 the comment, so we have to add an \n when wrapping */
9062 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9063 return 0;
9064 } else
9065 return 1;
9066}
9067
9068/* nextchar()
9069
3b753521 9070 Advances the parse position, and optionally absorbs
bcdf7404
YO
9071 "whitespace" from the inputstream.
9072
9073 Without /x "whitespace" means (?#...) style comments only,
9074 with /x this means (?#...) and # comments and whitespace proper.
9075
9076 Returns the RExC_parse point from BEFORE the scan occurs.
9077
9078 This is the /x friendly way of saying RExC_parse++.
9079*/
9080
76e3520e 9081STATIC char*
830247a4 9082S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 9083{
097eb12c 9084 char* const retval = RExC_parse++;
a0d0e21e 9085
7918f24d
NC
9086 PERL_ARGS_ASSERT_NEXTCHAR;
9087
4633a7c4 9088 for (;;) {
830247a4
IZ
9089 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
9090 RExC_parse[2] == '#') {
e994fd66
AE
9091 while (*RExC_parse != ')') {
9092 if (RExC_parse == RExC_end)
9093 FAIL("Sequence (?#... not terminated");
830247a4 9094 RExC_parse++;
e994fd66 9095 }
830247a4 9096 RExC_parse++;
4633a7c4
LW
9097 continue;
9098 }
bbe252da 9099 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
9100 if (isSPACE(*RExC_parse)) {
9101 RExC_parse++;
748a9306
LW
9102 continue;
9103 }
830247a4 9104 else if (*RExC_parse == '#') {
bcdf7404
YO
9105 if ( reg_skipcomment( pRExC_state ) )
9106 continue;
748a9306 9107 }
748a9306 9108 }
4633a7c4 9109 return retval;
a0d0e21e 9110 }
a687059c
LW
9111}
9112
9113/*
c277df42 9114- reg_node - emit a node
a0d0e21e 9115*/
76e3520e 9116STATIC regnode * /* Location. */
830247a4 9117S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 9118{
97aff369 9119 dVAR;
c277df42 9120 register regnode *ptr;
504618e9 9121 regnode * const ret = RExC_emit;
07be1b83 9122 GET_RE_DEBUG_FLAGS_DECL;
a687059c 9123
7918f24d
NC
9124 PERL_ARGS_ASSERT_REG_NODE;
9125
c277df42 9126 if (SIZE_ONLY) {
830247a4
IZ
9127 SIZE_ALIGN(RExC_size);
9128 RExC_size += 1;
a0d0e21e
LW
9129 return(ret);
9130 }
3b57cd43
YO
9131 if (RExC_emit >= RExC_emit_bound)
9132 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9133
c277df42 9134 NODE_ALIGN_FILL(ret);
a0d0e21e 9135 ptr = ret;
c277df42 9136 FILL_ADVANCE_NODE(ptr, op);
7122b237 9137#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9138 if (RExC_offsets) { /* MJD */
07be1b83 9139 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 9140 "reg_node", __LINE__,
13d6edb4 9141 PL_reg_name[op],
07be1b83
YO
9142 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
9143 ? "Overwriting end of array!\n" : "OK",
9144 (UV)(RExC_emit - RExC_emit_start),
9145 (UV)(RExC_parse - RExC_start),
9146 (UV)RExC_offsets[0]));
ccb2c380 9147 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 9148 }
7122b237 9149#endif
830247a4 9150 RExC_emit = ptr;
a0d0e21e 9151 return(ret);
a687059c
LW
9152}
9153
9154/*
a0d0e21e
LW
9155- reganode - emit a node with an argument
9156*/
76e3520e 9157STATIC regnode * /* Location. */
830247a4 9158S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 9159{
97aff369 9160 dVAR;
c277df42 9161 register regnode *ptr;
504618e9 9162 regnode * const ret = RExC_emit;
07be1b83 9163 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 9164
7918f24d
NC
9165 PERL_ARGS_ASSERT_REGANODE;
9166
c277df42 9167 if (SIZE_ONLY) {
830247a4
IZ
9168 SIZE_ALIGN(RExC_size);
9169 RExC_size += 2;
6bda09f9
YO
9170 /*
9171 We can't do this:
9172
9173 assert(2==regarglen[op]+1);
9174
9175 Anything larger than this has to allocate the extra amount.
9176 If we changed this to be:
9177
9178 RExC_size += (1 + regarglen[op]);
9179
9180 then it wouldn't matter. Its not clear what side effect
9181 might come from that so its not done so far.
9182 -- dmq
9183 */
a0d0e21e
LW
9184 return(ret);
9185 }
3b57cd43
YO
9186 if (RExC_emit >= RExC_emit_bound)
9187 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9188
c277df42 9189 NODE_ALIGN_FILL(ret);
a0d0e21e 9190 ptr = ret;
c277df42 9191 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 9192#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9193 if (RExC_offsets) { /* MJD */
07be1b83 9194 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 9195 "reganode",
ccb2c380 9196 __LINE__,
13d6edb4 9197 PL_reg_name[op],
07be1b83 9198 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 9199 "Overwriting end of array!\n" : "OK",
07be1b83
YO
9200 (UV)(RExC_emit - RExC_emit_start),
9201 (UV)(RExC_parse - RExC_start),
9202 (UV)RExC_offsets[0]));
ccb2c380 9203 Set_Cur_Node_Offset;
fac92740 9204 }
7122b237 9205#endif
830247a4 9206 RExC_emit = ptr;
a0d0e21e 9207 return(ret);
fe14fcc3
LW
9208}
9209
9210/*
cd439c50 9211- reguni - emit (if appropriate) a Unicode character
a0ed51b3 9212*/
71207a34
AL
9213STATIC STRLEN
9214S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 9215{
97aff369 9216 dVAR;
7918f24d
NC
9217
9218 PERL_ARGS_ASSERT_REGUNI;
9219
71207a34 9220 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
9221}
9222
9223/*
a0d0e21e
LW
9224- reginsert - insert an operator in front of already-emitted operand
9225*
9226* Means relocating the operand.
9227*/
76e3520e 9228STATIC void
6bda09f9 9229S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 9230{
97aff369 9231 dVAR;
c277df42
IZ
9232 register regnode *src;
9233 register regnode *dst;
9234 register regnode *place;
504618e9 9235 const int offset = regarglen[(U8)op];
6bda09f9 9236 const int size = NODE_STEP_REGNODE + offset;
07be1b83 9237 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9238
9239 PERL_ARGS_ASSERT_REGINSERT;
def51078 9240 PERL_UNUSED_ARG(depth);
22c35a8c 9241/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 9242 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 9243 if (SIZE_ONLY) {
6bda09f9 9244 RExC_size += size;
a0d0e21e
LW
9245 return;
9246 }
a687059c 9247
830247a4 9248 src = RExC_emit;
6bda09f9 9249 RExC_emit += size;
830247a4 9250 dst = RExC_emit;
40d049e4 9251 if (RExC_open_parens) {
6bda09f9 9252 int paren;
3b57cd43 9253 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 9254 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 9255 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 9256 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
9257 RExC_open_parens[paren] += size;
9258 } else {
3b57cd43 9259 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
9260 }
9261 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 9262 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
9263 RExC_close_parens[paren] += size;
9264 } else {
3b57cd43 9265 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
9266 }
9267 }
6bda09f9 9268 }
40d049e4 9269
fac92740 9270 while (src > opnd) {
c277df42 9271 StructCopy(--src, --dst, regnode);
7122b237 9272#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9273 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 9274 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 9275 "reg_insert",
ccb2c380 9276 __LINE__,
13d6edb4 9277 PL_reg_name[op],
07be1b83
YO
9278 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9279 ? "Overwriting end of array!\n" : "OK",
9280 (UV)(src - RExC_emit_start),
9281 (UV)(dst - RExC_emit_start),
9282 (UV)RExC_offsets[0]));
ccb2c380
MP
9283 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9284 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 9285 }
7122b237 9286#endif
fac92740
MJD
9287 }
9288
a0d0e21e
LW
9289
9290 place = opnd; /* Op node, where operand used to be. */
7122b237 9291#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9292 if (RExC_offsets) { /* MJD */
07be1b83 9293 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 9294 "reginsert",
ccb2c380 9295 __LINE__,
13d6edb4 9296 PL_reg_name[op],
07be1b83 9297 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 9298 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
9299 (UV)(place - RExC_emit_start),
9300 (UV)(RExC_parse - RExC_start),
786e8c11 9301 (UV)RExC_offsets[0]));
ccb2c380 9302 Set_Node_Offset(place, RExC_parse);
45948336 9303 Set_Node_Length(place, 1);
fac92740 9304 }
7122b237 9305#endif
c277df42
IZ
9306 src = NEXTOPER(place);
9307 FILL_ADVANCE_NODE(place, op);
9308 Zero(src, offset, regnode);
a687059c
LW
9309}
9310
9311/*
c277df42 9312- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 9313- SEE ALSO: regtail_study
a0d0e21e 9314*/
097eb12c 9315/* TODO: All three parms should be const */
76e3520e 9316STATIC void
3dab1dad 9317S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 9318{
97aff369 9319 dVAR;
c277df42 9320 register regnode *scan;
72f13be8 9321 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9322
9323 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
9324#ifndef DEBUGGING
9325 PERL_UNUSED_ARG(depth);
9326#endif
a0d0e21e 9327
c277df42 9328 if (SIZE_ONLY)
a0d0e21e
LW
9329 return;
9330
9331 /* Find last node. */
9332 scan = p;
9333 for (;;) {
504618e9 9334 regnode * const temp = regnext(scan);
3dab1dad
YO
9335 DEBUG_PARSE_r({
9336 SV * const mysv=sv_newmortal();
9337 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9338 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
9339 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9340 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9341 (temp == NULL ? "->" : ""),
13d6edb4 9342 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 9343 );
3dab1dad
YO
9344 });
9345 if (temp == NULL)
9346 break;
9347 scan = temp;
9348 }
9349
9350 if (reg_off_by_arg[OP(scan)]) {
9351 ARG_SET(scan, val - scan);
9352 }
9353 else {
9354 NEXT_OFF(scan) = val - scan;
9355 }
9356}
9357
07be1b83 9358#ifdef DEBUGGING
3dab1dad
YO
9359/*
9360- regtail_study - set the next-pointer at the end of a node chain of p to val.
9361- Look for optimizable sequences at the same time.
9362- currently only looks for EXACT chains.
07be1b83 9363
486ec47a 9364This is experimental code. The idea is to use this routine to perform
07be1b83
YO
9365in place optimizations on branches and groups as they are constructed,
9366with the long term intention of removing optimization from study_chunk so
9367that it is purely analytical.
9368
9369Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9370to control which is which.
9371
3dab1dad
YO
9372*/
9373/* TODO: All four parms should be const */
07be1b83 9374
3dab1dad
YO
9375STATIC U8
9376S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9377{
9378 dVAR;
9379 register regnode *scan;
07be1b83
YO
9380 U8 exact = PSEUDO;
9381#ifdef EXPERIMENTAL_INPLACESCAN
9382 I32 min = 0;
9383#endif
3dab1dad
YO
9384 GET_RE_DEBUG_FLAGS_DECL;
9385
7918f24d
NC
9386 PERL_ARGS_ASSERT_REGTAIL_STUDY;
9387
07be1b83 9388
3dab1dad
YO
9389 if (SIZE_ONLY)
9390 return exact;
9391
9392 /* Find last node. */
9393
9394 scan = p;
9395 for (;;) {
9396 regnode * const temp = regnext(scan);
07be1b83
YO
9397#ifdef EXPERIMENTAL_INPLACESCAN
9398 if (PL_regkind[OP(scan)] == EXACT)
9399 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9400 return EXACT;
9401#endif
3dab1dad
YO
9402 if ( exact ) {
9403 switch (OP(scan)) {
9404 case EXACT:
9405 case EXACTF:
2c2b7f86 9406 case EXACTFU:
3dab1dad
YO
9407 case EXACTFL:
9408 if( exact == PSEUDO )
9409 exact= OP(scan);
07be1b83
YO
9410 else if ( exact != OP(scan) )
9411 exact= 0;
3dab1dad
YO
9412 case NOTHING:
9413 break;
9414 default:
9415 exact= 0;
9416 }
9417 }
9418 DEBUG_PARSE_r({
9419 SV * const mysv=sv_newmortal();
9420 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9421 regprop(RExC_rx, mysv, scan);
eaf3ca90 9422 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 9423 SvPV_nolen_const(mysv),
eaf3ca90 9424 REG_NODE_NUM(scan),
13d6edb4 9425 PL_reg_name[exact]);
3dab1dad 9426 });
a0d0e21e
LW
9427 if (temp == NULL)
9428 break;
9429 scan = temp;
9430 }
07be1b83
YO
9431 DEBUG_PARSE_r({
9432 SV * const mysv_val=sv_newmortal();
9433 DEBUG_PARSE_MSG("");
9434 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
9435 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9436 SvPV_nolen_const(mysv_val),
9437 (IV)REG_NODE_NUM(val),
9438 (IV)(val - scan)
07be1b83
YO
9439 );
9440 });
c277df42
IZ
9441 if (reg_off_by_arg[OP(scan)]) {
9442 ARG_SET(scan, val - scan);
a0ed51b3
LW
9443 }
9444 else {
c277df42
IZ
9445 NEXT_OFF(scan) = val - scan;
9446 }
3dab1dad
YO
9447
9448 return exact;
a687059c 9449}
07be1b83 9450#endif
a687059c
LW
9451
9452/*
fd181c75 9453 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 9454 */
f7819f85 9455#ifdef DEBUGGING
c33269f7 9456static void
7918f24d
NC
9457S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9458{
f7819f85
A
9459 int bit;
9460 int set=0;
7918f24d 9461
f7819f85
A
9462 for (bit=0; bit<32; bit++) {
9463 if (flags & (1<<bit)) {
9464 if (!set++ && lead)
9465 PerlIO_printf(Perl_debug_log, "%s",lead);
9466 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9467 }
9468 }
9469 if (lead) {
9470 if (set)
9471 PerlIO_printf(Perl_debug_log, "\n");
9472 else
9473 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9474 }
9475}
9476#endif
9477
a687059c 9478void
097eb12c 9479Perl_regdump(pTHX_ const regexp *r)
a687059c 9480{
35ff7856 9481#ifdef DEBUGGING
97aff369 9482 dVAR;
c445ea15 9483 SV * const sv = sv_newmortal();
ab3bbdeb 9484 SV *dsv= sv_newmortal();
f8fc2ecf 9485 RXi_GET_DECL(r,ri);
f7819f85 9486 GET_RE_DEBUG_FLAGS_DECL;
a687059c 9487
7918f24d
NC
9488 PERL_ARGS_ASSERT_REGDUMP;
9489
f8fc2ecf 9490 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
9491
9492 /* Header fields of interest. */
ab3bbdeb
YO
9493 if (r->anchored_substr) {
9494 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9495 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 9496 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9497 "anchored %s%s at %"IVdf" ",
9498 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 9499 (IV)r->anchored_offset);
ab3bbdeb
YO
9500 } else if (r->anchored_utf8) {
9501 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9502 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 9503 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9504 "anchored utf8 %s%s at %"IVdf" ",
9505 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 9506 (IV)r->anchored_offset);
ab3bbdeb
YO
9507 }
9508 if (r->float_substr) {
9509 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9510 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 9511 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9512 "floating %s%s at %"IVdf"..%"UVuf" ",
9513 s, RE_SV_TAIL(r->float_substr),
7b0972df 9514 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
9515 } else if (r->float_utf8) {
9516 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9517 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 9518 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9519 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9520 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 9521 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 9522 }
33b8afdf 9523 if (r->check_substr || r->check_utf8)
b81d288d 9524 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
9525 (const char *)
9526 (r->check_substr == r->float_substr
9527 && r->check_utf8 == r->float_utf8
9528 ? "(checking floating" : "(checking anchored"));
bbe252da 9529 if (r->extflags & RXf_NOSCAN)
c277df42 9530 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 9531 if (r->extflags & RXf_CHECK_ALL)
c277df42 9532 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 9533 if (r->check_substr || r->check_utf8)
c277df42
IZ
9534 PerlIO_printf(Perl_debug_log, ") ");
9535
f8fc2ecf
YO
9536 if (ri->regstclass) {
9537 regprop(r, sv, ri->regstclass);
1de06328 9538 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 9539 }
bbe252da 9540 if (r->extflags & RXf_ANCH) {
774d564b 9541 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 9542 if (r->extflags & RXf_ANCH_BOL)
774d564b 9543 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 9544 if (r->extflags & RXf_ANCH_MBOL)
c277df42 9545 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 9546 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 9547 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 9548 if (r->extflags & RXf_ANCH_GPOS)
774d564b 9549 PerlIO_printf(Perl_debug_log, "(GPOS)");
9550 PerlIO_putc(Perl_debug_log, ' ');
9551 }
bbe252da 9552 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 9553 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 9554 if (r->intflags & PREGf_SKIP)
760ac839 9555 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 9556 if (r->intflags & PREGf_IMPLICIT)
760ac839 9557 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 9558 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 9559 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 9560 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 9561 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 9562 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 9563#else
7918f24d 9564 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 9565 PERL_UNUSED_CONTEXT;
65e66c80 9566 PERL_UNUSED_ARG(r);
17c3b450 9567#endif /* DEBUGGING */
a687059c
LW
9568}
9569
9570/*
a0d0e21e
LW
9571- regprop - printable representation of opcode
9572*/
3339dfd8
YO
9573#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9574STMT_START { \
9575 if (do_sep) { \
9576 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9577 if (flags & ANYOF_INVERT) \
9578 /*make sure the invert info is in each */ \
9579 sv_catpvs(sv, "^"); \
9580 do_sep = 0; \
9581 } \
9582} STMT_END
9583
46fc3d4c 9584void
32fc9b6a 9585Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 9586{
35ff7856 9587#ifdef DEBUGGING
97aff369 9588 dVAR;
9b155405 9589 register int k;
f8fc2ecf 9590 RXi_GET_DECL(prog,progi);
1de06328 9591 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 9592
7918f24d 9593 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 9594
76f68e9b 9595 sv_setpvs(sv, "");
8aa23a47 9596
03363afd 9597 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
9598 /* It would be nice to FAIL() here, but this may be called from
9599 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 9600 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 9601 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 9602
3dab1dad 9603 k = PL_regkind[OP(o)];
9b155405 9604
2a782b5b 9605 if (k == EXACT) {
f92a2122 9606 sv_catpvs(sv, " ");
ab3bbdeb
YO
9607 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9608 * is a crude hack but it may be the best for now since
9609 * we have no flag "this EXACTish node was UTF-8"
9610 * --jhi */
f92a2122
NC
9611 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9612 PERL_PV_ESCAPE_UNI_DETECT |
c89df6cf 9613 PERL_PV_ESCAPE_NONASCII |
f92a2122
NC
9614 PERL_PV_PRETTY_ELLIPSES |
9615 PERL_PV_PRETTY_LTGT |
9616 PERL_PV_PRETTY_NOCLEAR
9617 );
bb263b4e 9618 } else if (k == TRIE) {
3dab1dad 9619 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 9620 * progi->data isn't available here */
1de06328 9621 const char op = OP(o);
647f639f 9622 const U32 n = ARG(o);
1de06328 9623 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 9624 (reg_ac_data *)progi->data->data[n] :
1de06328 9625 NULL;
3251b653
NC
9626 const reg_trie_data * const trie
9627 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 9628
13d6edb4 9629 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
9630 DEBUG_TRIE_COMPILE_r(
9631 Perl_sv_catpvf(aTHX_ sv,
9632 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9633 (UV)trie->startstate,
1e2e3d02 9634 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
9635 (UV)trie->wordcount,
9636 (UV)trie->minlen,
9637 (UV)trie->maxlen,
9638 (UV)TRIE_CHARCOUNT(trie),
9639 (UV)trie->uniquecharcount
9640 )
9641 );
9642 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9643 int i;
9644 int rangestart = -1;
f46cb337 9645 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 9646 sv_catpvs(sv, "[");
1de06328
YO
9647 for (i = 0; i <= 256; i++) {
9648 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9649 if (rangestart == -1)
9650 rangestart = i;
9651 } else if (rangestart != -1) {
9652 if (i <= rangestart + 3)
9653 for (; rangestart < i; rangestart++)
9654 put_byte(sv, rangestart);
9655 else {
9656 put_byte(sv, rangestart);
9657 sv_catpvs(sv, "-");
9658 put_byte(sv, i - 1);
9659 }
9660 rangestart = -1;
9661 }
9662 }
f3a2811a 9663 sv_catpvs(sv, "]");
1de06328
YO
9664 }
9665
a3621e74 9666 } else if (k == CURLY) {
cb434fcc 9667 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
9668 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9669 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 9670 }
2c2d71f5
JH
9671 else if (k == WHILEM && o->flags) /* Ordinal/of */
9672 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 9673 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 9674 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 9675 if ( RXp_PAREN_NAMES(prog) ) {
9d6ecd7a 9676 if ( k != REF || (OP(o) < NREF)) {
502c6561 9677 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
9678 SV **name= av_fetch(list, ARG(o), 0 );
9679 if (name)
9680 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9681 }
9682 else {
502c6561 9683 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 9684 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
9685 I32 *nums=(I32*)SvPVX(sv_dat);
9686 SV **name= av_fetch(list, nums[0], 0 );
9687 I32 n;
9688 if (name) {
9689 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9690 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9691 (n ? "," : ""), (IV)nums[n]);
9692 }
9693 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 9694 }
1f1031fe 9695 }
ee9b8eae 9696 }
1f1031fe 9697 } else if (k == GOSUB)
6bda09f9 9698 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
9699 else if (k == VERB) {
9700 if (!o->flags)
9701 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 9702 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 9703 } else if (k == LOGICAL)
04ebc1ab 9704 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
f9a79580 9705 else if (k == FOLDCHAR)
df44d732 9706 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
653099ff
GS
9707 else if (k == ANYOF) {
9708 int i, rangestart = -1;
2d03de9c 9709 const U8 flags = ANYOF_FLAGS(o);
24d786f4 9710 int do_sep = 0;
0bd48802
AL
9711
9712 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9713 static const char * const anyofs[] = {
653099ff
GS
9714 "\\w",
9715 "\\W",
9716 "\\s",
9717 "\\S",
9718 "\\d",
9719 "\\D",
9720 "[:alnum:]",
9721 "[:^alnum:]",
9722 "[:alpha:]",
9723 "[:^alpha:]",
9724 "[:ascii:]",
9725 "[:^ascii:]",
24d786f4
YO
9726 "[:cntrl:]",
9727 "[:^cntrl:]",
653099ff
GS
9728 "[:graph:]",
9729 "[:^graph:]",
9730 "[:lower:]",
9731 "[:^lower:]",
9732 "[:print:]",
9733 "[:^print:]",
9734 "[:punct:]",
9735 "[:^punct:]",
9736 "[:upper:]",
aaa51d5e 9737 "[:^upper:]",
653099ff 9738 "[:xdigit:]",
aaa51d5e
JF
9739 "[:^xdigit:]",
9740 "[:space:]",
9741 "[:^space:]",
9742 "[:blank:]",
9743 "[:^blank:]"
653099ff
GS
9744 };
9745
19860706 9746 if (flags & ANYOF_LOCALE)
396482e1 9747 sv_catpvs(sv, "{loc}");
39065660 9748 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
396482e1 9749 sv_catpvs(sv, "{i}");
653099ff 9750 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 9751 if (flags & ANYOF_INVERT)
396482e1 9752 sv_catpvs(sv, "^");
3339dfd8
YO
9753
9754 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
9755 for (i = 0; i <= 256; i++) {
9756 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9757 if (rangestart == -1)
9758 rangestart = i;
9759 } else if (rangestart != -1) {
9760 if (i <= rangestart + 3)
9761 for (; rangestart < i; rangestart++)
653099ff 9762 put_byte(sv, rangestart);
ffc61ed2
JH
9763 else {
9764 put_byte(sv, rangestart);
396482e1 9765 sv_catpvs(sv, "-");
ffc61ed2 9766 put_byte(sv, i - 1);
653099ff 9767 }
24d786f4 9768 do_sep = 1;
ffc61ed2 9769 rangestart = -1;
653099ff 9770 }
847a199f 9771 }
3339dfd8
YO
9772
9773 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
3a15e693
KW
9774 /* output any special charclass tests (used entirely under use locale) */
9775 if (ANYOF_CLASS_TEST_ANY_SET(o))
bb7a0f54 9776 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 9777 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 9778 sv_catpv(sv, anyofs[i]);
24d786f4
YO
9779 do_sep = 1;
9780 }
9781
3339dfd8
YO
9782 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9783
9784 /* output information about the unicode matching */
ef87b810 9785 if (flags & ANYOF_UNICODE_ALL)
396482e1 9786 sv_catpvs(sv, "{unicode_all}");
ef87b810
KW
9787 else if (flags & ANYOF_UTF8)
9788 sv_catpvs(sv, "{unicode}");
f5ecd18d 9789 if (flags & ANYOF_NONBITMAP_NON_UTF8)
ef87b810 9790 sv_catpvs(sv, "{outside bitmap}");
ffc61ed2
JH
9791
9792 {
9793 SV *lv;
32fc9b6a 9794 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 9795
ffc61ed2
JH
9796 if (lv) {
9797 if (sw) {
89ebb4a3 9798 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 9799
ffc61ed2 9800 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 9801 uvchr_to_utf8(s, i);
ffc61ed2 9802
3568d838 9803 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
9804 if (rangestart == -1)
9805 rangestart = i;
9806 } else if (rangestart != -1) {
ffc61ed2
JH
9807 if (i <= rangestart + 3)
9808 for (; rangestart < i; rangestart++) {
2d03de9c
AL
9809 const U8 * const e = uvchr_to_utf8(s,rangestart);
9810 U8 *p;
9811 for(p = s; p < e; p++)
ffc61ed2
JH
9812 put_byte(sv, *p);
9813 }
9814 else {
2d03de9c
AL
9815 const U8 *e = uvchr_to_utf8(s,rangestart);
9816 U8 *p;
9817 for (p = s; p < e; p++)
ffc61ed2 9818 put_byte(sv, *p);
396482e1 9819 sv_catpvs(sv, "-");
2d03de9c
AL
9820 e = uvchr_to_utf8(s, i-1);
9821 for (p = s; p < e; p++)
1df70142 9822 put_byte(sv, *p);
ffc61ed2
JH
9823 }
9824 rangestart = -1;
9825 }
19860706 9826 }
ffc61ed2 9827
396482e1 9828 sv_catpvs(sv, "..."); /* et cetera */
19860706 9829 }
fde631ed 9830
ffc61ed2 9831 {
2e0de35c 9832 char *s = savesvpv(lv);
c445ea15 9833 char * const origs = s;
b81d288d 9834
3dab1dad
YO
9835 while (*s && *s != '\n')
9836 s++;
b81d288d 9837
ffc61ed2 9838 if (*s == '\n') {
2d03de9c 9839 const char * const t = ++s;
ffc61ed2
JH
9840
9841 while (*s) {
9842 if (*s == '\n')
9843 *s = ' ';
9844 s++;
9845 }
9846 if (s[-1] == ' ')
9847 s[-1] = 0;
9848
9849 sv_catpv(sv, t);
fde631ed 9850 }
b81d288d 9851
ffc61ed2 9852 Safefree(origs);
fde631ed
JH
9853 }
9854 }
653099ff 9855 }
ffc61ed2 9856
653099ff
GS
9857 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9858 }
9b155405 9859 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 9860 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 9861#else
96a5add6 9862 PERL_UNUSED_CONTEXT;
65e66c80
SP
9863 PERL_UNUSED_ARG(sv);
9864 PERL_UNUSED_ARG(o);
f9049ba1 9865 PERL_UNUSED_ARG(prog);
17c3b450 9866#endif /* DEBUGGING */
35ff7856 9867}
a687059c 9868
cad2e5aa 9869SV *
288b8c02 9870Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 9871{ /* Assume that RE_INTUIT is set */
97aff369 9872 dVAR;
288b8c02 9873 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 9874 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9875
9876 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
9877 PERL_UNUSED_CONTEXT;
9878
a3621e74 9879 DEBUG_COMPILE_r(
cfd0369c 9880 {
2d03de9c 9881 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 9882 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
9883
9884 if (!PL_colorset) reginitcolors();
9885 PerlIO_printf(Perl_debug_log,
a0288114 9886 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
9887 PL_colors[4],
9888 prog->check_substr ? "" : "utf8 ",
9889 PL_colors[5],PL_colors[0],
cad2e5aa
JH
9890 s,
9891 PL_colors[1],
9892 (strlen(s) > 60 ? "..." : ""));
9893 } );
9894
33b8afdf 9895 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
9896}
9897
84da74a7 9898/*
f8149455 9899 pregfree()
84da74a7 9900
f8149455
YO
9901 handles refcounting and freeing the perl core regexp structure. When
9902 it is necessary to actually free the structure the first thing it
3b753521 9903 does is call the 'free' method of the regexp_engine associated to
f8149455
YO
9904 the regexp, allowing the handling of the void *pprivate; member
9905 first. (This routine is not overridable by extensions, which is why
9906 the extensions free is called first.)
9907
9908 See regdupe and regdupe_internal if you change anything here.
84da74a7 9909*/
f8149455 9910#ifndef PERL_IN_XSUB_RE
2b69d0c2 9911void
84679df5 9912Perl_pregfree(pTHX_ REGEXP *r)
a687059c 9913{
288b8c02
NC
9914 SvREFCNT_dec(r);
9915}
9916
9917void
9918Perl_pregfree2(pTHX_ REGEXP *rx)
9919{
27da23d5 9920 dVAR;
288b8c02 9921 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 9922 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 9923
7918f24d
NC
9924 PERL_ARGS_ASSERT_PREGFREE2;
9925
28d8d7f4
YO
9926 if (r->mother_re) {
9927 ReREFCNT_dec(r->mother_re);
9928 } else {
288b8c02 9929 CALLREGFREE_PVT(rx); /* free the private data */
ef8d46e8 9930 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
9931 }
9932 if (r->substrs) {
ef8d46e8
VP
9933 SvREFCNT_dec(r->anchored_substr);
9934 SvREFCNT_dec(r->anchored_utf8);
9935 SvREFCNT_dec(r->float_substr);
9936 SvREFCNT_dec(r->float_utf8);
28d8d7f4
YO
9937 Safefree(r->substrs);
9938 }
288b8c02 9939 RX_MATCH_COPY_FREE(rx);
f8c7b90f 9940#ifdef PERL_OLD_COPY_ON_WRITE
ef8d46e8 9941 SvREFCNT_dec(r->saved_copy);
ed252734 9942#endif
f0ab9afb 9943 Safefree(r->offs);
f8149455 9944}
28d8d7f4
YO
9945
9946/* reg_temp_copy()
9947
9948 This is a hacky workaround to the structural issue of match results
9949 being stored in the regexp structure which is in turn stored in
9950 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9951 could be PL_curpm in multiple contexts, and could require multiple
9952 result sets being associated with the pattern simultaneously, such
9953 as when doing a recursive match with (??{$qr})
9954
9955 The solution is to make a lightweight copy of the regexp structure
9956 when a qr// is returned from the code executed by (??{$qr}) this
486ec47a 9957 lightweight copy doesn't actually own any of its data except for
28d8d7f4
YO
9958 the starp/end and the actual regexp structure itself.
9959
9960*/
9961
9962
84679df5 9963REGEXP *
f0826785 9964Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
7918f24d 9965{
f0826785 9966 struct regexp *ret;
288b8c02 9967 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 9968 register const I32 npar = r->nparens+1;
7918f24d
NC
9969
9970 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9971
f0826785
BM
9972 if (!ret_x)
9973 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9974 ret = (struct regexp *)SvANY(ret_x);
9975
288b8c02 9976 (void)ReREFCNT_inc(rx);
f7c278bf
NC
9977 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9978 by pointing directly at the buffer, but flagging that the allocated
9979 space in the copy is zero. As we've just done a struct copy, it's now
9980 a case of zero-ing that, rather than copying the current length. */
9981 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 9982 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
b6f60916
NC
9983 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9984 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
f7c278bf 9985 SvLEN_set(ret_x, 0);
b9ad13ac 9986 SvSTASH_set(ret_x, NULL);
703c388d 9987 SvMAGIC_set(ret_x, NULL);
f0ab9afb
NC
9988 Newx(ret->offs, npar, regexp_paren_pair);
9989 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 9990 if (r->substrs) {
28d8d7f4 9991 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
9992 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9993
9994 SvREFCNT_inc_void(ret->anchored_substr);
9995 SvREFCNT_inc_void(ret->anchored_utf8);
9996 SvREFCNT_inc_void(ret->float_substr);
9997 SvREFCNT_inc_void(ret->float_utf8);
9998
9999 /* check_substr and check_utf8, if non-NULL, point to either their
10000 anchored or float namesakes, and don't hold a second reference. */
486913e4 10001 }
288b8c02 10002 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 10003#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 10004 ret->saved_copy = NULL;
28d8d7f4 10005#endif
288b8c02 10006 ret->mother_re = rx;
28d8d7f4 10007
288b8c02 10008 return ret_x;
28d8d7f4 10009}
f8149455
YO
10010#endif
10011
10012/* regfree_internal()
10013
10014 Free the private data in a regexp. This is overloadable by
10015 extensions. Perl takes care of the regexp structure in pregfree(),
3b753521 10016 this covers the *pprivate pointer which technically perl doesn't
f8149455
YO
10017 know about, however of course we have to handle the
10018 regexp_internal structure when no extension is in use.
10019
10020 Note this is called before freeing anything in the regexp
10021 structure.
10022 */
10023
10024void
288b8c02 10025Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
10026{
10027 dVAR;
288b8c02 10028 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
10029 RXi_GET_DECL(r,ri);
10030 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10031
10032 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
10033
f8149455
YO
10034 DEBUG_COMPILE_r({
10035 if (!PL_colorset)
10036 reginitcolors();
10037 {
10038 SV *dsv= sv_newmortal();
3c8556c3 10039 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 10040 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
10041 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
10042 PL_colors[4],PL_colors[5],s);
10043 }
10044 });
7122b237
YO
10045#ifdef RE_TRACK_PATTERN_OFFSETS
10046 if (ri->u.offsets)
10047 Safefree(ri->u.offsets); /* 20010421 MJD */
10048#endif
f8fc2ecf
YO
10049 if (ri->data) {
10050 int n = ri->data->count;
f3548bdc
DM
10051 PAD* new_comppad = NULL;
10052 PAD* old_comppad;
4026c95a 10053 PADOFFSET refcnt;
dfad63ad 10054
c277df42 10055 while (--n >= 0) {
261faec3 10056 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 10057 switch (ri->data->what[n]) {
af534a04 10058 case 'a':
c277df42 10059 case 's':
81714fb9 10060 case 'S':
55eed653 10061 case 'u':
ad64d0ec 10062 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 10063 break;
653099ff 10064 case 'f':
f8fc2ecf 10065 Safefree(ri->data->data[n]);
653099ff 10066 break;
dfad63ad 10067 case 'p':
502c6561 10068 new_comppad = MUTABLE_AV(ri->data->data[n]);
dfad63ad 10069 break;
c277df42 10070 case 'o':
dfad63ad 10071 if (new_comppad == NULL)
cea2e8a9 10072 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
10073 PAD_SAVE_LOCAL(old_comppad,
10074 /* Watch out for global destruction's random ordering. */
c445ea15 10075 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 10076 );
b34c0dd4 10077 OP_REFCNT_LOCK;
f8fc2ecf 10078 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
10079 OP_REFCNT_UNLOCK;
10080 if (!refcnt)
f8fc2ecf 10081 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 10082
f3548bdc 10083 PAD_RESTORE_LOCAL(old_comppad);
ad64d0ec 10084 SvREFCNT_dec(MUTABLE_SV(new_comppad));
dfad63ad 10085 new_comppad = NULL;
c277df42
IZ
10086 break;
10087 case 'n':
9e55ce06 10088 break;
07be1b83 10089 case 'T':
be8e71aa
YO
10090 { /* Aho Corasick add-on structure for a trie node.
10091 Used in stclass optimization only */
07be1b83 10092 U32 refcount;
f8fc2ecf 10093 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
10094 OP_REFCNT_LOCK;
10095 refcount = --aho->refcount;
10096 OP_REFCNT_UNLOCK;
10097 if ( !refcount ) {
446bd890
NC
10098 PerlMemShared_free(aho->states);
10099 PerlMemShared_free(aho->fail);
446bd890
NC
10100 /* do this last!!!! */
10101 PerlMemShared_free(ri->data->data[n]);
10102 PerlMemShared_free(ri->regstclass);
07be1b83
YO
10103 }
10104 }
10105 break;
a3621e74 10106 case 't':
07be1b83 10107 {
be8e71aa 10108 /* trie structure. */
07be1b83 10109 U32 refcount;
f8fc2ecf 10110 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
10111 OP_REFCNT_LOCK;
10112 refcount = --trie->refcount;
10113 OP_REFCNT_UNLOCK;
10114 if ( !refcount ) {
446bd890 10115 PerlMemShared_free(trie->charmap);
446bd890
NC
10116 PerlMemShared_free(trie->states);
10117 PerlMemShared_free(trie->trans);
07be1b83 10118 if (trie->bitmap)
446bd890 10119 PerlMemShared_free(trie->bitmap);
786e8c11 10120 if (trie->jump)
446bd890 10121 PerlMemShared_free(trie->jump);
2e64971a 10122 PerlMemShared_free(trie->wordinfo);
446bd890
NC
10123 /* do this last!!!! */
10124 PerlMemShared_free(ri->data->data[n]);
a3621e74 10125 }
07be1b83
YO
10126 }
10127 break;
c277df42 10128 default:
f8fc2ecf 10129 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
10130 }
10131 }
f8fc2ecf
YO
10132 Safefree(ri->data->what);
10133 Safefree(ri->data);
a0d0e21e 10134 }
28d8d7f4 10135
f8fc2ecf 10136 Safefree(ri);
a687059c 10137}
c277df42 10138
a09252eb
NC
10139#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
10140#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
84da74a7
YO
10141#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
10142
10143/*
32cd70f6 10144 re_dup - duplicate a regexp.
84da74a7 10145
8233f606
DM
10146 This routine is expected to clone a given regexp structure. It is only
10147 compiled under USE_ITHREADS.
32cd70f6 10148
f8149455
YO
10149 After all of the core data stored in struct regexp is duplicated
10150 the regexp_engine.dupe method is used to copy any private data
10151 stored in the *pprivate pointer. This allows extensions to handle
10152 any duplication it needs to do.
10153
10154 See pregfree() and regfree_internal() if you change anything here.
84da74a7 10155*/
a3c0e9ca 10156#if defined(USE_ITHREADS)
f8149455 10157#ifndef PERL_IN_XSUB_RE
288b8c02
NC
10158void
10159Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 10160{
84da74a7 10161 dVAR;
a86a1ca7 10162 I32 npar;
288b8c02
NC
10163 const struct regexp *r = (const struct regexp *)SvANY(sstr);
10164 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 10165
7918f24d
NC
10166 PERL_ARGS_ASSERT_RE_DUP_GUTS;
10167
84da74a7 10168 npar = r->nparens+1;
f0ab9afb
NC
10169 Newx(ret->offs, npar, regexp_paren_pair);
10170 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 10171 if(ret->swap) {
28d8d7f4 10172 /* no need to copy these */
f0ab9afb 10173 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 10174 }
84da74a7 10175
6057429f 10176 if (ret->substrs) {
32cd70f6
NC
10177 /* Do it this way to avoid reading from *r after the StructCopy().
10178 That way, if any of the sv_dup_inc()s dislodge *r from the L1
10179 cache, it doesn't matter. */
66b1de87
NC
10180 const bool anchored = r->check_substr
10181 ? r->check_substr == r->anchored_substr
10182 : r->check_utf8 == r->anchored_utf8;
785a26d5 10183 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
10184 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10185
32cd70f6
NC
10186 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
10187 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
10188 ret->float_substr = sv_dup_inc(ret->float_substr, param);
10189 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 10190
32cd70f6
NC
10191 /* check_substr and check_utf8, if non-NULL, point to either their
10192 anchored or float namesakes, and don't hold a second reference. */
10193
10194 if (ret->check_substr) {
10195 if (anchored) {
10196 assert(r->check_utf8 == r->anchored_utf8);
10197 ret->check_substr = ret->anchored_substr;
10198 ret->check_utf8 = ret->anchored_utf8;
10199 } else {
10200 assert(r->check_substr == r->float_substr);
10201 assert(r->check_utf8 == r->float_utf8);
10202 ret->check_substr = ret->float_substr;
10203 ret->check_utf8 = ret->float_utf8;
10204 }
66b1de87
NC
10205 } else if (ret->check_utf8) {
10206 if (anchored) {
10207 ret->check_utf8 = ret->anchored_utf8;
10208 } else {
10209 ret->check_utf8 = ret->float_utf8;
10210 }
32cd70f6 10211 }
6057429f 10212 }
f8149455 10213
5daac39c 10214 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 10215
6057429f 10216 if (ret->pprivate)
288b8c02 10217 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 10218
288b8c02 10219 if (RX_MATCH_COPIED(dstr))
6057429f 10220 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
10221 else
10222 ret->subbeg = NULL;
10223#ifdef PERL_OLD_COPY_ON_WRITE
10224 ret->saved_copy = NULL;
10225#endif
6057429f 10226
c2123ae3
NC
10227 if (ret->mother_re) {
10228 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
10229 /* Our storage points directly to our mother regexp, but that's
10230 1: a buffer in a different thread
10231 2: something we no longer hold a reference on
10232 so we need to copy it locally. */
10233 /* Note we need to sue SvCUR() on our mother_re, because it, in
10234 turn, may well be pointing to its own mother_re. */
10235 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
10236 SvCUR(ret->mother_re)+1));
10237 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
10238 }
10239 ret->mother_re = NULL;
10240 }
6057429f 10241 ret->gofs = 0;
f8149455
YO
10242}
10243#endif /* PERL_IN_XSUB_RE */
10244
10245/*
10246 regdupe_internal()
10247
10248 This is the internal complement to regdupe() which is used to copy
10249 the structure pointed to by the *pprivate pointer in the regexp.
10250 This is the core version of the extension overridable cloning hook.
10251 The regexp structure being duplicated will be copied by perl prior
10252 to this and will be provided as the regexp *r argument, however
10253 with the /old/ structures pprivate pointer value. Thus this routine
10254 may override any copying normally done by perl.
10255
10256 It returns a pointer to the new regexp_internal structure.
10257*/
10258
10259void *
288b8c02 10260Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
10261{
10262 dVAR;
288b8c02 10263 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
10264 regexp_internal *reti;
10265 int len, npar;
10266 RXi_GET_DECL(r,ri);
7918f24d
NC
10267
10268 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455
YO
10269
10270 npar = r->nparens+1;
7122b237 10271 len = ProgLen(ri);
f8149455 10272
45cf4570 10273 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455
YO
10274 Copy(ri->program, reti->program, len+1, regnode);
10275
f8149455 10276
f8fc2ecf 10277 reti->regstclass = NULL;
bcdf7404 10278
f8fc2ecf 10279 if (ri->data) {
84da74a7 10280 struct reg_data *d;
f8fc2ecf 10281 const int count = ri->data->count;
84da74a7
YO
10282 int i;
10283
10284 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10285 char, struct reg_data);
10286 Newx(d->what, count, U8);
10287
10288 d->count = count;
10289 for (i = 0; i < count; i++) {
f8fc2ecf 10290 d->what[i] = ri->data->what[i];
84da74a7 10291 switch (d->what[i]) {
af534a04 10292 /* legal options are one of: sSfpontTua
84da74a7 10293 see also regcomp.h and pregfree() */
af534a04 10294 case 'a': /* actually an AV, but the dup function is identical. */
84da74a7 10295 case 's':
81714fb9 10296 case 'S':
0536c0a7 10297 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 10298 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 10299 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 10300 break;
84da74a7
YO
10301 case 'f':
10302 /* This is cheating. */
10303 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 10304 StructCopy(ri->data->data[i], d->data[i],
84da74a7 10305 struct regnode_charclass_class);
f8fc2ecf 10306 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
10307 break;
10308 case 'o':
bbe252da
YO
10309 /* Compiled op trees are readonly and in shared memory,
10310 and can thus be shared without duplication. */
84da74a7 10311 OP_REFCNT_LOCK;
f8fc2ecf 10312 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
10313 OP_REFCNT_UNLOCK;
10314 break;
23eab42c
NC
10315 case 'T':
10316 /* Trie stclasses are readonly and can thus be shared
10317 * without duplication. We free the stclass in pregfree
10318 * when the corresponding reg_ac_data struct is freed.
10319 */
10320 reti->regstclass= ri->regstclass;
10321 /* Fall through */
84da74a7 10322 case 't':
84da74a7 10323 OP_REFCNT_LOCK;
0536c0a7 10324 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 10325 OP_REFCNT_UNLOCK;
0536c0a7
NC
10326 /* Fall through */
10327 case 'n':
10328 d->data[i] = ri->data->data[i];
84da74a7 10329 break;
84da74a7 10330 default:
f8fc2ecf 10331 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
10332 }
10333 }
10334
f8fc2ecf 10335 reti->data = d;
84da74a7
YO
10336 }
10337 else
f8fc2ecf 10338 reti->data = NULL;
84da74a7 10339
cde0cee5
YO
10340 reti->name_list_idx = ri->name_list_idx;
10341
7122b237
YO
10342#ifdef RE_TRACK_PATTERN_OFFSETS
10343 if (ri->u.offsets) {
10344 Newx(reti->u.offsets, 2*len+1, U32);
10345 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10346 }
10347#else
10348 SetProgLen(reti,len);
10349#endif
10350
f8149455 10351 return (void*)reti;
84da74a7 10352}
f8149455
YO
10353
10354#endif /* USE_ITHREADS */
84da74a7 10355
f8149455 10356#ifndef PERL_IN_XSUB_RE
bcdf7404 10357
c277df42
IZ
10358/*
10359 - regnext - dig the "next" pointer out of a node
c277df42
IZ
10360 */
10361regnode *
864dbfa3 10362Perl_regnext(pTHX_ register regnode *p)
c277df42 10363{
97aff369 10364 dVAR;
c277df42
IZ
10365 register I32 offset;
10366
f8fc2ecf 10367 if (!p)
c277df42
IZ
10368 return(NULL);
10369
35db910f
KW
10370 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
10371 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10372 }
10373
c277df42
IZ
10374 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10375 if (offset == 0)
10376 return(NULL);
10377
c277df42 10378 return(p+offset);
c277df42 10379}
76234dfb 10380#endif
c277df42 10381
01f988be 10382STATIC void
cea2e8a9 10383S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
10384{
10385 va_list args;
10386 STRLEN l1 = strlen(pat1);
10387 STRLEN l2 = strlen(pat2);
10388 char buf[512];
06bf62c7 10389 SV *msv;
73d840c0 10390 const char *message;
c277df42 10391
7918f24d
NC
10392 PERL_ARGS_ASSERT_RE_CROAK2;
10393
c277df42
IZ
10394 if (l1 > 510)
10395 l1 = 510;
10396 if (l1 + l2 > 510)
10397 l2 = 510 - l1;
10398 Copy(pat1, buf, l1 , char);
10399 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
10400 buf[l1 + l2] = '\n';
10401 buf[l1 + l2 + 1] = '\0';
8736538c
AS
10402#ifdef I_STDARG
10403 /* ANSI variant takes additional second argument */
c277df42 10404 va_start(args, pat2);
8736538c
AS
10405#else
10406 va_start(args);
10407#endif
5a844595 10408 msv = vmess(buf, &args);
c277df42 10409 va_end(args);
cfd0369c 10410 message = SvPV_const(msv,l1);
c277df42
IZ
10411 if (l1 > 512)
10412 l1 = 512;
10413 Copy(message, buf, l1 , char);
197cf9b9 10414 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 10415 Perl_croak(aTHX_ "%s", buf);
c277df42 10416}
a0ed51b3
LW
10417
10418/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
10419
76234dfb 10420#ifndef PERL_IN_XSUB_RE
a0ed51b3 10421void
864dbfa3 10422Perl_save_re_context(pTHX)
b81d288d 10423{
97aff369 10424 dVAR;
1ade1aa1
NC
10425
10426 struct re_save_state *state;
10427
10428 SAVEVPTR(PL_curcop);
10429 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10430
10431 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10432 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
c6bf6a65 10433 SSPUSHUV(SAVEt_RE_STATE);
1ade1aa1 10434
46ab3289 10435 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 10436
a0ed51b3 10437 PL_reg_start_tmp = 0;
a0ed51b3 10438 PL_reg_start_tmpl = 0;
c445ea15 10439 PL_reg_oldsaved = NULL;
a5db57d6 10440 PL_reg_oldsavedlen = 0;
a5db57d6 10441 PL_reg_maxiter = 0;
a5db57d6 10442 PL_reg_leftiter = 0;
c445ea15 10443 PL_reg_poscache = NULL;
a5db57d6 10444 PL_reg_poscache_size = 0;
1ade1aa1
NC
10445#ifdef PERL_OLD_COPY_ON_WRITE
10446 PL_nrs = NULL;
10447#endif
ada6e8a9 10448
c445ea15
AL
10449 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10450 if (PL_curpm) {
10451 const REGEXP * const rx = PM_GETRE(PL_curpm);
10452 if (rx) {
1df70142 10453 U32 i;
07bc277f 10454 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 10455 char digits[TYPE_CHARS(long)];
d9fad198 10456 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
10457 GV *const *const gvp
10458 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10459
b37c2d43
AL
10460 if (gvp) {
10461 GV * const gv = *gvp;
10462 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10463 save_scalar(gv);
49f27e4b 10464 }
ada6e8a9
AMS
10465 }
10466 }
10467 }
a0ed51b3 10468}
76234dfb 10469#endif
51371543 10470
51371543 10471static void
acfe0abc 10472clear_re(pTHX_ void *r)
51371543 10473{
97aff369 10474 dVAR;
84679df5 10475 ReREFCNT_dec((REGEXP *)r);
51371543 10476}
ffbc6a93 10477
a28509cc
AL
10478#ifdef DEBUGGING
10479
10480STATIC void
10481S_put_byte(pTHX_ SV *sv, int c)
10482{
7918f24d
NC
10483 PERL_ARGS_ASSERT_PUT_BYTE;
10484
7fddd944
NC
10485 /* Our definition of isPRINT() ignores locales, so only bytes that are
10486 not part of UTF-8 are considered printable. I assume that the same
10487 holds for UTF-EBCDIC.
10488 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10489 which Wikipedia says:
10490
10491 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10492 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10493 identical, to the ASCII delete (DEL) or rubout control character.
10494 ) So the old condition can be simplified to !isPRINT(c) */
9ce2357e
KW
10495 if (!isPRINT(c)) {
10496 if (c < 256) {
10497 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
10498 }
10499 else {
10500 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
10501 }
10502 }
5e7aa789 10503 else {
88c9ea1e 10504 const char string = c;
5e7aa789
NC
10505 if (c == '-' || c == ']' || c == '\\' || c == '^')
10506 sv_catpvs(sv, "\\");
10507 sv_catpvn(sv, &string, 1);
10508 }
a28509cc
AL
10509}
10510
786e8c11 10511
3dab1dad
YO
10512#define CLEAR_OPTSTART \
10513 if (optstart) STMT_START { \
70685ca0 10514 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
10515 optstart=NULL; \
10516 } STMT_END
10517
786e8c11 10518#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 10519
b5a2f8d8
NC
10520STATIC const regnode *
10521S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
10522 const regnode *last, const regnode *plast,
10523 SV* sv, I32 indent, U32 depth)
a28509cc 10524{
97aff369 10525 dVAR;
786e8c11 10526 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 10527 register const regnode *next;
3dab1dad 10528 const regnode *optstart= NULL;
1f1031fe 10529
f8fc2ecf 10530 RXi_GET_DECL(r,ri);
3dab1dad 10531 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10532
10533 PERL_ARGS_ASSERT_DUMPUNTIL;
10534
786e8c11
YO
10535#ifdef DEBUG_DUMPUNTIL
10536 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10537 last ? last-start : 0,plast ? plast-start : 0);
10538#endif
10539
10540 if (plast && plast < last)
10541 last= plast;
10542
10543 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 10544 /* While that wasn't END last time... */
a28509cc
AL
10545 NODE_ALIGN(node);
10546 op = OP(node);
de734bd5 10547 if (op == CLOSE || op == WHILEM)
786e8c11 10548 indent--;
b5a2f8d8 10549 next = regnext((regnode *)node);
1f1031fe 10550
a28509cc 10551 /* Where, what. */
8e11feef 10552 if (OP(node) == OPTIMIZED) {
e68ec53f 10553 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 10554 optstart = node;
3dab1dad 10555 else
8e11feef 10556 goto after_print;
3dab1dad
YO
10557 } else
10558 CLEAR_OPTSTART;
1f1031fe 10559
32fc9b6a 10560 regprop(r, sv, node);
a28509cc 10561 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 10562 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
10563
10564 if (OP(node) != OPTIMIZED) {
10565 if (next == NULL) /* Next ptr. */
10566 PerlIO_printf(Perl_debug_log, " (0)");
10567 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10568 PerlIO_printf(Perl_debug_log, " (FAIL)");
10569 else
10570 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10571 (void)PerlIO_putc(Perl_debug_log, '\n');
10572 }
10573
a28509cc
AL
10574 after_print:
10575 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
10576 assert(next);
10577 {
10578 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
10579 ? regnext((regnode *)next)
10580 : next);
be8e71aa
YO
10581 if (last && nnode > last)
10582 nnode = last;
786e8c11 10583 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 10584 }
a28509cc
AL
10585 }
10586 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 10587 assert(next);
786e8c11 10588 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
10589 }
10590 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 10591 const regnode *this_trie = node;
1de06328 10592 const char op = OP(node);
647f639f 10593 const U32 n = ARG(node);
1de06328 10594 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 10595 (reg_ac_data *)ri->data->data[n] :
1de06328 10596 NULL;
3251b653
NC
10597 const reg_trie_data * const trie =
10598 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 10599#ifdef DEBUGGING
502c6561 10600 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 10601#endif
786e8c11 10602 const regnode *nextbranch= NULL;
a28509cc 10603 I32 word_idx;
76f68e9b 10604 sv_setpvs(sv, "");
786e8c11 10605 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 10606 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
10607
10608 PerlIO_printf(Perl_debug_log, "%*s%s ",
10609 (int)(2*(indent+3)), "",
10610 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
10611 PL_colors[0], PL_colors[1],
10612 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 10613 PERL_PV_PRETTY_ELLIPSES |
7f69552c 10614 PERL_PV_PRETTY_LTGT
786e8c11
YO
10615 )
10616 : "???"
10617 );
10618 if (trie->jump) {
40d049e4 10619 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
10620 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10621 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
10622 if (dist) {
10623 if (!nextbranch)
24b23f37 10624 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
10625 DUMPUNTIL(this_trie + dist, nextbranch);
10626 }
786e8c11
YO
10627 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10628 nextbranch= regnext((regnode *)nextbranch);
10629 } else {
10630 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 10631 }
786e8c11
YO
10632 }
10633 if (last && next > last)
10634 node= last;
10635 else
10636 node= next;
a28509cc 10637 }
786e8c11
YO
10638 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10639 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10640 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
10641 }
10642 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 10643 assert(next);
786e8c11 10644 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
10645 }
10646 else if ( op == PLUS || op == STAR) {
786e8c11 10647 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
10648 }
10649 else if (op == ANYOF) {
10650 /* arglen 1 + class block */
4a3ee7a8 10651 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
a28509cc
AL
10652 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10653 node = NEXTOPER(node);
10654 }
10655 else if (PL_regkind[(U8)op] == EXACT) {
10656 /* Literal string, where present. */
10657 node += NODE_SZ_STR(node) - 1;
10658 node = NEXTOPER(node);
10659 }
10660 else {
10661 node = NEXTOPER(node);
10662 node += regarglen[(U8)op];
10663 }
10664 if (op == CURLYX || op == OPEN)
786e8c11 10665 indent++;
a28509cc 10666 }
3dab1dad 10667 CLEAR_OPTSTART;
786e8c11 10668#ifdef DEBUG_DUMPUNTIL
70685ca0 10669 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 10670#endif
1de06328 10671 return node;
a28509cc
AL
10672}
10673
10674#endif /* DEBUGGING */
10675
241d1a3b
NC
10676/*
10677 * Local variables:
10678 * c-indentation-style: bsd
10679 * c-basic-offset: 4
10680 * indent-tabs-mode: t
10681 * End:
10682 *
37442d52
RGS
10683 * ex: set ts=8 sts=4 sw=4 noet:
10684 */