This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Only call mro_package_moved on new substashes
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
81714fb9 137 HV *paren_names; /* Paren names */
1f1031fe 138
40d049e4
YO
139 regnode **recurse; /* Recurse regops */
140 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
141#if ADD_TO_REGEXEC
142 char *starttry; /* -Dr: where regtry was called. */
143#define RExC_starttry (pRExC_state->starttry)
144#endif
3dab1dad 145#ifdef DEBUGGING
be8e71aa 146 const char *lastparse;
3dab1dad 147 I32 lastnum;
1f1031fe 148 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
149#define RExC_lastparse (pRExC_state->lastparse)
150#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 151#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 152#endif
830247a4
IZ
153} RExC_state_t;
154
e2509266 155#define RExC_flags (pRExC_state->flags)
830247a4 156#define RExC_precomp (pRExC_state->precomp)
288b8c02 157#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 158#define RExC_rx (pRExC_state->rx)
f8fc2ecf 159#define RExC_rxi (pRExC_state->rxi)
fac92740 160#define RExC_start (pRExC_state->start)
830247a4
IZ
161#define RExC_end (pRExC_state->end)
162#define RExC_parse (pRExC_state->parse)
163#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
164#ifdef RE_TRACK_PATTERN_OFFSETS
165#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
166#endif
830247a4 167#define RExC_emit (pRExC_state->emit)
fac92740 168#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 169#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
170#define RExC_naughty (pRExC_state->naughty)
171#define RExC_sawback (pRExC_state->sawback)
172#define RExC_seen (pRExC_state->seen)
173#define RExC_size (pRExC_state->size)
174#define RExC_npar (pRExC_state->npar)
e2e6a0f1 175#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
176#define RExC_extralen (pRExC_state->extralen)
177#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 179#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 180#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
181#define RExC_open_parens (pRExC_state->open_parens)
182#define RExC_close_parens (pRExC_state->close_parens)
183#define RExC_opend (pRExC_state->opend)
81714fb9 184#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
185#define RExC_recurse (pRExC_state->recurse)
186#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 187
cde0cee5 188
a687059c
LW
189#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
a687059c 192
35c8bce7
LW
193#ifdef SPSTART
194#undef SPSTART /* dratted cpp namespace... */
195#endif
a687059c
LW
196/*
197 * Flags to be passed up and down.
198 */
a687059c 199#define WORST 0 /* Worst case. */
a3b492c3 200#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
201
202/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
203 * character, and if utf8, must be invariant. */
204#define SIMPLE 0x02
a3b492c3
YO
205#define SPSTART 0x04 /* Starts with * or +. */
206#define TRYAGAIN 0x08 /* Weeded out a declaration. */
207#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 208
3dab1dad
YO
209#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
210
07be1b83
YO
211/* whether trie related optimizations are enabled */
212#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
213#define TRIE_STUDY_OPT
786e8c11 214#define FULL_TRIE_STUDY
07be1b83
YO
215#define TRIE_STCLASS
216#endif
1de06328
YO
217
218
40d049e4
YO
219
220#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
221#define PBITVAL(paren) (1 << ((paren) & 7))
222#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
223#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
224#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
225
bbd61b5f
KW
226/* If not already in utf8, do a longjmp back to the beginning */
227#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
228#define REQUIRE_UTF8 STMT_START { \
229 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
230 } STMT_END
40d049e4 231
1de06328
YO
232/* About scan_data_t.
233
234 During optimisation we recurse through the regexp program performing
235 various inplace (keyhole style) optimisations. In addition study_chunk
236 and scan_commit populate this data structure with information about
237 what strings MUST appear in the pattern. We look for the longest
3b753521 238 string that must appear at a fixed location, and we look for the
1de06328
YO
239 longest string that may appear at a floating location. So for instance
240 in the pattern:
241
242 /FOO[xX]A.*B[xX]BAR/
243
244 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
245 strings (because they follow a .* construct). study_chunk will identify
246 both FOO and BAR as being the longest fixed and floating strings respectively.
247
248 The strings can be composites, for instance
249
250 /(f)(o)(o)/
251
252 will result in a composite fixed substring 'foo'.
253
254 For each string some basic information is maintained:
255
256 - offset or min_offset
257 This is the position the string must appear at, or not before.
258 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
259 characters must match before the string we are searching for.
260 Likewise when combined with minlenp and the length of the string it
1de06328
YO
261 tells us how many characters must appear after the string we have
262 found.
263
264 - max_offset
265 Only used for floating strings. This is the rightmost point that
3b753521 266 the string can appear at. If set to I32 max it indicates that the
1de06328
YO
267 string can occur infinitely far to the right.
268
269 - minlenp
270 A pointer to the minimum length of the pattern that the string
271 was found inside. This is important as in the case of positive
272 lookahead or positive lookbehind we can have multiple patterns
273 involved. Consider
274
275 /(?=FOO).*F/
276
277 The minimum length of the pattern overall is 3, the minimum length
278 of the lookahead part is 3, but the minimum length of the part that
279 will actually match is 1. So 'FOO's minimum length is 3, but the
280 minimum length for the F is 1. This is important as the minimum length
281 is used to determine offsets in front of and behind the string being
282 looked for. Since strings can be composites this is the length of the
283 pattern at the time it was commited with a scan_commit. Note that
284 the length is calculated by study_chunk, so that the minimum lengths
285 are not known until the full pattern has been compiled, thus the
286 pointer to the value.
287
288 - lookbehind
289
290 In the case of lookbehind the string being searched for can be
291 offset past the start point of the final matching string.
292 If this value was just blithely removed from the min_offset it would
293 invalidate some of the calculations for how many chars must match
294 before or after (as they are derived from min_offset and minlen and
295 the length of the string being searched for).
296 When the final pattern is compiled and the data is moved from the
297 scan_data_t structure into the regexp structure the information
298 about lookbehind is factored in, with the information that would
299 have been lost precalculated in the end_shift field for the
300 associated string.
301
302 The fields pos_min and pos_delta are used to store the minimum offset
303 and the delta to the maximum offset at the current point in the pattern.
304
305*/
2c2d71f5
JH
306
307typedef struct scan_data_t {
1de06328
YO
308 /*I32 len_min; unused */
309 /*I32 len_delta; unused */
2c2d71f5
JH
310 I32 pos_min;
311 I32 pos_delta;
312 SV *last_found;
1de06328 313 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
314 I32 last_start_min;
315 I32 last_start_max;
1de06328
YO
316 SV **longest; /* Either &l_fixed, or &l_float. */
317 SV *longest_fixed; /* longest fixed string found in pattern */
318 I32 offset_fixed; /* offset where it starts */
319 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
320 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
321 SV *longest_float; /* longest floating string found in pattern */
322 I32 offset_float_min; /* earliest point in string it can appear */
323 I32 offset_float_max; /* latest point in string it can appear */
324 I32 *minlen_float; /* pointer to the minlen relevent to the string */
325 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
326 I32 flags;
327 I32 whilem_c;
cb434fcc 328 I32 *last_closep;
653099ff 329 struct regnode_charclass_class *start_class;
2c2d71f5
JH
330} scan_data_t;
331
a687059c 332/*
e50aee73 333 * Forward declarations for pregcomp()'s friends.
a687059c 334 */
a0d0e21e 335
27da23d5 336static const scan_data_t zero_scan_data =
1de06328 337 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
338
339#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
340#define SF_BEFORE_SEOL 0x0001
341#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
342#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
343#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
344
09b7f37c
CB
345#ifdef NO_UNARY_PLUS
346# define SF_FIX_SHIFT_EOL (0+2)
347# define SF_FL_SHIFT_EOL (0+4)
348#else
349# define SF_FIX_SHIFT_EOL (+2)
350# define SF_FL_SHIFT_EOL (+4)
351#endif
c277df42
IZ
352
353#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
354#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
355
356#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
357#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
358#define SF_IS_INF 0x0040
359#define SF_HAS_PAR 0x0080
360#define SF_IN_PAR 0x0100
361#define SF_HAS_EVAL 0x0200
362#define SCF_DO_SUBSTR 0x0400
653099ff
GS
363#define SCF_DO_STCLASS_AND 0x0800
364#define SCF_DO_STCLASS_OR 0x1000
365#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 366#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 367
786e8c11 368#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 369#define SCF_SEEN_ACCEPT 0x8000
07be1b83 370
43fead97
KW
371#define UTF cBOOL(RExC_utf8)
372#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
373#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
374#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 375
ffc61ed2 376#define OOB_UNICODE 12345678
93733859 377#define OOB_NAMEDCLASS -1
b8c5462f 378
a0ed51b3
LW
379#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
380#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
381
8615cb43 382
b45f050a
JF
383/* length of regex to show in messages that don't mark a position within */
384#define RegexLengthToShowInErrorMessages 127
385
386/*
387 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
388 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
389 * op/pragma/warn/regcomp.
390 */
7253e4e3
RK
391#define MARKER1 "<-- HERE" /* marker as it appears in the description */
392#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 393
7253e4e3 394#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
395
396/*
397 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
398 * arg. Show regex, up to a maximum length. If it's too long, chop and add
399 * "...".
400 */
58e23c8d 401#define _FAIL(code) STMT_START { \
bfed75c6 402 const char *ellipses = ""; \
ccb2c380
MP
403 IV len = RExC_end - RExC_precomp; \
404 \
405 if (!SIZE_ONLY) \
288b8c02 406 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
407 if (len > RegexLengthToShowInErrorMessages) { \
408 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
409 len = RegexLengthToShowInErrorMessages - 10; \
410 ellipses = "..."; \
411 } \
58e23c8d 412 code; \
ccb2c380 413} STMT_END
8615cb43 414
58e23c8d
YO
415#define FAIL(msg) _FAIL( \
416 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
417 msg, (int)len, RExC_precomp, ellipses))
418
419#define FAIL2(msg,arg) _FAIL( \
420 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
421 arg, (int)len, RExC_precomp, ellipses))
422
b45f050a 423/*
b45f050a
JF
424 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
425 */
ccb2c380 426#define Simple_vFAIL(m) STMT_START { \
a28509cc 427 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
428 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
b45f050a
JF
431
432/*
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
434 */
ccb2c380
MP
435#define vFAIL(m) STMT_START { \
436 if (!SIZE_ONLY) \
288b8c02 437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
438 Simple_vFAIL(m); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Like Simple_vFAIL(), but accepts two arguments.
443 */
ccb2c380 444#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 445 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
448} STMT_END
b45f050a
JF
449
450/*
451 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
452 */
ccb2c380
MP
453#define vFAIL2(m,a1) STMT_START { \
454 if (!SIZE_ONLY) \
288b8c02 455 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
456 Simple_vFAIL2(m, a1); \
457} STMT_END
b45f050a
JF
458
459
460/*
461 * Like Simple_vFAIL(), but accepts three arguments.
462 */
ccb2c380 463#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 464 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
465 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
466 (int)offset, RExC_precomp, RExC_precomp + offset); \
467} STMT_END
b45f050a
JF
468
469/*
470 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
471 */
ccb2c380
MP
472#define vFAIL3(m,a1,a2) STMT_START { \
473 if (!SIZE_ONLY) \
288b8c02 474 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
475 Simple_vFAIL3(m, a1, a2); \
476} STMT_END
b45f050a
JF
477
478/*
479 * Like Simple_vFAIL(), but accepts four arguments.
480 */
ccb2c380 481#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 482 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
483 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
484 (int)offset, RExC_precomp, RExC_precomp + offset); \
485} STMT_END
b45f050a 486
668c081a 487#define ckWARNreg(loc,m) STMT_START { \
a28509cc 488 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
489 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
491} STMT_END
492
668c081a 493#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 494 const IV offset = loc - RExC_precomp; \
d1d15184 495 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
496 m REPORT_LOCATION, \
497 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
498} STMT_END
499
668c081a 500#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 501 const IV offset = loc - RExC_precomp; \
668c081a 502 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
503 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
504} STMT_END
505
506#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 507 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
508 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
509 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
510} STMT_END
511
668c081a
NC
512#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
513 const IV offset = loc - RExC_precomp; \
514 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
515 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
516} STMT_END
517
ccb2c380 518#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 519 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
520 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
521 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
522} STMT_END
523
668c081a
NC
524#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
527 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
528} STMT_END
529
ccb2c380 530#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 531 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
532 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
533 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
534} STMT_END
9d1d55b5 535
8615cb43 536
cd439c50 537/* Allow for side effects in s */
ccb2c380
MP
538#define REGC(c,s) STMT_START { \
539 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
540} STMT_END
cd439c50 541
fac92740
MJD
542/* Macros for recording node offsets. 20001227 mjd@plover.com
543 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
544 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
545 * Element 0 holds the number n.
07be1b83 546 * Position is 1 indexed.
fac92740 547 */
7122b237
YO
548#ifndef RE_TRACK_PATTERN_OFFSETS
549#define Set_Node_Offset_To_R(node,byte)
550#define Set_Node_Offset(node,byte)
551#define Set_Cur_Node_Offset
552#define Set_Node_Length_To_R(node,len)
553#define Set_Node_Length(node,len)
554#define Set_Node_Cur_Length(node)
555#define Node_Offset(n)
556#define Node_Length(n)
557#define Set_Node_Offset_Length(node,offset,len)
558#define ProgLen(ri) ri->u.proglen
559#define SetProgLen(ri,x) ri->u.proglen = x
560#else
561#define ProgLen(ri) ri->u.offsets[0]
562#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
563#define Set_Node_Offset_To_R(node,byte) STMT_START { \
564 if (! SIZE_ONLY) { \
565 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 566 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 567 if((node) < 0) { \
551405c4 568 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
569 } else { \
570 RExC_offsets[2*(node)-1] = (byte); \
571 } \
572 } \
573} STMT_END
574
575#define Set_Node_Offset(node,byte) \
576 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
577#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
578
579#define Set_Node_Length_To_R(node,len) STMT_START { \
580 if (! SIZE_ONLY) { \
581 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 582 __LINE__, (int)(node), (int)(len))); \
ccb2c380 583 if((node) < 0) { \
551405c4 584 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
585 } else { \
586 RExC_offsets[2*(node)] = (len); \
587 } \
588 } \
589} STMT_END
590
591#define Set_Node_Length(node,len) \
592 Set_Node_Length_To_R((node)-RExC_emit_start, len)
593#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
594#define Set_Node_Cur_Length(node) \
595 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
596
597/* Get offsets and lengths */
598#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
599#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
600
07be1b83
YO
601#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
602 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
603 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
604} STMT_END
7122b237 605#endif
07be1b83
YO
606
607#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
608#define EXPERIMENTAL_INPLACESCAN
f427392e 609#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 610
304ee84b
YO
611#define DEBUG_STUDYDATA(str,data,depth) \
612DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 613 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
614 "%*s" str "Pos:%"IVdf"/%"IVdf \
615 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
616 (int)(depth)*2, "", \
617 (IV)((data)->pos_min), \
618 (IV)((data)->pos_delta), \
304ee84b 619 (UV)((data)->flags), \
1de06328 620 (IV)((data)->whilem_c), \
304ee84b
YO
621 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
622 is_inf ? "INF " : "" \
1de06328
YO
623 ); \
624 if ((data)->last_found) \
625 PerlIO_printf(Perl_debug_log, \
626 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
627 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
628 SvPVX_const((data)->last_found), \
629 (IV)((data)->last_end), \
630 (IV)((data)->last_start_min), \
631 (IV)((data)->last_start_max), \
632 ((data)->longest && \
633 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
634 SvPVX_const((data)->longest_fixed), \
635 (IV)((data)->offset_fixed), \
636 ((data)->longest && \
637 (data)->longest==&((data)->longest_float)) ? "*" : "", \
638 SvPVX_const((data)->longest_float), \
639 (IV)((data)->offset_float_min), \
640 (IV)((data)->offset_float_max) \
641 ); \
642 PerlIO_printf(Perl_debug_log,"\n"); \
643});
644
acfe0abc 645static void clear_re(pTHX_ void *r);
4327152a 646
653099ff 647/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 648 Update the longest found anchored substring and the longest found
653099ff
GS
649 floating substrings if needed. */
650
4327152a 651STATIC void
304ee84b 652S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 653{
e1ec3a88
AL
654 const STRLEN l = CHR_SVLEN(data->last_found);
655 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 656 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 657
7918f24d
NC
658 PERL_ARGS_ASSERT_SCAN_COMMIT;
659
c277df42 660 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 661 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
662 if (*data->longest == data->longest_fixed) {
663 data->offset_fixed = l ? data->last_start_min : data->pos_min;
664 if (data->flags & SF_BEFORE_EOL)
b81d288d 665 data->flags
c277df42
IZ
666 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
667 else
668 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
669 data->minlen_fixed=minlenp;
670 data->lookbehind_fixed=0;
a0ed51b3 671 }
304ee84b 672 else { /* *data->longest == data->longest_float */
c277df42 673 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
674 data->offset_float_max = (l
675 ? data->last_start_max
c277df42 676 : data->pos_min + data->pos_delta);
304ee84b 677 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 678 data->offset_float_max = I32_MAX;
c277df42 679 if (data->flags & SF_BEFORE_EOL)
b81d288d 680 data->flags
c277df42
IZ
681 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
682 else
683 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
684 data->minlen_float=minlenp;
685 data->lookbehind_float=0;
c277df42
IZ
686 }
687 }
688 SvCUR_set(data->last_found, 0);
0eda9292 689 {
a28509cc 690 SV * const sv = data->last_found;
097eb12c
AL
691 if (SvUTF8(sv) && SvMAGICAL(sv)) {
692 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
693 if (mg)
694 mg->mg_len = 0;
695 }
0eda9292 696 }
c277df42
IZ
697 data->last_end = -1;
698 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 699 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
700}
701
653099ff
GS
702/* Can match anything (initialization) */
703STATIC void
097eb12c 704S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 705{
7918f24d
NC
706 PERL_ARGS_ASSERT_CL_ANYTHING;
707
653099ff 708 ANYOF_CLASS_ZERO(cl);
f8bef550 709 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 710 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
711 if (LOC)
712 cl->flags |= ANYOF_LOCALE;
8951c461 713 cl->flags |= ANYOF_FOLD;
653099ff
GS
714}
715
716/* Can match anything (initialization) */
717STATIC int
5f66b61c 718S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
719{
720 int value;
721
7918f24d
NC
722 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
723
aaa51d5e 724 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
725 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
726 return 1;
1aa99e6b
IH
727 if (!(cl->flags & ANYOF_UNICODE_ALL))
728 return 0;
10edeb5d 729 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 730 return 0;
653099ff
GS
731 return 1;
732}
733
734/* Can match anything (initialization) */
735STATIC void
097eb12c 736S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 737{
7918f24d
NC
738 PERL_ARGS_ASSERT_CL_INIT;
739
8ecf7187 740 Zero(cl, 1, struct regnode_charclass_class);
653099ff 741 cl->type = ANYOF;
830247a4 742 cl_anything(pRExC_state, cl);
653099ff
GS
743}
744
745STATIC void
097eb12c 746S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 747{
7918f24d
NC
748 PERL_ARGS_ASSERT_CL_INIT_ZERO;
749
8ecf7187 750 Zero(cl, 1, struct regnode_charclass_class);
653099ff 751 cl->type = ANYOF;
830247a4 752 cl_anything(pRExC_state, cl);
653099ff
GS
753 if (LOC)
754 cl->flags |= ANYOF_LOCALE;
755}
756
757/* 'And' a given class with another one. Can create false positives */
758/* We assume that cl is not inverted */
759STATIC void
5f66b61c 760S_cl_and(struct regnode_charclass_class *cl,
a28509cc 761 const struct regnode_charclass_class *and_with)
653099ff 762{
7918f24d 763 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
764
765 assert(and_with->type == ANYOF);
653099ff
GS
766 if (!(and_with->flags & ANYOF_CLASS)
767 && !(cl->flags & ANYOF_CLASS)
768 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
769 && !(and_with->flags & ANYOF_FOLD)
770 && !(cl->flags & ANYOF_FOLD)) {
771 int i;
772
773 if (and_with->flags & ANYOF_INVERT)
774 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
775 cl->bitmap[i] &= ~and_with->bitmap[i];
776 else
777 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
778 cl->bitmap[i] &= and_with->bitmap[i];
779 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
780 if (!(and_with->flags & ANYOF_EOS))
781 cl->flags &= ~ANYOF_EOS;
1aa99e6b 782
8951c461
KW
783 if (!(and_with->flags & ANYOF_FOLD))
784 cl->flags &= ~ANYOF_FOLD;
785
3ff7ceb3 786 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
14ebb1a2 787 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b 788 cl->flags &= ~ANYOF_UNICODE_ALL;
ef87b810
KW
789 cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
790 only the one(s)
791 actually set */
1aa99e6b
IH
792 ARG_SET(cl, ARG(and_with));
793 }
14ebb1a2
JH
794 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
795 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 796 cl->flags &= ~ANYOF_UNICODE_ALL;
3ff7ceb3 797 if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
14ebb1a2 798 !(and_with->flags & ANYOF_INVERT))
3ff7ceb3 799 cl->flags &= ~ANYOF_NONBITMAP;
653099ff
GS
800}
801
802/* 'OR' a given class with another one. Can create false positives */
803/* We assume that cl is not inverted */
804STATIC void
097eb12c 805S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 806{
7918f24d
NC
807 PERL_ARGS_ASSERT_CL_OR;
808
653099ff
GS
809 if (or_with->flags & ANYOF_INVERT) {
810 /* We do not use
811 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
812 * <= (B1 | !B2) | (CL1 | !CL2)
813 * which is wasteful if CL2 is small, but we ignore CL2:
814 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
815 * XXXX Can we handle case-fold? Unclear:
816 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
817 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
818 */
819 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
820 && !(or_with->flags & ANYOF_FOLD)
821 && !(cl->flags & ANYOF_FOLD) ) {
822 int i;
823
824 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
825 cl->bitmap[i] |= ~or_with->bitmap[i];
826 } /* XXXX: logic is complicated otherwise */
827 else {
830247a4 828 cl_anything(pRExC_state, cl);
653099ff
GS
829 }
830 } else {
831 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
832 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 833 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
834 || (cl->flags & ANYOF_FOLD)) ) {
835 int i;
836
837 /* OR char bitmap and class bitmap separately */
838 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
839 cl->bitmap[i] |= or_with->bitmap[i];
840 if (or_with->flags & ANYOF_CLASS) {
841 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
842 cl->classflags[i] |= or_with->classflags[i];
843 cl->flags |= ANYOF_CLASS;
844 }
845 }
846 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 847 cl_anything(pRExC_state, cl);
653099ff
GS
848 }
849 }
850 if (or_with->flags & ANYOF_EOS)
851 cl->flags |= ANYOF_EOS;
1aa99e6b 852
8951c461
KW
853 if (or_with->flags & ANYOF_FOLD)
854 cl->flags |= ANYOF_FOLD;
855
9826f543
KW
856 /* If both nodes match something outside the bitmap, but what they match
857 * outside is not the same pointer, and hence not easily compared, give up
858 * and allow the start class to match everything outside the bitmap */
3ff7ceb3 859 if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
1aa99e6b
IH
860 ARG(cl) != ARG(or_with)) {
861 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 862 }
9826f543 863
1aa99e6b
IH
864 if (or_with->flags & ANYOF_UNICODE_ALL) {
865 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 866 }
653099ff
GS
867}
868
a3621e74
YO
869#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
870#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
871#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
872#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
873
3dab1dad
YO
874
875#ifdef DEBUGGING
07be1b83 876/*
2b8b4781
NC
877 dump_trie(trie,widecharmap,revcharmap)
878 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
879 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
880
881 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
882 The _interim_ variants are used for debugging the interim
883 tables that are used to generate the final compressed
884 representation which is what dump_trie expects.
885
3dab1dad
YO
886 Part of the reason for their existance is to provide a form
887 of documentation as to how the different representations function.
07be1b83
YO
888
889*/
3dab1dad
YO
890
891/*
3dab1dad
YO
892 Dumps the final compressed table form of the trie to Perl_debug_log.
893 Used for debugging make_trie().
894*/
b9a59e08 895
3dab1dad 896STATIC void
2b8b4781
NC
897S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
898 AV *revcharmap, U32 depth)
3dab1dad
YO
899{
900 U32 state;
ab3bbdeb 901 SV *sv=sv_newmortal();
55eed653 902 int colwidth= widecharmap ? 6 : 4;
2e64971a 903 U16 word;
3dab1dad
YO
904 GET_RE_DEBUG_FLAGS_DECL;
905
7918f24d 906 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 907
3dab1dad
YO
908 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
909 (int)depth * 2 + 2,"",
910 "Match","Base","Ofs" );
911
912 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 913 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 914 if ( tmp ) {
ab3bbdeb
YO
915 PerlIO_printf( Perl_debug_log, "%*s",
916 colwidth,
ddc5bc0f 917 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
918 PL_colors[0], PL_colors[1],
919 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
920 PERL_PV_ESCAPE_FIRSTCHAR
921 )
922 );
3dab1dad
YO
923 }
924 }
925 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
926 (int)depth * 2 + 2,"");
927
928 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 929 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
930 PerlIO_printf( Perl_debug_log, "\n");
931
1e2e3d02 932 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 933 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
934
935 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
936
937 if ( trie->states[ state ].wordnum ) {
938 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
939 } else {
940 PerlIO_printf( Perl_debug_log, "%6s", "" );
941 }
942
943 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
944
945 if ( base ) {
946 U32 ofs = 0;
947
948 while( ( base + ofs < trie->uniquecharcount ) ||
949 ( base + ofs - trie->uniquecharcount < trie->lasttrans
950 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
951 ofs++;
952
953 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
954
955 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
956 if ( ( base + ofs >= trie->uniquecharcount ) &&
957 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
958 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
959 {
ab3bbdeb
YO
960 PerlIO_printf( Perl_debug_log, "%*"UVXf,
961 colwidth,
3dab1dad
YO
962 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
963 } else {
ab3bbdeb 964 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
965 }
966 }
967
968 PerlIO_printf( Perl_debug_log, "]");
969
970 }
971 PerlIO_printf( Perl_debug_log, "\n" );
972 }
2e64971a
DM
973 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
974 for (word=1; word <= trie->wordcount; word++) {
975 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
976 (int)word, (int)(trie->wordinfo[word].prev),
977 (int)(trie->wordinfo[word].len));
978 }
979 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
980}
981/*
3dab1dad
YO
982 Dumps a fully constructed but uncompressed trie in list form.
983 List tries normally only are used for construction when the number of
984 possible chars (trie->uniquecharcount) is very high.
985 Used for debugging make_trie().
986*/
987STATIC void
55eed653 988S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
989 HV *widecharmap, AV *revcharmap, U32 next_alloc,
990 U32 depth)
3dab1dad
YO
991{
992 U32 state;
ab3bbdeb 993 SV *sv=sv_newmortal();
55eed653 994 int colwidth= widecharmap ? 6 : 4;
3dab1dad 995 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
996
997 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
998
3dab1dad 999 /* print out the table precompression. */
ab3bbdeb
YO
1000 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1001 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1002 "------:-----+-----------------\n" );
3dab1dad
YO
1003
1004 for( state=1 ; state < next_alloc ; state ++ ) {
1005 U16 charid;
1006
ab3bbdeb 1007 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1008 (int)depth * 2 + 2,"", (UV)state );
1009 if ( ! trie->states[ state ].wordnum ) {
1010 PerlIO_printf( Perl_debug_log, "%5s| ","");
1011 } else {
1012 PerlIO_printf( Perl_debug_log, "W%4x| ",
1013 trie->states[ state ].wordnum
1014 );
1015 }
1016 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1017 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1018 if ( tmp ) {
1019 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1020 colwidth,
ddc5bc0f 1021 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1022 PL_colors[0], PL_colors[1],
1023 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1024 PERL_PV_ESCAPE_FIRSTCHAR
1025 ) ,
1e2e3d02
YO
1026 TRIE_LIST_ITEM(state,charid).forid,
1027 (UV)TRIE_LIST_ITEM(state,charid).newstate
1028 );
1029 if (!(charid % 10))
664e119d
RGS
1030 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1031 (int)((depth * 2) + 14), "");
1e2e3d02 1032 }
ab3bbdeb
YO
1033 }
1034 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1035 }
1036}
1037
1038/*
3dab1dad
YO
1039 Dumps a fully constructed but uncompressed trie in table form.
1040 This is the normal DFA style state transition table, with a few
1041 twists to facilitate compression later.
1042 Used for debugging make_trie().
1043*/
1044STATIC void
55eed653 1045S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1046 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1047 U32 depth)
3dab1dad
YO
1048{
1049 U32 state;
1050 U16 charid;
ab3bbdeb 1051 SV *sv=sv_newmortal();
55eed653 1052 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1053 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1054
1055 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1056
1057 /*
1058 print out the table precompression so that we can do a visual check
1059 that they are identical.
1060 */
1061
1062 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1063
1064 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1065 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1066 if ( tmp ) {
ab3bbdeb
YO
1067 PerlIO_printf( Perl_debug_log, "%*s",
1068 colwidth,
ddc5bc0f 1069 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1070 PL_colors[0], PL_colors[1],
1071 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1072 PERL_PV_ESCAPE_FIRSTCHAR
1073 )
1074 );
3dab1dad
YO
1075 }
1076 }
1077
1078 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1079
1080 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1081 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1082 }
1083
1084 PerlIO_printf( Perl_debug_log, "\n" );
1085
1086 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1087
1088 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1089 (int)depth * 2 + 2,"",
1090 (UV)TRIE_NODENUM( state ) );
1091
1092 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1093 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1094 if (v)
1095 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1096 else
1097 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1098 }
1099 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1100 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1101 } else {
1102 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1103 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1104 }
1105 }
07be1b83 1106}
3dab1dad
YO
1107
1108#endif
1109
2e64971a 1110
786e8c11
YO
1111/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1112 startbranch: the first branch in the whole branch sequence
1113 first : start branch of sequence of branch-exact nodes.
1114 May be the same as startbranch
1115 last : Thing following the last branch.
1116 May be the same as tail.
1117 tail : item following the branch sequence
1118 count : words in the sequence
1119 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1120 depth : indent depth
3dab1dad 1121
786e8c11 1122Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1123
786e8c11
YO
1124A trie is an N'ary tree where the branches are determined by digital
1125decomposition of the key. IE, at the root node you look up the 1st character and
1126follow that branch repeat until you find the end of the branches. Nodes can be
1127marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1128
786e8c11 1129 /he|she|his|hers/
72f13be8 1130
786e8c11
YO
1131would convert into the following structure. Numbers represent states, letters
1132following numbers represent valid transitions on the letter from that state, if
1133the number is in square brackets it represents an accepting state, otherwise it
1134will be in parenthesis.
07be1b83 1135
786e8c11
YO
1136 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1137 | |
1138 | (2)
1139 | |
1140 (1) +-i->(6)-+-s->[7]
1141 |
1142 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1143
786e8c11
YO
1144 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1145
1146This shows that when matching against the string 'hers' we will begin at state 1
1147read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1148then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1149is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1150single traverse. We store a mapping from accepting to state to which word was
1151matched, and then when we have multiple possibilities we try to complete the
1152rest of the regex in the order in which they occured in the alternation.
1153
1154The only prior NFA like behaviour that would be changed by the TRIE support is
1155the silent ignoring of duplicate alternations which are of the form:
1156
1157 / (DUPE|DUPE) X? (?{ ... }) Y /x
1158
1159Thus EVAL blocks follwing a trie may be called a different number of times with
1160and without the optimisation. With the optimisations dupes will be silently
1161ignored. This inconsistant behaviour of EVAL type nodes is well established as
1162the following demonstrates:
1163
1164 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1165
1166which prints out 'word' three times, but
1167
1168 'words'=~/(word|word|word)(?{ print $1 })S/
1169
1170which doesnt print it out at all. This is due to other optimisations kicking in.
1171
1172Example of what happens on a structural level:
1173
1174The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1175
1176 1: CURLYM[1] {1,32767}(18)
1177 5: BRANCH(8)
1178 6: EXACT <ac>(16)
1179 8: BRANCH(11)
1180 9: EXACT <ad>(16)
1181 11: BRANCH(14)
1182 12: EXACT <ab>(16)
1183 16: SUCCEED(0)
1184 17: NOTHING(18)
1185 18: END(0)
1186
1187This would be optimizable with startbranch=5, first=5, last=16, tail=16
1188and should turn into:
1189
1190 1: CURLYM[1] {1,32767}(18)
1191 5: TRIE(16)
1192 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1193 <ac>
1194 <ad>
1195 <ab>
1196 16: SUCCEED(0)
1197 17: NOTHING(18)
1198 18: END(0)
1199
1200Cases where tail != last would be like /(?foo|bar)baz/:
1201
1202 1: BRANCH(4)
1203 2: EXACT <foo>(8)
1204 4: BRANCH(7)
1205 5: EXACT <bar>(8)
1206 7: TAIL(8)
1207 8: EXACT <baz>(10)
1208 10: END(0)
1209
1210which would be optimizable with startbranch=1, first=1, last=7, tail=8
1211and would end up looking like:
1212
1213 1: TRIE(8)
1214 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1215 <foo>
1216 <bar>
1217 7: TAIL(8)
1218 8: EXACT <baz>(10)
1219 10: END(0)
1220
1221 d = uvuni_to_utf8_flags(d, uv, 0);
1222
1223is the recommended Unicode-aware way of saying
1224
1225 *(d++) = uv;
1226*/
1227
1e2e3d02 1228#define TRIE_STORE_REVCHAR \
786e8c11 1229 STMT_START { \
73031816
NC
1230 if (UTF) { \
1231 SV *zlopp = newSV(2); \
88c9ea1e
CB
1232 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1233 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1234 SvCUR_set(zlopp, kapow - flrbbbbb); \
1235 SvPOK_on(zlopp); \
1236 SvUTF8_on(zlopp); \
1237 av_push(revcharmap, zlopp); \
1238 } else { \
6bdeddd2 1239 char ooooff = (char)uvc; \
73031816
NC
1240 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1241 } \
1242 } STMT_END
786e8c11
YO
1243
1244#define TRIE_READ_CHAR STMT_START { \
1245 wordlen++; \
1246 if ( UTF ) { \
1247 if ( folder ) { \
1248 if ( foldlen > 0 ) { \
1249 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1250 foldlen -= len; \
1251 scan += len; \
1252 len = 0; \
1253 } else { \
1254 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1255 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1256 foldlen -= UNISKIP( uvc ); \
1257 scan = foldbuf + UNISKIP( uvc ); \
1258 } \
1259 } else { \
1260 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1261 } \
1262 } else { \
1263 uvc = (U32)*uc; \
1264 len = 1; \
1265 } \
1266} STMT_END
1267
1268
1269
1270#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1271 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1272 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1273 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1274 } \
1275 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1276 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1277 TRIE_LIST_CUR( state )++; \
1278} STMT_END
07be1b83 1279
786e8c11
YO
1280#define TRIE_LIST_NEW(state) STMT_START { \
1281 Newxz( trie->states[ state ].trans.list, \
1282 4, reg_trie_trans_le ); \
1283 TRIE_LIST_CUR( state ) = 1; \
1284 TRIE_LIST_LEN( state ) = 4; \
1285} STMT_END
07be1b83 1286
786e8c11
YO
1287#define TRIE_HANDLE_WORD(state) STMT_START { \
1288 U16 dupe= trie->states[ state ].wordnum; \
1289 regnode * const noper_next = regnext( noper ); \
1290 \
786e8c11
YO
1291 DEBUG_r({ \
1292 /* store the word for dumping */ \
1293 SV* tmp; \
1294 if (OP(noper) != NOTHING) \
740cce10 1295 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1296 else \
740cce10 1297 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1298 av_push( trie_words, tmp ); \
786e8c11
YO
1299 }); \
1300 \
1301 curword++; \
2e64971a
DM
1302 trie->wordinfo[curword].prev = 0; \
1303 trie->wordinfo[curword].len = wordlen; \
1304 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1305 \
1306 if ( noper_next < tail ) { \
1307 if (!trie->jump) \
c944940b 1308 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1309 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1310 if (!jumper) \
1311 jumper = noper_next; \
1312 if (!nextbranch) \
1313 nextbranch= regnext(cur); \
1314 } \
1315 \
1316 if ( dupe ) { \
2e64971a
DM
1317 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1318 /* chain, so that when the bits of chain are later */\
1319 /* linked together, the dups appear in the chain */\
1320 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1321 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1322 } else { \
1323 /* we haven't inserted this word yet. */ \
1324 trie->states[ state ].wordnum = curword; \
1325 } \
1326} STMT_END
07be1b83 1327
3dab1dad 1328
786e8c11
YO
1329#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1330 ( ( base + charid >= ucharcount \
1331 && base + charid < ubound \
1332 && state == trie->trans[ base - ucharcount + charid ].check \
1333 && trie->trans[ base - ucharcount + charid ].next ) \
1334 ? trie->trans[ base - ucharcount + charid ].next \
1335 : ( state==1 ? special : 0 ) \
1336 )
3dab1dad 1337
786e8c11
YO
1338#define MADE_TRIE 1
1339#define MADE_JUMP_TRIE 2
1340#define MADE_EXACT_TRIE 4
3dab1dad 1341
a3621e74 1342STATIC I32
786e8c11 1343S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1344{
27da23d5 1345 dVAR;
a3621e74
YO
1346 /* first pass, loop through and scan words */
1347 reg_trie_data *trie;
55eed653 1348 HV *widecharmap = NULL;
2b8b4781 1349 AV *revcharmap = newAV();
a3621e74 1350 regnode *cur;
9f7f3913 1351 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1352 STRLEN len = 0;
1353 UV uvc = 0;
1354 U16 curword = 0;
1355 U32 next_alloc = 0;
786e8c11
YO
1356 regnode *jumper = NULL;
1357 regnode *nextbranch = NULL;
7f69552c 1358 regnode *convert = NULL;
2e64971a 1359 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1360 /* we just use folder as a flag in utf8 */
e1ec3a88 1361 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1362 ? PL_fold
1363 : ( flags == EXACTFL
1364 ? PL_fold_locale
1365 : NULL
1366 )
1367 );
1368
2b8b4781
NC
1369#ifdef DEBUGGING
1370 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1371 AV *trie_words = NULL;
1372 /* along with revcharmap, this only used during construction but both are
1373 * useful during debugging so we store them in the struct when debugging.
8e11feef 1374 */
2b8b4781
NC
1375#else
1376 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1377 STRLEN trie_charcount=0;
3dab1dad 1378#endif
2b8b4781 1379 SV *re_trie_maxbuff;
a3621e74 1380 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1381
1382 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1383#ifndef DEBUGGING
1384 PERL_UNUSED_ARG(depth);
1385#endif
a3621e74 1386
c944940b 1387 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1388 trie->refcount = 1;
3dab1dad 1389 trie->startstate = 1;
786e8c11 1390 trie->wordcount = word_count;
f8fc2ecf 1391 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1392 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1393 if (!(UTF && folder))
c944940b 1394 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1395 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1396 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1397
a3621e74 1398 DEBUG_r({
2b8b4781 1399 trie_words = newAV();
a3621e74 1400 });
a3621e74 1401
0111c4fd 1402 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1403 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1404 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1405 }
3dab1dad
YO
1406 DEBUG_OPTIMISE_r({
1407 PerlIO_printf( Perl_debug_log,
786e8c11 1408 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1409 (int)depth * 2 + 2, "",
1410 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1411 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1412 (int)depth);
3dab1dad 1413 });
7f69552c
YO
1414
1415 /* Find the node we are going to overwrite */
1416 if ( first == startbranch && OP( last ) != BRANCH ) {
1417 /* whole branch chain */
1418 convert = first;
1419 } else {
1420 /* branch sub-chain */
1421 convert = NEXTOPER( first );
1422 }
1423
a3621e74
YO
1424 /* -- First loop and Setup --
1425
1426 We first traverse the branches and scan each word to determine if it
1427 contains widechars, and how many unique chars there are, this is
1428 important as we have to build a table with at least as many columns as we
1429 have unique chars.
1430
1431 We use an array of integers to represent the character codes 0..255
38a44b82 1432 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1433 native representation of the character value as the key and IV's for the
1434 coded index.
1435
1436 *TODO* If we keep track of how many times each character is used we can
1437 remap the columns so that the table compression later on is more
3b753521 1438 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1439 middle and the least common are on the outside. IMO this would be better
1440 than a most to least common mapping as theres a decent chance the most
1441 common letter will share a node with the least common, meaning the node
1442 will not be compressable. With a middle is most common approach the worst
1443 case is when we have the least common nodes twice.
1444
1445 */
1446
a3621e74 1447 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1448 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1449 const U8 *uc = (U8*)STRING( noper );
a28509cc 1450 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1451 STRLEN foldlen = 0;
1452 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1453 const U8 *scan = (U8*)NULL;
07be1b83 1454 U32 wordlen = 0; /* required init */
02daf0ab
YO
1455 STRLEN chars = 0;
1456 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1457
3dab1dad
YO
1458 if (OP(noper) == NOTHING) {
1459 trie->minlen= 0;
1460 continue;
1461 }
02daf0ab
YO
1462 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1463 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1464 regardless of encoding */
1465
a3621e74 1466 for ( ; uc < e ; uc += len ) {
3dab1dad 1467 TRIE_CHARCOUNT(trie)++;
a3621e74 1468 TRIE_READ_CHAR;
3dab1dad 1469 chars++;
a3621e74
YO
1470 if ( uvc < 256 ) {
1471 if ( !trie->charmap[ uvc ] ) {
1472 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1473 if ( folder )
1474 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1475 TRIE_STORE_REVCHAR;
a3621e74 1476 }
02daf0ab
YO
1477 if ( set_bit ) {
1478 /* store the codepoint in the bitmap, and if its ascii
1479 also store its folded equivelent. */
1480 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1481
1482 /* store the folded codepoint */
1483 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1484
1485 if ( !UTF ) {
1486 /* store first byte of utf8 representation of
1487 codepoints in the 127 < uvc < 256 range */
1488 if (127 < uvc && uvc < 192) {
1489 TRIE_BITMAP_SET(trie,194);
1490 } else if (191 < uvc ) {
1491 TRIE_BITMAP_SET(trie,195);
1492 /* && uvc < 256 -- we know uvc is < 256 already */
1493 }
1494 }
02daf0ab
YO
1495 set_bit = 0; /* We've done our bit :-) */
1496 }
a3621e74
YO
1497 } else {
1498 SV** svpp;
55eed653
NC
1499 if ( !widecharmap )
1500 widecharmap = newHV();
a3621e74 1501
55eed653 1502 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1503
1504 if ( !svpp )
e4584336 1505 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1506
1507 if ( !SvTRUE( *svpp ) ) {
1508 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1509 TRIE_STORE_REVCHAR;
a3621e74
YO
1510 }
1511 }
1512 }
3dab1dad
YO
1513 if( cur == first ) {
1514 trie->minlen=chars;
1515 trie->maxlen=chars;
1516 } else if (chars < trie->minlen) {
1517 trie->minlen=chars;
1518 } else if (chars > trie->maxlen) {
1519 trie->maxlen=chars;
1520 }
1521
a3621e74
YO
1522 } /* end first pass */
1523 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1524 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1525 (int)depth * 2 + 2,"",
55eed653 1526 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1527 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1528 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1529 );
a3621e74
YO
1530
1531 /*
1532 We now know what we are dealing with in terms of unique chars and
1533 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1534 representation using a flat table will take. If it's over a reasonable
1535 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1536 conservative but potentially much slower representation using an array
1537 of lists.
1538
1539 At the end we convert both representations into the same compressed
1540 form that will be used in regexec.c for matching with. The latter
1541 is a form that cannot be used to construct with but has memory
1542 properties similar to the list form and access properties similar
1543 to the table form making it both suitable for fast searches and
1544 small enough that its feasable to store for the duration of a program.
1545
1546 See the comment in the code where the compressed table is produced
1547 inplace from the flat tabe representation for an explanation of how
1548 the compression works.
1549
1550 */
1551
1552
2e64971a
DM
1553 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1554 prev_states[1] = 0;
1555
3dab1dad 1556 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1557 /*
1558 Second Pass -- Array Of Lists Representation
1559
1560 Each state will be represented by a list of charid:state records
1561 (reg_trie_trans_le) the first such element holds the CUR and LEN
1562 points of the allocated array. (See defines above).
1563
1564 We build the initial structure using the lists, and then convert
1565 it into the compressed table form which allows faster lookups
1566 (but cant be modified once converted).
a3621e74
YO
1567 */
1568
a3621e74
YO
1569 STRLEN transcount = 1;
1570
1e2e3d02
YO
1571 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1572 "%*sCompiling trie using list compiler\n",
1573 (int)depth * 2 + 2, ""));
446bd890 1574
c944940b
JH
1575 trie->states = (reg_trie_state *)
1576 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1577 sizeof(reg_trie_state) );
a3621e74
YO
1578 TRIE_LIST_NEW(1);
1579 next_alloc = 2;
1580
1581 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1582
c445ea15
AL
1583 regnode * const noper = NEXTOPER( cur );
1584 U8 *uc = (U8*)STRING( noper );
1585 const U8 * const e = uc + STR_LEN( noper );
1586 U32 state = 1; /* required init */
1587 U16 charid = 0; /* sanity init */
1588 U8 *scan = (U8*)NULL; /* sanity init */
1589 STRLEN foldlen = 0; /* required init */
07be1b83 1590 U32 wordlen = 0; /* required init */
c445ea15
AL
1591 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1592
3dab1dad 1593 if (OP(noper) != NOTHING) {
786e8c11 1594 for ( ; uc < e ; uc += len ) {
c445ea15 1595
786e8c11 1596 TRIE_READ_CHAR;
c445ea15 1597
786e8c11
YO
1598 if ( uvc < 256 ) {
1599 charid = trie->charmap[ uvc ];
c445ea15 1600 } else {
55eed653 1601 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1602 if ( !svpp ) {
1603 charid = 0;
1604 } else {
1605 charid=(U16)SvIV( *svpp );
1606 }
c445ea15 1607 }
786e8c11
YO
1608 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1609 if ( charid ) {
a3621e74 1610
786e8c11
YO
1611 U16 check;
1612 U32 newstate = 0;
a3621e74 1613
786e8c11
YO
1614 charid--;
1615 if ( !trie->states[ state ].trans.list ) {
1616 TRIE_LIST_NEW( state );
c445ea15 1617 }
786e8c11
YO
1618 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1619 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1620 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1621 break;
1622 }
1623 }
1624 if ( ! newstate ) {
1625 newstate = next_alloc++;
2e64971a 1626 prev_states[newstate] = state;
786e8c11
YO
1627 TRIE_LIST_PUSH( state, charid, newstate );
1628 transcount++;
1629 }
1630 state = newstate;
1631 } else {
1632 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1633 }
a28509cc 1634 }
c445ea15 1635 }
3dab1dad 1636 TRIE_HANDLE_WORD(state);
a3621e74
YO
1637
1638 } /* end second pass */
1639
1e2e3d02
YO
1640 /* next alloc is the NEXT state to be allocated */
1641 trie->statecount = next_alloc;
c944940b
JH
1642 trie->states = (reg_trie_state *)
1643 PerlMemShared_realloc( trie->states,
1644 next_alloc
1645 * sizeof(reg_trie_state) );
a3621e74 1646
3dab1dad 1647 /* and now dump it out before we compress it */
2b8b4781
NC
1648 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1649 revcharmap, next_alloc,
1650 depth+1)
1e2e3d02 1651 );
a3621e74 1652
c944940b
JH
1653 trie->trans = (reg_trie_trans *)
1654 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1655 {
1656 U32 state;
a3621e74
YO
1657 U32 tp = 0;
1658 U32 zp = 0;
1659
1660
1661 for( state=1 ; state < next_alloc ; state ++ ) {
1662 U32 base=0;
1663
1664 /*
1665 DEBUG_TRIE_COMPILE_MORE_r(
1666 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1667 );
1668 */
1669
1670 if (trie->states[state].trans.list) {
1671 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1672 U16 maxid=minid;
a28509cc 1673 U16 idx;
a3621e74
YO
1674
1675 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1676 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1677 if ( forid < minid ) {
1678 minid=forid;
1679 } else if ( forid > maxid ) {
1680 maxid=forid;
1681 }
a3621e74
YO
1682 }
1683 if ( transcount < tp + maxid - minid + 1) {
1684 transcount *= 2;
c944940b
JH
1685 trie->trans = (reg_trie_trans *)
1686 PerlMemShared_realloc( trie->trans,
446bd890
NC
1687 transcount
1688 * sizeof(reg_trie_trans) );
a3621e74
YO
1689 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1690 }
1691 base = trie->uniquecharcount + tp - minid;
1692 if ( maxid == minid ) {
1693 U32 set = 0;
1694 for ( ; zp < tp ; zp++ ) {
1695 if ( ! trie->trans[ zp ].next ) {
1696 base = trie->uniquecharcount + zp - minid;
1697 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1698 trie->trans[ zp ].check = state;
1699 set = 1;
1700 break;
1701 }
1702 }
1703 if ( !set ) {
1704 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1705 trie->trans[ tp ].check = state;
1706 tp++;
1707 zp = tp;
1708 }
1709 } else {
1710 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1711 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1712 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1713 trie->trans[ tid ].check = state;
1714 }
1715 tp += ( maxid - minid + 1 );
1716 }
1717 Safefree(trie->states[ state ].trans.list);
1718 }
1719 /*
1720 DEBUG_TRIE_COMPILE_MORE_r(
1721 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1722 );
1723 */
1724 trie->states[ state ].trans.base=base;
1725 }
cc601c31 1726 trie->lasttrans = tp + 1;
a3621e74
YO
1727 }
1728 } else {
1729 /*
1730 Second Pass -- Flat Table Representation.
1731
1732 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1733 We know that we will need Charcount+1 trans at most to store the data
1734 (one row per char at worst case) So we preallocate both structures
1735 assuming worst case.
1736
1737 We then construct the trie using only the .next slots of the entry
1738 structs.
1739
3b753521 1740 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
1741 make compression both faster and easier by keeping track of how many non
1742 zero fields are in the node.
1743
1744 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1745 transition.
1746
1747 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1748 number representing the first entry of the node, and state as a
1749 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1750 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1751 are 2 entrys per node. eg:
1752
1753 A B A B
1754 1. 2 4 1. 3 7
1755 2. 0 3 3. 0 5
1756 3. 0 0 5. 0 0
1757 4. 0 0 7. 0 0
1758
1759 The table is internally in the right hand, idx form. However as we also
1760 have to deal with the states array which is indexed by nodenum we have to
1761 use TRIE_NODENUM() to convert.
1762
1763 */
1e2e3d02
YO
1764 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1765 "%*sCompiling trie using table compiler\n",
1766 (int)depth * 2 + 2, ""));
3dab1dad 1767
c944940b
JH
1768 trie->trans = (reg_trie_trans *)
1769 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1770 * trie->uniquecharcount + 1,
1771 sizeof(reg_trie_trans) );
1772 trie->states = (reg_trie_state *)
1773 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1774 sizeof(reg_trie_state) );
a3621e74
YO
1775 next_alloc = trie->uniquecharcount + 1;
1776
3dab1dad 1777
a3621e74
YO
1778 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1779
c445ea15 1780 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1781 const U8 *uc = (U8*)STRING( noper );
1782 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1783
1784 U32 state = 1; /* required init */
1785
1786 U16 charid = 0; /* sanity init */
1787 U32 accept_state = 0; /* sanity init */
1788 U8 *scan = (U8*)NULL; /* sanity init */
1789
1790 STRLEN foldlen = 0; /* required init */
07be1b83 1791 U32 wordlen = 0; /* required init */
a3621e74
YO
1792 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1793
3dab1dad 1794 if ( OP(noper) != NOTHING ) {
786e8c11 1795 for ( ; uc < e ; uc += len ) {
a3621e74 1796
786e8c11 1797 TRIE_READ_CHAR;
a3621e74 1798
786e8c11
YO
1799 if ( uvc < 256 ) {
1800 charid = trie->charmap[ uvc ];
1801 } else {
55eed653 1802 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1803 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1804 }
786e8c11
YO
1805 if ( charid ) {
1806 charid--;
1807 if ( !trie->trans[ state + charid ].next ) {
1808 trie->trans[ state + charid ].next = next_alloc;
1809 trie->trans[ state ].check++;
2e64971a
DM
1810 prev_states[TRIE_NODENUM(next_alloc)]
1811 = TRIE_NODENUM(state);
786e8c11
YO
1812 next_alloc += trie->uniquecharcount;
1813 }
1814 state = trie->trans[ state + charid ].next;
1815 } else {
1816 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1817 }
1818 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1819 }
a3621e74 1820 }
3dab1dad
YO
1821 accept_state = TRIE_NODENUM( state );
1822 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1823
1824 } /* end second pass */
1825
3dab1dad 1826 /* and now dump it out before we compress it */
2b8b4781
NC
1827 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1828 revcharmap,
1829 next_alloc, depth+1));
a3621e74 1830
a3621e74
YO
1831 {
1832 /*
1833 * Inplace compress the table.*
1834
1835 For sparse data sets the table constructed by the trie algorithm will
1836 be mostly 0/FAIL transitions or to put it another way mostly empty.
1837 (Note that leaf nodes will not contain any transitions.)
1838
1839 This algorithm compresses the tables by eliminating most such
1840 transitions, at the cost of a modest bit of extra work during lookup:
1841
1842 - Each states[] entry contains a .base field which indicates the
1843 index in the state[] array wheres its transition data is stored.
1844
3b753521 1845 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
1846
1847 - If .base is nonzero then charid is added to it to find an entry in
1848 the trans array.
1849
1850 -If trans[states[state].base+charid].check!=state then the
1851 transition is taken to be a 0/Fail transition. Thus if there are fail
1852 transitions at the front of the node then the .base offset will point
1853 somewhere inside the previous nodes data (or maybe even into a node
1854 even earlier), but the .check field determines if the transition is
1855 valid.
1856
786e8c11 1857 XXX - wrong maybe?
a3621e74 1858 The following process inplace converts the table to the compressed
3b753521 1859 table: We first do not compress the root node 1,and mark all its
a3621e74 1860 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
1861 allows us to do a DFA construction from the compressed table later,
1862 and ensures that any .base pointers we calculate later are greater
1863 than 0.
a3621e74
YO
1864
1865 - We set 'pos' to indicate the first entry of the second node.
1866
1867 - We then iterate over the columns of the node, finding the first and
1868 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1869 and set the .check pointers accordingly, and advance pos
1870 appropriately and repreat for the next node. Note that when we copy
1871 the next pointers we have to convert them from the original
1872 NODEIDX form to NODENUM form as the former is not valid post
1873 compression.
1874
1875 - If a node has no transitions used we mark its base as 0 and do not
1876 advance the pos pointer.
1877
1878 - If a node only has one transition we use a second pointer into the
1879 structure to fill in allocated fail transitions from other states.
1880 This pointer is independent of the main pointer and scans forward
1881 looking for null transitions that are allocated to a state. When it
1882 finds one it writes the single transition into the "hole". If the
786e8c11 1883 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1884
1885 - Once compressed we can Renew/realloc the structures to release the
1886 excess space.
1887
1888 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1889 specifically Fig 3.47 and the associated pseudocode.
1890
1891 demq
1892 */
a3b680e6 1893 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1894 U32 state, charid;
a3621e74 1895 U32 pos = 0, zp=0;
1e2e3d02 1896 trie->statecount = laststate;
a3621e74
YO
1897
1898 for ( state = 1 ; state < laststate ; state++ ) {
1899 U8 flag = 0;
a28509cc
AL
1900 const U32 stateidx = TRIE_NODEIDX( state );
1901 const U32 o_used = trie->trans[ stateidx ].check;
1902 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1903 trie->trans[ stateidx ].check = 0;
1904
1905 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1906 if ( flag || trie->trans[ stateidx + charid ].next ) {
1907 if ( trie->trans[ stateidx + charid ].next ) {
1908 if (o_used == 1) {
1909 for ( ; zp < pos ; zp++ ) {
1910 if ( ! trie->trans[ zp ].next ) {
1911 break;
1912 }
1913 }
1914 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1915 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1916 trie->trans[ zp ].check = state;
1917 if ( ++zp > pos ) pos = zp;
1918 break;
1919 }
1920 used--;
1921 }
1922 if ( !flag ) {
1923 flag = 1;
1924 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1925 }
1926 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1927 trie->trans[ pos ].check = state;
1928 pos++;
1929 }
1930 }
1931 }
cc601c31 1932 trie->lasttrans = pos + 1;
c944940b
JH
1933 trie->states = (reg_trie_state *)
1934 PerlMemShared_realloc( trie->states, laststate
1935 * sizeof(reg_trie_state) );
a3621e74 1936 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1937 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1938 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1939 (int)depth * 2 + 2,"",
1940 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1941 (IV)next_alloc,
1942 (IV)pos,
a3621e74
YO
1943 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1944 );
1945
1946 } /* end table compress */
1947 }
1e2e3d02
YO
1948 DEBUG_TRIE_COMPILE_MORE_r(
1949 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1950 (int)depth * 2 + 2, "",
1951 (UV)trie->statecount,
1952 (UV)trie->lasttrans)
1953 );
cc601c31 1954 /* resize the trans array to remove unused space */
c944940b
JH
1955 trie->trans = (reg_trie_trans *)
1956 PerlMemShared_realloc( trie->trans, trie->lasttrans
1957 * sizeof(reg_trie_trans) );
a3621e74 1958
3b753521 1959 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
1960 U8 nodetype =(U8)(flags & 0xFF);
1961 char *str=NULL;
786e8c11 1962
07be1b83 1963#ifdef DEBUGGING
e62cc96a 1964 regnode *optimize = NULL;
7122b237
YO
1965#ifdef RE_TRACK_PATTERN_OFFSETS
1966
b57a0404
JH
1967 U32 mjd_offset = 0;
1968 U32 mjd_nodelen = 0;
7122b237
YO
1969#endif /* RE_TRACK_PATTERN_OFFSETS */
1970#endif /* DEBUGGING */
a3621e74 1971 /*
3dab1dad
YO
1972 This means we convert either the first branch or the first Exact,
1973 depending on whether the thing following (in 'last') is a branch
1974 or not and whther first is the startbranch (ie is it a sub part of
1975 the alternation or is it the whole thing.)
3b753521 1976 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 1977 the whole branch sequence, including the first.
a3621e74 1978 */
3dab1dad 1979 /* Find the node we are going to overwrite */
7f69552c 1980 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1981 /* branch sub-chain */
3dab1dad 1982 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1983#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1984 DEBUG_r({
1985 mjd_offset= Node_Offset((convert));
1986 mjd_nodelen= Node_Length((convert));
1987 });
7122b237 1988#endif
7f69552c 1989 /* whole branch chain */
7122b237
YO
1990 }
1991#ifdef RE_TRACK_PATTERN_OFFSETS
1992 else {
7f69552c
YO
1993 DEBUG_r({
1994 const regnode *nop = NEXTOPER( convert );
1995 mjd_offset= Node_Offset((nop));
1996 mjd_nodelen= Node_Length((nop));
1997 });
07be1b83
YO
1998 }
1999 DEBUG_OPTIMISE_r(
2000 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2001 (int)depth * 2 + 2, "",
786e8c11 2002 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2003 );
7122b237 2004#endif
3dab1dad
YO
2005 /* But first we check to see if there is a common prefix we can
2006 split out as an EXACT and put in front of the TRIE node. */
2007 trie->startstate= 1;
55eed653 2008 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2009 U32 state;
1e2e3d02 2010 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2011 U32 ofs = 0;
8e11feef
RGS
2012 I32 idx = -1;
2013 U32 count = 0;
2014 const U32 base = trie->states[ state ].trans.base;
a3621e74 2015
3dab1dad 2016 if ( trie->states[state].wordnum )
8e11feef 2017 count = 1;
a3621e74 2018
8e11feef 2019 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2020 if ( ( base + ofs >= trie->uniquecharcount ) &&
2021 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2022 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2023 {
3dab1dad 2024 if ( ++count > 1 ) {
2b8b4781 2025 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2026 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2027 if ( state == 1 ) break;
3dab1dad
YO
2028 if ( count == 2 ) {
2029 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2030 DEBUG_OPTIMISE_r(
8e11feef
RGS
2031 PerlIO_printf(Perl_debug_log,
2032 "%*sNew Start State=%"UVuf" Class: [",
2033 (int)depth * 2 + 2, "",
786e8c11 2034 (UV)state));
be8e71aa 2035 if (idx >= 0) {
2b8b4781 2036 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2037 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2038
3dab1dad 2039 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2040 if ( folder )
2041 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2042 DEBUG_OPTIMISE_r(
f1f66076 2043 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2044 );
8e11feef
RGS
2045 }
2046 }
2047 TRIE_BITMAP_SET(trie,*ch);
2048 if ( folder )
2049 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2050 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2051 }
2052 idx = ofs;
2053 }
3dab1dad
YO
2054 }
2055 if ( count == 1 ) {
2b8b4781 2056 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2057 STRLEN len;
2058 char *ch = SvPV( *tmp, len );
de734bd5
A
2059 DEBUG_OPTIMISE_r({
2060 SV *sv=sv_newmortal();
8e11feef
RGS
2061 PerlIO_printf( Perl_debug_log,
2062 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2063 (int)depth * 2 + 2, "",
de734bd5
A
2064 (UV)state, (UV)idx,
2065 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2066 PL_colors[0], PL_colors[1],
2067 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2068 PERL_PV_ESCAPE_FIRSTCHAR
2069 )
2070 );
2071 });
3dab1dad
YO
2072 if ( state==1 ) {
2073 OP( convert ) = nodetype;
2074 str=STRING(convert);
2075 STR_LEN(convert)=0;
2076 }
c490c714
YO
2077 STR_LEN(convert) += len;
2078 while (len--)
de734bd5 2079 *str++ = *ch++;
8e11feef 2080 } else {
f9049ba1 2081#ifdef DEBUGGING
8e11feef
RGS
2082 if (state>1)
2083 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2084#endif
8e11feef
RGS
2085 break;
2086 }
2087 }
2e64971a 2088 trie->prefixlen = (state-1);
3dab1dad 2089 if (str) {
8e11feef 2090 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2091 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2092 trie->startstate = state;
07be1b83
YO
2093 trie->minlen -= (state - 1);
2094 trie->maxlen -= (state - 1);
33809eae
JH
2095#ifdef DEBUGGING
2096 /* At least the UNICOS C compiler choked on this
2097 * being argument to DEBUG_r(), so let's just have
2098 * it right here. */
2099 if (
2100#ifdef PERL_EXT_RE_BUILD
2101 1
2102#else
2103 DEBUG_r_TEST
2104#endif
2105 ) {
2106 regnode *fix = convert;
2107 U32 word = trie->wordcount;
2108 mjd_nodelen++;
2109 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2110 while( ++fix < n ) {
2111 Set_Node_Offset_Length(fix, 0, 0);
2112 }
2113 while (word--) {
2114 SV ** const tmp = av_fetch( trie_words, word, 0 );
2115 if (tmp) {
2116 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2117 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2118 else
2119 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2120 }
2121 }
2122 }
2123#endif
8e11feef
RGS
2124 if (trie->maxlen) {
2125 convert = n;
2126 } else {
3dab1dad 2127 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2128 DEBUG_r(optimize= n);
3dab1dad
YO
2129 }
2130 }
2131 }
a5ca303d
YO
2132 if (!jumper)
2133 jumper = last;
3dab1dad 2134 if ( trie->maxlen ) {
8e11feef
RGS
2135 NEXT_OFF( convert ) = (U16)(tail - convert);
2136 ARG_SET( convert, data_slot );
786e8c11
YO
2137 /* Store the offset to the first unabsorbed branch in
2138 jump[0], which is otherwise unused by the jump logic.
2139 We use this when dumping a trie and during optimisation. */
2140 if (trie->jump)
7f69552c 2141 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2142
6c48061a
YO
2143 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2144 * and there is a bitmap
2145 * and the first "jump target" node we found leaves enough room
2146 * then convert the TRIE node into a TRIEC node, with the bitmap
2147 * embedded inline in the opcode - this is hypothetically faster.
2148 */
2149 if ( !trie->states[trie->startstate].wordnum
2150 && trie->bitmap
2151 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2152 {
2153 OP( convert ) = TRIEC;
2154 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2155 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2156 trie->bitmap= NULL;
2157 } else
2158 OP( convert ) = TRIE;
a3621e74 2159
3dab1dad
YO
2160 /* store the type in the flags */
2161 convert->flags = nodetype;
a5ca303d
YO
2162 DEBUG_r({
2163 optimize = convert
2164 + NODE_STEP_REGNODE
2165 + regarglen[ OP( convert ) ];
2166 });
2167 /* XXX We really should free up the resource in trie now,
2168 as we won't use them - (which resources?) dmq */
3dab1dad 2169 }
a3621e74 2170 /* needed for dumping*/
e62cc96a 2171 DEBUG_r(if (optimize) {
07be1b83 2172 regnode *opt = convert;
bcdf7404 2173
e62cc96a 2174 while ( ++opt < optimize) {
07be1b83
YO
2175 Set_Node_Offset_Length(opt,0,0);
2176 }
786e8c11
YO
2177 /*
2178 Try to clean up some of the debris left after the
2179 optimisation.
a3621e74 2180 */
786e8c11 2181 while( optimize < jumper ) {
07be1b83 2182 mjd_nodelen += Node_Length((optimize));
a3621e74 2183 OP( optimize ) = OPTIMIZED;
07be1b83 2184 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2185 optimize++;
2186 }
07be1b83 2187 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2188 });
2189 } /* end node insert */
2e64971a
DM
2190
2191 /* Finish populating the prev field of the wordinfo array. Walk back
2192 * from each accept state until we find another accept state, and if
2193 * so, point the first word's .prev field at the second word. If the
2194 * second already has a .prev field set, stop now. This will be the
2195 * case either if we've already processed that word's accept state,
3b753521
FN
2196 * or that state had multiple words, and the overspill words were
2197 * already linked up earlier.
2e64971a
DM
2198 */
2199 {
2200 U16 word;
2201 U32 state;
2202 U16 prev;
2203
2204 for (word=1; word <= trie->wordcount; word++) {
2205 prev = 0;
2206 if (trie->wordinfo[word].prev)
2207 continue;
2208 state = trie->wordinfo[word].accept;
2209 while (state) {
2210 state = prev_states[state];
2211 if (!state)
2212 break;
2213 prev = trie->states[state].wordnum;
2214 if (prev)
2215 break;
2216 }
2217 trie->wordinfo[word].prev = prev;
2218 }
2219 Safefree(prev_states);
2220 }
2221
2222
2223 /* and now dump out the compressed format */
2224 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2225
55eed653 2226 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2227#ifdef DEBUGGING
2228 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2229 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2230#else
2231 SvREFCNT_dec(revcharmap);
07be1b83 2232#endif
786e8c11
YO
2233 return trie->jump
2234 ? MADE_JUMP_TRIE
2235 : trie->startstate>1
2236 ? MADE_EXACT_TRIE
2237 : MADE_TRIE;
2238}
2239
2240STATIC void
2241S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2242{
3b753521 2243/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2244
2245 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2246 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2247 ISBN 0-201-10088-6
2248
2249 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2250 suffix of the current state's 'word' that is also a proper prefix of another word in our
2251 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2252 the DFA not to have to restart after its tried and failed a word at a given point, it
2253 simply continues as though it had been matching the other word in the first place.
2254 Consider
2255 'abcdgu'=~/abcdefg|cdgu/
2256 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2257 fail, which would bring us to the state representing 'd' in the second word where we would
2258 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2259 */
2260 /* add a fail transition */
3251b653
NC
2261 const U32 trie_offset = ARG(source);
2262 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2263 U32 *q;
2264 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2265 const U32 numstates = trie->statecount;
786e8c11
YO
2266 const U32 ubound = trie->lasttrans + ucharcount;
2267 U32 q_read = 0;
2268 U32 q_write = 0;
2269 U32 charid;
2270 U32 base = trie->states[ 1 ].trans.base;
2271 U32 *fail;
2272 reg_ac_data *aho;
2273 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2274 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2275
2276 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2277#ifndef DEBUGGING
2278 PERL_UNUSED_ARG(depth);
2279#endif
2280
2281
2282 ARG_SET( stclass, data_slot );
c944940b 2283 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2284 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2285 aho->trie=trie_offset;
446bd890
NC
2286 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2287 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2288 Newxz( q, numstates, U32);
c944940b 2289 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2290 aho->refcount = 1;
2291 fail = aho->fail;
2292 /* initialize fail[0..1] to be 1 so that we always have
2293 a valid final fail state */
2294 fail[ 0 ] = fail[ 1 ] = 1;
2295
2296 for ( charid = 0; charid < ucharcount ; charid++ ) {
2297 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2298 if ( newstate ) {
2299 q[ q_write ] = newstate;
2300 /* set to point at the root */
2301 fail[ q[ q_write++ ] ]=1;
2302 }
2303 }
2304 while ( q_read < q_write) {
2305 const U32 cur = q[ q_read++ % numstates ];
2306 base = trie->states[ cur ].trans.base;
2307
2308 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2309 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2310 if (ch_state) {
2311 U32 fail_state = cur;
2312 U32 fail_base;
2313 do {
2314 fail_state = fail[ fail_state ];
2315 fail_base = aho->states[ fail_state ].trans.base;
2316 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2317
2318 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2319 fail[ ch_state ] = fail_state;
2320 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2321 {
2322 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2323 }
2324 q[ q_write++ % numstates] = ch_state;
2325 }
2326 }
2327 }
2328 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2329 when we fail in state 1, this allows us to use the
2330 charclass scan to find a valid start char. This is based on the principle
2331 that theres a good chance the string being searched contains lots of stuff
2332 that cant be a start char.
2333 */
2334 fail[ 0 ] = fail[ 1 ] = 0;
2335 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2336 PerlIO_printf(Perl_debug_log,
2337 "%*sStclass Failtable (%"UVuf" states): 0",
2338 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2339 );
786e8c11
YO
2340 for( q_read=1; q_read<numstates; q_read++ ) {
2341 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2342 }
2343 PerlIO_printf(Perl_debug_log, "\n");
2344 });
2345 Safefree(q);
2346 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2347}
2348
786e8c11 2349
a3621e74 2350/*
5d1c421c
JH
2351 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2352 * These need to be revisited when a newer toolchain becomes available.
2353 */
2354#if defined(__sparc64__) && defined(__GNUC__)
2355# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2356# undef SPARC64_GCC_WORKAROUND
2357# define SPARC64_GCC_WORKAROUND 1
2358# endif
2359#endif
2360
07be1b83 2361#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2362 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2363 SV * const mysv=sv_newmortal(); \
2364 regnode *Next = regnext(scan); \
2365 regprop(RExC_rx, mysv, scan); \
7f69552c 2366 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2367 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2368 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2369 }});
07be1b83 2370
1de06328
YO
2371
2372
2373
2374
07be1b83
YO
2375#define JOIN_EXACT(scan,min,flags) \
2376 if (PL_regkind[OP(scan)] == EXACT) \
2377 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2378
be8e71aa 2379STATIC U32
07be1b83
YO
2380S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2381 /* Merge several consecutive EXACTish nodes into one. */
2382 regnode *n = regnext(scan);
2383 U32 stringok = 1;
2384 regnode *next = scan + NODE_SZ_STR(scan);
2385 U32 merged = 0;
2386 U32 stopnow = 0;
2387#ifdef DEBUGGING
2388 regnode *stop = scan;
72f13be8 2389 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2390#else
d47053eb
RGS
2391 PERL_UNUSED_ARG(depth);
2392#endif
7918f24d
NC
2393
2394 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2395#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2396 PERL_UNUSED_ARG(flags);
2397 PERL_UNUSED_ARG(val);
07be1b83 2398#endif
07be1b83
YO
2399 DEBUG_PEEP("join",scan,depth);
2400
2401 /* Skip NOTHING, merge EXACT*. */
2402 while (n &&
2403 ( PL_regkind[OP(n)] == NOTHING ||
2404 (stringok && (OP(n) == OP(scan))))
2405 && NEXT_OFF(n)
2406 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2407
2408 if (OP(n) == TAIL || n > next)
2409 stringok = 0;
2410 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2411 DEBUG_PEEP("skip:",n,depth);
2412 NEXT_OFF(scan) += NEXT_OFF(n);
2413 next = n + NODE_STEP_REGNODE;
2414#ifdef DEBUGGING
2415 if (stringok)
2416 stop = n;
2417#endif
2418 n = regnext(n);
2419 }
2420 else if (stringok) {
786e8c11 2421 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2422 regnode * const nnext = regnext(n);
2423
2424 DEBUG_PEEP("merg",n,depth);
2425
2426 merged++;
2427 if (oldl + STR_LEN(n) > U8_MAX)
2428 break;
2429 NEXT_OFF(scan) += NEXT_OFF(n);
2430 STR_LEN(scan) += STR_LEN(n);
2431 next = n + NODE_SZ_STR(n);
2432 /* Now we can overwrite *n : */
2433 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2434#ifdef DEBUGGING
2435 stop = next - 1;
2436#endif
2437 n = nnext;
2438 if (stopnow) break;
2439 }
2440
d47053eb
RGS
2441#ifdef EXPERIMENTAL_INPLACESCAN
2442 if (flags && !NEXT_OFF(n)) {
2443 DEBUG_PEEP("atch", val, depth);
2444 if (reg_off_by_arg[OP(n)]) {
2445 ARG_SET(n, val - n);
2446 }
2447 else {
2448 NEXT_OFF(n) = val - n;
2449 }
2450 stopnow = 1;
2451 }
07be1b83
YO
2452#endif
2453 }
2454
2455 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2456 /*
2457 Two problematic code points in Unicode casefolding of EXACT nodes:
2458
2459 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2460 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2461
2462 which casefold to
2463
2464 Unicode UTF-8
2465
2466 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2467 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2468
2469 This means that in case-insensitive matching (or "loose matching",
2470 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2471 length of the above casefolded versions) can match a target string
2472 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2473 This would rather mess up the minimum length computation.
2474
2475 What we'll do is to look for the tail four bytes, and then peek
2476 at the preceding two bytes to see whether we need to decrease
2477 the minimum length by four (six minus two).
2478
2479 Thanks to the design of UTF-8, there cannot be false matches:
2480 A sequence of valid UTF-8 bytes cannot be a subsequence of
2481 another valid sequence of UTF-8 bytes.
2482
2483 */
2484 char * const s0 = STRING(scan), *s, *t;
2485 char * const s1 = s0 + STR_LEN(scan) - 1;
2486 char * const s2 = s1 - 4;
e294cc5d
JH
2487#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2488 const char t0[] = "\xaf\x49\xaf\x42";
2489#else
07be1b83 2490 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2491#endif
07be1b83
YO
2492 const char * const t1 = t0 + 3;
2493
2494 for (s = s0 + 2;
2495 s < s2 && (t = ninstr(s, s1, t0, t1));
2496 s = t + 4) {
e294cc5d
JH
2497#ifdef EBCDIC
2498 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2499 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2500#else
07be1b83
YO
2501 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2502 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2503#endif
07be1b83
YO
2504 *min -= 4;
2505 }
2506 }
2507
2508#ifdef DEBUGGING
2509 /* Allow dumping */
2510 n = scan + NODE_SZ_STR(scan);
2511 while (n <= stop) {
2512 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2513 OP(n) = OPTIMIZED;
2514 NEXT_OFF(n) = 0;
2515 }
2516 n++;
2517 }
2518#endif
2519 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2520 return stopnow;
2521}
2522
653099ff
GS
2523/* REx optimizer. Converts nodes into quickier variants "in place".
2524 Finds fixed substrings. */
2525
a0288114 2526/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2527 to the position after last scanned or to NULL. */
2528
40d049e4
YO
2529#define INIT_AND_WITHP \
2530 assert(!and_withp); \
2531 Newx(and_withp,1,struct regnode_charclass_class); \
2532 SAVEFREEPV(and_withp)
07be1b83 2533
b515a41d
YO
2534/* this is a chain of data about sub patterns we are processing that
2535 need to be handled seperately/specially in study_chunk. Its so
2536 we can simulate recursion without losing state. */
2537struct scan_frame;
2538typedef struct scan_frame {
2539 regnode *last; /* last node to process in this frame */
2540 regnode *next; /* next node to process when last is reached */
2541 struct scan_frame *prev; /*previous frame*/
2542 I32 stop; /* what stopparen do we use */
2543} scan_frame;
2544
304ee84b
YO
2545
2546#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2547
e1d1eefb
YO
2548#define CASE_SYNST_FNC(nAmE) \
2549case nAmE: \
2550 if (flags & SCF_DO_STCLASS_AND) { \
2551 for (value = 0; value < 256; value++) \
2552 if (!is_ ## nAmE ## _cp(value)) \
2553 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2554 } \
2555 else { \
2556 for (value = 0; value < 256; value++) \
2557 if (is_ ## nAmE ## _cp(value)) \
2558 ANYOF_BITMAP_SET(data->start_class, value); \
2559 } \
2560 break; \
2561case N ## nAmE: \
2562 if (flags & SCF_DO_STCLASS_AND) { \
2563 for (value = 0; value < 256; value++) \
2564 if (is_ ## nAmE ## _cp(value)) \
2565 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2566 } \
2567 else { \
2568 for (value = 0; value < 256; value++) \
2569 if (!is_ ## nAmE ## _cp(value)) \
2570 ANYOF_BITMAP_SET(data->start_class, value); \
2571 } \
2572 break
2573
2574
2575
76e3520e 2576STATIC I32
40d049e4 2577S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2578 I32 *minlenp, I32 *deltap,
40d049e4
YO
2579 regnode *last,
2580 scan_data_t *data,
2581 I32 stopparen,
2582 U8* recursed,
2583 struct regnode_charclass_class *and_withp,
2584 U32 flags, U32 depth)
c277df42
IZ
2585 /* scanp: Start here (read-write). */
2586 /* deltap: Write maxlen-minlen here. */
2587 /* last: Stop before this one. */
40d049e4
YO
2588 /* data: string data about the pattern */
2589 /* stopparen: treat close N as END */
2590 /* recursed: which subroutines have we recursed into */
2591 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2592{
97aff369 2593 dVAR;
c277df42
IZ
2594 I32 min = 0, pars = 0, code;
2595 regnode *scan = *scanp, *next;
2596 I32 delta = 0;
2597 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2598 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2599 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2600 scan_data_t data_fake;
a3621e74 2601 SV *re_trie_maxbuff = NULL;
786e8c11 2602 regnode *first_non_open = scan;
e2e6a0f1 2603 I32 stopmin = I32_MAX;
8aa23a47 2604 scan_frame *frame = NULL;
a3621e74 2605 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2606
7918f24d
NC
2607 PERL_ARGS_ASSERT_STUDY_CHUNK;
2608
13a24bad 2609#ifdef DEBUGGING
40d049e4 2610 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2611#endif
40d049e4 2612
786e8c11 2613 if ( depth == 0 ) {
40d049e4 2614 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2615 first_non_open=regnext(first_non_open);
2616 }
2617
b81d288d 2618
8aa23a47
YO
2619 fake_study_recurse:
2620 while ( scan && OP(scan) != END && scan < last ){
2621 /* Peephole optimizer: */
304ee84b 2622 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2623 DEBUG_PEEP("Peep",scan,depth);
2624 JOIN_EXACT(scan,&min,0);
2625
2626 /* Follow the next-chain of the current node and optimize
2627 away all the NOTHINGs from it. */
2628 if (OP(scan) != CURLYX) {
2629 const int max = (reg_off_by_arg[OP(scan)]
2630 ? I32_MAX
2631 /* I32 may be smaller than U16 on CRAYs! */
2632 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2633 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2634 int noff;
2635 regnode *n = scan;
2636
2637 /* Skip NOTHING and LONGJMP. */
2638 while ((n = regnext(n))
2639 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2640 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2641 && off + noff < max)
2642 off += noff;
2643 if (reg_off_by_arg[OP(scan)])
2644 ARG(scan) = off;
2645 else
2646 NEXT_OFF(scan) = off;
2647 }
a3621e74 2648
c277df42 2649
8aa23a47
YO
2650
2651 /* The principal pseudo-switch. Cannot be a switch, since we
2652 look into several different things. */
2653 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2654 || OP(scan) == IFTHEN) {
2655 next = regnext(scan);
2656 code = OP(scan);
2657 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2658
2659 if (OP(next) == code || code == IFTHEN) {
2660 /* NOTE - There is similar code to this block below for handling
2661 TRIE nodes on a re-study. If you change stuff here check there
2662 too. */
2663 I32 max1 = 0, min1 = I32_MAX, num = 0;
2664 struct regnode_charclass_class accum;
2665 regnode * const startbranch=scan;
2666
2667 if (flags & SCF_DO_SUBSTR)
304ee84b 2668 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2669 if (flags & SCF_DO_STCLASS)
2670 cl_init_zero(pRExC_state, &accum);
2671
2672 while (OP(scan) == code) {
2673 I32 deltanext, minnext, f = 0, fake;
2674 struct regnode_charclass_class this_class;
2675
2676 num++;
2677 data_fake.flags = 0;
2678 if (data) {
2679 data_fake.whilem_c = data->whilem_c;
2680 data_fake.last_closep = data->last_closep;
2681 }
2682 else
2683 data_fake.last_closep = &fake;
58e23c8d
YO
2684
2685 data_fake.pos_delta = delta;
8aa23a47
YO
2686 next = regnext(scan);
2687 scan = NEXTOPER(scan);
2688 if (code != BRANCH)
c277df42 2689 scan = NEXTOPER(scan);
8aa23a47
YO
2690 if (flags & SCF_DO_STCLASS) {
2691 cl_init(pRExC_state, &this_class);
2692 data_fake.start_class = &this_class;
2693 f = SCF_DO_STCLASS_AND;
58e23c8d 2694 }
8aa23a47
YO
2695 if (flags & SCF_WHILEM_VISITED_POS)
2696 f |= SCF_WHILEM_VISITED_POS;
2697
2698 /* we suppose the run is continuous, last=next...*/
2699 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2700 next, &data_fake,
2701 stopparen, recursed, NULL, f,depth+1);
2702 if (min1 > minnext)
2703 min1 = minnext;
2704 if (max1 < minnext + deltanext)
2705 max1 = minnext + deltanext;
2706 if (deltanext == I32_MAX)
2707 is_inf = is_inf_internal = 1;
2708 scan = next;
2709 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2710 pars++;
2711 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2712 if ( stopmin > minnext)
2713 stopmin = min + min1;
2714 flags &= ~SCF_DO_SUBSTR;
2715 if (data)
2716 data->flags |= SCF_SEEN_ACCEPT;
2717 }
2718 if (data) {
2719 if (data_fake.flags & SF_HAS_EVAL)
2720 data->flags |= SF_HAS_EVAL;
2721 data->whilem_c = data_fake.whilem_c;
3dab1dad 2722 }
8aa23a47
YO
2723 if (flags & SCF_DO_STCLASS)
2724 cl_or(pRExC_state, &accum, &this_class);
2725 }
2726 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2727 min1 = 0;
2728 if (flags & SCF_DO_SUBSTR) {
2729 data->pos_min += min1;
2730 data->pos_delta += max1 - min1;
2731 if (max1 != min1 || is_inf)
2732 data->longest = &(data->longest_float);
2733 }
2734 min += min1;
2735 delta += max1 - min1;
2736 if (flags & SCF_DO_STCLASS_OR) {
2737 cl_or(pRExC_state, data->start_class, &accum);
2738 if (min1) {
2739 cl_and(data->start_class, and_withp);
2740 flags &= ~SCF_DO_STCLASS;
653099ff 2741 }
8aa23a47
YO
2742 }
2743 else if (flags & SCF_DO_STCLASS_AND) {
2744 if (min1) {
2745 cl_and(data->start_class, &accum);
2746 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2747 }
8aa23a47
YO
2748 else {
2749 /* Switch to OR mode: cache the old value of
2750 * data->start_class */
2751 INIT_AND_WITHP;
2752 StructCopy(data->start_class, and_withp,
2753 struct regnode_charclass_class);
2754 flags &= ~SCF_DO_STCLASS_AND;
2755 StructCopy(&accum, data->start_class,
2756 struct regnode_charclass_class);
2757 flags |= SCF_DO_STCLASS_OR;
2758 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2759 }
8aa23a47 2760 }
a3621e74 2761
8aa23a47
YO
2762 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2763 /* demq.
a3621e74 2764
8aa23a47
YO
2765 Assuming this was/is a branch we are dealing with: 'scan' now
2766 points at the item that follows the branch sequence, whatever
2767 it is. We now start at the beginning of the sequence and look
2768 for subsequences of
a3621e74 2769
8aa23a47
YO
2770 BRANCH->EXACT=>x1
2771 BRANCH->EXACT=>x2
2772 tail
a3621e74 2773
8aa23a47 2774 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2775
8aa23a47
YO
2776 If we can find such a subseqence we need to turn the first
2777 element into a trie and then add the subsequent branch exact
2778 strings to the trie.
a3621e74 2779
8aa23a47 2780 We have two cases
a3621e74 2781
3b753521 2782 1. patterns where the whole set of branches can be converted.
a3621e74 2783
8aa23a47 2784 2. patterns where only a subset can be converted.
a3621e74 2785
8aa23a47
YO
2786 In case 1 we can replace the whole set with a single regop
2787 for the trie. In case 2 we need to keep the start and end
3b753521 2788 branches so
a3621e74 2789
8aa23a47
YO
2790 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2791 becomes BRANCH TRIE; BRANCH X;
786e8c11 2792
8aa23a47
YO
2793 There is an additional case, that being where there is a
2794 common prefix, which gets split out into an EXACT like node
2795 preceding the TRIE node.
a3621e74 2796
8aa23a47
YO
2797 If x(1..n)==tail then we can do a simple trie, if not we make
2798 a "jump" trie, such that when we match the appropriate word
2799 we "jump" to the appopriate tail node. Essentailly we turn
2800 a nested if into a case structure of sorts.
b515a41d 2801
8aa23a47
YO
2802 */
2803
2804 int made=0;
2805 if (!re_trie_maxbuff) {
2806 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2807 if (!SvIOK(re_trie_maxbuff))
2808 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2809 }
2810 if ( SvIV(re_trie_maxbuff)>=0 ) {
2811 regnode *cur;
2812 regnode *first = (regnode *)NULL;
2813 regnode *last = (regnode *)NULL;
2814 regnode *tail = scan;
2815 U8 optype = 0;
2816 U32 count=0;
a3621e74
YO
2817
2818#ifdef DEBUGGING
8aa23a47 2819 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2820#endif
8aa23a47
YO
2821 /* var tail is used because there may be a TAIL
2822 regop in the way. Ie, the exacts will point to the
2823 thing following the TAIL, but the last branch will
2824 point at the TAIL. So we advance tail. If we
2825 have nested (?:) we may have to move through several
2826 tails.
2827 */
2828
2829 while ( OP( tail ) == TAIL ) {
2830 /* this is the TAIL generated by (?:) */
2831 tail = regnext( tail );
2832 }
a3621e74 2833
8aa23a47
YO
2834
2835 DEBUG_OPTIMISE_r({
2836 regprop(RExC_rx, mysv, tail );
2837 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2838 (int)depth * 2 + 2, "",
2839 "Looking for TRIE'able sequences. Tail node is: ",
2840 SvPV_nolen_const( mysv )
2841 );
2842 });
2843
2844 /*
2845
2846 step through the branches, cur represents each
2847 branch, noper is the first thing to be matched
2848 as part of that branch and noper_next is the
2849 regnext() of that node. if noper is an EXACT
2850 and noper_next is the same as scan (our current
2851 position in the regex) then the EXACT branch is
2852 a possible optimization target. Once we have
2853 two or more consequetive such branches we can
2854 create a trie of the EXACT's contents and stich
2855 it in place. If the sequence represents all of
2856 the branches we eliminate the whole thing and
2857 replace it with a single TRIE. If it is a
2858 subsequence then we need to stitch it in. This
2859 means the first branch has to remain, and needs
2860 to be repointed at the item on the branch chain
2861 following the last branch optimized. This could
2862 be either a BRANCH, in which case the
2863 subsequence is internal, or it could be the
2864 item following the branch sequence in which
2865 case the subsequence is at the end.
2866
2867 */
2868
2869 /* dont use tail as the end marker for this traverse */
2870 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2871 regnode * const noper = NEXTOPER( cur );
b515a41d 2872#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2873 regnode * const noper_next = regnext( noper );
b515a41d
YO
2874#endif
2875
8aa23a47
YO
2876 DEBUG_OPTIMISE_r({
2877 regprop(RExC_rx, mysv, cur);
2878 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2879 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2880
2881 regprop(RExC_rx, mysv, noper);
2882 PerlIO_printf( Perl_debug_log, " -> %s",
2883 SvPV_nolen_const(mysv));
2884
2885 if ( noper_next ) {
2886 regprop(RExC_rx, mysv, noper_next );
2887 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2888 SvPV_nolen_const(mysv));
2889 }
2890 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2891 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2892 });
2893 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2894 : PL_regkind[ OP( noper ) ] == EXACT )
2895 || OP(noper) == NOTHING )
786e8c11 2896#ifdef NOJUMPTRIE
8aa23a47 2897 && noper_next == tail
786e8c11 2898#endif
8aa23a47
YO
2899 && count < U16_MAX)
2900 {
2901 count++;
2902 if ( !first || optype == NOTHING ) {
2903 if (!first) first = cur;
2904 optype = OP( noper );
2905 } else {
2906 last = cur;
2907 }
2908 } else {
a0a388a1 2909/*
0abd0d78
YO
2910 Currently we do not believe that the trie logic can
2911 handle case insensitive matching properly when the
2912 pattern is not unicode (thus forcing unicode semantics).
2913
2914 If/when this is fixed the following define can be swapped
2915 in below to fully enable trie logic.
2916
a0a388a1 2917#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2918
2919*/
2920#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2921
a0a388a1 2922 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2923 make_trie( pRExC_state,
2924 startbranch, first, cur, tail, count,
2925 optype, depth+1 );
2926 }
2927 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2928#ifdef NOJUMPTRIE
8aa23a47 2929 && noper_next == tail
786e8c11 2930#endif
8aa23a47
YO
2931 ){
2932 count = 1;
2933 first = cur;
2934 optype = OP( noper );
2935 } else {
2936 count = 0;
2937 first = NULL;
2938 optype = 0;
2939 }
2940 last = NULL;
2941 }
2942 }
2943 DEBUG_OPTIMISE_r({
2944 regprop(RExC_rx, mysv, cur);
2945 PerlIO_printf( Perl_debug_log,
2946 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2947 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2948
2949 });
a0a388a1
YO
2950
2951 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2952 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2953#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2954 if ( ((made == MADE_EXACT_TRIE &&
2955 startbranch == first)
2956 || ( first_non_open == first )) &&
2957 depth==0 ) {
2958 flags |= SCF_TRIE_RESTUDY;
2959 if ( startbranch == first
2960 && scan == tail )
2961 {
2962 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2963 }
2964 }
3dab1dad 2965#endif
8aa23a47
YO
2966 }
2967 }
2968
2969 } /* do trie */
2970
653099ff 2971 }
8aa23a47
YO
2972 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2973 scan = NEXTOPER(NEXTOPER(scan));
2974 } else /* single branch is optimized. */
2975 scan = NEXTOPER(scan);
2976 continue;
2977 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2978 scan_frame *newframe = NULL;
2979 I32 paren;
2980 regnode *start;
2981 regnode *end;
2982
2983 if (OP(scan) != SUSPEND) {
2984 /* set the pointer */
2985 if (OP(scan) == GOSUB) {
2986 paren = ARG(scan);
2987 RExC_recurse[ARG2L(scan)] = scan;
2988 start = RExC_open_parens[paren-1];
2989 end = RExC_close_parens[paren-1];
2990 } else {
2991 paren = 0;
f8fc2ecf 2992 start = RExC_rxi->program + 1;
8aa23a47
YO
2993 end = RExC_opend;
2994 }
2995 if (!recursed) {
2996 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2997 SAVEFREEPV(recursed);
2998 }
2999 if (!PAREN_TEST(recursed,paren+1)) {
3000 PAREN_SET(recursed,paren+1);
3001 Newx(newframe,1,scan_frame);
3002 } else {
3003 if (flags & SCF_DO_SUBSTR) {
304ee84b 3004 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3005 data->longest = &(data->longest_float);
3006 }
3007 is_inf = is_inf_internal = 1;
3008 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3009 cl_anything(pRExC_state, data->start_class);
3010 flags &= ~SCF_DO_STCLASS;
3011 }
3012 } else {
3013 Newx(newframe,1,scan_frame);
3014 paren = stopparen;
3015 start = scan+2;
3016 end = regnext(scan);
3017 }
3018 if (newframe) {
3019 assert(start);
3020 assert(end);
3021 SAVEFREEPV(newframe);
3022 newframe->next = regnext(scan);
3023 newframe->last = last;
3024 newframe->stop = stopparen;
3025 newframe->prev = frame;
3026
3027 frame = newframe;
3028 scan = start;
3029 stopparen = paren;
3030 last = end;
3031
3032 continue;
3033 }
3034 }
3035 else if (OP(scan) == EXACT) {
3036 I32 l = STR_LEN(scan);
3037 UV uc;
3038 if (UTF) {
3039 const U8 * const s = (U8*)STRING(scan);
3040 l = utf8_length(s, s + l);
3041 uc = utf8_to_uvchr(s, NULL);
3042 } else {
3043 uc = *((U8*)STRING(scan));
3044 }
3045 min += l;
3046 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3047 /* The code below prefers earlier match for fixed
3048 offset, later match for variable offset. */
3049 if (data->last_end == -1) { /* Update the start info. */
3050 data->last_start_min = data->pos_min;
3051 data->last_start_max = is_inf
3052 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3053 }
8aa23a47
YO
3054 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3055 if (UTF)
3056 SvUTF8_on(data->last_found);
3057 {
3058 SV * const sv = data->last_found;
3059 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3060 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3061 if (mg && mg->mg_len >= 0)
3062 mg->mg_len += utf8_length((U8*)STRING(scan),
3063 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3064 }
8aa23a47
YO
3065 data->last_end = data->pos_min + l;
3066 data->pos_min += l; /* As in the first entry. */
3067 data->flags &= ~SF_BEFORE_EOL;
3068 }
3069 if (flags & SCF_DO_STCLASS_AND) {
3070 /* Check whether it is compatible with what we know already! */
3071 int compat = 1;
3072
3073 if (uc >= 0x100 ||
3074 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3075 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3076 && (!(data->start_class->flags & ANYOF_FOLD)
3077 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3078 )
3079 compat = 0;
3080 ANYOF_CLASS_ZERO(data->start_class);
3081 ANYOF_BITMAP_ZERO(data->start_class);
3082 if (compat)
3083 ANYOF_BITMAP_SET(data->start_class, uc);
3084 data->start_class->flags &= ~ANYOF_EOS;
3085 if (uc < 0x100)
3086 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3087 }
3088 else if (flags & SCF_DO_STCLASS_OR) {
3089 /* false positive possible if the class is case-folded */
3090 if (uc < 0x100)
3091 ANYOF_BITMAP_SET(data->start_class, uc);
3092 else
3093 data->start_class->flags |= ANYOF_UNICODE_ALL;
3094 data->start_class->flags &= ~ANYOF_EOS;
3095 cl_and(data->start_class, and_withp);
3096 }
3097 flags &= ~SCF_DO_STCLASS;
3098 }
3099 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3100 I32 l = STR_LEN(scan);
3101 UV uc = *((U8*)STRING(scan));
3102
3103 /* Search for fixed substrings supports EXACT only. */
3104 if (flags & SCF_DO_SUBSTR) {
3105 assert(data);
304ee84b 3106 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3107 }
3108 if (UTF) {
3109 const U8 * const s = (U8 *)STRING(scan);
3110 l = utf8_length(s, s + l);
3111 uc = utf8_to_uvchr(s, NULL);
3112 }
3113 min += l;
3114 if (flags & SCF_DO_SUBSTR)
3115 data->pos_min += l;
3116 if (flags & SCF_DO_STCLASS_AND) {
3117 /* Check whether it is compatible with what we know already! */
3118 int compat = 1;
3119
3120 if (uc >= 0x100 ||
3121 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3122 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3123 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3124 compat = 0;
3125 ANYOF_CLASS_ZERO(data->start_class);
3126 ANYOF_BITMAP_ZERO(data->start_class);
3127 if (compat) {
3128 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3129 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
3130 data->start_class->flags |= ANYOF_FOLD;
3131 if (OP(scan) == EXACTFL)
3132 data->start_class->flags |= ANYOF_LOCALE;
653099ff 3133 }
8aa23a47
YO
3134 }
3135 else if (flags & SCF_DO_STCLASS_OR) {
3136 if (data->start_class->flags & ANYOF_FOLD) {
3137 /* false positive possible if the class is case-folded.
3138 Assume that the locale settings are the same... */
1aa99e6b
IH
3139 if (uc < 0x100)
3140 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
3141 data->start_class->flags &= ~ANYOF_EOS;
3142 }
8aa23a47 3143 cl_and(data->start_class, and_withp);
653099ff 3144 }
8aa23a47
YO
3145 flags &= ~SCF_DO_STCLASS;
3146 }
e52fc539 3147 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3148 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3149 I32 f = flags, pos_before = 0;
3150 regnode * const oscan = scan;
3151 struct regnode_charclass_class this_class;
3152 struct regnode_charclass_class *oclass = NULL;
3153 I32 next_is_eval = 0;
3154
3155 switch (PL_regkind[OP(scan)]) {
3156 case WHILEM: /* End of (?:...)* . */
3157 scan = NEXTOPER(scan);
3158 goto finish;
3159 case PLUS:
3160 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3161 next = NEXTOPER(scan);
3162 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3163 mincount = 1;
3164 maxcount = REG_INFTY;
3165 next = regnext(scan);
3166 scan = NEXTOPER(scan);
3167 goto do_curly;
3168 }
3169 }
3170 if (flags & SCF_DO_SUBSTR)
3171 data->pos_min++;
3172 min++;
3173 /* Fall through. */
3174 case STAR:
3175 if (flags & SCF_DO_STCLASS) {
3176 mincount = 0;
3177 maxcount = REG_INFTY;
3178 next = regnext(scan);
3179 scan = NEXTOPER(scan);
3180 goto do_curly;
3181 }
3182 is_inf = is_inf_internal = 1;
3183 scan = regnext(scan);
c277df42 3184 if (flags & SCF_DO_SUBSTR) {
304ee84b 3185 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3186 data->longest = &(data->longest_float);
c277df42 3187 }
8aa23a47
YO
3188 goto optimize_curly_tail;
3189 case CURLY:
3190 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3191 && (scan->flags == stopparen))
3192 {
3193 mincount = 1;
3194 maxcount = 1;
3195 } else {
3196 mincount = ARG1(scan);
3197 maxcount = ARG2(scan);
653099ff 3198 }
8aa23a47
YO
3199 next = regnext(scan);
3200 if (OP(scan) == CURLYX) {
3201 I32 lp = (data ? *(data->last_closep) : 0);
3202 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3203 }
8aa23a47
YO
3204 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3205 next_is_eval = (OP(scan) == EVAL);
3206 do_curly:
3207 if (flags & SCF_DO_SUBSTR) {
304ee84b 3208 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3209 pos_before = data->pos_min;
b45f050a 3210 }
8aa23a47
YO
3211 if (data) {
3212 fl = data->flags;
3213 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3214 if (is_inf)
3215 data->flags |= SF_IS_INF;
3216 }
3217 if (flags & SCF_DO_STCLASS) {
3218 cl_init(pRExC_state, &this_class);
3219 oclass = data->start_class;
3220 data->start_class = &this_class;
3221 f |= SCF_DO_STCLASS_AND;
3222 f &= ~SCF_DO_STCLASS_OR;
3223 }
3224 /* These are the cases when once a subexpression
3225 fails at a particular position, it cannot succeed
3226 even after backtracking at the enclosing scope.
b9a59e08 3227
8aa23a47
YO
3228 XXXX what if minimal match and we are at the
3229 initial run of {n,m}? */
3230 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3231 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3232
8aa23a47
YO
3233 /* This will finish on WHILEM, setting scan, or on NULL: */
3234 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3235 last, data, stopparen, recursed, NULL,
3236 (mincount == 0
3237 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3238
8aa23a47
YO
3239 if (flags & SCF_DO_STCLASS)
3240 data->start_class = oclass;
3241 if (mincount == 0 || minnext == 0) {
3242 if (flags & SCF_DO_STCLASS_OR) {
3243 cl_or(pRExC_state, data->start_class, &this_class);
3244 }
3245 else if (flags & SCF_DO_STCLASS_AND) {
3246 /* Switch to OR mode: cache the old value of
3247 * data->start_class */
3248 INIT_AND_WITHP;
3249 StructCopy(data->start_class, and_withp,
3250 struct regnode_charclass_class);
3251 flags &= ~SCF_DO_STCLASS_AND;
3252 StructCopy(&this_class, data->start_class,
3253 struct regnode_charclass_class);
3254 flags |= SCF_DO_STCLASS_OR;
3255 data->start_class->flags |= ANYOF_EOS;
3256 }
3257 } else { /* Non-zero len */
3258 if (flags & SCF_DO_STCLASS_OR) {
3259 cl_or(pRExC_state, data->start_class, &this_class);
3260 cl_and(data->start_class, and_withp);
3261 }
3262 else if (flags & SCF_DO_STCLASS_AND)
3263 cl_and(data->start_class, &this_class);
3264 flags &= ~SCF_DO_STCLASS;
3265 }
3266 if (!scan) /* It was not CURLYX, but CURLY. */
3267 scan = next;
3268 if ( /* ? quantifier ok, except for (?{ ... }) */
3269 (next_is_eval || !(mincount == 0 && maxcount == 1))
3270 && (minnext == 0) && (deltanext == 0)
3271 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3272 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3273 {
668c081a
NC
3274 ckWARNreg(RExC_parse,
3275 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3276 }
3277
3278 min += minnext * mincount;
3279 is_inf_internal |= ((maxcount == REG_INFTY
3280 && (minnext + deltanext) > 0)
3281 || deltanext == I32_MAX);
3282 is_inf |= is_inf_internal;
3283 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3284
3285 /* Try powerful optimization CURLYX => CURLYN. */
3286 if ( OP(oscan) == CURLYX && data
3287 && data->flags & SF_IN_PAR
3288 && !(data->flags & SF_HAS_EVAL)
3289 && !deltanext && minnext == 1 ) {
3290 /* Try to optimize to CURLYN. */
3291 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3292 regnode * const nxt1 = nxt;
497b47a8 3293#ifdef DEBUGGING
8aa23a47 3294 regnode *nxt2;
497b47a8 3295#endif
c277df42 3296
8aa23a47
YO
3297 /* Skip open. */
3298 nxt = regnext(nxt);
e52fc539 3299 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3300 && !(PL_regkind[OP(nxt)] == EXACT
3301 && STR_LEN(nxt) == 1))
3302 goto nogo;
497b47a8 3303#ifdef DEBUGGING
8aa23a47 3304 nxt2 = nxt;
497b47a8 3305#endif
8aa23a47
YO
3306 nxt = regnext(nxt);
3307 if (OP(nxt) != CLOSE)
3308 goto nogo;
3309 if (RExC_open_parens) {
3310 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3311 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3312 }
3313 /* Now we know that nxt2 is the only contents: */
3314 oscan->flags = (U8)ARG(nxt);
3315 OP(oscan) = CURLYN;
3316 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3317
c277df42 3318#ifdef DEBUGGING
8aa23a47 3319 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3320 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3321 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3322 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3323 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3324 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3325#endif
8aa23a47
YO
3326 }
3327 nogo:
3328
3329 /* Try optimization CURLYX => CURLYM. */
3330 if ( OP(oscan) == CURLYX && data
3331 && !(data->flags & SF_HAS_PAR)
3332 && !(data->flags & SF_HAS_EVAL)
3333 && !deltanext /* atom is fixed width */
3334 && minnext != 0 /* CURLYM can't handle zero width */
3335 ) {
3336 /* XXXX How to optimize if data == 0? */
3337 /* Optimize to a simpler form. */
3338 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3339 regnode *nxt2;
3340
3341 OP(oscan) = CURLYM;
3342 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3343 && (OP(nxt2) != WHILEM))
3344 nxt = nxt2;
3345 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3346 /* Need to optimize away parenths. */
b3c0965f 3347 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3348 /* Set the parenth number. */
3349 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3350
8aa23a47
YO
3351 oscan->flags = (U8)ARG(nxt);
3352 if (RExC_open_parens) {
3353 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3354 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3355 }
8aa23a47
YO
3356 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3357 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3358
c277df42 3359#ifdef DEBUGGING
8aa23a47
YO
3360 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3361 OP(nxt + 1) = OPTIMIZED; /* was count. */
3362 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3363 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3364#endif
c277df42 3365#if 0
8aa23a47
YO
3366 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3367 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3368 if (nnxt == nxt) {
3369 if (reg_off_by_arg[OP(nxt1)])
3370 ARG_SET(nxt1, nxt2 - nxt1);
3371 else if (nxt2 - nxt1 < U16_MAX)
3372 NEXT_OFF(nxt1) = nxt2 - nxt1;
3373 else
3374 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3375 }
8aa23a47 3376 nxt1 = nnxt;
c277df42 3377 }
5d1c421c 3378#endif
8aa23a47
YO
3379 /* Optimize again: */
3380 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3381 NULL, stopparen, recursed, NULL, 0,depth+1);
3382 }
3383 else
3384 oscan->flags = 0;
3385 }
3386 else if ((OP(oscan) == CURLYX)
3387 && (flags & SCF_WHILEM_VISITED_POS)
3388 /* See the comment on a similar expression above.
3b753521 3389 However, this time it's not a subexpression
8aa23a47
YO
3390 we care about, but the expression itself. */
3391 && (maxcount == REG_INFTY)
3392 && data && ++data->whilem_c < 16) {
3393 /* This stays as CURLYX, we can put the count/of pair. */
3394 /* Find WHILEM (as in regexec.c) */
3395 regnode *nxt = oscan + NEXT_OFF(oscan);
3396
3397 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3398 nxt += ARG(nxt);
3399 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3400 | (RExC_whilem_seen << 4)); /* On WHILEM */
3401 }
3402 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3403 pars++;
3404 if (flags & SCF_DO_SUBSTR) {
3405 SV *last_str = NULL;
3406 int counted = mincount != 0;
a0ed51b3 3407
8aa23a47
YO
3408 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3409#if defined(SPARC64_GCC_WORKAROUND)
3410 I32 b = 0;
3411 STRLEN l = 0;
3412 const char *s = NULL;
3413 I32 old = 0;
b515a41d 3414
8aa23a47
YO
3415 if (pos_before >= data->last_start_min)
3416 b = pos_before;
3417 else
3418 b = data->last_start_min;
b515a41d 3419
8aa23a47
YO
3420 l = 0;
3421 s = SvPV_const(data->last_found, l);
3422 old = b - data->last_start_min;
3423
3424#else
3425 I32 b = pos_before >= data->last_start_min
3426 ? pos_before : data->last_start_min;
3427 STRLEN l;
3428 const char * const s = SvPV_const(data->last_found, l);
3429 I32 old = b - data->last_start_min;
3430#endif
3431
3432 if (UTF)
3433 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3434 l -= old;
3435 /* Get the added string: */
740cce10 3436 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3437 if (deltanext == 0 && pos_before == b) {
3438 /* What was added is a constant string */
3439 if (mincount > 1) {
3440 SvGROW(last_str, (mincount * l) + 1);
3441 repeatcpy(SvPVX(last_str) + l,
3442 SvPVX_const(last_str), l, mincount - 1);
3443 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3444 /* Add additional parts. */
3445 SvCUR_set(data->last_found,
3446 SvCUR(data->last_found) - l);
3447 sv_catsv(data->last_found, last_str);
3448 {
3449 SV * sv = data->last_found;
3450 MAGIC *mg =
3451 SvUTF8(sv) && SvMAGICAL(sv) ?
3452 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3453 if (mg && mg->mg_len >= 0)
bd94e887 3454 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3455 }
8aa23a47 3456 data->last_end += l * (mincount - 1);
b515a41d 3457 }
8aa23a47
YO
3458 } else {
3459 /* start offset must point into the last copy */
3460 data->last_start_min += minnext * (mincount - 1);
3461 data->last_start_max += is_inf ? I32_MAX
3462 : (maxcount - 1) * (minnext + data->pos_delta);
3463 }
c277df42 3464 }
8aa23a47
YO
3465 /* It is counted once already... */
3466 data->pos_min += minnext * (mincount - counted);
3467 data->pos_delta += - counted * deltanext +
3468 (minnext + deltanext) * maxcount - minnext * mincount;
3469 if (mincount != maxcount) {
3470 /* Cannot extend fixed substrings found inside
3471 the group. */
304ee84b 3472 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3473 if (mincount && last_str) {
3474 SV * const sv = data->last_found;
3475 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3476 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3477
3478 if (mg)
3479 mg->mg_len = -1;
3480 sv_setsv(sv, last_str);
3481 data->last_end = data->pos_min;
3482 data->last_start_min =
3483 data->pos_min - CHR_SVLEN(last_str);
3484 data->last_start_max = is_inf
3485 ? I32_MAX
3486 : data->pos_min + data->pos_delta
3487 - CHR_SVLEN(last_str);
3488 }
3489 data->longest = &(data->longest_float);
3490 }
3491 SvREFCNT_dec(last_str);
c277df42 3492 }
8aa23a47
YO
3493 if (data && (fl & SF_HAS_EVAL))
3494 data->flags |= SF_HAS_EVAL;
3495 optimize_curly_tail:
3496 if (OP(oscan) != CURLYX) {
3497 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3498 && NEXT_OFF(next))
3499 NEXT_OFF(oscan) += NEXT_OFF(next);
3500 }
3501 continue;
3502 default: /* REF and CLUMP only? */
3503 if (flags & SCF_DO_SUBSTR) {
304ee84b 3504 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3505 data->longest = &(data->longest_float);
3506 }
3507 is_inf = is_inf_internal = 1;
3508 if (flags & SCF_DO_STCLASS_OR)
3509 cl_anything(pRExC_state, data->start_class);
3510 flags &= ~SCF_DO_STCLASS;
3511 break;
c277df42 3512 }
8aa23a47 3513 }
e1d1eefb
YO
3514 else if (OP(scan) == LNBREAK) {
3515 if (flags & SCF_DO_STCLASS) {
3516 int value = 0;
3517 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3518 if (flags & SCF_DO_STCLASS_AND) {
3519 for (value = 0; value < 256; value++)
e64b1bd1 3520 if (!is_VERTWS_cp(value))
b9a59e08
KW
3521 ANYOF_BITMAP_CLEAR(data->start_class, value);
3522 }
3523 else {
e1d1eefb 3524 for (value = 0; value < 256; value++)
e64b1bd1 3525 if (is_VERTWS_cp(value))
b9a59e08
KW
3526 ANYOF_BITMAP_SET(data->start_class, value);
3527 }
e1d1eefb
YO
3528 if (flags & SCF_DO_STCLASS_OR)
3529 cl_and(data->start_class, and_withp);
3530 flags &= ~SCF_DO_STCLASS;
3531 }
3532 min += 1;
f9a79580 3533 delta += 1;
e1d1eefb
YO
3534 if (flags & SCF_DO_SUBSTR) {
3535 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3536 data->pos_min += 1;
f9a79580 3537 data->pos_delta += 1;
e1d1eefb
YO
3538 data->longest = &(data->longest_float);
3539 }
e1d1eefb 3540 }
f9a79580
RGS
3541 else if (OP(scan) == FOLDCHAR) {
3542 int d = ARG(scan)==0xDF ? 1 : 2;
3543 flags &= ~SCF_DO_STCLASS;
3544 min += 1;
3545 delta += d;
3546 if (flags & SCF_DO_SUBSTR) {
3547 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3548 data->pos_min += 1;
3549 data->pos_delta += d;
3550 data->longest = &(data->longest_float);
3551 }
3552 }
e52fc539 3553 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 3554 int value = 0;
653099ff 3555
8aa23a47 3556 if (flags & SCF_DO_SUBSTR) {
304ee84b 3557 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3558 data->pos_min++;
3559 }
3560 min++;
3561 if (flags & SCF_DO_STCLASS) {
3562 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3563
8aa23a47
YO
3564 /* Some of the logic below assumes that switching
3565 locale on will only add false positives. */
3566 switch (PL_regkind[OP(scan)]) {
3567 case SANY:
3568 default:
3569 do_default:
3570 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3571 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3572 cl_anything(pRExC_state, data->start_class);
3573 break;
3574 case REG_ANY:
3575 if (OP(scan) == SANY)
3576 goto do_default;
3577 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3578 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
cefafd73
KW
3579 || ((data->start_class->flags & ANYOF_CLASS)
3580 && ANYOF_CLASS_TEST_ANY_SET(data->start_class)));
8aa23a47 3581 cl_anything(pRExC_state, data->start_class);
653099ff 3582 }
8aa23a47
YO
3583 if (flags & SCF_DO_STCLASS_AND || !value)
3584 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3585 break;
3586 case ANYOF:
3587 if (flags & SCF_DO_STCLASS_AND)
3588 cl_and(data->start_class,
3589 (struct regnode_charclass_class*)scan);
653099ff 3590 else
8aa23a47
YO
3591 cl_or(pRExC_state, data->start_class,
3592 (struct regnode_charclass_class*)scan);
3593 break;
3594 case ALNUM:
3595 if (flags & SCF_DO_STCLASS_AND) {
3596 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3597 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
a12cf05f
KW
3598 if (FLAGS(scan) & USE_UNI) {
3599 for (value = 0; value < 256; value++) {
3600 if (!isWORDCHAR_L1(value)) {
3601 ANYOF_BITMAP_CLEAR(data->start_class, value);
3602 }
3603 }
3604 } else {
3605 for (value = 0; value < 256; value++) {
3606 if (!isALNUM(value)) {
3607 ANYOF_BITMAP_CLEAR(data->start_class, value);
3608 }
3609 }
3610 }
8aa23a47 3611 }
653099ff 3612 }
8aa23a47
YO
3613 else {
3614 if (data->start_class->flags & ANYOF_LOCALE)
3615 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
a12cf05f
KW
3616 else if (FLAGS(scan) & USE_UNI) {
3617 for (value = 0; value < 256; value++) {
3618 if (isWORDCHAR_L1(value)) {
3619 ANYOF_BITMAP_SET(data->start_class, value);
3620 }
3621 }
3622 } else {
3623 for (value = 0; value < 256; value++) {
3624 if (isALNUM(value)) {
3625 ANYOF_BITMAP_SET(data->start_class, value);
3626 }
3627 }
3628 }
8aa23a47
YO
3629 }
3630 break;
3631 case ALNUML:
3632 if (flags & SCF_DO_STCLASS_AND) {
3633 if (data->start_class->flags & ANYOF_LOCALE)
3634 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3635 }
3636 else {
3637 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3638 data->start_class->flags |= ANYOF_LOCALE;
3639 }
3640 break;
3641 case NALNUM:
3642 if (flags & SCF_DO_STCLASS_AND) {
3643 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3644 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
a12cf05f
KW
3645 if (FLAGS(scan) & USE_UNI) {
3646 for (value = 0; value < 256; value++) {
3647 if (isWORDCHAR_L1(value)) {
3648 ANYOF_BITMAP_CLEAR(data->start_class, value);
3649 }
3650 }
3651 } else {
3652 for (value = 0; value < 256; value++) {
3653 if (isALNUM(value)) {
3654 ANYOF_BITMAP_CLEAR(data->start_class, value);
3655 }
3656 }
3657 }
653099ff
GS
3658 }
3659 }
8aa23a47
YO
3660 else {
3661 if (data->start_class->flags & ANYOF_LOCALE)
3662 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3663 else {
3664 for (value = 0; value < 256; value++)
3665 if (!isALNUM(value))
b9a59e08 3666 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47 3667 }
653099ff 3668 }
8aa23a47
YO
3669 break;
3670 case NALNUML:
3671 if (flags & SCF_DO_STCLASS_AND) {
3672 if (data->start_class->flags & ANYOF_LOCALE)
3673 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3674 }
8aa23a47
YO
3675 else {
3676 data->start_class->flags |= ANYOF_LOCALE;
3677 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3678 }
3679 break;
3680 case SPACE:
3681 if (flags & SCF_DO_STCLASS_AND) {
3682 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3683 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
a12cf05f
KW
3684 if (FLAGS(scan) & USE_UNI) {
3685 for (value = 0; value < 256; value++) {
3686 if (!isSPACE_L1(value)) {
3687 ANYOF_BITMAP_CLEAR(data->start_class, value);
3688 }
3689 }
3690 } else {
3691 for (value = 0; value < 256; value++) {
3692 if (!isSPACE(value)) {
3693 ANYOF_BITMAP_CLEAR(data->start_class, value);
3694 }
3695 }
3696 }
653099ff
GS
3697 }
3698 }
8aa23a47 3699 else {
a12cf05f 3700 if (data->start_class->flags & ANYOF_LOCALE) {
8aa23a47 3701 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
a12cf05f
KW
3702 }
3703 else if (FLAGS(scan) & USE_UNI) {
3704 for (value = 0; value < 256; value++) {
3705 if (isSPACE_L1(value)) {
3706 ANYOF_BITMAP_SET(data->start_class, value);
3707 }
3708 }
3709 } else {
3710 for (value = 0; value < 256; value++) {
3711 if (isSPACE(value)) {
3712 ANYOF_BITMAP_SET(data->start_class, value);
3713 }
3714 }
8aa23a47 3715 }
653099ff 3716 }
8aa23a47
YO
3717 break;
3718 case SPACEL:
3719 if (flags & SCF_DO_STCLASS_AND) {
3720 if (data->start_class->flags & ANYOF_LOCALE)
3721 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3722 }
3723 else {
3724 data->start_class->flags |= ANYOF_LOCALE;
3725 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3726 }
3727 break;
3728 case NSPACE:
3729 if (flags & SCF_DO_STCLASS_AND) {
3730 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3731 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
a12cf05f
KW
3732 if (FLAGS(scan) & USE_UNI) {
3733 for (value = 0; value < 256; value++) {
3734 if (isSPACE_L1(value)) {
3735 ANYOF_BITMAP_CLEAR(data->start_class, value);
3736 }
3737 }
3738 } else {
3739 for (value = 0; value < 256; value++) {
3740 if (isSPACE(value)) {
3741 ANYOF_BITMAP_CLEAR(data->start_class, value);
3742 }
3743 }
3744 }
653099ff 3745 }
8aa23a47
YO
3746 }
3747 else {
3748 if (data->start_class->flags & ANYOF_LOCALE)
3749 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
a12cf05f
KW
3750 else if (FLAGS(scan) & USE_UNI) {
3751 for (value = 0; value < 256; value++) {
3752 if (!isSPACE_L1(value)) {
3753 ANYOF_BITMAP_SET(data->start_class, value);
3754 }
3755 }
3756 }
3757 else {
3758 for (value = 0; value < 256; value++) {
3759 if (!isSPACE(value)) {
3760 ANYOF_BITMAP_SET(data->start_class, value);
3761 }
3762 }
3763 }
653099ff 3764 }
8aa23a47
YO
3765 break;
3766 case NSPACEL:
3767 if (flags & SCF_DO_STCLASS_AND) {
3768 if (data->start_class->flags & ANYOF_LOCALE) {
3769 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3770 for (value = 0; value < 256; value++)
3771 if (!isSPACE(value))
3772 ANYOF_BITMAP_CLEAR(data->start_class, value);
3773 }
653099ff 3774 }
8aa23a47
YO
3775 else {
3776 data->start_class->flags |= ANYOF_LOCALE;
3777 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3778 }
3779 break;
3780 case DIGIT:
3781 if (flags & SCF_DO_STCLASS_AND) {
3782 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3783 for (value = 0; value < 256; value++)
3784 if (!isDIGIT(value))
3785 ANYOF_BITMAP_CLEAR(data->start_class, value);
3786 }
3787 else {
3788 if (data->start_class->flags & ANYOF_LOCALE)
3789 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3790 else {
3791 for (value = 0; value < 256; value++)
3792 if (isDIGIT(value))
b9a59e08 3793 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47
YO
3794 }
3795 }
3796 break;
3797 case NDIGIT:
3798 if (flags & SCF_DO_STCLASS_AND) {
3799 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3800 for (value = 0; value < 256; value++)
3801 if (isDIGIT(value))
3802 ANYOF_BITMAP_CLEAR(data->start_class, value);
3803 }
3804 else {
3805 if (data->start_class->flags & ANYOF_LOCALE)
3806 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3807 else {
3808 for (value = 0; value < 256; value++)
3809 if (!isDIGIT(value))
b9a59e08 3810 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3811 }
3812 }
8aa23a47 3813 break;
e1d1eefb
YO
3814 CASE_SYNST_FNC(VERTWS);
3815 CASE_SYNST_FNC(HORIZWS);
3816
8aa23a47
YO
3817 }
3818 if (flags & SCF_DO_STCLASS_OR)
3819 cl_and(data->start_class, and_withp);
3820 flags &= ~SCF_DO_STCLASS;
3821 }
3822 }
3823 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3824 data->flags |= (OP(scan) == MEOL
3825 ? SF_BEFORE_MEOL
3826 : SF_BEFORE_SEOL);
3827 }
3828 else if ( PL_regkind[OP(scan)] == BRANCHJ
3829 /* Lookbehind, or need to calculate parens/evals/stclass: */
3830 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3831 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3832 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3833 || OP(scan) == UNLESSM )
3834 {
3835 /* Negative Lookahead/lookbehind
3836 In this case we can't do fixed string optimisation.
3837 */
1de06328 3838
8aa23a47
YO
3839 I32 deltanext, minnext, fake = 0;
3840 regnode *nscan;
3841 struct regnode_charclass_class intrnl;
3842 int f = 0;
1de06328 3843
8aa23a47
YO
3844 data_fake.flags = 0;
3845 if (data) {
3846 data_fake.whilem_c = data->whilem_c;
3847 data_fake.last_closep = data->last_closep;
c277df42 3848 }
8aa23a47
YO
3849 else
3850 data_fake.last_closep = &fake;
58e23c8d 3851 data_fake.pos_delta = delta;
8aa23a47
YO
3852 if ( flags & SCF_DO_STCLASS && !scan->flags
3853 && OP(scan) == IFMATCH ) { /* Lookahead */
3854 cl_init(pRExC_state, &intrnl);
3855 data_fake.start_class = &intrnl;
3856 f |= SCF_DO_STCLASS_AND;
3857 }
3858 if (flags & SCF_WHILEM_VISITED_POS)
3859 f |= SCF_WHILEM_VISITED_POS;
3860 next = regnext(scan);
3861 nscan = NEXTOPER(NEXTOPER(scan));
3862 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3863 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3864 if (scan->flags) {
3865 if (deltanext) {
58e23c8d 3866 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3867 }
3868 else if (minnext > (I32)U8_MAX) {
58e23c8d 3869 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3870 }
3871 scan->flags = (U8)minnext;
3872 }
3873 if (data) {
3874 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3875 pars++;
3876 if (data_fake.flags & SF_HAS_EVAL)
3877 data->flags |= SF_HAS_EVAL;
3878 data->whilem_c = data_fake.whilem_c;
3879 }
3880 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
3881 if (flags & SCF_DO_STCLASS_OR) {
3882 /* OR before, AND after: ideally we would recurse with
3883 * data_fake to get the AND applied by study of the
3884 * remainder of the pattern, and then derecurse;
3885 * *** HACK *** for now just treat as "no information".
3886 * See [perl #56690].
3887 */
3888 cl_init(pRExC_state, data->start_class);
3889 } else {
3890 /* AND before and after: combine and continue */
3891 const int was = (data->start_class->flags & ANYOF_EOS);
3892
3893 cl_and(data->start_class, &intrnl);
3894 if (was)
3895 data->start_class->flags |= ANYOF_EOS;
3896 }
8aa23a47 3897 }
cb434fcc 3898 }
8aa23a47
YO
3899#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3900 else {
3901 /* Positive Lookahead/lookbehind
3902 In this case we can do fixed string optimisation,
3903 but we must be careful about it. Note in the case of
3904 lookbehind the positions will be offset by the minimum
3905 length of the pattern, something we won't know about
3906 until after the recurse.
3907 */
3908 I32 deltanext, fake = 0;
3909 regnode *nscan;
3910 struct regnode_charclass_class intrnl;
3911 int f = 0;
3912 /* We use SAVEFREEPV so that when the full compile
3913 is finished perl will clean up the allocated
3b753521 3914 minlens when it's all done. This way we don't
8aa23a47
YO
3915 have to worry about freeing them when we know
3916 they wont be used, which would be a pain.
3917 */
3918 I32 *minnextp;
3919 Newx( minnextp, 1, I32 );
3920 SAVEFREEPV(minnextp);
3921
3922 if (data) {
3923 StructCopy(data, &data_fake, scan_data_t);
3924 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3925 f |= SCF_DO_SUBSTR;
3926 if (scan->flags)
304ee84b 3927 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3928 data_fake.last_found=newSVsv(data->last_found);
3929 }
3930 }
3931 else
3932 data_fake.last_closep = &fake;
3933 data_fake.flags = 0;
58e23c8d 3934 data_fake.pos_delta = delta;
8aa23a47
YO
3935 if (is_inf)
3936 data_fake.flags |= SF_IS_INF;
3937 if ( flags & SCF_DO_STCLASS && !scan->flags
3938 && OP(scan) == IFMATCH ) { /* Lookahead */
3939 cl_init(pRExC_state, &intrnl);
3940 data_fake.start_class = &intrnl;
3941 f |= SCF_DO_STCLASS_AND;
3942 }
3943 if (flags & SCF_WHILEM_VISITED_POS)
3944 f |= SCF_WHILEM_VISITED_POS;
3945 next = regnext(scan);
3946 nscan = NEXTOPER(NEXTOPER(scan));
3947
3948 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3949 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3950 if (scan->flags) {
3951 if (deltanext) {
58e23c8d 3952 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3953 }
3954 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3955 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3956 }
3957 scan->flags = (U8)*minnextp;
3958 }
3959
3960 *minnextp += min;
3961
3962 if (f & SCF_DO_STCLASS_AND) {
3963 const int was = (data->start_class->flags & ANYOF_EOS);
3964
3965 cl_and(data->start_class, &intrnl);
3966 if (was)
3967 data->start_class->flags |= ANYOF_EOS;
3968 }
3969 if (data) {
3970 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3971 pars++;
3972 if (data_fake.flags & SF_HAS_EVAL)
3973 data->flags |= SF_HAS_EVAL;
3974 data->whilem_c = data_fake.whilem_c;
3975 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3976 if (RExC_rx->minlen<*minnextp)
3977 RExC_rx->minlen=*minnextp;
304ee84b 3978 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3979 SvREFCNT_dec(data_fake.last_found);
3980
3981 if ( data_fake.minlen_fixed != minlenp )
3982 {
3983 data->offset_fixed= data_fake.offset_fixed;
3984 data->minlen_fixed= data_fake.minlen_fixed;
3985 data->lookbehind_fixed+= scan->flags;
3986 }
3987 if ( data_fake.minlen_float != minlenp )
3988 {
3989 data->minlen_float= data_fake.minlen_float;
3990 data->offset_float_min=data_fake.offset_float_min;
3991 data->offset_float_max=data_fake.offset_float_max;
3992 data->lookbehind_float+= scan->flags;
3993 }
3994 }
3995 }
3996
3997
40d049e4 3998 }
8aa23a47
YO
3999#endif
4000 }
4001 else if (OP(scan) == OPEN) {
4002 if (stopparen != (I32)ARG(scan))
4003 pars++;
4004 }
4005 else if (OP(scan) == CLOSE) {
4006 if (stopparen == (I32)ARG(scan)) {
4007 break;
4008 }
4009 if ((I32)ARG(scan) == is_par) {
4010 next = regnext(scan);
b515a41d 4011
8aa23a47
YO
4012 if ( next && (OP(next) != WHILEM) && next < last)
4013 is_par = 0; /* Disable optimization */
40d049e4 4014 }
8aa23a47
YO
4015 if (data)
4016 *(data->last_closep) = ARG(scan);
4017 }
4018 else if (OP(scan) == EVAL) {
c277df42
IZ
4019 if (data)
4020 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
4021 }
4022 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4023 if (flags & SCF_DO_SUBSTR) {
304ee84b 4024 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 4025 flags &= ~SCF_DO_SUBSTR;
40d049e4 4026 }
8aa23a47
YO
4027 if (data && OP(scan)==ACCEPT) {
4028 data->flags |= SCF_SEEN_ACCEPT;
4029 if (stopmin > min)
4030 stopmin = min;
e2e6a0f1 4031 }
8aa23a47
YO
4032 }
4033 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4034 {
0f5d15d6 4035 if (flags & SCF_DO_SUBSTR) {
304ee84b 4036 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
4037 data->longest = &(data->longest_float);
4038 }
4039 is_inf = is_inf_internal = 1;
653099ff 4040 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 4041 cl_anything(pRExC_state, data->start_class);
96776eda 4042 flags &= ~SCF_DO_STCLASS;
8aa23a47 4043 }
58e23c8d 4044 else if (OP(scan) == GPOS) {
bbe252da 4045 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
4046 !(delta || is_inf || (data && data->pos_delta)))
4047 {
bbe252da
YO
4048 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4049 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
4050 if (RExC_rx->gofs < (U32)min)
4051 RExC_rx->gofs = min;
4052 } else {
bbe252da 4053 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
4054 RExC_rx->gofs = 0;
4055 }
4056 }
786e8c11 4057#ifdef TRIE_STUDY_OPT
40d049e4 4058#ifdef FULL_TRIE_STUDY
8aa23a47
YO
4059 else if (PL_regkind[OP(scan)] == TRIE) {
4060 /* NOTE - There is similar code to this block above for handling
4061 BRANCH nodes on the initial study. If you change stuff here
4062 check there too. */
4063 regnode *trie_node= scan;
4064 regnode *tail= regnext(scan);
f8fc2ecf 4065 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4066 I32 max1 = 0, min1 = I32_MAX;
4067 struct regnode_charclass_class accum;
4068
4069 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 4070 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
4071 if (flags & SCF_DO_STCLASS)
4072 cl_init_zero(pRExC_state, &accum);
4073
4074 if (!trie->jump) {
4075 min1= trie->minlen;
4076 max1= trie->maxlen;
4077 } else {
4078 const regnode *nextbranch= NULL;
4079 U32 word;
4080
4081 for ( word=1 ; word <= trie->wordcount ; word++)
4082 {
4083 I32 deltanext=0, minnext=0, f = 0, fake;
4084 struct regnode_charclass_class this_class;
4085
4086 data_fake.flags = 0;
4087 if (data) {
4088 data_fake.whilem_c = data->whilem_c;
4089 data_fake.last_closep = data->last_closep;
4090 }
4091 else
4092 data_fake.last_closep = &fake;
58e23c8d 4093 data_fake.pos_delta = delta;
8aa23a47
YO
4094 if (flags & SCF_DO_STCLASS) {
4095 cl_init(pRExC_state, &this_class);
4096 data_fake.start_class = &this_class;
4097 f = SCF_DO_STCLASS_AND;
4098 }
4099 if (flags & SCF_WHILEM_VISITED_POS)
4100 f |= SCF_WHILEM_VISITED_POS;
4101
4102 if (trie->jump[word]) {
4103 if (!nextbranch)
4104 nextbranch = trie_node + trie->jump[0];
4105 scan= trie_node + trie->jump[word];
4106 /* We go from the jump point to the branch that follows
4107 it. Note this means we need the vestigal unused branches
4108 even though they arent otherwise used.
4109 */
4110 minnext = study_chunk(pRExC_state, &scan, minlenp,
4111 &deltanext, (regnode *)nextbranch, &data_fake,
4112 stopparen, recursed, NULL, f,depth+1);
4113 }
4114 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4115 nextbranch= regnext((regnode*)nextbranch);
4116
4117 if (min1 > (I32)(minnext + trie->minlen))
4118 min1 = minnext + trie->minlen;
4119 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4120 max1 = minnext + deltanext + trie->maxlen;
4121 if (deltanext == I32_MAX)
4122 is_inf = is_inf_internal = 1;
4123
4124 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4125 pars++;
4126 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4127 if ( stopmin > min + min1)
4128 stopmin = min + min1;
4129 flags &= ~SCF_DO_SUBSTR;
4130 if (data)
4131 data->flags |= SCF_SEEN_ACCEPT;
4132 }
4133 if (data) {
4134 if (data_fake.flags & SF_HAS_EVAL)
4135 data->flags |= SF_HAS_EVAL;
4136 data->whilem_c = data_fake.whilem_c;
4137 }
4138 if (flags & SCF_DO_STCLASS)
4139 cl_or(pRExC_state, &accum, &this_class);
4140 }
4141 }
4142 if (flags & SCF_DO_SUBSTR) {
4143 data->pos_min += min1;
4144 data->pos_delta += max1 - min1;
4145 if (max1 != min1 || is_inf)
4146 data->longest = &(data->longest_float);
4147 }
4148 min += min1;
4149 delta += max1 - min1;
4150 if (flags & SCF_DO_STCLASS_OR) {
4151 cl_or(pRExC_state, data->start_class, &accum);
4152 if (min1) {
4153 cl_and(data->start_class, and_withp);
4154 flags &= ~SCF_DO_STCLASS;
4155 }
4156 }
4157 else if (flags & SCF_DO_STCLASS_AND) {
4158 if (min1) {
4159 cl_and(data->start_class, &accum);
4160 flags &= ~SCF_DO_STCLASS;
4161 }
4162 else {
4163 /* Switch to OR mode: cache the old value of
4164 * data->start_class */
4165 INIT_AND_WITHP;
4166 StructCopy(data->start_class, and_withp,
4167 struct regnode_charclass_class);
4168 flags &= ~SCF_DO_STCLASS_AND;
4169 StructCopy(&accum, data->start_class,
4170 struct regnode_charclass_class);
4171 flags |= SCF_DO_STCLASS_OR;
4172 data->start_class->flags |= ANYOF_EOS;
4173 }
4174 }
4175 scan= tail;
4176 continue;
4177 }
786e8c11 4178#else
8aa23a47 4179 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4180 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4181 U8*bang=NULL;
4182
4183 min += trie->minlen;
4184 delta += (trie->maxlen - trie->minlen);
4185 flags &= ~SCF_DO_STCLASS; /* xxx */
4186 if (flags & SCF_DO_SUBSTR) {
304ee84b 4187 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4188 data->pos_min += trie->minlen;
4189 data->pos_delta += (trie->maxlen - trie->minlen);
4190 if (trie->maxlen != trie->minlen)
4191 data->longest = &(data->longest_float);
4192 }
4193 if (trie->jump) /* no more substrings -- for now /grr*/
4194 flags &= ~SCF_DO_SUBSTR;
b515a41d 4195 }
8aa23a47
YO
4196#endif /* old or new */
4197#endif /* TRIE_STUDY_OPT */
e1d1eefb 4198
8aa23a47
YO
4199 /* Else: zero-length, ignore. */
4200 scan = regnext(scan);
4201 }
4202 if (frame) {
4203 last = frame->last;
4204 scan = frame->next;
4205 stopparen = frame->stop;
4206 frame = frame->prev;
4207 goto fake_study_recurse;
c277df42
IZ
4208 }
4209
4210 finish:
8aa23a47 4211 assert(!frame);
304ee84b 4212 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4213
c277df42 4214 *scanp = scan;
aca2d497 4215 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4216 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4217 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4218 if (is_par > (I32)U8_MAX)
c277df42
IZ
4219 is_par = 0;
4220 if (is_par && pars==1 && data) {
4221 data->flags |= SF_IN_PAR;
4222 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4223 }
4224 else if (pars && data) {
c277df42
IZ
4225 data->flags |= SF_HAS_PAR;
4226 data->flags &= ~SF_IN_PAR;
4227 }
653099ff 4228 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4229 cl_and(data->start_class, and_withp);
786e8c11
YO
4230 if (flags & SCF_TRIE_RESTUDY)
4231 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4232
304ee84b 4233 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4234
e2e6a0f1 4235 return min < stopmin ? min : stopmin;
c277df42
IZ
4236}
4237
2eccd3b2
NC
4238STATIC U32
4239S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4240{
4a4e7719
NC
4241 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4242
7918f24d
NC
4243 PERL_ARGS_ASSERT_ADD_DATA;
4244
4a4e7719
NC
4245 Renewc(RExC_rxi->data,
4246 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4247 char, struct reg_data);
4248 if(count)
f8fc2ecf 4249 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4250 else
f8fc2ecf 4251 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4252 RExC_rxi->data->count = count + n;
4253 Copy(s, RExC_rxi->data->what + count, n, U8);
4254 return count;
c277df42
IZ
4255}
4256
f8149455 4257/*XXX: todo make this not included in a non debugging perl */
76234dfb 4258#ifndef PERL_IN_XSUB_RE
d88dccdf 4259void
864dbfa3 4260Perl_reginitcolors(pTHX)
d88dccdf 4261{
97aff369 4262 dVAR;
1df70142 4263 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4264 if (s) {
1df70142
AL
4265 char *t = savepv(s);
4266 int i = 0;
4267 PL_colors[0] = t;
d88dccdf 4268 while (++i < 6) {
1df70142
AL
4269 t = strchr(t, '\t');
4270 if (t) {
4271 *t = '\0';
4272 PL_colors[i] = ++t;
d88dccdf
IZ
4273 }
4274 else
1df70142 4275 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4276 }
4277 } else {
1df70142 4278 int i = 0;
b81d288d 4279 while (i < 6)
06b5626a 4280 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4281 }
4282 PL_colorset = 1;
4283}
76234dfb 4284#endif
8615cb43 4285
07be1b83 4286
786e8c11
YO
4287#ifdef TRIE_STUDY_OPT
4288#define CHECK_RESTUDY_GOTO \
4289 if ( \
4290 (data.flags & SCF_TRIE_RESTUDY) \
4291 && ! restudied++ \
4292 ) goto reStudy
4293#else
4294#define CHECK_RESTUDY_GOTO
4295#endif
f9f4320a 4296
a687059c 4297/*
e50aee73 4298 - pregcomp - compile a regular expression into internal code
a687059c
LW
4299 *
4300 * We can't allocate space until we know how big the compiled form will be,
4301 * but we can't compile it (and thus know how big it is) until we've got a
4302 * place to put the code. So we cheat: we compile it twice, once with code
4303 * generation turned off and size counting turned on, and once "for real".
4304 * This also means that we don't allocate space until we are sure that the
4305 * thing really will compile successfully, and we never have to move the
4306 * code and thus invalidate pointers into it. (Note that it has to be in
4307 * one piece because free() must be able to free it all.) [NB: not true in perl]
4308 *
4309 * Beware that the optimization-preparation code in here knows about some
4310 * of the structure of the compiled regexp. [I'll say.]
4311 */
b9b4dddf
YO
4312
4313
4314
f9f4320a 4315#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4316#define RE_ENGINE_PTR &PL_core_reg_engine
4317#else
f9f4320a
YO
4318extern const struct regexp_engine my_reg_engine;
4319#define RE_ENGINE_PTR &my_reg_engine
4320#endif
6d5c990f
RGS
4321
4322#ifndef PERL_IN_XSUB_RE
3ab4a224 4323REGEXP *
1593ad57 4324Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4325{
97aff369 4326 dVAR;
6d5c990f 4327 HV * const table = GvHV(PL_hintgv);
7918f24d
NC
4328
4329 PERL_ARGS_ASSERT_PREGCOMP;
4330
f9f4320a
YO
4331 /* Dispatch a request to compile a regexp to correct
4332 regexp engine. */
f9f4320a
YO
4333 if (table) {
4334 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4335 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4336 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4337 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4338 DEBUG_COMPILE_r({
8d8756e7 4339 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4340 SvIV(*ptr));
4341 });
3ab4a224 4342 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4343 }
b9b4dddf 4344 }
3ab4a224 4345 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4346}
6d5c990f 4347#endif
2a5d9b1d 4348
3ab4a224 4349REGEXP *
1593ad57 4350Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
2a5d9b1d
RGS
4351{
4352 dVAR;
288b8c02
NC
4353 REGEXP *rx;
4354 struct regexp *r;
f8fc2ecf 4355 register regexp_internal *ri;
3ab4a224 4356 STRLEN plen;
5d51ce98
KW
4357 char *exp;
4358 char* xend;
c277df42 4359 regnode *scan;
a0d0e21e 4360 I32 flags;
a0d0e21e 4361 I32 minlen = 0;
e7f38d0f
YO
4362
4363 /* these are all flags - maybe they should be turned
4364 * into a single int with different bit masks */
4365 I32 sawlookahead = 0;
a0d0e21e
LW
4366 I32 sawplus = 0;
4367 I32 sawopen = 0;
e7f38d0f 4368
bbd61b5f
KW
4369 U8 jump_ret = 0;
4370 dJMPENV;
2c2d71f5 4371 scan_data_t data;
830247a4 4372 RExC_state_t RExC_state;
be8e71aa 4373 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83 4374#ifdef TRIE_STUDY_OPT
5d51ce98 4375 int restudied;
07be1b83
YO
4376 RExC_state_t copyRExC_state;
4377#endif
2a5d9b1d 4378 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4379
4380 PERL_ARGS_ASSERT_RE_COMPILE;
4381
6d5c990f 4382 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4383
b9ad30b4 4384 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
a0ed51b3 4385
02daf0ab 4386
bbd61b5f
KW
4387 /* Longjmp back to here if have to switch in midstream to utf8 */
4388 if (! RExC_orig_utf8) {
4389 JMPENV_PUSH(jump_ret);
4390 }
4391
5d51ce98
KW
4392 if (jump_ret == 0) { /* First time through */
4393 exp = SvPV(pattern, plen);
4394 xend = exp + plen;
4395
4396 DEBUG_COMPILE_r({
4397 SV *dsv= sv_newmortal();
4398 RE_PV_QUOTED_DECL(s, RExC_utf8,
4399 dsv, exp, plen, 60);
4400 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4401 PL_colors[4],PL_colors[5],s);
4402 });
4403 }
4404 else { /* longjumped back */
bbd61b5f
KW
4405 STRLEN len = plen;
4406
5d51ce98
KW
4407 /* If the cause for the longjmp was other than changing to utf8, pop
4408 * our own setjmp, and longjmp to the correct handler */
bbd61b5f
KW
4409 if (jump_ret != UTF8_LONGJMP) {
4410 JMPENV_POP;
4411 JMPENV_JUMP(jump_ret);
4412 }
4413
595598ee
KW
4414 GET_RE_DEBUG_FLAGS;
4415
bbd61b5f
KW
4416 /* It's possible to write a regexp in ascii that represents Unicode
4417 codepoints outside of the byte range, such as via \x{100}. If we
4418 detect such a sequence we have to convert the entire pattern to utf8
4419 and then recompile, as our sizing calculation will have been based
4420 on 1 byte == 1 character, but we will need to use utf8 to encode
4421 at least some part of the pattern, and therefore must convert the whole
4422 thing.
4423 -- dmq */
4424 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4425 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
595598ee 4426 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
bbd61b5f
KW
4427 xend = exp + len;
4428 RExC_orig_utf8 = RExC_utf8 = 1;
4429 SAVEFREEPV(exp);
4430 }
4431
5d51ce98
KW
4432#ifdef TRIE_STUDY_OPT
4433 restudied = 0;
4434#endif
4435
02daf0ab 4436 RExC_precomp = exp;
c737faaf 4437 RExC_flags = pm_flags;
830247a4 4438 RExC_sawback = 0;
bbce6d69 4439
830247a4
IZ
4440 RExC_seen = 0;
4441 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4442 RExC_seen_evals = 0;
4443 RExC_extralen = 0;
c277df42 4444
bbce6d69 4445 /* First pass: determine size, legality. */
830247a4 4446 RExC_parse = exp;
fac92740 4447 RExC_start = exp;
830247a4
IZ
4448 RExC_end = xend;
4449 RExC_naughty = 0;
4450 RExC_npar = 1;
e2e6a0f1 4451 RExC_nestroot = 0;
830247a4
IZ
4452 RExC_size = 0L;
4453 RExC_emit = &PL_regdummy;
4454 RExC_whilem_seen = 0;
40d049e4
YO
4455 RExC_open_parens = NULL;
4456 RExC_close_parens = NULL;
4457 RExC_opend = NULL;
81714fb9 4458 RExC_paren_names = NULL;
1f1031fe
YO
4459#ifdef DEBUGGING
4460 RExC_paren_name_list = NULL;
4461#endif
40d049e4
YO
4462 RExC_recurse = NULL;
4463 RExC_recurse_count = 0;
81714fb9 4464
85ddcde9
JH
4465#if 0 /* REGC() is (currently) a NOP at the first pass.
4466 * Clever compilers notice this and complain. --jhi */
830247a4 4467 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4468#endif
3dab1dad
YO
4469 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4470 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4471 RExC_precomp = NULL;
a0d0e21e
LW
4472 return(NULL);
4473 }
bbd61b5f
KW
4474
4475 /* Here, finished first pass. Get rid of our setjmp, which we added for
4476 * efficiency only if the passed-in string wasn't in utf8, as shown by
4477 * RExC_orig_utf8. But if the first pass was redone, that variable will be
4478 * 1 here even though the original string wasn't utf8, but in this case
4479 * there will have been a long jump */
4480 if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
4481 JMPENV_POP;
02daf0ab 4482 }
07be1b83 4483 DEBUG_PARSE_r({
81714fb9
YO
4484 PerlIO_printf(Perl_debug_log,
4485 "Required size %"IVdf" nodes\n"
4486 "Starting second pass (creation)\n",
4487 (IV)RExC_size);
07be1b83
YO
4488 RExC_lastnum=0;
4489 RExC_lastparse=NULL;
4490 });
c277df42
IZ
4491 /* Small enough for pointer-storage convention?
4492 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4493 if (RExC_size >= 0x10000L && RExC_extralen)
4494 RExC_size += RExC_extralen;
c277df42 4495 else
830247a4
IZ
4496 RExC_extralen = 0;
4497 if (RExC_whilem_seen > 15)
4498 RExC_whilem_seen = 15;
a0d0e21e 4499
f9f4320a
YO
4500 /* Allocate space and zero-initialize. Note, the two step process
4501 of zeroing when in debug mode, thus anything assigned has to
4502 happen after that */
d2f13c59 4503 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 4504 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
4505 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4506 char, regexp_internal);
4507 if ( r == NULL || ri == NULL )
b45f050a 4508 FAIL("Regexp out of space");
0f79a09d
GS
4509#ifdef DEBUGGING
4510 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4511 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4512#else
f8fc2ecf
YO
4513 /* bulk initialize base fields with 0. */
4514 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4515#endif
58e23c8d
YO
4516
4517 /* non-zero initialization begins here */
f8fc2ecf 4518 RXi_SET( r, ri );
f9f4320a 4519 r->engine= RE_ENGINE_PTR;
c737faaf 4520 r->extflags = pm_flags;
bcdf7404 4521 {
f7819f85 4522 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
77ba44fb 4523 bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
c5ea2ffa
KW
4524
4525 /* The caret is output if there are any defaults: if not all the STD
4526 * flags are set, or if no character set specifier is needed */
4527 bool has_default =
4528 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4529 || ! has_charset);
bcdf7404 4530 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
4531 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4532 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
4533 const char *fptr = STD_PAT_MODS; /*"msix"*/
4534 char *p;
fb85c044 4535 /* Allocate for the worst case, which is all the std flags are turned
c5ea2ffa
KW
4536 * on. If more precision is desired, we could do a population count of
4537 * the flags set. This could be done with a small lookup table, or by
4538 * shifting, masking and adding, or even, when available, assembly
4539 * language for a machine-language population count.
4540 * We never output a minus, as all those are defaults, so are
4541 * covered by the caret */
fb85c044 4542 const STRLEN wraplen = plen + has_p + has_runon
c5ea2ffa
KW
4543 + has_default /* If needs a caret */
4544 + has_charset /* If needs a character set specifier */
bcdf7404
YO
4545 + (sizeof(STD_PAT_MODS) - 1)
4546 + (sizeof("(?:)") - 1);
4547
c5ea2ffa 4548 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
f7c278bf 4549 SvPOK_on(rx);
8f6ae13c 4550 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 4551 *p++='('; *p++='?';
9de15fec
KW
4552
4553 /* If a default, cover it using the caret */
c5ea2ffa 4554 if (has_default) {
85508812 4555 *p++= DEFAULT_PAT_MOD;
fb85c044 4556 }
c5ea2ffa
KW
4557 if (has_charset) {
4558 if (r->extflags & RXf_PMf_LOCALE) {
4559 *p++ = LOCALE_PAT_MOD;
4560 } else {
4561 *p++ = UNICODE_PAT_MOD;
4562 }
9de15fec 4563 }
f7819f85
A
4564 if (has_p)
4565 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404 4566 {
bcdf7404 4567 char ch;
bcdf7404
YO
4568 while((ch = *fptr++)) {
4569 if(reganch & 1)
4570 *p++ = ch;
bcdf7404
YO
4571 reganch >>= 1;
4572 }
bcdf7404
YO
4573 }
4574
28d8d7f4 4575 *p++ = ':';
bb661a58 4576 Copy(RExC_precomp, p, plen, char);
efd26800
NC
4577 assert ((RX_WRAPPED(rx) - p) < 16);
4578 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 4579 p += plen;
bcdf7404 4580 if (has_runon)
28d8d7f4
YO
4581 *p++ = '\n';
4582 *p++ = ')';
4583 *p = 0;
fb85c044 4584 SvCUR_set(rx, p - SvPVX_const(rx));
bcdf7404
YO
4585 }
4586
bbe252da 4587 r->intflags = 0;
830247a4 4588 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4589
6bda09f9 4590 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4591 Newxz(RExC_open_parens, RExC_npar,regnode *);
4592 SAVEFREEPV(RExC_open_parens);
4593 Newxz(RExC_close_parens,RExC_npar,regnode *);
4594 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4595 }
4596
4597 /* Useful during FAIL. */
7122b237
YO
4598#ifdef RE_TRACK_PATTERN_OFFSETS
4599 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4600 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4601 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4602 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4603 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4604#endif
4605 SetProgLen(ri,RExC_size);
288b8c02 4606 RExC_rx_sv = rx;
830247a4 4607 RExC_rx = r;
f8fc2ecf 4608 RExC_rxi = ri;
bbce6d69 4609
4610 /* Second pass: emit code. */
c737faaf 4611 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
4612 RExC_parse = exp;
4613 RExC_end = xend;
4614 RExC_naughty = 0;
4615 RExC_npar = 1;
f8fc2ecf
YO
4616 RExC_emit_start = ri->program;
4617 RExC_emit = ri->program;
3b57cd43
YO
4618 RExC_emit_bound = ri->program + RExC_size + 1;
4619
2cd61cdb 4620 /* Store the count of eval-groups for security checks: */
f8149455 4621 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4622 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 4623 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 4624 ReREFCNT_dec(rx);
a0d0e21e 4625 return(NULL);
80757612 4626 }
07be1b83
YO
4627 /* XXXX To minimize changes to RE engine we always allocate
4628 3-units-long substrs field. */
4629 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4630 if (RExC_recurse_count) {
4631 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4632 SAVEFREEPV(RExC_recurse);
4633 }
a0d0e21e 4634
07be1b83 4635reStudy:
e7f38d0f 4636 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
07be1b83 4637 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4638
07be1b83 4639#ifdef TRIE_STUDY_OPT
0934c9d9
SH
4640 if (!restudied) {
4641 StructCopy(&zero_scan_data, &data, scan_data_t);
4642 copyRExC_state = RExC_state;
4643 } else {
5d458dd8 4644 U32 seen=RExC_seen;
07be1b83 4645 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4646
4647 RExC_state = copyRExC_state;
4648 if (seen & REG_TOP_LEVEL_BRANCHES)
4649 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4650 else
4651 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4652 if (data.last_found) {
07be1b83 4653 SvREFCNT_dec(data.longest_fixed);
07be1b83 4654 SvREFCNT_dec(data.longest_float);
07be1b83 4655 SvREFCNT_dec(data.last_found);
1de06328 4656 }
40d049e4 4657 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4658 }
40d049e4
YO
4659#else
4660 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4661#endif
fc8cd66c 4662
a0d0e21e 4663 /* Dig out information for optimizations. */
f7819f85 4664 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
4665 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4666
a0ed51b3 4667 if (UTF)
8f6ae13c 4668 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 4669 ri->regstclass = NULL;
830247a4 4670 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4671 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4672 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4673
1de06328
YO
4674 /* testing for BRANCH here tells us whether there is "must appear"
4675 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4676 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4677 I32 fake;
c5254dd6 4678 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4679 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4680 int stclass_flag;
07be1b83 4681 I32 last_close = 0; /* pointed to by data */
5339e136
YO
4682 regnode *first= scan;
4683 regnode *first_next= regnext(first);
639081d6
YO
4684 /*
4685 * Skip introductions and multiplicators >= 1
4686 * so that we can extract the 'meat' of the pattern that must
4687 * match in the large if() sequence following.
4688 * NOTE that EXACT is NOT covered here, as it is normally
4689 * picked up by the optimiser separately.
4690 *
4691 * This is unfortunate as the optimiser isnt handling lookahead
4692 * properly currently.
4693 *
4694 */
a0d0e21e 4695 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4696 /* An OR of *one* alternative - should not happen now. */
5339e136 4697 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83 4698 /* for now we can't handle lookbehind IFMATCH*/
e7f38d0f 4699 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
a0d0e21e
LW
4700 (OP(first) == PLUS) ||
4701 (OP(first) == MINMOD) ||
653099ff 4702 /* An {n,m} with n>0 */
5339e136
YO
4703 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4704 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 4705 {
639081d6
YO
4706 /*
4707 * the only op that could be a regnode is PLUS, all the rest
4708 * will be regnode_1 or regnode_2.
4709 *
4710 */
a0d0e21e
LW
4711 if (OP(first) == PLUS)
4712 sawplus = 1;
4713 else
3dab1dad 4714 first += regarglen[OP(first)];
639081d6
YO
4715
4716 first = NEXTOPER(first);
5339e136 4717 first_next= regnext(first);
a687059c
LW
4718 }
4719
a0d0e21e
LW
4720 /* Starting-point info. */
4721 again:
786e8c11 4722 DEBUG_PEEP("first:",first,0);
07be1b83 4723 /* Ignore EXACT as we deal with it later. */
3dab1dad 4724 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4725 if (OP(first) == EXACT)
6f207bd3 4726 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4727 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4728 ri->regstclass = first;
b3c9acc1 4729 }
07be1b83 4730#ifdef TRIE_STCLASS
786e8c11 4731 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4732 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4733 {
786e8c11 4734 regnode *trie_op;
07be1b83 4735 /* this can happen only on restudy */
786e8c11 4736 if ( OP(first) == TRIE ) {
c944940b 4737 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4738 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4739 StructCopy(first,trieop,struct regnode_1);
4740 trie_op=(regnode *)trieop;
4741 } else {
c944940b 4742 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4743 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4744 StructCopy(first,trieop,struct regnode_charclass);
4745 trie_op=(regnode *)trieop;
4746 }
1de06328 4747 OP(trie_op)+=2;
786e8c11 4748 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4749 ri->regstclass = trie_op;
07be1b83
YO
4750 }
4751#endif
e52fc539 4752 else if (REGNODE_SIMPLE(OP(first)))
f8fc2ecf 4753 ri->regstclass = first;
3dab1dad
YO
4754 else if (PL_regkind[OP(first)] == BOUND ||
4755 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4756 ri->regstclass = first;
3dab1dad 4757 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4758 r->extflags |= (OP(first) == MBOL
4759 ? RXf_ANCH_MBOL
cad2e5aa 4760 : (OP(first) == SBOL
bbe252da
YO
4761 ? RXf_ANCH_SBOL
4762 : RXf_ANCH_BOL));
a0d0e21e 4763 first = NEXTOPER(first);
774d564b 4764 goto again;
4765 }
4766 else if (OP(first) == GPOS) {
bbe252da 4767 r->extflags |= RXf_ANCH_GPOS;
774d564b 4768 first = NEXTOPER(first);
4769 goto again;
a0d0e21e 4770 }
cf2a2b69
YO
4771 else if ((!sawopen || !RExC_sawback) &&
4772 (OP(first) == STAR &&
3dab1dad 4773 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4774 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4775 {
4776 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4777 const int type =
4778 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4779 ? RXf_ANCH_MBOL
4780 : RXf_ANCH_SBOL;
4781 r->extflags |= type;
4782 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4783 first = NEXTOPER(first);
774d564b 4784 goto again;
a0d0e21e 4785 }
e7f38d0f 4786 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
830247a4 4787 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4788 /* x+ must match at the 1st pos of run of x's */
bbe252da 4789 r->intflags |= PREGf_SKIP;
a0d0e21e 4790
c277df42 4791 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4792#ifdef TRIE_STUDY_OPT
81714fb9 4793 DEBUG_PARSE_r(
be8e71aa
YO
4794 if (!restudied)
4795 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4796 (IV)(first - scan + 1))
4797 );
4798#else
81714fb9 4799 DEBUG_PARSE_r(
be8e71aa
YO
4800 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4801 (IV)(first - scan + 1))
4802 );
4803#endif
4804
4805
a0d0e21e
LW
4806 /*
4807 * If there's something expensive in the r.e., find the
4808 * longest literal string that must appear and make it the
4809 * regmust. Resolve ties in favor of later strings, since
4810 * the regstart check works with the beginning of the r.e.
4811 * and avoiding duplication strengthens checking. Not a
4812 * strong reason, but sufficient in the absence of others.
4813 * [Now we resolve ties in favor of the earlier string if
c277df42 4814 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4815 * earlier string may buy us something the later one won't.]
4816 */
de8c5301 4817
396482e1
GA
4818 data.longest_fixed = newSVpvs("");
4819 data.longest_float = newSVpvs("");
4820 data.last_found = newSVpvs("");
c277df42
IZ
4821 data.longest = &(data.longest_fixed);
4822 first = scan;
f8fc2ecf 4823 if (!ri->regstclass) {
830247a4 4824 cl_init(pRExC_state, &ch_class);
653099ff
GS
4825 data.start_class = &ch_class;
4826 stclass_flag = SCF_DO_STCLASS_AND;
4827 } else /* XXXX Check for BOUND? */
4828 stclass_flag = 0;
cb434fcc 4829 data.last_closep = &last_close;
de8c5301 4830
1de06328 4831 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4832 &data, -1, NULL, NULL,
4833 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4834
07be1b83 4835
786e8c11
YO
4836 CHECK_RESTUDY_GOTO;
4837
4838
830247a4 4839 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4840 && data.last_start_min == 0 && data.last_end > 0
830247a4 4841 && !RExC_seen_zerolen
2bf803e2 4842 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4843 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4844 r->extflags |= RXf_CHECK_ALL;
304ee84b 4845 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4846 SvREFCNT_dec(data.last_found);
4847
1de06328
YO
4848 /* Note that code very similar to this but for anchored string
4849 follows immediately below, changes may need to be made to both.
4850 Be careful.
4851 */
a0ed51b3 4852 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4853 if (longest_float_length
c277df42
IZ
4854 || (data.flags & SF_FL_BEFORE_EOL
4855 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4856 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4857 {
1182767e 4858 I32 t,ml;
cf93c79d 4859
1de06328 4860 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4861 && data.offset_fixed == data.offset_float_min
4862 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4863 goto remove_float; /* As in (a)+. */
4864
1de06328
YO
4865 /* copy the information about the longest float from the reg_scan_data
4866 over to the program. */
33b8afdf
JH
4867 if (SvUTF8(data.longest_float)) {
4868 r->float_utf8 = data.longest_float;
c445ea15 4869 r->float_substr = NULL;
33b8afdf
JH
4870 } else {
4871 r->float_substr = data.longest_float;
c445ea15 4872 r->float_utf8 = NULL;
33b8afdf 4873 }
1de06328
YO
4874 /* float_end_shift is how many chars that must be matched that
4875 follow this item. We calculate it ahead of time as once the
4876 lookbehind offset is added in we lose the ability to correctly
4877 calculate it.*/
4878 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4879 : (I32)longest_float_length;
1de06328
YO
4880 r->float_end_shift = ml - data.offset_float_min
4881 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4882 + data.lookbehind_float;
4883 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4884 r->float_max_offset = data.offset_float_max;
1182767e 4885 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4886 r->float_max_offset -= data.lookbehind_float;
4887
cf93c79d
IZ
4888 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4889 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4890 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4891 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4892 }
4893 else {
aca2d497 4894 remove_float:
c445ea15 4895 r->float_substr = r->float_utf8 = NULL;
c277df42 4896 SvREFCNT_dec(data.longest_float);
c5254dd6 4897 longest_float_length = 0;
a0d0e21e 4898 }
c277df42 4899
1de06328
YO
4900 /* Note that code very similar to this but for floating string
4901 is immediately above, changes may need to be made to both.
4902 Be careful.
4903 */
a0ed51b3 4904 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4905 if (longest_fixed_length
c277df42
IZ
4906 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4907 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4908 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4909 {
1182767e 4910 I32 t,ml;
cf93c79d 4911
1de06328
YO
4912 /* copy the information about the longest fixed
4913 from the reg_scan_data over to the program. */
33b8afdf
JH
4914 if (SvUTF8(data.longest_fixed)) {
4915 r->anchored_utf8 = data.longest_fixed;
c445ea15 4916 r->anchored_substr = NULL;
33b8afdf
JH
4917 } else {
4918 r->anchored_substr = data.longest_fixed;
c445ea15 4919 r->anchored_utf8 = NULL;
33b8afdf 4920 }
1de06328
YO
4921 /* fixed_end_shift is how many chars that must be matched that
4922 follow this item. We calculate it ahead of time as once the
4923 lookbehind offset is added in we lose the ability to correctly
4924 calculate it.*/
4925 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4926 : (I32)longest_fixed_length;
1de06328
YO
4927 r->anchored_end_shift = ml - data.offset_fixed
4928 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4929 + data.lookbehind_fixed;
4930 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4931
cf93c79d
IZ
4932 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4933 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4934 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4935 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4936 }
4937 else {
c445ea15 4938 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4939 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4940 longest_fixed_length = 0;
a0d0e21e 4941 }
f8fc2ecf
YO
4942 if (ri->regstclass
4943 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4944 ri->regstclass = NULL;
33b8afdf
JH
4945 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4946 && stclass_flag
653099ff 4947 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4948 && !cl_is_anything(data.start_class))
4949 {
2eccd3b2 4950 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4951
f8fc2ecf 4952 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4953 struct regnode_charclass_class);
4954 StructCopy(data.start_class,
f8fc2ecf 4955 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4956 struct regnode_charclass_class);
f8fc2ecf 4957 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4958 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4959 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4960 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4961 PerlIO_printf(Perl_debug_log,
a0288114 4962 "synthetic stclass \"%s\".\n",
3f7c398e 4963 SvPVX_const(sv));});
653099ff 4964 }
c277df42
IZ
4965
4966 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4967 if (longest_fixed_length > longest_float_length) {
1de06328 4968 r->check_end_shift = r->anchored_end_shift;
c277df42 4969 r->check_substr = r->anchored_substr;
33b8afdf 4970 r->check_utf8 = r->anchored_utf8;
c277df42 4971 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4972 if (r->extflags & RXf_ANCH_SINGLE)
4973 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4974 }
4975 else {
1de06328 4976 r->check_end_shift = r->float_end_shift;
c277df42 4977 r->check_substr = r->float_substr;
33b8afdf 4978 r->check_utf8 = r->float_utf8;
1de06328
YO
4979 r->check_offset_min = r->float_min_offset;
4980 r->check_offset_max = r->float_max_offset;
a0d0e21e 4981 }
30382c73
IZ
4982 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4983 This should be changed ASAP! */
bbe252da
YO
4984 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4985 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4986 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4987 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4988 }
1de06328
YO
4989 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4990 if ( (STRLEN)minlen < longest_float_length )
4991 minlen= longest_float_length;
4992 if ( (STRLEN)minlen < longest_fixed_length )
4993 minlen= longest_fixed_length;
4994 */
a0ed51b3
LW
4995 }
4996 else {
c277df42
IZ
4997 /* Several toplevels. Best we can is to set minlen. */
4998 I32 fake;
653099ff 4999 struct regnode_charclass_class ch_class;
cb434fcc 5000 I32 last_close = 0;
c277df42 5001
5d458dd8 5002 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 5003
f8fc2ecf 5004 scan = ri->program + 1;
830247a4 5005 cl_init(pRExC_state, &ch_class);
653099ff 5006 data.start_class = &ch_class;
cb434fcc 5007 data.last_closep = &last_close;
07be1b83 5008
de8c5301 5009
1de06328 5010 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 5011 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 5012
786e8c11 5013 CHECK_RESTUDY_GOTO;
07be1b83 5014
33b8afdf 5015 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 5016 = r->float_substr = r->float_utf8 = NULL;
653099ff 5017 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
5018 && !cl_is_anything(data.start_class))
5019 {
2eccd3b2 5020 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 5021
f8fc2ecf 5022 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
5023 struct regnode_charclass_class);
5024 StructCopy(data.start_class,
f8fc2ecf 5025 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 5026 struct regnode_charclass_class);
f8fc2ecf 5027 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5028 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5029 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 5030 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5031 PerlIO_printf(Perl_debug_log,
a0288114 5032 "synthetic stclass \"%s\".\n",
3f7c398e 5033 SvPVX_const(sv));});
653099ff 5034 }
a0d0e21e
LW
5035 }
5036
1de06328
YO
5037 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5038 the "real" pattern. */
cf9788e3
RGS
5039 DEBUG_OPTIMISE_r({
5040 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 5041 (IV)minlen, (IV)r->minlen);
cf9788e3 5042 });
de8c5301 5043 r->minlenret = minlen;
1de06328
YO
5044 if (r->minlen < minlen)
5045 r->minlen = minlen;
5046
b81d288d 5047 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 5048 r->extflags |= RXf_GPOS_SEEN;
830247a4 5049 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 5050 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 5051 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 5052 r->extflags |= RXf_EVAL_SEEN;
f33976b4 5053 if (RExC_seen & REG_SEEN_CANY)
bbe252da 5054 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 5055 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 5056 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 5057 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 5058 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 5059 if (RExC_paren_names)
85fbaab2 5060 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 5061 else
5daac39c 5062 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 5063
7bd1e614 5064#ifdef STUPID_PATTERN_CHECKS
5509d87a 5065 if (RX_PRELEN(rx) == 0)
640f820d 5066 r->extflags |= RXf_NULL;
5509d87a 5067 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
5068 /* XXX: this should happen BEFORE we compile */
5069 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 5070 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 5071 r->extflags |= RXf_WHITE;
5509d87a 5072 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 5073 r->extflags |= RXf_START_ONLY;
f1b875a0 5074#else
5509d87a 5075 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
5076 /* XXX: this should happen BEFORE we compile */
5077 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5078 else {
5079 regnode *first = ri->program + 1;
39aa8307
JH
5080 U8 fop = OP(first);
5081 U8 nop = OP(NEXTOPER(first));
7bd1e614 5082
640f820d
AB
5083 if (PL_regkind[fop] == NOTHING && nop == END)
5084 r->extflags |= RXf_NULL;
5085 else if (PL_regkind[fop] == BOL && nop == END)
7bd1e614
YO
5086 r->extflags |= RXf_START_ONLY;
5087 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5088 r->extflags |= RXf_WHITE;
5089 }
f1b875a0 5090#endif
1f1031fe
YO
5091#ifdef DEBUGGING
5092 if (RExC_paren_names) {
af534a04 5093 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
1f1031fe
YO
5094 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5095 } else
1f1031fe 5096#endif
cde0cee5 5097 ri->name_list_idx = 0;
1f1031fe 5098
40d049e4
YO
5099 if (RExC_recurse_count) {
5100 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5101 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5102 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5103 }
5104 }
f0ab9afb 5105 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
5106 /* assume we don't need to swap parens around before we match */
5107
be8e71aa
YO
5108 DEBUG_DUMP_r({
5109 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
5110 regdump(r);
5111 });
7122b237
YO
5112#ifdef RE_TRACK_PATTERN_OFFSETS
5113 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5114 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
5115 U32 i;
5116 GET_RE_DEBUG_FLAGS_DECL;
7122b237 5117 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 5118 for (i = 1; i <= len; i++) {
7122b237 5119 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 5120 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 5121 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
5122 }
5123 PerlIO_printf(Perl_debug_log, "\n");
5124 });
7122b237 5125#endif
288b8c02 5126 return rx;
a687059c
LW
5127}
5128
f9f4320a 5129#undef RE_ENGINE_PTR
3dab1dad 5130
93b32b6d 5131
81714fb9 5132SV*
192b9cd1
AB
5133Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5134 const U32 flags)
5135{
7918f24d
NC
5136 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5137
192b9cd1
AB
5138 PERL_UNUSED_ARG(value);
5139
f1b875a0 5140 if (flags & RXapif_FETCH) {
192b9cd1 5141 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 5142 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6ad8f254 5143 Perl_croak_no_modify(aTHX);
192b9cd1 5144 return NULL;
f1b875a0 5145 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
5146 return reg_named_buff_exists(rx, key, flags)
5147 ? &PL_sv_yes
5148 : &PL_sv_no;
f1b875a0 5149 } else if (flags & RXapif_REGNAMES) {
192b9cd1 5150 return reg_named_buff_all(rx, flags);
f1b875a0 5151 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
5152 return reg_named_buff_scalar(rx, flags);
5153 } else {
5154 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5155 return NULL;
5156 }
5157}
5158
5159SV*
5160Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5161 const U32 flags)
5162{
7918f24d 5163 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
5164 PERL_UNUSED_ARG(lastkey);
5165
f1b875a0 5166 if (flags & RXapif_FIRSTKEY)
192b9cd1 5167 return reg_named_buff_firstkey(rx, flags);
f1b875a0 5168 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
5169 return reg_named_buff_nextkey(rx, flags);
5170 else {
5171 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5172 return NULL;
5173 }
5174}
5175
5176SV*
288b8c02
NC
5177Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5178 const U32 flags)
81714fb9 5179{
44a2ac75
YO
5180 AV *retarray = NULL;
5181 SV *ret;
288b8c02 5182 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5183
5184 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5185
f1b875a0 5186 if (flags & RXapif_ALL)
44a2ac75 5187 retarray=newAV();
93b32b6d 5188
5daac39c
NC
5189 if (rx && RXp_PAREN_NAMES(rx)) {
5190 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
5191 if (he_str) {
5192 IV i;
5193 SV* sv_dat=HeVAL(he_str);
5194 I32 *nums=(I32*)SvPVX(sv_dat);
5195 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
5196 if ((I32)(rx->nparens) >= nums[i]
5197 && rx->offs[nums[i]].start != -1
5198 && rx->offs[nums[i]].end != -1)
93b32b6d 5199 {
49d7dfbc 5200 ret = newSVpvs("");
288b8c02 5201 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
5202 if (!retarray)
5203 return ret;
5204 } else {
5205 ret = newSVsv(&PL_sv_undef);
5206 }
ec83ea38 5207 if (retarray)
93b32b6d 5208 av_push(retarray, ret);
81714fb9 5209 }
93b32b6d 5210 if (retarray)
ad64d0ec 5211 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
5212 }
5213 }
5214 return NULL;
5215}
5216
5217bool
288b8c02 5218Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
5219 const U32 flags)
5220{
288b8c02 5221 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5222
5223 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5224
5daac39c 5225 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5226 if (flags & RXapif_ALL) {
5daac39c 5227 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 5228 } else {
288b8c02 5229 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
5230 if (sv) {
5231 SvREFCNT_dec(sv);
192b9cd1
AB
5232 return TRUE;
5233 } else {
5234 return FALSE;
5235 }
5236 }
5237 } else {
5238 return FALSE;
5239 }
5240}
5241
5242SV*
288b8c02 5243Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5244{
288b8c02 5245 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5246
5247 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5248
5daac39c
NC
5249 if ( rx && RXp_PAREN_NAMES(rx) ) {
5250 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 5251
288b8c02 5252 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
5253 } else {
5254 return FALSE;
5255 }
192b9cd1
AB
5256}
5257
5258SV*
288b8c02 5259Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5260{
288b8c02 5261 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 5262 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5263
5264 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5265
5daac39c
NC
5266 if (rx && RXp_PAREN_NAMES(rx)) {
5267 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
5268 HE *temphe;
5269 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5270 IV i;
5271 IV parno = 0;
5272 SV* sv_dat = HeVAL(temphe);
5273 I32 *nums = (I32*)SvPVX(sv_dat);
5274 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5275 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5276 rx->offs[nums[i]].start != -1 &&
5277 rx->offs[nums[i]].end != -1)
5278 {
5279 parno = nums[i];
5280 break;
5281 }
5282 }
f1b875a0 5283 if (parno || flags & RXapif_ALL) {
a663657d 5284 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5285 }
81714fb9
YO
5286 }
5287 }
44a2ac75
YO
5288 return NULL;
5289}
5290
192b9cd1 5291SV*
288b8c02 5292Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5293{
5294 SV *ret;
5295 AV *av;
5296 I32 length;
288b8c02 5297 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5298
7918f24d
NC
5299 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5300
5daac39c 5301 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5302 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5303 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5304 } else if (flags & RXapif_ONE) {
288b8c02 5305 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 5306 av = MUTABLE_AV(SvRV(ret));
192b9cd1 5307 length = av_len(av);
ec83ea38 5308 SvREFCNT_dec(ret);
192b9cd1
AB
5309 return newSViv(length + 1);
5310 } else {
5311 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5312 return NULL;
5313 }
5314 }
5315 return &PL_sv_undef;
5316}
5317
5318SV*
288b8c02 5319Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5320{
288b8c02 5321 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5322 AV *av = newAV();
5323
7918f24d
NC
5324 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5325
5daac39c
NC
5326 if (rx && RXp_PAREN_NAMES(rx)) {
5327 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5328 HE *temphe;
5329 (void)hv_iterinit(hv);
5330 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5331 IV i;
5332 IV parno = 0;
5333 SV* sv_dat = HeVAL(temphe);
5334 I32 *nums = (I32*)SvPVX(sv_dat);
5335 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5336 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5337 rx->offs[nums[i]].start != -1 &&
5338 rx->offs[nums[i]].end != -1)
5339 {
5340 parno = nums[i];
5341 break;
5342 }
5343 }
f1b875a0 5344 if (parno || flags & RXapif_ALL) {
a663657d 5345 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
5346 }
5347 }
5348 }
5349
ad64d0ec 5350 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
5351}
5352
49d7dfbc 5353void
288b8c02
NC
5354Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5355 SV * const sv)
44a2ac75 5356{
288b8c02 5357 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 5358 char *s = NULL;
a9d504c3 5359 I32 i = 0;
44a2ac75 5360 I32 s1, t1;
7918f24d
NC
5361
5362 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 5363
cde0cee5
YO
5364 if (!rx->subbeg) {
5365 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5366 return;
cde0cee5
YO
5367 }
5368 else
f1b875a0 5369 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 5370 /* $` */
f0ab9afb 5371 i = rx->offs[0].start;
cde0cee5 5372 s = rx->subbeg;
44a2ac75
YO
5373 }
5374 else
f1b875a0 5375 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 5376 /* $' */
f0ab9afb
NC
5377 s = rx->subbeg + rx->offs[0].end;
5378 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
5379 }
5380 else
5381 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
5382 (s1 = rx->offs[paren].start) != -1 &&
5383 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
5384 {
5385 /* $& $1 ... */
5386 i = t1 - s1;
5387 s = rx->subbeg + s1;
cde0cee5
YO
5388 } else {
5389 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5390 return;
cde0cee5
YO
5391 }
5392 assert(rx->sublen >= (s - rx->subbeg) + i );
5393 if (i >= 0) {
5394 const int oldtainted = PL_tainted;
5395 TAINT_NOT;
5396 sv_setpvn(sv, s, i);
5397 PL_tainted = oldtainted;
5398 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 5399 ? (RXp_MATCH_UTF8(rx)
cde0cee5 5400 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 5401 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
5402 {
5403 SvUTF8_on(sv);
5404 }
5405 else
5406 SvUTF8_off(sv);
5407 if (PL_tainting) {
07bc277f 5408 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
5409 if (SvTYPE(sv) >= SVt_PVMG) {
5410 MAGIC* const mg = SvMAGIC(sv);
5411 MAGIC* mgt;
5412 PL_tainted = 1;
5413 SvMAGIC_set(sv, mg->mg_moremagic);
5414 SvTAINT(sv);
5415 if ((mgt = SvMAGIC(sv))) {
5416 mg->mg_moremagic = mgt;
5417 SvMAGIC_set(sv, mg);
44a2ac75 5418 }
cde0cee5
YO
5419 } else {
5420 PL_tainted = 1;
5421 SvTAINT(sv);
5422 }
5423 } else
5424 SvTAINTED_off(sv);
44a2ac75 5425 }
81714fb9 5426 } else {
44a2ac75 5427 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5428 return;
81714fb9
YO
5429 }
5430}
93b32b6d 5431
2fdbfb4d
AB
5432void
5433Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5434 SV const * const value)
5435{
7918f24d
NC
5436 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5437
2fdbfb4d
AB
5438 PERL_UNUSED_ARG(rx);
5439 PERL_UNUSED_ARG(paren);
5440 PERL_UNUSED_ARG(value);
5441
5442 if (!PL_localizing)
6ad8f254 5443 Perl_croak_no_modify(aTHX);
2fdbfb4d
AB
5444}
5445
5446I32
288b8c02 5447Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
5448 const I32 paren)
5449{
288b8c02 5450 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
5451 I32 i;
5452 I32 s1, t1;
5453
7918f24d
NC
5454 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5455
2fdbfb4d
AB
5456 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5457 switch (paren) {
192b9cd1 5458 /* $` / ${^PREMATCH} */
f1b875a0 5459 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
5460 if (rx->offs[0].start != -1) {
5461 i = rx->offs[0].start;
5462 if (i > 0) {
5463 s1 = 0;
5464 t1 = i;
5465 goto getlen;
5466 }
5467 }
5468 return 0;
192b9cd1 5469 /* $' / ${^POSTMATCH} */
f1b875a0 5470 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
5471 if (rx->offs[0].end != -1) {
5472 i = rx->sublen - rx->offs[0].end;
5473 if (i > 0) {
5474 s1 = rx->offs[0].end;
5475 t1 = rx->sublen;
5476 goto getlen;
5477 }
5478 }
5479 return 0;
192b9cd1
AB
5480 /* $& / ${^MATCH}, $1, $2, ... */
5481 default:
2fdbfb4d
AB
5482 if (paren <= (I32)rx->nparens &&
5483 (s1 = rx->offs[paren].start) != -1 &&
5484 (t1 = rx->offs[paren].end) != -1)
5485 {
5486 i = t1 - s1;
5487 goto getlen;
5488 } else {
5489 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 5490 report_uninit((const SV *)sv);
2fdbfb4d
AB
5491 return 0;
5492 }
5493 }
5494 getlen:
07bc277f 5495 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
5496 const char * const s = rx->subbeg + s1;
5497 const U8 *ep;
5498 STRLEN el;
5499
5500 i = t1 - s1;
5501 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5502 i = el;
5503 }
5504 return i;
5505}
5506
fe578d7f 5507SV*
49d7dfbc 5508Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 5509{
7918f24d 5510 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 5511 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
5512 if (0)
5513 return NULL;
5514 else
5515 return newSVpvs("Regexp");
fe578d7f 5516}
0a4db386 5517
894be9b7 5518/* Scans the name of a named buffer from the pattern.
0a4db386
YO
5519 * If flags is REG_RSN_RETURN_NULL returns null.
5520 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5521 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5522 * to the parsed name as looked up in the RExC_paren_names hash.
5523 * If there is an error throws a vFAIL().. type exception.
894be9b7 5524 */
0a4db386
YO
5525
5526#define REG_RSN_RETURN_NULL 0
5527#define REG_RSN_RETURN_NAME 1
5528#define REG_RSN_RETURN_DATA 2
5529
894be9b7 5530STATIC SV*
7918f24d
NC
5531S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5532{
894be9b7 5533 char *name_start = RExC_parse;
1f1031fe 5534
7918f24d
NC
5535 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5536
1f1031fe
YO
5537 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5538 /* skip IDFIRST by using do...while */
5539 if (UTF)
5540 do {
5541 RExC_parse += UTF8SKIP(RExC_parse);
5542 } while (isALNUM_utf8((U8*)RExC_parse));
5543 else
5544 do {
5545 RExC_parse++;
5546 } while (isALNUM(*RExC_parse));
894be9b7 5547 }
1f1031fe 5548
0a4db386 5549 if ( flags ) {
59cd0e26
NC
5550 SV* sv_name
5551 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5552 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
5553 if ( flags == REG_RSN_RETURN_NAME)
5554 return sv_name;
5555 else if (flags==REG_RSN_RETURN_DATA) {
5556 HE *he_str = NULL;
5557 SV *sv_dat = NULL;
5558 if ( ! sv_name ) /* should not happen*/
5559 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5560 if (RExC_paren_names)
5561 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5562 if ( he_str )
5563 sv_dat = HeVAL(he_str);
5564 if ( ! sv_dat )
5565 vFAIL("Reference to nonexistent named group");
5566 return sv_dat;
5567 }
5568 else {
5569 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5570 }
5571 /* NOT REACHED */
894be9b7 5572 }
0a4db386 5573 return NULL;
894be9b7
YO
5574}
5575
3dab1dad
YO
5576#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5577 int rem=(int)(RExC_end - RExC_parse); \
5578 int cut; \
5579 int num; \
5580 int iscut=0; \
5581 if (rem>10) { \
5582 rem=10; \
5583 iscut=1; \
5584 } \
5585 cut=10-rem; \
5586 if (RExC_lastparse!=RExC_parse) \
5587 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5588 rem, RExC_parse, \
5589 cut + 4, \
5590 iscut ? "..." : "<" \
5591 ); \
5592 else \
5593 PerlIO_printf(Perl_debug_log,"%16s",""); \
5594 \
5595 if (SIZE_ONLY) \
3b57cd43 5596 num = RExC_size + 1; \
3dab1dad
YO
5597 else \
5598 num=REG_NODE_NUM(RExC_emit); \
5599 if (RExC_lastnum!=num) \
0a4db386 5600 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 5601 else \
0a4db386 5602 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
5603 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5604 (int)((depth*2)), "", \
3dab1dad
YO
5605 (funcname) \
5606 ); \
5607 RExC_lastnum=num; \
5608 RExC_lastparse=RExC_parse; \
5609})
5610
07be1b83
YO
5611
5612
3dab1dad
YO
5613#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5614 DEBUG_PARSE_MSG((funcname)); \
5615 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5616})
6bda09f9
YO
5617#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5618 DEBUG_PARSE_MSG((funcname)); \
5619 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5620})
a687059c
LW
5621/*
5622 - reg - regular expression, i.e. main body or parenthesized thing
5623 *
5624 * Caller must absorb opening parenthesis.
5625 *
5626 * Combining parenthesis handling with the base level of regular expression
5627 * is a trifle forced, but the need to tie the tails of the branches to what
5628 * follows makes it hard to avoid.
5629 */
07be1b83
YO
5630#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5631#ifdef DEBUGGING
5632#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5633#else
5634#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5635#endif
3dab1dad 5636
76e3520e 5637STATIC regnode *
3dab1dad 5638S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 5639 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 5640{
27da23d5 5641 dVAR;
c277df42
IZ
5642 register regnode *ret; /* Will be the head of the group. */
5643 register regnode *br;
5644 register regnode *lastbr;
cbbf8932 5645 register regnode *ender = NULL;
a0d0e21e 5646 register I32 parno = 0;
cbbf8932 5647 I32 flags;
f7819f85 5648 U32 oregflags = RExC_flags;
6136c704
AL
5649 bool have_branch = 0;
5650 bool is_open = 0;
594d7033
YO
5651 I32 freeze_paren = 0;
5652 I32 after_freeze = 0;
9d1d55b5
JP
5653
5654 /* for (?g), (?gc), and (?o) warnings; warning
5655 about (?c) will warn about (?g) -- japhy */
5656
6136c704
AL
5657#define WASTED_O 0x01
5658#define WASTED_G 0x02
5659#define WASTED_C 0x04
5660#define WASTED_GC (0x02|0x04)
cbbf8932 5661 I32 wastedflags = 0x00;
9d1d55b5 5662
fac92740 5663 char * parse_start = RExC_parse; /* MJD */
a28509cc 5664 char * const oregcomp_parse = RExC_parse;
a0d0e21e 5665
3dab1dad 5666 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5667
5668 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
5669 DEBUG_PARSE("reg ");
5670
821b33a5 5671 *flagp = 0; /* Tentatively. */
a0d0e21e 5672
9d1d55b5 5673
a0d0e21e
LW
5674 /* Make an OPEN node, if parenthesized. */
5675 if (paren) {
e2e6a0f1
YO
5676 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5677 char *start_verb = RExC_parse;
5678 STRLEN verb_len = 0;
5679 char *start_arg = NULL;
5680 unsigned char op = 0;
5681 int argok = 1;
5682 int internal_argval = 0; /* internal_argval is only useful if !argok */
5683 while ( *RExC_parse && *RExC_parse != ')' ) {
5684 if ( *RExC_parse == ':' ) {
5685 start_arg = RExC_parse + 1;
5686 break;
5687 }
5688 RExC_parse++;
5689 }
5690 ++start_verb;
5691 verb_len = RExC_parse - start_verb;
5692 if ( start_arg ) {
5693 RExC_parse++;
5694 while ( *RExC_parse && *RExC_parse != ')' )
5695 RExC_parse++;
5696 if ( *RExC_parse != ')' )
5697 vFAIL("Unterminated verb pattern argument");
5698 if ( RExC_parse == start_arg )
5699 start_arg = NULL;
5700 } else {
5701 if ( *RExC_parse != ')' )
5702 vFAIL("Unterminated verb pattern");
5703 }
5d458dd8 5704
e2e6a0f1
YO
5705 switch ( *start_verb ) {
5706 case 'A': /* (*ACCEPT) */
568a785a 5707 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
5708 op = ACCEPT;
5709 internal_argval = RExC_nestroot;
5710 }
5711 break;
5712 case 'C': /* (*COMMIT) */
568a785a 5713 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 5714 op = COMMIT;
e2e6a0f1
YO
5715 break;
5716 case 'F': /* (*FAIL) */
568a785a 5717 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
5718 op = OPFAIL;
5719 argok = 0;
5720 }
5721 break;
5d458dd8
YO
5722 case ':': /* (*:NAME) */
5723 case 'M': /* (*MARK:NAME) */
568a785a 5724 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 5725 op = MARKPOINT;
5d458dd8
YO
5726 argok = -1;
5727 }
5728 break;
5729 case 'P': /* (*PRUNE) */
568a785a 5730 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 5731 op = PRUNE;
e2e6a0f1 5732 break;
5d458dd8 5733 case 'S': /* (*SKIP) */
568a785a 5734 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
5735 op = SKIP;
5736 break;
5737 case 'T': /* (*THEN) */
5738 /* [19:06] <TimToady> :: is then */
568a785a 5739 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
5740 op = CUTGROUP;
5741 RExC_seen |= REG_SEEN_CUTGROUP;
5742 }
e2e6a0f1
YO
5743 break;
5744 }
5745 if ( ! op ) {
5746 RExC_parse++;
5747 vFAIL3("Unknown verb pattern '%.*s'",
5748 verb_len, start_verb);
5749 }
5750 if ( argok ) {
5751 if ( start_arg && internal_argval ) {
5752 vFAIL3("Verb pattern '%.*s' may not have an argument",
5753 verb_len, start_verb);
5754 } else if ( argok < 0 && !start_arg ) {
5755 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5756 verb_len, start_verb);
5757 } else {
5758 ret = reganode(pRExC_state, op, internal_argval);
5759 if ( ! internal_argval && ! SIZE_ONLY ) {
5760 if (start_arg) {
5761 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5762 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5763 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
5764 ret->flags = 0;
5765 } else {
5766 ret->flags = 1;
5767 }
5768 }
5769 }
5770 if (!internal_argval)
5771 RExC_seen |= REG_SEEN_VERBARG;
5772 } else if ( start_arg ) {
5773 vFAIL3("Verb pattern '%.*s' may not have an argument",
5774 verb_len, start_verb);
5775 } else {
5776 ret = reg_node(pRExC_state, op);
5777 }
5778 nextchar(pRExC_state);
5779 return ret;
5780 } else
fac92740 5781 if (*RExC_parse == '?') { /* (?...) */
6136c704 5782 bool is_logical = 0;
a28509cc 5783 const char * const seqstart = RExC_parse;
fb85c044 5784 bool has_use_defaults = FALSE;
ca9dfc88 5785
830247a4
IZ
5786 RExC_parse++;
5787 paren = *RExC_parse++;
c277df42 5788 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5789 switch (paren) {
894be9b7 5790
1f1031fe
YO
5791 case 'P': /* (?P...) variants for those used to PCRE/Python */
5792 paren = *RExC_parse++;
5793 if ( paren == '<') /* (?P<...>) named capture */
5794 goto named_capture;
5795 else if (paren == '>') { /* (?P>name) named recursion */
5796 goto named_recursion;
5797 }
5798 else if (paren == '=') { /* (?P=...) named backref */
5799 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5800 you change this make sure you change that */
5801 char* name_start = RExC_parse;
5802 U32 num = 0;
5803 SV *sv_dat = reg_scan_name(pRExC_state,
5804 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5805 if (RExC_parse == name_start || *RExC_parse != ')')
5806 vFAIL2("Sequence %.3s... not terminated",parse_start);
5807
5808 if (!SIZE_ONLY) {
5809 num = add_data( pRExC_state, 1, "S" );
5810 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5811 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
5812 }
5813 RExC_sawback = 1;
5814 ret = reganode(pRExC_state,
5815 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5816 num);
5817 *flagp |= HASWIDTH;
5818
5819 Set_Node_Offset(ret, parse_start+1);
5820 Set_Node_Cur_Length(ret); /* MJD */
5821
5822 nextchar(pRExC_state);
5823 return ret;
5824 }
57b84237
YO
5825 RExC_parse++;
5826 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5827 /*NOTREACHED*/
5828 case '<': /* (?<...) */
b81d288d 5829 if (*RExC_parse == '!')
c277df42 5830 paren = ',';
0a4db386 5831 else if (*RExC_parse != '=')
1f1031fe 5832 named_capture:
0a4db386 5833 { /* (?<...>) */
81714fb9 5834 char *name_start;
894be9b7 5835 SV *svname;
81714fb9
YO
5836 paren= '>';
5837 case '\'': /* (?'...') */
5838 name_start= RExC_parse;
0a4db386
YO
5839 svname = reg_scan_name(pRExC_state,
5840 SIZE_ONLY ? /* reverse test from the others */
5841 REG_RSN_RETURN_NAME :
5842 REG_RSN_RETURN_NULL);
57b84237
YO
5843 if (RExC_parse == name_start) {
5844 RExC_parse++;
5845 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5846 /*NOTREACHED*/
5847 }
81714fb9
YO
5848 if (*RExC_parse != paren)
5849 vFAIL2("Sequence (?%c... not terminated",
5850 paren=='>' ? '<' : paren);
5851 if (SIZE_ONLY) {
e62cc96a
YO
5852 HE *he_str;
5853 SV *sv_dat = NULL;
894be9b7
YO
5854 if (!svname) /* shouldnt happen */
5855 Perl_croak(aTHX_
5856 "panic: reg_scan_name returned NULL");
81714fb9
YO
5857 if (!RExC_paren_names) {
5858 RExC_paren_names= newHV();
ad64d0ec 5859 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
5860#ifdef DEBUGGING
5861 RExC_paren_name_list= newAV();
ad64d0ec 5862 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 5863#endif
81714fb9
YO
5864 }
5865 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5866 if ( he_str )
81714fb9 5867 sv_dat = HeVAL(he_str);
e62cc96a 5868 if ( ! sv_dat ) {
81714fb9 5869 /* croak baby croak */
e62cc96a
YO
5870 Perl_croak(aTHX_
5871 "panic: paren_name hash element allocation failed");
5872 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
5873 /* (?|...) can mean we have dupes so scan to check
5874 its already been stored. Maybe a flag indicating
5875 we are inside such a construct would be useful,
5876 but the arrays are likely to be quite small, so
5877 for now we punt -- dmq */
5878 IV count = SvIV(sv_dat);
5879 I32 *pv = (I32*)SvPVX(sv_dat);
5880 IV i;
5881 for ( i = 0 ; i < count ; i++ ) {
5882 if ( pv[i] == RExC_npar ) {
5883 count = 0;
5884 break;
5885 }
5886 }
5887 if ( count ) {
5888 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5889 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5890 pv[count] = RExC_npar;
3a92e6ae 5891 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 5892 }
81714fb9
YO
5893 } else {
5894 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5895 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5896 SvIOK_on(sv_dat);
3ec35e0f 5897 SvIV_set(sv_dat, 1);
e62cc96a 5898 }
1f1031fe
YO
5899#ifdef DEBUGGING
5900 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5901 SvREFCNT_dec(svname);
5902#endif
e62cc96a 5903
81714fb9
YO
5904 /*sv_dump(sv_dat);*/
5905 }
5906 nextchar(pRExC_state);
5907 paren = 1;
5908 goto capturing_parens;
5909 }
5910 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5911 RExC_parse++;
fac92740 5912 case '=': /* (?=...) */
89c6a13e 5913 RExC_seen_zerolen++;
5c3fa2e7 5914 break;
fac92740 5915 case '!': /* (?!...) */
830247a4 5916 RExC_seen_zerolen++;
e2e6a0f1
YO
5917 if (*RExC_parse == ')') {
5918 ret=reg_node(pRExC_state, OPFAIL);
5919 nextchar(pRExC_state);
5920 return ret;
5921 }
594d7033
YO
5922 break;
5923 case '|': /* (?|...) */
5924 /* branch reset, behave like a (?:...) except that
5925 buffers in alternations share the same numbers */
5926 paren = ':';
5927 after_freeze = freeze_paren = RExC_npar;
5928 break;
fac92740
MJD
5929 case ':': /* (?:...) */
5930 case '>': /* (?>...) */
a0d0e21e 5931 break;
fac92740
MJD
5932 case '$': /* (?$...) */
5933 case '@': /* (?@...) */
8615cb43 5934 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5935 break;
fac92740 5936 case '#': /* (?#...) */
830247a4
IZ
5937 while (*RExC_parse && *RExC_parse != ')')
5938 RExC_parse++;
5939 if (*RExC_parse != ')')
c277df42 5940 FAIL("Sequence (?#... not terminated");
830247a4 5941 nextchar(pRExC_state);
a0d0e21e
LW
5942 *flagp = TRYAGAIN;
5943 return NULL;
894be9b7
YO
5944 case '0' : /* (?0) */
5945 case 'R' : /* (?R) */
5946 if (*RExC_parse != ')')
6bda09f9 5947 FAIL("Sequence (?R) not terminated");
1a147d38 5948 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 5949 *flagp |= POSTPONED;
7f69552c
YO
5950 nextchar(pRExC_state);
5951 return ret;
5952 /*notreached*/
894be9b7
YO
5953 { /* named and numeric backreferences */
5954 I32 num;
894be9b7
YO
5955 case '&': /* (?&NAME) */
5956 parse_start = RExC_parse - 1;
1f1031fe 5957 named_recursion:
894be9b7 5958 {
0a4db386
YO
5959 SV *sv_dat = reg_scan_name(pRExC_state,
5960 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5961 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5962 }
5963 goto gen_recurse_regop;
5964 /* NOT REACHED */
542fa716
YO
5965 case '+':
5966 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5967 RExC_parse++;
5968 vFAIL("Illegal pattern");
5969 }
5970 goto parse_recursion;
5971 /* NOT REACHED*/
5972 case '-': /* (?-1) */
5973 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5974 RExC_parse--; /* rewind to let it be handled later */
5975 goto parse_flags;
5976 }
5977 /*FALLTHROUGH */
6bda09f9
YO
5978 case '1': case '2': case '3': case '4': /* (?1) */
5979 case '5': case '6': case '7': case '8': case '9':
5980 RExC_parse--;
542fa716 5981 parse_recursion:
894be9b7
YO
5982 num = atoi(RExC_parse);
5983 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5984 if (*RExC_parse == '-')
5985 RExC_parse++;
6bda09f9
YO
5986 while (isDIGIT(*RExC_parse))
5987 RExC_parse++;
5988 if (*RExC_parse!=')')
5989 vFAIL("Expecting close bracket");
894be9b7
YO
5990
5991 gen_recurse_regop:
542fa716
YO
5992 if ( paren == '-' ) {
5993 /*
5994 Diagram of capture buffer numbering.
5995 Top line is the normal capture buffer numbers
3b753521 5996 Bottom line is the negative indexing as from
542fa716
YO
5997 the X (the (?-2))
5998
5999 + 1 2 3 4 5 X 6 7
6000 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6001 - 5 4 3 2 1 X x x
6002
6003 */
6004 num = RExC_npar + num;
6005 if (num < 1) {
6006 RExC_parse++;
6007 vFAIL("Reference to nonexistent group");
6008 }
6009 } else if ( paren == '+' ) {
6010 num = RExC_npar + num - 1;
6011 }
6012
1a147d38 6013 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
6014 if (!SIZE_ONLY) {
6015 if (num > (I32)RExC_rx->nparens) {
6016 RExC_parse++;
6017 vFAIL("Reference to nonexistent group");
6018 }
40d049e4 6019 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 6020 RExC_emit++;
226de585 6021 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 6022 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 6023 } else {
6bda09f9 6024 RExC_size++;
6bda09f9 6025 }
0a4db386 6026 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 6027 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
6028 Set_Node_Offset(ret, parse_start); /* MJD */
6029
a3b492c3 6030 *flagp |= POSTPONED;
6bda09f9
YO
6031 nextchar(pRExC_state);
6032 return ret;
894be9b7
YO
6033 } /* named and numeric backreferences */
6034 /* NOT REACHED */
6035
fac92740 6036 case '?': /* (??...) */
6136c704 6037 is_logical = 1;
57b84237
YO
6038 if (*RExC_parse != '{') {
6039 RExC_parse++;
6040 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6041 /*NOTREACHED*/
6042 }
a3b492c3 6043 *flagp |= POSTPONED;
830247a4 6044 paren = *RExC_parse++;
0f5d15d6 6045 /* FALL THROUGH */
fac92740 6046 case '{': /* (?{...}) */
c277df42 6047 {
2eccd3b2
NC
6048 I32 count = 1;
6049 U32 n = 0;
c277df42 6050 char c;
830247a4 6051 char *s = RExC_parse;
c277df42 6052
830247a4
IZ
6053 RExC_seen_zerolen++;
6054 RExC_seen |= REG_SEEN_EVAL;
6055 while (count && (c = *RExC_parse)) {
6136c704
AL
6056 if (c == '\\') {
6057 if (RExC_parse[1])
6058 RExC_parse++;
6059 }
b81d288d 6060 else if (c == '{')
c277df42 6061 count++;
b81d288d 6062 else if (c == '}')
c277df42 6063 count--;
830247a4 6064 RExC_parse++;
c277df42 6065 }
6136c704 6066 if (*RExC_parse != ')') {
b81d288d 6067 RExC_parse = s;
b45f050a
JF
6068 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6069 }
c277df42 6070 if (!SIZE_ONLY) {
f3548bdc 6071 PAD *pad;
6136c704
AL
6072 OP_4tree *sop, *rop;
6073 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 6074
569233ed
SB
6075 ENTER;
6076 Perl_save_re_context(aTHX);
f3548bdc 6077 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
6078 sop->op_private |= OPpREFCOUNTED;
6079 /* re_dup will OpREFCNT_inc */
6080 OpREFCNT_set(sop, 1);
569233ed 6081 LEAVE;
c277df42 6082
830247a4 6083 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
6084 RExC_rxi->data->data[n] = (void*)rop;
6085 RExC_rxi->data->data[n+1] = (void*)sop;
6086 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 6087 SvREFCNT_dec(sv);
a0ed51b3 6088 }
e24b16f9 6089 else { /* First pass */
830247a4 6090 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 6091 && IN_PERL_RUNTIME)
2cd61cdb
IZ
6092 /* No compiled RE interpolated, has runtime
6093 components ===> unsafe. */
6094 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 6095 if (PL_tainting && PL_tainted)
cc6b7395 6096 FAIL("Eval-group in insecure regular expression");
54df2634 6097#if PERL_VERSION > 8
923e4eb5 6098 if (IN_PERL_COMPILETIME)
b5c19bd7 6099 PL_cv_has_eval = 1;
54df2634 6100#endif
c277df42 6101 }
b5c19bd7 6102
830247a4 6103 nextchar(pRExC_state);
6136c704 6104 if (is_logical) {
830247a4 6105 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6106 if (!SIZE_ONLY)
6107 ret->flags = 2;
3dab1dad 6108 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 6109 /* deal with the length of this later - MJD */
0f5d15d6
IZ
6110 return ret;
6111 }
ccb2c380
MP
6112 ret = reganode(pRExC_state, EVAL, n);
6113 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6114 Set_Node_Offset(ret, parse_start);
6115 return ret;
c277df42 6116 }
fac92740 6117 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 6118 {
0a4db386 6119 int is_define= 0;
fac92740 6120 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
6121 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6122 || RExC_parse[1] == '<'
830247a4 6123 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
6124 I32 flag;
6125
830247a4 6126 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
6127 if (!SIZE_ONLY)
6128 ret->flags = 1;
3dab1dad 6129 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 6130 goto insert_if;
b81d288d 6131 }
a0ed51b3 6132 }
0a4db386
YO
6133 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6134 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6135 {
6136 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6137 char *name_start= RExC_parse++;
2eccd3b2 6138 U32 num = 0;
0a4db386
YO
6139 SV *sv_dat=reg_scan_name(pRExC_state,
6140 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6141 if (RExC_parse == name_start || *RExC_parse != ch)
6142 vFAIL2("Sequence (?(%c... not terminated",
6143 (ch == '>' ? '<' : ch));
6144 RExC_parse++;
6145 if (!SIZE_ONLY) {
6146 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 6147 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 6148 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
6149 }
6150 ret = reganode(pRExC_state,NGROUPP,num);
6151 goto insert_if_check_paren;
6152 }
6153 else if (RExC_parse[0] == 'D' &&
6154 RExC_parse[1] == 'E' &&
6155 RExC_parse[2] == 'F' &&
6156 RExC_parse[3] == 'I' &&
6157 RExC_parse[4] == 'N' &&
6158 RExC_parse[5] == 'E')
6159 {
6160 ret = reganode(pRExC_state,DEFINEP,0);
6161 RExC_parse +=6 ;
6162 is_define = 1;
6163 goto insert_if_check_paren;
6164 }
6165 else if (RExC_parse[0] == 'R') {
6166 RExC_parse++;
6167 parno = 0;
6168 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6169 parno = atoi(RExC_parse++);
6170 while (isDIGIT(*RExC_parse))
6171 RExC_parse++;
6172 } else if (RExC_parse[0] == '&') {
6173 SV *sv_dat;
6174 RExC_parse++;
6175 sv_dat = reg_scan_name(pRExC_state,
6176 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6177 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6178 }
1a147d38 6179 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
6180 goto insert_if_check_paren;
6181 }
830247a4 6182 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 6183 /* (?(1)...) */
6136c704 6184 char c;
830247a4 6185 parno = atoi(RExC_parse++);
c277df42 6186
830247a4
IZ
6187 while (isDIGIT(*RExC_parse))
6188 RExC_parse++;
fac92740 6189 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 6190
0a4db386 6191 insert_if_check_paren:
830247a4 6192 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 6193 vFAIL("Switch condition not recognized");
c277df42 6194 insert_if:
3dab1dad
YO
6195 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6196 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 6197 if (br == NULL)
830247a4 6198 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 6199 else
3dab1dad 6200 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 6201 c = *nextchar(pRExC_state);
d1b80229
IZ
6202 if (flags&HASWIDTH)
6203 *flagp |= HASWIDTH;
c277df42 6204 if (c == '|') {
0a4db386
YO
6205 if (is_define)
6206 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 6207 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
6208 regbranch(pRExC_state, &flags, 1,depth+1);
6209 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
6210 if (flags&HASWIDTH)
6211 *flagp |= HASWIDTH;
830247a4 6212 c = *nextchar(pRExC_state);
a0ed51b3
LW
6213 }
6214 else
c277df42
IZ
6215 lastbr = NULL;
6216 if (c != ')')
8615cb43 6217 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 6218 ender = reg_node(pRExC_state, TAIL);
3dab1dad 6219 REGTAIL(pRExC_state, br, ender);
c277df42 6220 if (lastbr) {
3dab1dad
YO
6221 REGTAIL(pRExC_state, lastbr, ender);
6222 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
6223 }
6224 else
3dab1dad 6225 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
6226 RExC_size++; /* XXX WHY do we need this?!!
6227 For large programs it seems to be required
6228 but I can't figure out why. -- dmq*/
c277df42 6229 return ret;
a0ed51b3
LW
6230 }
6231 else {
830247a4 6232 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
6233 }
6234 }
1b1626e4 6235 case 0:
830247a4 6236 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 6237 vFAIL("Sequence (? incomplete");
1b1626e4 6238 break;
85508812
KW
6239 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6240 that follow */
fb85c044
KW
6241 has_use_defaults = TRUE;
6242 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9de15fec 6243 RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
fb85c044 6244 goto parse_flags;
a0d0e21e 6245 default:
cde0cee5
YO
6246 --RExC_parse;
6247 parse_flags: /* (?i) */
6248 {
6249 U32 posflags = 0, negflags = 0;
6250 U32 *flagsp = &posflags;
9de15fec 6251 bool has_charset_modifier = 0;
cde0cee5
YO
6252
6253 while (*RExC_parse) {
6254 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
6255 /* (?g), (?gc) and (?o) are useless here
6256 and must be globally applied -- japhy */
cde0cee5
YO
6257 switch (*RExC_parse) {
6258 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9de15fec
KW
6259 case LOCALE_PAT_MOD:
6260 if (has_charset_modifier || flagsp == &negflags) {
6261 goto fail_modifiers;
6262 }
8cc86590
KW
6263 posflags |= RXf_PMf_LOCALE;
6264 negflags |= RXf_PMf_UNICODE;
9de15fec
KW
6265 has_charset_modifier = 1;
6266 break;
6267 case UNICODE_PAT_MOD:
6268 if (has_charset_modifier || flagsp == &negflags) {
6269 goto fail_modifiers;
6270 }
8cc86590
KW
6271 posflags |= RXf_PMf_UNICODE;
6272 negflags |= RXf_PMf_LOCALE;
9de15fec
KW
6273 has_charset_modifier = 1;
6274 break;
6275 case DUAL_PAT_MOD:
6276 if (has_use_defaults
6277 || has_charset_modifier
6278 || flagsp == &negflags)
6279 {
6280 goto fail_modifiers;
6281 }
8cc86590 6282 negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
9de15fec
KW
6283 has_charset_modifier = 1;
6284 break;
f7819f85
A
6285 case ONCE_PAT_MOD: /* 'o' */
6286 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 6287 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 6288 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
6289 if (! (wastedflags & wflagbit) ) {
6290 wastedflags |= wflagbit;
6291 vWARN5(
6292 RExC_parse + 1,
6293 "Useless (%s%c) - %suse /%c modifier",
6294 flagsp == &negflags ? "?-" : "?",
6295 *RExC_parse,
6296 flagsp == &negflags ? "don't " : "",
6297 *RExC_parse
6298 );
6299 }
6300 }
cde0cee5
YO
6301 break;
6302
f7819f85 6303 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 6304 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
6305 if (! (wastedflags & WASTED_C) ) {
6306 wastedflags |= WASTED_GC;
9d1d55b5
JP
6307 vWARN3(
6308 RExC_parse + 1,
6309 "Useless (%sc) - %suse /gc modifier",
6310 flagsp == &negflags ? "?-" : "?",
6311 flagsp == &negflags ? "don't " : ""
6312 );
6313 }
6314 }
cde0cee5 6315 break;
f7819f85 6316 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5 6317 if (flagsp == &negflags) {
668c081a
NC
6318 if (SIZE_ONLY)
6319 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
6320 } else {
6321 *flagsp |= RXf_PMf_KEEPCOPY;
6322 }
6323 break;
6324 case '-':
3b753521 6325 /* A flag is a default iff it is following a minus, so
fb85c044
KW
6326 * if there is a minus, it means will be trying to
6327 * re-specify a default which is an error */
6328 if (has_use_defaults || flagsp == &negflags) {
9de15fec 6329 fail_modifiers:
57b84237
YO
6330 RExC_parse++;
6331 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6332 /*NOTREACHED*/
6333 }
cde0cee5
YO
6334 flagsp = &negflags;
6335 wastedflags = 0; /* reset so (?g-c) warns twice */
6336 break;
6337 case ':':
6338 paren = ':';
6339 /*FALLTHROUGH*/
6340 case ')':
6341 RExC_flags |= posflags;
6342 RExC_flags &= ~negflags;
f7819f85
A
6343 if (paren != ':') {
6344 oregflags |= posflags;
6345 oregflags &= ~negflags;
6346 }
cde0cee5
YO
6347 nextchar(pRExC_state);
6348 if (paren != ':') {
6349 *flagp = TRYAGAIN;
6350 return NULL;
6351 } else {
6352 ret = NULL;
6353 goto parse_rest;
6354 }
6355 /*NOTREACHED*/
6356 default:
cde0cee5
YO
6357 RExC_parse++;
6358 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6359 /*NOTREACHED*/
6360 }
830247a4 6361 ++RExC_parse;
48c036b1 6362 }
cde0cee5 6363 }} /* one for the default block, one for the switch */
a0d0e21e 6364 }
fac92740 6365 else { /* (...) */
81714fb9 6366 capturing_parens:
830247a4
IZ
6367 parno = RExC_npar;
6368 RExC_npar++;
e2e6a0f1 6369
830247a4 6370 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
6371 if (!SIZE_ONLY ){
6372 if (!RExC_nestroot)
6373 RExC_nestroot = parno;
c009da3d
YO
6374 if (RExC_seen & REG_SEEN_RECURSE
6375 && !RExC_open_parens[parno-1])
6376 {
e2e6a0f1 6377 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
6378 "Setting open paren #%"IVdf" to %d\n",
6379 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
6380 RExC_open_parens[parno-1]= ret;
6381 }
6bda09f9 6382 }
fac92740
MJD
6383 Set_Node_Length(ret, 1); /* MJD */
6384 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 6385 is_open = 1;
a0d0e21e 6386 }
a0ed51b3 6387 }
fac92740 6388 else /* ! paren */
a0d0e21e 6389 ret = NULL;
cde0cee5
YO
6390
6391 parse_rest:
a0d0e21e 6392 /* Pick up the branches, linking them together. */
fac92740 6393 parse_start = RExC_parse; /* MJD */
3dab1dad 6394 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e
VP
6395
6396 if (freeze_paren) {
6397 if (RExC_npar > after_freeze)
6398 after_freeze = RExC_npar;
6399 RExC_npar = freeze_paren;
6400 }
6401
fac92740 6402 /* branch_len = (paren != 0); */
2af232bd 6403
a0d0e21e
LW
6404 if (br == NULL)
6405 return(NULL);
830247a4
IZ
6406 if (*RExC_parse == '|') {
6407 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 6408 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 6409 }
fac92740 6410 else { /* MJD */
6bda09f9 6411 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
6412 Set_Node_Length(br, paren != 0);
6413 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6414 }
c277df42
IZ
6415 have_branch = 1;
6416 if (SIZE_ONLY)
830247a4 6417 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
6418 }
6419 else if (paren == ':') {
c277df42
IZ
6420 *flagp |= flags&SIMPLE;
6421 }
6136c704 6422 if (is_open) { /* Starts with OPEN. */
3dab1dad 6423 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
6424 }
6425 else if (paren != '?') /* Not Conditional */
a0d0e21e 6426 ret = br;
8ae10a67 6427 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 6428 lastbr = br;
830247a4
IZ
6429 while (*RExC_parse == '|') {
6430 if (!SIZE_ONLY && RExC_extralen) {
6431 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 6432 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
6433 }
6434 if (SIZE_ONLY)
830247a4
IZ
6435 RExC_extralen += 2; /* Account for LONGJMP. */
6436 nextchar(pRExC_state);
594d7033
YO
6437 if (freeze_paren) {
6438 if (RExC_npar > after_freeze)
6439 after_freeze = RExC_npar;
6440 RExC_npar = freeze_paren;
6441 }
3dab1dad 6442 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 6443
a687059c 6444 if (br == NULL)
a0d0e21e 6445 return(NULL);
3dab1dad 6446 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 6447 lastbr = br;
8ae10a67 6448 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
6449 }
6450
c277df42
IZ
6451 if (have_branch || paren != ':') {
6452 /* Make a closing node, and hook it on the end. */
6453 switch (paren) {
6454 case ':':
830247a4 6455 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
6456 break;
6457 case 1:
830247a4 6458 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
6459 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6460 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6461 "Setting close paren #%"IVdf" to %d\n",
6462 (IV)parno, REG_NODE_NUM(ender)));
6463 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
6464 if (RExC_nestroot == parno)
6465 RExC_nestroot = 0;
40d049e4 6466 }
fac92740
MJD
6467 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6468 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
6469 break;
6470 case '<':
c277df42
IZ
6471 case ',':
6472 case '=':
6473 case '!':
c277df42 6474 *flagp &= ~HASWIDTH;
821b33a5
IZ
6475 /* FALL THROUGH */
6476 case '>':
830247a4 6477 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
6478 break;
6479 case 0:
830247a4 6480 ender = reg_node(pRExC_state, END);
40d049e4
YO
6481 if (!SIZE_ONLY) {
6482 assert(!RExC_opend); /* there can only be one! */
6483 RExC_opend = ender;
6484 }
c277df42
IZ
6485 break;
6486 }
eaf3ca90 6487 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 6488
9674d46a 6489 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
6490 if (depth==1)
6491 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6492
c277df42 6493 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
6494 for (br = ret; br; br = regnext(br)) {
6495 const U8 op = PL_regkind[OP(br)];
6496 if (op == BRANCH) {
07be1b83 6497 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
6498 }
6499 else if (op == BRANCHJ) {
07be1b83 6500 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 6501 }
c277df42
IZ
6502 }
6503 }
a0d0e21e 6504 }
c277df42
IZ
6505
6506 {
e1ec3a88
AL
6507 const char *p;
6508 static const char parens[] = "=!<,>";
c277df42
IZ
6509
6510 if (paren && (p = strchr(parens, paren))) {
eb160463 6511 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
6512 int flag = (p - parens) > 1;
6513
6514 if (paren == '>')
6515 node = SUSPEND, flag = 0;
6bda09f9 6516 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
6517 Set_Node_Cur_Length(ret);
6518 Set_Node_Offset(ret, parse_start + 1);
c277df42 6519 ret->flags = flag;
07be1b83 6520 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 6521 }
a0d0e21e
LW
6522 }
6523
6524 /* Check for proper termination. */
ce3e6498 6525 if (paren) {
e2509266 6526 RExC_flags = oregflags;
830247a4
IZ
6527 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6528 RExC_parse = oregcomp_parse;
380a0633 6529 vFAIL("Unmatched (");
ce3e6498 6530 }
a0ed51b3 6531 }
830247a4
IZ
6532 else if (!paren && RExC_parse < RExC_end) {
6533 if (*RExC_parse == ')') {
6534 RExC_parse++;
380a0633 6535 vFAIL("Unmatched )");
a0ed51b3
LW
6536 }
6537 else
b45f050a 6538 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
6539 /* NOTREACHED */
6540 }
594d7033
YO
6541 if (after_freeze)
6542 RExC_npar = after_freeze;
a0d0e21e 6543 return(ret);
a687059c
LW
6544}
6545
6546/*
6547 - regbranch - one alternative of an | operator
6548 *
6549 * Implements the concatenation operator.
6550 */
76e3520e 6551STATIC regnode *
3dab1dad 6552S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 6553{
97aff369 6554 dVAR;
c277df42
IZ
6555 register regnode *ret;
6556 register regnode *chain = NULL;
6557 register regnode *latest;
6558 I32 flags = 0, c = 0;
3dab1dad 6559 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6560
6561 PERL_ARGS_ASSERT_REGBRANCH;
6562
3dab1dad 6563 DEBUG_PARSE("brnc");
02daf0ab 6564
b81d288d 6565 if (first)
c277df42
IZ
6566 ret = NULL;
6567 else {
b81d288d 6568 if (!SIZE_ONLY && RExC_extralen)
830247a4 6569 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 6570 else {
830247a4 6571 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
6572 Set_Node_Length(ret, 1);
6573 }
c277df42
IZ
6574 }
6575
b81d288d 6576 if (!first && SIZE_ONLY)
830247a4 6577 RExC_extralen += 1; /* BRANCHJ */
b81d288d 6578
c277df42 6579 *flagp = WORST; /* Tentatively. */
a0d0e21e 6580
830247a4
IZ
6581 RExC_parse--;
6582 nextchar(pRExC_state);
6583 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 6584 flags &= ~TRYAGAIN;
3dab1dad 6585 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6586 if (latest == NULL) {
6587 if (flags & TRYAGAIN)
6588 continue;
6589 return(NULL);
a0ed51b3
LW
6590 }
6591 else if (ret == NULL)
c277df42 6592 ret = latest;
8ae10a67 6593 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 6594 if (chain == NULL) /* First piece. */
a0d0e21e
LW
6595 *flagp |= flags&SPSTART;
6596 else {
830247a4 6597 RExC_naughty++;
3dab1dad 6598 REGTAIL(pRExC_state, chain, latest);
a687059c 6599 }
a0d0e21e 6600 chain = latest;
c277df42
IZ
6601 c++;
6602 }
6603 if (chain == NULL) { /* Loop ran zero times. */
830247a4 6604 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
6605 if (ret == NULL)
6606 ret = chain;
6607 }
6608 if (c == 1) {
6609 *flagp |= flags&SIMPLE;
a0d0e21e 6610 }
a687059c 6611
d4c19fe8 6612 return ret;
a687059c
LW
6613}
6614
6615/*
6616 - regpiece - something followed by possible [*+?]
6617 *
6618 * Note that the branching code sequences used for ? and the general cases
6619 * of * and + are somewhat optimized: they use the same NOTHING node as
6620 * both the endmarker for their branch list and the body of the last branch.
6621 * It might seem that this node could be dispensed with entirely, but the
6622 * endmarker role is not redundant.
6623 */
76e3520e 6624STATIC regnode *
3dab1dad 6625S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6626{
97aff369 6627 dVAR;
c277df42 6628 register regnode *ret;
a0d0e21e
LW
6629 register char op;
6630 register char *next;
6631 I32 flags;
1df70142 6632 const char * const origparse = RExC_parse;
a0d0e21e 6633 I32 min;
c277df42 6634 I32 max = REG_INFTY;
fac92740 6635 char *parse_start;
10edeb5d 6636 const char *maxpos = NULL;
3dab1dad 6637 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6638
6639 PERL_ARGS_ASSERT_REGPIECE;
6640
3dab1dad 6641 DEBUG_PARSE("piec");
a0d0e21e 6642
3dab1dad 6643 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6644 if (ret == NULL) {
6645 if (flags & TRYAGAIN)
6646 *flagp |= TRYAGAIN;
6647 return(NULL);
6648 }
6649
830247a4 6650 op = *RExC_parse;
a0d0e21e 6651
830247a4 6652 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 6653 maxpos = NULL;
fac92740 6654 parse_start = RExC_parse; /* MJD */
830247a4 6655 next = RExC_parse + 1;
a0d0e21e
LW
6656 while (isDIGIT(*next) || *next == ',') {
6657 if (*next == ',') {
6658 if (maxpos)
6659 break;
6660 else
6661 maxpos = next;
a687059c 6662 }
a0d0e21e
LW
6663 next++;
6664 }
6665 if (*next == '}') { /* got one */
6666 if (!maxpos)
6667 maxpos = next;
830247a4
IZ
6668 RExC_parse++;
6669 min = atoi(RExC_parse);
a0d0e21e
LW
6670 if (*maxpos == ',')
6671 maxpos++;
6672 else
830247a4 6673 maxpos = RExC_parse;
a0d0e21e
LW
6674 max = atoi(maxpos);
6675 if (!max && *maxpos != '0')
c277df42
IZ
6676 max = REG_INFTY; /* meaning "infinity" */
6677 else if (max >= REG_INFTY)
8615cb43 6678 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
6679 RExC_parse = next;
6680 nextchar(pRExC_state);
a0d0e21e
LW
6681
6682 do_curly:
6683 if ((flags&SIMPLE)) {
830247a4 6684 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 6685 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
6686 Set_Node_Offset(ret, parse_start+1); /* MJD */
6687 Set_Node_Cur_Length(ret);
a0d0e21e
LW
6688 }
6689 else {
3dab1dad 6690 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
6691
6692 w->flags = 0;
3dab1dad 6693 REGTAIL(pRExC_state, ret, w);
830247a4 6694 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
6695 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6696 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
6697 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6698 }
6bda09f9 6699 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
6700 /* MJD hk */
6701 Set_Node_Offset(ret, parse_start+1);
2af232bd 6702 Set_Node_Length(ret,
fac92740 6703 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 6704
830247a4 6705 if (!SIZE_ONLY && RExC_extralen)
c277df42 6706 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 6707 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 6708 if (SIZE_ONLY)
830247a4
IZ
6709 RExC_whilem_seen++, RExC_extralen += 3;
6710 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 6711 }
c277df42 6712 ret->flags = 0;
a0d0e21e
LW
6713
6714 if (min > 0)
821b33a5
IZ
6715 *flagp = WORST;
6716 if (max > 0)
6717 *flagp |= HASWIDTH;
8fa23287 6718 if (max < min)
8615cb43 6719 vFAIL("Can't do {n,m} with n > m");
c277df42 6720 if (!SIZE_ONLY) {
eb160463
GS
6721 ARG1_SET(ret, (U16)min);
6722 ARG2_SET(ret, (U16)max);
a687059c 6723 }
a687059c 6724
a0d0e21e 6725 goto nest_check;
a687059c 6726 }
a0d0e21e 6727 }
a687059c 6728
a0d0e21e
LW
6729 if (!ISMULT1(op)) {
6730 *flagp = flags;
a687059c 6731 return(ret);
a0d0e21e 6732 }
bb20fd44 6733
c277df42 6734#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
6735
6736 /* if this is reinstated, don't forget to put this back into perldiag:
6737
6738 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6739
6740 (F) The part of the regexp subject to either the * or + quantifier
6741 could match an empty string. The {#} shows in the regular
6742 expression about where the problem was discovered.
6743
6744 */
6745
bb20fd44 6746 if (!(flags&HASWIDTH) && op != '?')
b45f050a 6747 vFAIL("Regexp *+ operand could be empty");
b81d288d 6748#endif
bb20fd44 6749
fac92740 6750 parse_start = RExC_parse;
830247a4 6751 nextchar(pRExC_state);
a0d0e21e 6752
821b33a5 6753 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
6754
6755 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 6756 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 6757 ret->flags = 0;
830247a4 6758 RExC_naughty += 4;
a0d0e21e
LW
6759 }
6760 else if (op == '*') {
6761 min = 0;
6762 goto do_curly;
a0ed51b3
LW
6763 }
6764 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 6765 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 6766 ret->flags = 0;
830247a4 6767 RExC_naughty += 3;
a0d0e21e
LW
6768 }
6769 else if (op == '+') {
6770 min = 1;
6771 goto do_curly;
a0ed51b3
LW
6772 }
6773 else if (op == '?') {
a0d0e21e
LW
6774 min = 0; max = 1;
6775 goto do_curly;
6776 }
6777 nest_check:
668c081a
NC
6778 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6779 ckWARN3reg(RExC_parse,
6780 "%.*s matches null string many times",
6781 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6782 origparse);
a0d0e21e
LW
6783 }
6784
b9b4dddf 6785 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 6786 nextchar(pRExC_state);
6bda09f9 6787 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 6788 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 6789 }
b9b4dddf
YO
6790#ifndef REG_ALLOW_MINMOD_SUSPEND
6791 else
6792#endif
6793 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6794 regnode *ender;
6795 nextchar(pRExC_state);
6796 ender = reg_node(pRExC_state, SUCCEED);
6797 REGTAIL(pRExC_state, ret, ender);
6798 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6799 ret->flags = 0;
6800 ender = reg_node(pRExC_state, TAIL);
6801 REGTAIL(pRExC_state, ret, ender);
6802 /*ret= ender;*/
6803 }
6804
6805 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 6806 RExC_parse++;
b45f050a
JF
6807 vFAIL("Nested quantifiers");
6808 }
a0d0e21e
LW
6809
6810 return(ret);
a687059c
LW
6811}
6812
fc8cd66c
YO
6813
6814/* reg_namedseq(pRExC_state,UVp)
6815
6816 This is expected to be called by a parser routine that has
afefe6bf 6817 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
6818 expected to point at the first char following the N at the time
6819 of the call.
ff3f963a
KW
6820
6821 The \N may be inside (indicated by valuep not being NULL) or outside a
6822 character class.
6823
6824 \N may begin either a named sequence, or if outside a character class, mean
6825 to match a non-newline. For non single-quoted regexes, the tokenizer has
6826 attempted to decide which, and in the case of a named sequence converted it
6827 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6828 where c1... are the characters in the sequence. For single-quoted regexes,
6829 the tokenizer passes the \N sequence through unchanged; this code will not
6830 attempt to determine this nor expand those. The net effect is that if the
6831 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6832 signals that this \N occurrence means to match a non-newline.
6833
6834 Only the \N{U+...} form should occur in a character class, for the same
6835 reason that '.' inside a character class means to just match a period: it
6836 just doesn't make sense.
fc8cd66c
YO
6837
6838 If valuep is non-null then it is assumed that we are parsing inside
6839 of a charclass definition and the first codepoint in the resolved
6840 string is returned via *valuep and the routine will return NULL.
6841 In this mode if a multichar string is returned from the charnames
ff3f963a 6842 handler, a warning will be issued, and only the first char in the
fc8cd66c
YO
6843 sequence will be examined. If the string returned is zero length
6844 then the value of *valuep is undefined and NON-NULL will
6845 be returned to indicate failure. (This will NOT be a valid pointer
6846 to a regnode.)
6847
ff3f963a
KW
6848 If valuep is null then it is assumed that we are parsing normal text and a
6849 new EXACT node is inserted into the program containing the resolved string,
6850 and a pointer to the new node is returned. But if the string is zero length
6851 a NOTHING node is emitted instead.
afefe6bf 6852
fc8cd66c 6853 On success RExC_parse is set to the char following the endbrace.
ff3f963a 6854 Parsing failures will generate a fatal error via vFAIL(...)
fc8cd66c
YO
6855 */
6856STATIC regnode *
afefe6bf 6857S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
fc8cd66c 6858{
c3c41406 6859 char * endbrace; /* '}' following the name */
fc8cd66c 6860 regnode *ret = NULL;
ff3f963a
KW
6861#ifdef DEBUGGING
6862 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6863#endif
c3c41406 6864 char* p;
ff3f963a
KW
6865
6866 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6867
6868 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
ff3f963a
KW
6869
6870 GET_RE_DEBUG_FLAGS;
c3c41406
KW
6871
6872 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6873 * modifier. The other meaning does not */
6874 p = (RExC_flags & RXf_PMf_EXTENDED)
6875 ? regwhite( pRExC_state, RExC_parse )
6876 : RExC_parse;
7918f24d 6877
ff3f963a 6878 /* Disambiguate between \N meaning a named character versus \N meaning
c3c41406
KW
6879 * [^\n]. The former is assumed when it can't be the latter. */
6880 if (*p != '{' || regcurly(p)) {
6881 RExC_parse = p;
ff3f963a 6882 if (valuep) {
afefe6bf 6883 /* no bare \N in a charclass */
ff3f963a
KW
6884 vFAIL("\\N in a character class must be a named character: \\N{...}");
6885 }
afefe6bf
RGS
6886 nextchar(pRExC_state);
6887 ret = reg_node(pRExC_state, REG_ANY);
6888 *flagp |= HASWIDTH|SIMPLE;
6889 RExC_naughty++;
6890 RExC_parse--;
6891 Set_Node_Length(ret, 1); /* MJD */
6892 return ret;
fc8cd66c 6893 }
a4893424 6894
c3c41406
KW
6895 /* Here, we have decided it should be a named sequence */
6896
6897 /* The test above made sure that the next real character is a '{', but
6898 * under the /x modifier, it could be separated by space (or a comment and
6899 * \n) and this is not allowed (for consistency with \x{...} and the
6900 * tokenizer handling of \N{NAME}). */
6901 if (*RExC_parse != '{') {
6902 vFAIL("Missing braces on \\N{}");
6903 }
6904
ff3f963a 6905 RExC_parse++; /* Skip past the '{' */
c3c41406
KW
6906
6907 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6908 || ! (endbrace == RExC_parse /* nothing between the {} */
6909 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6910 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6911 {
6912 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6913 vFAIL("\\N{NAME} must be resolved by the lexer");
6914 }
6915
ff3f963a
KW
6916 if (endbrace == RExC_parse) { /* empty: \N{} */
6917 if (! valuep) {
6918 RExC_parse = endbrace + 1;
6919 return reg_node(pRExC_state,NOTHING);
a4893424 6920 }
fc8cd66c 6921
ff3f963a
KW
6922 if (SIZE_ONLY) {
6923 ckWARNreg(RExC_parse,
6924 "Ignoring zero length \\N{} in character class"
6925 );
6926 RExC_parse = endbrace + 1;
6927 }
6928 *valuep = 0;
6929 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 6930 }
ff3f963a 6931
62fed28b 6932 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
ff3f963a
KW
6933 RExC_parse += 2; /* Skip past the 'U+' */
6934
6935 if (valuep) { /* In a bracketed char class */
6936 /* We only pay attention to the first char of
6937 multichar strings being returned. I kinda wonder
6938 if this makes sense as it does change the behaviour
6939 from earlier versions, OTOH that behaviour was broken
6940 as well. XXX Solution is to recharacterize as
6941 [rest-of-class]|multi1|multi2... */
6942
6943 STRLEN length_of_hex;
6944 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6945 | PERL_SCAN_DISALLOW_PREFIX
6946 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6947
37820adc
KW
6948 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6949 if (endchar < endbrace) {
ff3f963a
KW
6950 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6951 }
ff3f963a
KW
6952
6953 length_of_hex = (STRLEN)(endchar - RExC_parse);
6954 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6955
6956 /* The tokenizer should have guaranteed validity, but it's possible to
6957 * bypass it by using single quoting, so check */
c3c41406
KW
6958 if (length_of_hex == 0
6959 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6960 {
6961 RExC_parse += length_of_hex; /* Includes all the valid */
6962 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6963 ? UTF8SKIP(RExC_parse)
6964 : 1;
6965 /* Guard against malformed utf8 */
6966 if (RExC_parse >= endchar) RExC_parse = endchar;
6967 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
6968 }
6969
6970 RExC_parse = endbrace + 1;
6971 if (endchar == endbrace) return NULL;
6972
6973 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 6974 }
ff3f963a
KW
6975 else { /* Not a char class */
6976 char *s; /* String to put in generated EXACT node */
fda99bee 6977 STRLEN len = 0; /* Its current byte length */
ff3f963a
KW
6978 char *endchar; /* Points to '.' or '}' ending cur char in the input
6979 stream */
6980
6981 ret = reg_node(pRExC_state,
6982 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6983 s= STRING(ret);
6984
6985 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
6986 * the input which is of the form now 'c1.c2.c3...}' until find the
fda99bee 6987 * ending brace or exceed length 255. The characters that exceed this
ff3f963a
KW
6988 * limit are dropped. The limit could be relaxed should it become
6989 * desirable by reparsing this as (?:\N{NAME}), so could generate
6990 * multiple EXACT nodes, as is done for just regular input. But this
6991 * is primarily a named character, and not intended to be a huge long
6992 * string, so 255 bytes should be good enough */
6993 while (1) {
c3c41406 6994 STRLEN length_of_hex;
ff3f963a
KW
6995 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6996 | PERL_SCAN_DISALLOW_PREFIX
6997 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6998 UV cp; /* Ord of current character */
6999
7000 /* Code points are separated by dots. If none, there is only one
7001 * code point, and is terminated by the brace */
37820adc 7002 endchar = RExC_parse + strcspn(RExC_parse, ".}");
ff3f963a
KW
7003
7004 /* The values are Unicode even on EBCDIC machines */
c3c41406
KW
7005 length_of_hex = (STRLEN)(endchar - RExC_parse);
7006 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7007 if ( length_of_hex == 0
7008 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
ff3f963a 7009 {
c3c41406
KW
7010 RExC_parse += length_of_hex; /* Includes all the valid */
7011 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7012 ? UTF8SKIP(RExC_parse)
7013 : 1;
7014 /* Guard against malformed utf8 */
7015 if (RExC_parse >= endchar) RExC_parse = endchar;
7016 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
7017 }
7018
7019 if (! FOLD) { /* Not folding, just append to the string */
7020 STRLEN unilen;
7021
7022 /* Quit before adding this character if would exceed limit */
7023 if (len + UNISKIP(cp) > U8_MAX) break;
fc8cd66c 7024
ff3f963a
KW
7025 unilen = reguni(pRExC_state, cp, s);
7026 if (unilen > 0) {
7027 s += unilen;
7028 len += unilen;
7029 }
7030 } else { /* Folding, output the folded equivalent */
7031 STRLEN foldlen,numlen;
7032 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7033 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7034
7035 /* Quit before exceeding size limit */
7036 if (len + foldlen > U8_MAX) break;
7037
7038 for (foldbuf = tmpbuf;
7039 foldlen;
7040 foldlen -= numlen)
7041 {
7042 cp = utf8_to_uvchr(foldbuf, &numlen);
7043 if (numlen > 0) {
7044 const STRLEN unilen = reguni(pRExC_state, cp, s);
7045 s += unilen;
7046 len += unilen;
7047 /* In EBCDIC the numlen and unilen can differ. */
7048 foldbuf += numlen;
7049 if (numlen >= foldlen)
7050 break;
7051 }
7052 else
7053 break; /* "Can't happen." */
7054 }
7055 }
7056
7057 /* Point to the beginning of the next character in the sequence. */
7058 RExC_parse = endchar + 1;
7059
7060 /* Quit if no more characters */
7061 if (RExC_parse >= endbrace) break;
7062 }
7063
7064
7065 if (SIZE_ONLY) {
7066 if (RExC_parse < endbrace) {
7067 ckWARNreg(RExC_parse - 1,
7068 "Using just the first characters returned by \\N{}");
7069 }
7070
7071 RExC_size += STR_SZ(len);
7072 } else {
7073 STR_LEN(ret) = len;
7074 RExC_emit += STR_SZ(len);
7075 }
7076
7077 RExC_parse = endbrace + 1;
7078
7079 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7080 with malformed in t/re/pat_advanced.t */
7081 RExC_parse --;
7082 Set_Node_Cur_Length(ret); /* MJD */
7083 nextchar(pRExC_state);
7084 }
7085
7086 return ret;
fc8cd66c
YO
7087}
7088
7089
9e08bc66
TS
7090/*
7091 * reg_recode
7092 *
7093 * It returns the code point in utf8 for the value in *encp.
7094 * value: a code value in the source encoding
7095 * encp: a pointer to an Encode object
7096 *
7097 * If the result from Encode is not a single character,
7098 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7099 */
7100STATIC UV
7101S_reg_recode(pTHX_ const char value, SV **encp)
7102{
7103 STRLEN numlen = 1;
59cd0e26 7104 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 7105 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
7106 const STRLEN newlen = SvCUR(sv);
7107 UV uv = UNICODE_REPLACEMENT;
7108
7918f24d
NC
7109 PERL_ARGS_ASSERT_REG_RECODE;
7110
9e08bc66
TS
7111 if (newlen)
7112 uv = SvUTF8(sv)
7113 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7114 : *(U8*)s;
7115
7116 if (!newlen || numlen != newlen) {
7117 uv = UNICODE_REPLACEMENT;
c86f7df5 7118 *encp = NULL;
9e08bc66
TS
7119 }
7120 return uv;
7121}
7122
fc8cd66c 7123
a687059c
LW
7124/*
7125 - regatom - the lowest level
ee9b8eae
YO
7126
7127 Try to identify anything special at the start of the pattern. If there
7128 is, then handle it as required. This may involve generating a single regop,
7129 such as for an assertion; or it may involve recursing, such as to
7130 handle a () structure.
7131
7132 If the string doesn't start with something special then we gobble up
7133 as much literal text as we can.
7134
7135 Once we have been able to handle whatever type of thing started the
7136 sequence, we return.
7137
7138 Note: we have to be careful with escapes, as they can be both literal
7139 and special, and in the case of \10 and friends can either, depending
7140 on context. Specifically there are two seperate switches for handling
7141 escape sequences, with the one for handling literal escapes requiring
7142 a dummy entry for all of the special escapes that are actually handled
7143 by the other.
7144*/
7145
76e3520e 7146STATIC regnode *
3dab1dad 7147S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 7148{
97aff369 7149 dVAR;
cbbf8932 7150 register regnode *ret = NULL;
a0d0e21e 7151 I32 flags;
45948336 7152 char *parse_start = RExC_parse;
3dab1dad
YO
7153 GET_RE_DEBUG_FLAGS_DECL;
7154 DEBUG_PARSE("atom");
a0d0e21e
LW
7155 *flagp = WORST; /* Tentatively. */
7156
7918f24d 7157 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 7158
a0d0e21e 7159tryagain:
f9a79580 7160 switch ((U8)*RExC_parse) {
a0d0e21e 7161 case '^':
830247a4
IZ
7162 RExC_seen_zerolen++;
7163 nextchar(pRExC_state);
bbe252da 7164 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7165 ret = reg_node(pRExC_state, MBOL);
bbe252da 7166 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7167 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7168 else
830247a4 7169 ret = reg_node(pRExC_state, BOL);
fac92740 7170 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7171 break;
7172 case '$':
830247a4 7173 nextchar(pRExC_state);
b81d288d 7174 if (*RExC_parse)
830247a4 7175 RExC_seen_zerolen++;
bbe252da 7176 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 7177 ret = reg_node(pRExC_state, MEOL);
bbe252da 7178 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 7179 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7180 else
830247a4 7181 ret = reg_node(pRExC_state, EOL);
fac92740 7182 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7183 break;
7184 case '.':
830247a4 7185 nextchar(pRExC_state);
bbe252da 7186 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
7187 ret = reg_node(pRExC_state, SANY);
7188 else
7189 ret = reg_node(pRExC_state, REG_ANY);
7190 *flagp |= HASWIDTH|SIMPLE;
830247a4 7191 RExC_naughty++;
fac92740 7192 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
7193 break;
7194 case '[':
b45f050a 7195 {
3dab1dad
YO
7196 char * const oregcomp_parse = ++RExC_parse;
7197 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
7198 if (*RExC_parse != ']') {
7199 RExC_parse = oregcomp_parse;
b45f050a
JF
7200 vFAIL("Unmatched [");
7201 }
830247a4 7202 nextchar(pRExC_state);
a0d0e21e 7203 *flagp |= HASWIDTH|SIMPLE;
fac92740 7204 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 7205 break;
b45f050a 7206 }
a0d0e21e 7207 case '(':
830247a4 7208 nextchar(pRExC_state);
3dab1dad 7209 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 7210 if (ret == NULL) {
bf93d4cc 7211 if (flags & TRYAGAIN) {
830247a4 7212 if (RExC_parse == RExC_end) {
bf93d4cc
GS
7213 /* Make parent create an empty node if needed. */
7214 *flagp |= TRYAGAIN;
7215 return(NULL);
7216 }
a0d0e21e 7217 goto tryagain;
bf93d4cc 7218 }
a0d0e21e
LW
7219 return(NULL);
7220 }
a3b492c3 7221 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
7222 break;
7223 case '|':
7224 case ')':
7225 if (flags & TRYAGAIN) {
7226 *flagp |= TRYAGAIN;
7227 return NULL;
7228 }
b45f050a 7229 vFAIL("Internal urp");
a0d0e21e
LW
7230 /* Supposed to be caught earlier. */
7231 break;
85afd4ae 7232 case '{':
830247a4
IZ
7233 if (!regcurly(RExC_parse)) {
7234 RExC_parse++;
85afd4ae
CS
7235 goto defchar;
7236 }
7237 /* FALL THROUGH */
a0d0e21e
LW
7238 case '?':
7239 case '+':
7240 case '*':
830247a4 7241 RExC_parse++;
b45f050a 7242 vFAIL("Quantifier follows nothing");
a0d0e21e 7243 break;
f9a79580
RGS
7244 case 0xDF:
7245 case 0xC3:
7246 case 0xCE:
a0a388a1 7247 do_foldchar:
56d400ed 7248 if (!LOC && FOLD) {
e64b1bd1 7249 U32 len,cp;
7cf3a6a3 7250 len=0; /* silence a spurious compiler warning */
56d400ed 7251 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
e64b1bd1
YO
7252 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7253 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7254 ret = reganode(pRExC_state, FOLDCHAR, cp);
7255 Set_Node_Length(ret, 1); /* MJD */
7256 nextchar(pRExC_state); /* kill whitespace under /x */
7257 return ret;
7258 }
7259 }
7260 goto outer_default;
a0d0e21e 7261 case '\\':
ee9b8eae
YO
7262 /* Special Escapes
7263
7264 This switch handles escape sequences that resolve to some kind
7265 of special regop and not to literal text. Escape sequnces that
7266 resolve to literal text are handled below in the switch marked
7267 "Literal Escapes".
7268
7269 Every entry in this switch *must* have a corresponding entry
7270 in the literal escape switch. However, the opposite is not
7271 required, as the default for this switch is to jump to the
7272 literal text handling code.
7273 */
a0a388a1
YO
7274 switch ((U8)*++RExC_parse) {
7275 case 0xDF:
7276 case 0xC3:
7277 case 0xCE:
7278 goto do_foldchar;
ee9b8eae 7279 /* Special Escapes */
a0d0e21e 7280 case 'A':
830247a4
IZ
7281 RExC_seen_zerolen++;
7282 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7283 *flagp |= SIMPLE;
ee9b8eae 7284 goto finish_meta_pat;
a0d0e21e 7285 case 'G':
830247a4
IZ
7286 ret = reg_node(pRExC_state, GPOS);
7287 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 7288 *flagp |= SIMPLE;
ee9b8eae
YO
7289 goto finish_meta_pat;
7290 case 'K':
7291 RExC_seen_zerolen++;
7292 ret = reg_node(pRExC_state, KEEPS);
7293 *flagp |= SIMPLE;
37923168
RGS
7294 /* XXX:dmq : disabling in-place substitution seems to
7295 * be necessary here to avoid cases of memory corruption, as
7296 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7297 */
7298 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 7299 goto finish_meta_pat;
a0d0e21e 7300 case 'Z':
830247a4 7301 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7302 *flagp |= SIMPLE;
a1917ab9 7303 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7304 goto finish_meta_pat;
b85d18e9 7305 case 'z':
830247a4 7306 ret = reg_node(pRExC_state, EOS);
b85d18e9 7307 *flagp |= SIMPLE;
830247a4 7308 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7309 goto finish_meta_pat;
4a2d328f 7310 case 'C':
f33976b4
DB
7311 ret = reg_node(pRExC_state, CANY);
7312 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 7313 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7314 goto finish_meta_pat;
a0ed51b3 7315 case 'X':
830247a4 7316 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 7317 *flagp |= HASWIDTH;
ee9b8eae 7318 goto finish_meta_pat;
a0d0e21e 7319 case 'w':
a12cf05f
KW
7320 if (LOC) {
7321 ret = reg_node(pRExC_state, (U8)(ALNUML));
7322 } else {
7323 ret = reg_node(pRExC_state, (U8)(ALNUM));
7324 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7325 }
a0d0e21e 7326 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7327 goto finish_meta_pat;
a0d0e21e 7328 case 'W':
a12cf05f
KW
7329 if (LOC) {
7330 ret = reg_node(pRExC_state, (U8)(NALNUML));
7331 } else {
7332 ret = reg_node(pRExC_state, (U8)(NALNUM));
7333 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7334 }
a0d0e21e 7335 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7336 goto finish_meta_pat;
a0d0e21e 7337 case 'b':
830247a4
IZ
7338 RExC_seen_zerolen++;
7339 RExC_seen |= REG_SEEN_LOOKBEHIND;
a12cf05f
KW
7340 if (LOC) {
7341 ret = reg_node(pRExC_state, (U8)(BOUNDL));
7342 } else {
7343 ret = reg_node(pRExC_state, (U8)(BOUND));
7344 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7345 }
a0d0e21e 7346 *flagp |= SIMPLE;
ee9b8eae 7347 goto finish_meta_pat;
a0d0e21e 7348 case 'B':
830247a4
IZ
7349 RExC_seen_zerolen++;
7350 RExC_seen |= REG_SEEN_LOOKBEHIND;
a12cf05f
KW
7351 if (LOC) {
7352 ret = reg_node(pRExC_state, (U8)(NBOUNDL));
7353 } else {
7354 ret = reg_node(pRExC_state, (U8)(NBOUND));
7355 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7356 }
a0d0e21e 7357 *flagp |= SIMPLE;
ee9b8eae 7358 goto finish_meta_pat;
a0d0e21e 7359 case 's':
a12cf05f
KW
7360 if (LOC) {
7361 ret = reg_node(pRExC_state, (U8)(SPACEL));
7362 } else {
7363 ret = reg_node(pRExC_state, (U8)(SPACE));
7364 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7365 }
a0d0e21e 7366 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7367 goto finish_meta_pat;
a0d0e21e 7368 case 'S':
a12cf05f
KW
7369 if (LOC) {
7370 ret = reg_node(pRExC_state, (U8)(NSPACEL));
7371 } else {
7372 ret = reg_node(pRExC_state, (U8)(NSPACE));
7373 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7374 }
a0d0e21e 7375 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7376 goto finish_meta_pat;
a0d0e21e 7377 case 'd':
ffc61ed2 7378 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 7379 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7380 goto finish_meta_pat;
a0d0e21e 7381 case 'D':
ffc61ed2 7382 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 7383 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7384 goto finish_meta_pat;
e1d1eefb
YO
7385 case 'R':
7386 ret = reg_node(pRExC_state, LNBREAK);
7387 *flagp |= HASWIDTH|SIMPLE;
7388 goto finish_meta_pat;
7389 case 'h':
7390 ret = reg_node(pRExC_state, HORIZWS);
7391 *flagp |= HASWIDTH|SIMPLE;
7392 goto finish_meta_pat;
7393 case 'H':
7394 ret = reg_node(pRExC_state, NHORIZWS);
7395 *flagp |= HASWIDTH|SIMPLE;
7396 goto finish_meta_pat;
ee9b8eae 7397 case 'v':
e1d1eefb
YO
7398 ret = reg_node(pRExC_state, VERTWS);
7399 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
7400 goto finish_meta_pat;
7401 case 'V':
e1d1eefb
YO
7402 ret = reg_node(pRExC_state, NVERTWS);
7403 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7404 finish_meta_pat:
830247a4 7405 nextchar(pRExC_state);
fac92740 7406 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 7407 break;
a14b48bc
LW
7408 case 'p':
7409 case 'P':
3568d838 7410 {
3dab1dad 7411 char* const oldregxend = RExC_end;
d008bc60 7412#ifdef DEBUGGING
ccb2c380 7413 char* parse_start = RExC_parse - 2;
d008bc60 7414#endif
a14b48bc 7415
830247a4 7416 if (RExC_parse[1] == '{') {
3568d838 7417 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
7418 RExC_end = strchr(RExC_parse, '}');
7419 if (!RExC_end) {
3dab1dad 7420 const U8 c = (U8)*RExC_parse;
830247a4
IZ
7421 RExC_parse += 2;
7422 RExC_end = oldregxend;
0da60cf5 7423 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 7424 }
830247a4 7425 RExC_end++;
a14b48bc 7426 }
af6f566e 7427 else {
830247a4 7428 RExC_end = RExC_parse + 2;
af6f566e
HS
7429 if (RExC_end > oldregxend)
7430 RExC_end = oldregxend;
7431 }
830247a4 7432 RExC_parse--;
a14b48bc 7433
3dab1dad 7434 ret = regclass(pRExC_state,depth+1);
a14b48bc 7435
830247a4
IZ
7436 RExC_end = oldregxend;
7437 RExC_parse--;
ccb2c380
MP
7438
7439 Set_Node_Offset(ret, parse_start + 2);
7440 Set_Node_Cur_Length(ret);
830247a4 7441 nextchar(pRExC_state);
a14b48bc
LW
7442 *flagp |= HASWIDTH|SIMPLE;
7443 }
7444 break;
fc8cd66c 7445 case 'N':
afefe6bf 7446 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
7447 multicharacter. join_exact() will join them up later on.
7448 Also this makes sure that things like /\N{BLAH}+/ and
7449 \N{BLAH} being multi char Just Happen. dmq*/
7450 ++RExC_parse;
afefe6bf 7451 ret= reg_namedseq(pRExC_state, NULL, flagp);
fc8cd66c 7452 break;
0a4db386 7453 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 7454 parse_named_seq:
81714fb9
YO
7455 {
7456 char ch= RExC_parse[1];
1f1031fe
YO
7457 if (ch != '<' && ch != '\'' && ch != '{') {
7458 RExC_parse++;
7459 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 7460 } else {
1f1031fe
YO
7461 /* this pretty much dupes the code for (?P=...) in reg(), if
7462 you change this make sure you change that */
81714fb9 7463 char* name_start = (RExC_parse += 2);
2eccd3b2 7464 U32 num = 0;
0a4db386
YO
7465 SV *sv_dat = reg_scan_name(pRExC_state,
7466 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 7467 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 7468 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
7469 vFAIL2("Sequence %.3s... not terminated",parse_start);
7470
7471 if (!SIZE_ONLY) {
7472 num = add_data( pRExC_state, 1, "S" );
7473 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 7474 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
7475 }
7476
81714fb9
YO
7477 RExC_sawback = 1;
7478 ret = reganode(pRExC_state,
7479 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7480 num);
7481 *flagp |= HASWIDTH;
1f1031fe 7482
81714fb9
YO
7483 /* override incorrect value set in reganode MJD */
7484 Set_Node_Offset(ret, parse_start+1);
7485 Set_Node_Cur_Length(ret); /* MJD */
7486 nextchar(pRExC_state);
1f1031fe 7487
81714fb9
YO
7488 }
7489 break;
1f1031fe 7490 }
2bf803e2 7491 case 'g':
a0d0e21e
LW
7492 case '1': case '2': case '3': case '4':
7493 case '5': case '6': case '7': case '8': case '9':
7494 {
c74340f9 7495 I32 num;
2bf803e2
YO
7496 bool isg = *RExC_parse == 'g';
7497 bool isrel = 0;
7498 bool hasbrace = 0;
7499 if (isg) {
c74340f9 7500 RExC_parse++;
2bf803e2
YO
7501 if (*RExC_parse == '{') {
7502 RExC_parse++;
7503 hasbrace = 1;
7504 }
7505 if (*RExC_parse == '-') {
7506 RExC_parse++;
7507 isrel = 1;
7508 }
1f1031fe
YO
7509 if (hasbrace && !isDIGIT(*RExC_parse)) {
7510 if (isrel) RExC_parse--;
7511 RExC_parse -= 2;
7512 goto parse_named_seq;
7513 } }
c74340f9 7514 num = atoi(RExC_parse);
b72d83b2
RGS
7515 if (isg && num == 0)
7516 vFAIL("Reference to invalid group 0");
c74340f9 7517 if (isrel) {
5624f11d 7518 num = RExC_npar - num;
c74340f9
YO
7519 if (num < 1)
7520 vFAIL("Reference to nonexistent or unclosed group");
7521 }
2bf803e2 7522 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
7523 goto defchar;
7524 else {
3dab1dad 7525 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
7526 while (isDIGIT(*RExC_parse))
7527 RExC_parse++;
1f1031fe
YO
7528 if (parse_start == RExC_parse - 1)
7529 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
7530 if (hasbrace) {
7531 if (*RExC_parse != '}')
7532 vFAIL("Unterminated \\g{...} pattern");
7533 RExC_parse++;
7534 }
c74340f9
YO
7535 if (!SIZE_ONLY) {
7536 if (num > (I32)RExC_rx->nparens)
7537 vFAIL("Reference to nonexistent group");
c74340f9 7538 }
830247a4 7539 RExC_sawback = 1;
eb160463
GS
7540 ret = reganode(pRExC_state,
7541 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7542 num);
a0d0e21e 7543 *flagp |= HASWIDTH;
2af232bd 7544
fac92740 7545 /* override incorrect value set in reganode MJD */
2af232bd 7546 Set_Node_Offset(ret, parse_start+1);
fac92740 7547 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
7548 RExC_parse--;
7549 nextchar(pRExC_state);
a0d0e21e
LW
7550 }
7551 }
7552 break;
7553 case '\0':
830247a4 7554 if (RExC_parse >= RExC_end)
b45f050a 7555 FAIL("Trailing \\");
a0d0e21e
LW
7556 /* FALL THROUGH */
7557 default:
a0288114 7558 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 7559 back into the quick-grab loop below */
45948336 7560 parse_start--;
a0d0e21e
LW
7561 goto defchar;
7562 }
7563 break;
4633a7c4
LW
7564
7565 case '#':
bbe252da 7566 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 7567 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
7568 goto tryagain;
7569 }
7570 /* FALL THROUGH */
7571
f9a79580
RGS
7572 default:
7573 outer_default:{
ba210ebe 7574 register STRLEN len;
58ae7d3f 7575 register UV ender;
a0d0e21e 7576 register char *p;
3dab1dad 7577 char *s;
80aecb99 7578 STRLEN foldlen;
89ebb4a3 7579 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
7580
7581 parse_start = RExC_parse - 1;
a0d0e21e 7582
830247a4 7583 RExC_parse++;
a0d0e21e
LW
7584
7585 defchar:
58ae7d3f 7586 ender = 0;
eb160463
GS
7587 ret = reg_node(pRExC_state,
7588 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 7589 s = STRING(ret);
830247a4
IZ
7590 for (len = 0, p = RExC_parse - 1;
7591 len < 127 && p < RExC_end;
a0d0e21e
LW
7592 len++)
7593 {
3dab1dad 7594 char * const oldp = p;
5b5a24f7 7595
bbe252da 7596 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 7597 p = regwhite( pRExC_state, p );
f9a79580
RGS
7598 switch ((U8)*p) {
7599 case 0xDF:
7600 case 0xC3:
7601 case 0xCE:
56d400ed 7602 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
f9a79580 7603 goto normal_default;
a0d0e21e
LW
7604 case '^':
7605 case '$':
7606 case '.':
7607 case '[':
7608 case '(':
7609 case ')':
7610 case '|':
7611 goto loopdone;
7612 case '\\':
ee9b8eae
YO
7613 /* Literal Escapes Switch
7614
7615 This switch is meant to handle escape sequences that
7616 resolve to a literal character.
7617
7618 Every escape sequence that represents something
7619 else, like an assertion or a char class, is handled
7620 in the switch marked 'Special Escapes' above in this
7621 routine, but also has an entry here as anything that
7622 isn't explicitly mentioned here will be treated as
7623 an unescaped equivalent literal.
7624 */
7625
a0a388a1 7626 switch ((U8)*++p) {
ee9b8eae 7627 /* These are all the special escapes. */
a0a388a1
YO
7628 case 0xDF:
7629 case 0xC3:
7630 case 0xCE:
7631 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7632 goto normal_default;
ee9b8eae
YO
7633 case 'A': /* Start assertion */
7634 case 'b': case 'B': /* Word-boundary assertion*/
7635 case 'C': /* Single char !DANGEROUS! */
7636 case 'd': case 'D': /* digit class */
7637 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 7638 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
7639 case 'k': case 'K': /* named backref, keep marker */
7640 case 'N': /* named char sequence */
38a44b82 7641 case 'p': case 'P': /* Unicode property */
e1d1eefb 7642 case 'R': /* LNBREAK */
ee9b8eae 7643 case 's': case 'S': /* space class */
e1d1eefb 7644 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
7645 case 'w': case 'W': /* word class */
7646 case 'X': /* eXtended Unicode "combining character sequence" */
7647 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
7648 --p;
7649 goto loopdone;
ee9b8eae
YO
7650
7651 /* Anything after here is an escape that resolves to a
7652 literal. (Except digits, which may or may not)
7653 */
a0d0e21e
LW
7654 case 'n':
7655 ender = '\n';
7656 p++;
a687059c 7657 break;
a0d0e21e
LW
7658 case 'r':
7659 ender = '\r';
7660 p++;
a687059c 7661 break;
a0d0e21e
LW
7662 case 't':
7663 ender = '\t';
7664 p++;
a687059c 7665 break;
a0d0e21e
LW
7666 case 'f':
7667 ender = '\f';
7668 p++;
a687059c 7669 break;
a0d0e21e 7670 case 'e':
c7f1f016 7671 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 7672 p++;
a687059c 7673 break;
a0d0e21e 7674 case 'a':
c7f1f016 7675 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 7676 p++;
a687059c 7677 break;
f0a2b745
KW
7678 case 'o':
7679 {
7680 STRLEN brace_len = len;
00c0cb6d 7681 UV result;
454155d9
KW
7682 const char* error_msg;
7683
7684 bool valid = grok_bslash_o(p,
7685 &result,
7686 &brace_len,
7687 &error_msg,
7688 1);
7689 p += brace_len;
7690 if (! valid) {
7691 RExC_parse = p; /* going to die anyway; point
7692 to exact spot of failure */
f0a2b745
KW
7693 vFAIL(error_msg);
7694 }
00c0cb6d
DG
7695 else
7696 {
7697 ender = result;
7698 }
f0a2b745
KW
7699 if (PL_encoding && ender < 0x100) {
7700 goto recode_encoding;
7701 }
7702 if (ender > 0xff) {
62fed28b 7703 REQUIRE_UTF8;
f0a2b745
KW
7704 }
7705 break;
7706 }
a0d0e21e 7707 case 'x':
a0ed51b3 7708 if (*++p == '{') {
1df70142 7709 char* const e = strchr(p, '}');
b81d288d 7710
b45f050a 7711 if (!e) {
830247a4 7712 RExC_parse = p + 1;
b45f050a
JF
7713 vFAIL("Missing right brace on \\x{}");
7714 }
de5f0749 7715 else {
a4c04bdc
NC
7716 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7717 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 7718 STRLEN numlen = e - p - 1;
53305cf1 7719 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028 7720 if (ender > 0xff)
62fed28b 7721 REQUIRE_UTF8;
a0ed51b3
LW
7722 p = e + 1;
7723 }
a0ed51b3
LW
7724 }
7725 else {
a4c04bdc 7726 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 7727 STRLEN numlen = 2;
53305cf1 7728 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
7729 p += numlen;
7730 }
9e08bc66
TS
7731 if (PL_encoding && ender < 0x100)
7732 goto recode_encoding;
a687059c 7733 break;
a0d0e21e
LW
7734 case 'c':
7735 p++;
f9d13529 7736 ender = grok_bslash_c(*p++, SIZE_ONLY);
a687059c 7737 break;
a0d0e21e
LW
7738 case '0': case '1': case '2': case '3':case '4':
7739 case '5': case '6': case '7': case '8':case '9':
7740 if (*p == '0' ||
ca67da41 7741 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
c99e91e9
KW
7742 {
7743 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
1df70142 7744 STRLEN numlen = 3;
53305cf1 7745 ender = grok_oct(p, &numlen, &flags, NULL);
fa1639c5 7746 if (ender > 0xff) {
62fed28b 7747 REQUIRE_UTF8;
609122bd 7748 }
a0d0e21e
LW
7749 p += numlen;
7750 }
7751 else {
7752 --p;
7753 goto loopdone;
a687059c 7754 }
9e08bc66
TS
7755 if (PL_encoding && ender < 0x100)
7756 goto recode_encoding;
7757 break;
7758 recode_encoding:
7759 {
7760 SV* enc = PL_encoding;
7761 ender = reg_recode((const char)(U8)ender, &enc);
668c081a
NC
7762 if (!enc && SIZE_ONLY)
7763 ckWARNreg(p, "Invalid escape in the specified encoding");
62fed28b 7764 REQUIRE_UTF8;
9e08bc66 7765 }
a687059c 7766 break;
a0d0e21e 7767 case '\0':
830247a4 7768 if (p >= RExC_end)
b45f050a 7769 FAIL("Trailing \\");
a687059c 7770 /* FALL THROUGH */
a0d0e21e 7771 default:
668c081a
NC
7772 if (!SIZE_ONLY&& isALPHA(*p))
7773 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 7774 goto normal_default;
a0d0e21e
LW
7775 }
7776 break;
a687059c 7777 default:
a0ed51b3 7778 normal_default:
fd400ab9 7779 if (UTF8_IS_START(*p) && UTF) {
1df70142 7780 STRLEN numlen;
5e12f4fb 7781 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 7782 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
7783 p += numlen;
7784 }
7785 else
7786 ender = *p++;
a0d0e21e 7787 break;
a687059c 7788 }
bcdf7404
YO
7789 if ( RExC_flags & RXf_PMf_EXTENDED)
7790 p = regwhite( pRExC_state, p );
60a8b682
JH
7791 if (UTF && FOLD) {
7792 /* Prime the casefolded buffer. */
ac7e0132 7793 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 7794 }
bcdf7404 7795 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
7796 if (len)
7797 p = oldp;
16ea2a2e 7798 else if (UTF) {
80aecb99 7799 if (FOLD) {
60a8b682 7800 /* Emit all the Unicode characters. */
1df70142 7801 STRLEN numlen;
80aecb99
JH
7802 for (foldbuf = tmpbuf;
7803 foldlen;
7804 foldlen -= numlen) {
7805 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7806 if (numlen > 0) {
71207a34 7807 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7808 s += unilen;
7809 len += unilen;
7810 /* In EBCDIC the numlen
7811 * and unilen can differ. */
9dc45d57 7812 foldbuf += numlen;
47654450
JH
7813 if (numlen >= foldlen)
7814 break;
9dc45d57
JH
7815 }
7816 else
7817 break; /* "Can't happen." */
80aecb99
JH
7818 }
7819 }
7820 else {
71207a34 7821 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7822 if (unilen > 0) {
0ebc6274
JH
7823 s += unilen;
7824 len += unilen;
9dc45d57 7825 }
80aecb99 7826 }
a0ed51b3 7827 }
a0d0e21e
LW
7828 else {
7829 len++;
eb160463 7830 REGC((char)ender, s++);
a0d0e21e
LW
7831 }
7832 break;
a687059c 7833 }
16ea2a2e 7834 if (UTF) {
80aecb99 7835 if (FOLD) {
60a8b682 7836 /* Emit all the Unicode characters. */
1df70142 7837 STRLEN numlen;
80aecb99
JH
7838 for (foldbuf = tmpbuf;
7839 foldlen;
7840 foldlen -= numlen) {
7841 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7842 if (numlen > 0) {
71207a34 7843 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7844 len += unilen;
7845 s += unilen;
7846 /* In EBCDIC the numlen
7847 * and unilen can differ. */
9dc45d57 7848 foldbuf += numlen;
47654450
JH
7849 if (numlen >= foldlen)
7850 break;
9dc45d57
JH
7851 }
7852 else
7853 break;
80aecb99
JH
7854 }
7855 }
7856 else {
71207a34 7857 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7858 if (unilen > 0) {
0ebc6274
JH
7859 s += unilen;
7860 len += unilen;
9dc45d57 7861 }
80aecb99
JH
7862 }
7863 len--;
a0ed51b3
LW
7864 }
7865 else
eb160463 7866 REGC((char)ender, s++);
a0d0e21e
LW
7867 }
7868 loopdone:
830247a4 7869 RExC_parse = p - 1;
fac92740 7870 Set_Node_Cur_Length(ret); /* MJD */
830247a4 7871 nextchar(pRExC_state);
793db0cb
JH
7872 {
7873 /* len is STRLEN which is unsigned, need to copy to signed */
7874 IV iv = len;
7875 if (iv < 0)
7876 vFAIL("Internal disaster");
7877 }
a0d0e21e
LW
7878 if (len > 0)
7879 *flagp |= HASWIDTH;
090f7165 7880 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 7881 *flagp |= SIMPLE;
3dab1dad 7882
cd439c50 7883 if (SIZE_ONLY)
830247a4 7884 RExC_size += STR_SZ(len);
3dab1dad
YO
7885 else {
7886 STR_LEN(ret) = len;
830247a4 7887 RExC_emit += STR_SZ(len);
07be1b83 7888 }
3dab1dad 7889 }
a0d0e21e
LW
7890 break;
7891 }
a687059c 7892
a0d0e21e 7893 return(ret);
a687059c
LW
7894}
7895
873ef191 7896STATIC char *
bcdf7404 7897S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 7898{
bcdf7404 7899 const char *e = RExC_end;
7918f24d
NC
7900
7901 PERL_ARGS_ASSERT_REGWHITE;
7902
5b5a24f7
CS
7903 while (p < e) {
7904 if (isSPACE(*p))
7905 ++p;
7906 else if (*p == '#') {
bcdf7404 7907 bool ended = 0;
5b5a24f7 7908 do {
bcdf7404
YO
7909 if (*p++ == '\n') {
7910 ended = 1;
7911 break;
7912 }
7913 } while (p < e);
7914 if (!ended)
7915 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
7916 }
7917 else
7918 break;
7919 }
7920 return p;
7921}
7922
b8c5462f
JH
7923/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7924 Character classes ([:foo:]) can also be negated ([:^foo:]).
7925 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7926 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 7927 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
7928
7929#define POSIXCC_DONE(c) ((c) == ':')
7930#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7931#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7932
b8c5462f 7933STATIC I32
830247a4 7934S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 7935{
97aff369 7936 dVAR;
936ed897 7937 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 7938
7918f24d
NC
7939 PERL_ARGS_ASSERT_REGPPOSIXCC;
7940
830247a4 7941 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 7942 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 7943 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 7944 const char c = UCHARAT(RExC_parse);
097eb12c 7945 char* const s = RExC_parse++;
b81d288d 7946
9a86a77b 7947 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
7948 RExC_parse++;
7949 if (RExC_parse == RExC_end)
620e46c5 7950 /* Grandfather lone [:, [=, [. */
830247a4 7951 RExC_parse = s;
620e46c5 7952 else {
3dab1dad 7953 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
7954 assert(*t == c);
7955
9a86a77b 7956 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 7957 const char *posixcc = s + 1;
830247a4 7958 RExC_parse++; /* skip over the ending ] */
3dab1dad 7959
b8c5462f 7960 if (*s == ':') {
1df70142
AL
7961 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7962 const I32 skip = t - posixcc;
80916619
NC
7963
7964 /* Initially switch on the length of the name. */
7965 switch (skip) {
7966 case 4:
3dab1dad
YO
7967 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7968 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 7969 break;
80916619
NC
7970 case 5:
7971 /* Names all of length 5. */
7972 /* alnum alpha ascii blank cntrl digit graph lower
7973 print punct space upper */
7974 /* Offset 4 gives the best switch position. */
7975 switch (posixcc[4]) {
7976 case 'a':
3dab1dad
YO
7977 if (memEQ(posixcc, "alph", 4)) /* alpha */
7978 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
7979 break;
7980 case 'e':
3dab1dad
YO
7981 if (memEQ(posixcc, "spac", 4)) /* space */
7982 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
7983 break;
7984 case 'h':
3dab1dad
YO
7985 if (memEQ(posixcc, "grap", 4)) /* graph */
7986 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
7987 break;
7988 case 'i':
3dab1dad
YO
7989 if (memEQ(posixcc, "asci", 4)) /* ascii */
7990 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
7991 break;
7992 case 'k':
3dab1dad
YO
7993 if (memEQ(posixcc, "blan", 4)) /* blank */
7994 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
7995 break;
7996 case 'l':
3dab1dad
YO
7997 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7998 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
7999 break;
8000 case 'm':
3dab1dad
YO
8001 if (memEQ(posixcc, "alnu", 4)) /* alnum */
8002 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
8003 break;
8004 case 'r':
3dab1dad
YO
8005 if (memEQ(posixcc, "lowe", 4)) /* lower */
8006 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8007 else if (memEQ(posixcc, "uppe", 4)) /* upper */
8008 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
8009 break;
8010 case 't':
3dab1dad
YO
8011 if (memEQ(posixcc, "digi", 4)) /* digit */
8012 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8013 else if (memEQ(posixcc, "prin", 4)) /* print */
8014 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8015 else if (memEQ(posixcc, "punc", 4)) /* punct */
8016 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 8017 break;
b8c5462f
JH
8018 }
8019 break;
80916619 8020 case 6:
3dab1dad
YO
8021 if (memEQ(posixcc, "xdigit", 6))
8022 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
8023 break;
8024 }
80916619
NC
8025
8026 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
8027 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8028 t - s - 1, s + 1);
80916619
NC
8029 assert (posixcc[skip] == ':');
8030 assert (posixcc[skip+1] == ']');
b45f050a 8031 } else if (!SIZE_ONLY) {
b8c5462f 8032 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 8033
830247a4 8034 /* adjust RExC_parse so the warning shows after
b45f050a 8035 the class closes */
9a86a77b 8036 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 8037 RExC_parse++;
b45f050a
JF
8038 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8039 }
b8c5462f
JH
8040 } else {
8041 /* Maternal grandfather:
8042 * "[:" ending in ":" but not in ":]" */
830247a4 8043 RExC_parse = s;
767d463e 8044 }
620e46c5
JH
8045 }
8046 }
8047
b8c5462f
JH
8048 return namedclass;
8049}
8050
8051STATIC void
830247a4 8052S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 8053{
97aff369 8054 dVAR;
7918f24d
NC
8055
8056 PERL_ARGS_ASSERT_CHECKPOSIXCC;
8057
3dab1dad 8058 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
8059 const char *s = RExC_parse;
8060 const char c = *s++;
b8c5462f 8061
3dab1dad 8062 while (isALNUM(*s))
b8c5462f
JH
8063 s++;
8064 if (*s && c == *s && s[1] == ']') {
668c081a
NC
8065 ckWARN3reg(s+2,
8066 "POSIX syntax [%c %c] belongs inside character classes",
8067 c, c);
b45f050a
JF
8068
8069 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 8070 if (POSIXCC_NOTYET(c)) {
830247a4 8071 /* adjust RExC_parse so the error shows after
b45f050a 8072 the class closes */
9a86a77b 8073 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 8074 NOOP;
b45f050a
JF
8075 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8076 }
b8c5462f
JH
8077 }
8078 }
620e46c5
JH
8079}
8080
0399b215 8081/* No locale test */
e1d1eefb
YO
8082#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
8083ANYOF_##NAME: \
8084 for (value = 0; value < 256; value++) \
8085 if (TEST) \
2283d326 8086 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
e1d1eefb
YO
8087 yesno = '+'; \
8088 what = WORD; \
8089 break; \
8090case ANYOF_N##NAME: \
8091 for (value = 0; value < 256; value++) \
8092 if (!TEST) \
2283d326 8093 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
e1d1eefb
YO
8094 yesno = '!'; \
8095 what = WORD; \
8096 break
89836f1f 8097
a12cf05f
KW
8098/* Like the above, but there are differences if we are in uni-8-bit or not, so
8099 * there are two tests passed in, to use depending on that. There aren't any
8100 * cases where the label is different from the name, so no need for that
8101 * parameter */
0399b215 8102#define _C_C_T_(NAME,TEST_8,TEST_7,WORD) \
a12cf05f
KW
8103ANYOF_##NAME: \
8104 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8105 else if (UNI_SEMANTICS) { \
8106 for (value = 0; value < 256; value++) { \
2283d326
KW
8107 if (TEST_8) stored += \
8108 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
a12cf05f
KW
8109 } \
8110 } \
8111 else { \
8112 for (value = 0; value < 256; value++) { \
2283d326
KW
8113 if (TEST_7) stored += \
8114 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
a12cf05f
KW
8115 } \
8116 } \
8117 yesno = '+'; \
8118 what = WORD; \
8119 break; \
8120case ANYOF_N##NAME: \
8121 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8122 else if (UNI_SEMANTICS) { \
8123 for (value = 0; value < 256; value++) { \
2283d326
KW
8124 if (! TEST_8) stored += \
8125 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
a12cf05f
KW
8126 } \
8127 } \
8128 else { \
8129 for (value = 0; value < 256; value++) { \
2283d326
KW
8130 if (! TEST_7) stored += \
8131 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
a12cf05f
KW
8132 } \
8133 } \
8134 yesno = '!'; \
8135 what = WORD; \
8136 break
8137
da7fcca4
YO
8138/*
8139 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8140 so that it is possible to override the option here without having to
8141 rebuild the entire core. as we are required to do if we change regcomp.h
8142 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8143*/
8144#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8145#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8146#endif
8147
8148#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8149#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8150#else
8151#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8152#endif
8153
2283d326
KW
8154STATIC U8
8155S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8156{
8157
8158 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
8159 * Locale folding is done at run-time, so this function should not be
8160 * called for nodes that are for locales.
8161 *
8162 * This function simply sets the bit corresponding to the fold of the input
8163 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
8164 * 'F' is 'f'.
8165 *
8166 * It also sets any necessary flags, and returns the number of bits that
8167 * actually changed from 0 to 1 */
8168
8169 U8 stored = 0;
8170 SV *sv;
8171 U8 fold;
8172
8173 fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
8174 : PL_fold[value];
8175
8176 /* It assumes the bit for 'value' has already been set */
8177 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
8178 ANYOF_BITMAP_SET(node, fold);
8179 stored++;
8180 }
8181
8182 /* The fold of the German sharp s is two ASCII characters, so isn't in the
8183 * bitmap and doesn't have to be in utf8, but we only process it if unicode
8184 * semantics are called for */
8185 if (UNI_SEMANTICS && value == LATIN_SMALL_LETTER_SHARP_S) {
8186 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
8187 }
8188 else if (_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C(value)
8189 || (! UNI_SEMANTICS
8190 && ! isASCII(value)
8191 && PL_fold_latin1[value] != value))
8192 { /* A character that has a fold outside of Latin1 matches outside the
8193 bitmap, but only when the target string is utf8. Similarly when we
8194 don't have unicode semantics for the above ASCII Latin-1 characters,
8195 and they have a fold, they should match if the target is utf8, and
8196 not otherwise */
8197 ANYOF_FLAGS(node) |= ANYOF_UTF8;
8198 }
8199
8200 return stored;
8201}
8202
8203
8204PERL_STATIC_INLINE U8
8205S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U32 value)
8206{
8207 /* This inline function sets a bit in the bitmap if not already set, and if
8208 * appropriate, its fold, returning the number of bits that actually
8209 * changed from 0 to 1 */
8210
8211 U8 stored;
8212
8213 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
8214 return 0;
8215 }
8216
8217 ANYOF_BITMAP_SET(node, value);
8218 stored = 1;
8219
8220 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
8221 stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
8222 }
8223
8224 return stored;
8225}
8226
7f6f358c
YO
8227/*
8228 parse a class specification and produce either an ANYOF node that
89836f1f
YO
8229 matches the pattern or if the pattern matches a single char only and
8230 that char is < 256 and we are case insensitive then we produce an
8231 EXACT node instead.
7f6f358c 8232*/
89836f1f 8233
76e3520e 8234STATIC regnode *
3dab1dad 8235S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 8236{
97aff369 8237 dVAR;
9a86a77b 8238 register UV nextvalue;
3568d838 8239 register IV prevvalue = OOB_UNICODE;
ffc61ed2 8240 register IV range = 0;
e1d1eefb 8241 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 8242 register regnode *ret;
ba210ebe 8243 STRLEN numlen;
ffc61ed2 8244 IV namedclass;
cbbf8932 8245 char *rangebegin = NULL;
936ed897 8246 bool need_class = 0;
c445ea15 8247 SV *listsv = NULL;
ffc61ed2 8248 UV n;
9e55ce06 8249 bool optimize_invert = TRUE;
cbbf8932 8250 AV* unicode_alternate = NULL;
1b2d223b
JH
8251#ifdef EBCDIC
8252 UV literal_endpoint = 0;
8253#endif
2c63ecad 8254 UV stored = 0; /* 0, 1, or more than 1 chars stored in the class */
ffc61ed2 8255
3dab1dad 8256 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 8257 case we need to change the emitted regop to an EXACT. */
07be1b83 8258 const char * orig_parse = RExC_parse;
72f13be8 8259 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8260
8261 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
8262#ifndef DEBUGGING
8263 PERL_UNUSED_ARG(depth);
8264#endif
72f13be8 8265
3dab1dad 8266 DEBUG_PARSE("clas");
7f6f358c
YO
8267
8268 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
8269 ret = reganode(pRExC_state, ANYOF, 0);
8270
8271 if (!SIZE_ONLY)
8272 ANYOF_FLAGS(ret) = 0;
8273
9a86a77b 8274 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
8275 RExC_naughty++;
8276 RExC_parse++;
8277 if (!SIZE_ONLY)
8278 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8279 }
a0d0e21e 8280
73060fc4 8281 if (SIZE_ONLY) {
830247a4 8282 RExC_size += ANYOF_SKIP;
73060fc4
JH
8283 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8284 }
936ed897 8285 else {
830247a4 8286 RExC_emit += ANYOF_SKIP;
936ed897
IZ
8287 if (FOLD)
8288 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8289 if (LOC)
8290 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 8291 ANYOF_BITMAP_ZERO(ret);
396482e1 8292 listsv = newSVpvs("# comment\n");
a0d0e21e 8293 }
b8c5462f 8294
9a86a77b
JH
8295 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8296
b938889d 8297 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 8298 checkposixcc(pRExC_state);
b8c5462f 8299
f064b6ad
HS
8300 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8301 if (UCHARAT(RExC_parse) == ']')
8302 goto charclassloop;
ffc61ed2 8303
fc8cd66c 8304parseit:
9a86a77b 8305 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
8306
8307 charclassloop:
8308
8309 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8310
73b437c8 8311 if (!range)
830247a4 8312 rangebegin = RExC_parse;
ffc61ed2 8313 if (UTF) {
5e12f4fb 8314 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 8315 RExC_end - RExC_parse,
9f7f3913 8316 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
8317 RExC_parse += numlen;
8318 }
8319 else
8320 value = UCHARAT(RExC_parse++);
7f6f358c 8321
9a86a77b
JH
8322 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8323 if (value == '[' && POSIXCC(nextvalue))
830247a4 8324 namedclass = regpposixcc(pRExC_state, value);
620e46c5 8325 else if (value == '\\') {
ffc61ed2 8326 if (UTF) {
5e12f4fb 8327 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 8328 RExC_end - RExC_parse,
9f7f3913 8329 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
8330 RExC_parse += numlen;
8331 }
8332 else
8333 value = UCHARAT(RExC_parse++);
470c3474 8334 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 8335 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
8336 * be a problem later if we want switch on Unicode.
8337 * A similar issue a little bit later when switching on
8338 * namedclass. --jhi */
ffc61ed2 8339 switch ((I32)value) {
b8c5462f
JH
8340 case 'w': namedclass = ANYOF_ALNUM; break;
8341 case 'W': namedclass = ANYOF_NALNUM; break;
8342 case 's': namedclass = ANYOF_SPACE; break;
8343 case 'S': namedclass = ANYOF_NSPACE; break;
8344 case 'd': namedclass = ANYOF_DIGIT; break;
8345 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
8346 case 'v': namedclass = ANYOF_VERTWS; break;
8347 case 'V': namedclass = ANYOF_NVERTWS; break;
8348 case 'h': namedclass = ANYOF_HORIZWS; break;
8349 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
8350 case 'N': /* Handle \N{NAME} in class */
8351 {
8352 /* We only pay attention to the first char of
8353 multichar strings being returned. I kinda wonder
8354 if this makes sense as it does change the behaviour
8355 from earlier versions, OTOH that behaviour was broken
8356 as well. */
8357 UV v; /* value is register so we cant & it /grrr */
afefe6bf 8358 if (reg_namedseq(pRExC_state, &v, NULL)) {
fc8cd66c
YO
8359 goto parseit;
8360 }
8361 value= v;
8362 }
8363 break;
ffc61ed2
JH
8364 case 'p':
8365 case 'P':
3dab1dad
YO
8366 {
8367 char *e;
af6f566e 8368 if (RExC_parse >= RExC_end)
2a4859cd 8369 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 8370 if (*RExC_parse == '{') {
1df70142 8371 const U8 c = (U8)value;
ffc61ed2
JH
8372 e = strchr(RExC_parse++, '}');
8373 if (!e)
0da60cf5 8374 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
8375 while (isSPACE(UCHARAT(RExC_parse)))
8376 RExC_parse++;
8377 if (e == RExC_parse)
0da60cf5 8378 vFAIL2("Empty \\%c{}", c);
ffc61ed2 8379 n = e - RExC_parse;
ab13f0c7
JH
8380 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8381 n--;
ffc61ed2
JH
8382 }
8383 else {
8384 e = RExC_parse;
8385 n = 1;
8386 }
8387 if (!SIZE_ONLY) {
ab13f0c7
JH
8388 if (UCHARAT(RExC_parse) == '^') {
8389 RExC_parse++;
8390 n--;
8391 value = value == 'p' ? 'P' : 'p'; /* toggle */
8392 while (isSPACE(UCHARAT(RExC_parse))) {
8393 RExC_parse++;
8394 n--;
8395 }
8396 }
097eb12c
AL
8397 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8398 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
8399 }
8400 RExC_parse = e + 1;
08fc12dd
KW
8401
8402 /* The \p could match something in the Latin1 range, hence
8403 * something that isn't utf8 */
3ff7ceb3 8404 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
f81125e2 8405 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 8406 }
f81125e2 8407 break;
b8c5462f
JH
8408 case 'n': value = '\n'; break;
8409 case 'r': value = '\r'; break;
8410 case 't': value = '\t'; break;
8411 case 'f': value = '\f'; break;
8412 case 'b': value = '\b'; break;
c7f1f016
NIS
8413 case 'e': value = ASCII_TO_NATIVE('\033');break;
8414 case 'a': value = ASCII_TO_NATIVE('\007');break;
f0a2b745
KW
8415 case 'o':
8416 RExC_parse--; /* function expects to be pointed at the 'o' */
454155d9
KW
8417 {
8418 const char* error_msg;
8419 bool valid = grok_bslash_o(RExC_parse,
f0a2b745
KW
8420 &value,
8421 &numlen,
454155d9
KW
8422 &error_msg,
8423 SIZE_ONLY);
8424 RExC_parse += numlen;
8425 if (! valid) {
8426 vFAIL(error_msg);
8427 }
f0a2b745 8428 }
f0a2b745
KW
8429 if (PL_encoding && value < 0x100) {
8430 goto recode_encoding;
8431 }
8432 break;
b8c5462f 8433 case 'x':
ffc61ed2 8434 if (*RExC_parse == '{') {
a4c04bdc
NC
8435 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8436 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 8437 char * const e = strchr(RExC_parse++, '}');
b81d288d 8438 if (!e)
ffc61ed2 8439 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
8440
8441 numlen = e - RExC_parse;
8442 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8443 RExC_parse = e + 1;
8444 }
8445 else {
a4c04bdc 8446 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
8447 numlen = 2;
8448 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8449 RExC_parse += numlen;
8450 }
9e08bc66
TS
8451 if (PL_encoding && value < 0x100)
8452 goto recode_encoding;
b8c5462f
JH
8453 break;
8454 case 'c':
f9d13529 8455 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
b8c5462f
JH
8456 break;
8457 case '0': case '1': case '2': case '3': case '4':
c99e91e9 8458 case '5': case '6': case '7':
9e08bc66 8459 {
c99e91e9
KW
8460 /* Take 1-3 octal digits */
8461 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9e08bc66
TS
8462 numlen = 3;
8463 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8464 RExC_parse += numlen;
8465 if (PL_encoding && value < 0x100)
8466 goto recode_encoding;
8467 break;
8468 }
8469 recode_encoding:
8470 {
8471 SV* enc = PL_encoding;
8472 value = reg_recode((const char)(U8)value, &enc);
668c081a
NC
8473 if (!enc && SIZE_ONLY)
8474 ckWARNreg(RExC_parse,
8475 "Invalid escape in the specified encoding");
9e08bc66
TS
8476 break;
8477 }
1028017a 8478 default:
c99e91e9
KW
8479 /* Allow \_ to not give an error */
8480 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
668c081a
NC
8481 ckWARN2reg(RExC_parse,
8482 "Unrecognized escape \\%c in character class passed through",
8483 (int)value);
c99e91e9 8484 }
1028017a 8485 break;
b8c5462f 8486 }
ffc61ed2 8487 } /* end of \blah */
1b2d223b
JH
8488#ifdef EBCDIC
8489 else
8490 literal_endpoint++;
8491#endif
ffc61ed2
JH
8492
8493 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8494
2c63ecad
KW
8495 /* What matches in a locale is not known until runtime, so need to
8496 * (one time per class) allocate extra space to pass to regexec.
8497 * The space will contain a bit for each named class that is to be
8498 * matched against. This isn't needed for \p{} and pseudo-classes,
8499 * as they are not affected by locale, and hence are dealt with
8500 * separately */
8501 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8502 need_class = 1;
8503 if (SIZE_ONLY) {
8504 RExC_size += ANYOF_CLASS_ADD_SKIP;
8505 }
8506 else {
8507 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8508 ANYOF_CLASS_ZERO(ret);
8509 }
9051cfd9 8510 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
2c63ecad 8511 }
ffc61ed2
JH
8512
8513 /* a bad range like a-\d, a-[:digit:] ? */
8514 if (range) {
73b437c8 8515 if (!SIZE_ONLY) {
668c081a
NC
8516 const int w =
8517 RExC_parse >= rangebegin ?
8518 RExC_parse - rangebegin : 0;
8519 ckWARN4reg(RExC_parse,
b45f050a 8520 "False [] range \"%*.*s\"",
097eb12c 8521 w, w, rangebegin);
668c081a 8522
3568d838 8523 if (prevvalue < 256) {
2283d326
KW
8524 stored +=
8525 S_set_regclass_bit(aTHX_ pRExC_state, ret, prevvalue);
8526 stored +=
8527 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
ffc61ed2
JH
8528 }
8529 else {
ef87b810 8530 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
ffc61ed2 8531 Perl_sv_catpvf(aTHX_ listsv,
3568d838 8532 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 8533 }
b8c5462f 8534 }
ffc61ed2
JH
8535
8536 range = 0; /* this was not a true range */
73b437c8 8537 }
ffc61ed2 8538
89836f1f
YO
8539
8540
73b437c8 8541 if (!SIZE_ONLY) {
c49a72a9
NC
8542 const char *what = NULL;
8543 char yesno = 0;
8544
3568d838
JH
8545 if (namedclass > OOB_NAMEDCLASS)
8546 optimize_invert = FALSE;
e2962f66
JH
8547 /* Possible truncation here but in some 64-bit environments
8548 * the compiler gets heartburn about switch on 64-bit values.
8549 * A similar issue a little earlier when switching on value.
98f323fa 8550 * --jhi */
e2962f66 8551 switch ((I32)namedclass) {
da7fcca4 8552
0399b215
KW
8553 case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
8554 case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
8555 case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
8556 case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
8557 case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
8558 case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
8559 case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
8560 case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
8561 case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
8562 case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
da7fcca4 8563#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
a12cf05f 8564 /* \s, \w match all unicode if utf8. */
0399b215
KW
8565 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8566 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
da7fcca4 8567#else
a12cf05f 8568 /* \s, \w match ascii and locale only */
0399b215
KW
8569 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8570 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
da7fcca4 8571#endif
0399b215 8572 case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
e1d1eefb
YO
8573 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8574 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
73b437c8
JH
8575 case ANYOF_ASCII:
8576 if (LOC)
936ed897 8577 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 8578 else {
c7f1f016 8579#ifndef EBCDIC
1ba5c669 8580 for (value = 0; value < 128; value++)
2283d326
KW
8581 stored +=
8582 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
1ba5c669 8583#else /* EBCDIC */
ffbc6a93 8584 for (value = 0; value < 256; value++) {
3a3c4447 8585 if (isASCII(value))
2283d326 8586 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
ffbc6a93 8587 }
1ba5c669 8588#endif /* EBCDIC */
73b437c8 8589 }
c49a72a9
NC
8590 yesno = '+';
8591 what = "ASCII";
73b437c8
JH
8592 break;
8593 case ANYOF_NASCII:
8594 if (LOC)
936ed897 8595 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 8596 else {
c7f1f016 8597#ifndef EBCDIC
1ba5c669 8598 for (value = 128; value < 256; value++)
2283d326
KW
8599 stored +=
8600 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
1ba5c669 8601#else /* EBCDIC */
ffbc6a93 8602 for (value = 0; value < 256; value++) {
3a3c4447 8603 if (!isASCII(value))
2283d326 8604 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
ffbc6a93 8605 }
1ba5c669 8606#endif /* EBCDIC */
73b437c8 8607 }
c49a72a9
NC
8608 yesno = '!';
8609 what = "ASCII";
89836f1f 8610 break;
ffc61ed2
JH
8611 case ANYOF_DIGIT:
8612 if (LOC)
8613 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8614 else {
8615 /* consecutive digits assumed */
8616 for (value = '0'; value <= '9'; value++)
2283d326
KW
8617 stored +=
8618 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
ffc61ed2 8619 }
c49a72a9 8620 yesno = '+';
da7fcca4 8621 what = POSIX_CC_UNI_NAME("Digit");
ffc61ed2
JH
8622 break;
8623 case ANYOF_NDIGIT:
8624 if (LOC)
8625 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8626 else {
8627 /* consecutive digits assumed */
8628 for (value = 0; value < '0'; value++)
2283d326
KW
8629 stored +=
8630 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
ffc61ed2 8631 for (value = '9' + 1; value < 256; value++)
2283d326
KW
8632 stored +=
8633 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
ffc61ed2 8634 }
c49a72a9 8635 yesno = '!';
da7fcca4 8636 what = POSIX_CC_UNI_NAME("Digit");
89836f1f 8637 break;
f81125e2
JP
8638 case ANYOF_MAX:
8639 /* this is to handle \p and \P */
8640 break;
73b437c8 8641 default:
b45f050a 8642 vFAIL("Invalid [::] class");
73b437c8 8643 break;
b8c5462f 8644 }
c49a72a9
NC
8645 if (what) {
8646 /* Strings such as "+utf8::isWord\n" */
8647 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8648 }
2c63ecad 8649 stored+=2; /* can't optimize this class */
ef87b810
KW
8650
8651 /* All but ASCII can match Unicode characters, but all the ones
8652 * that aren't in utf8 are in the bitmap */
8653 if (namedclass != ANYOF_ASCII) {
8654 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8655 }
73b437c8 8656 continue;
a0d0e21e 8657 }
ffc61ed2
JH
8658 } /* end of namedclass \blah */
8659
a0d0e21e 8660 if (range) {
eb160463 8661 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
8662 const int w = RExC_parse - rangebegin;
8663 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 8664 range = 0; /* not a valid range */
73b437c8 8665 }
a0d0e21e
LW
8666 }
8667 else {
3568d838 8668 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
8669 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8670 RExC_parse[1] != ']') {
8671 RExC_parse++;
ffc61ed2
JH
8672
8673 /* a bad range like \w-, [:word:]- ? */
8674 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 8675 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 8676 const int w =
afd78fd5
JH
8677 RExC_parse >= rangebegin ?
8678 RExC_parse - rangebegin : 0;
830247a4 8679 vWARN4(RExC_parse,
b45f050a 8680 "False [] range \"%*.*s\"",
097eb12c 8681 w, w, rangebegin);
afd78fd5 8682 }
73b437c8 8683 if (!SIZE_ONLY)
2283d326
KW
8684 stored +=
8685 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
73b437c8 8686 } else
ffc61ed2
JH
8687 range = 1; /* yeah, it's a range! */
8688 continue; /* but do it the next time */
a0d0e21e 8689 }
a687059c 8690 }
ffc61ed2 8691
93733859 8692 /* now is the next time */
07be1b83 8693 /*stored += (value - prevvalue + 1);*/
ae5c130c 8694 if (!SIZE_ONLY) {
3568d838 8695 if (prevvalue < 256) {
1df70142 8696 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 8697 IV i;
3568d838 8698#ifdef EBCDIC
1b2d223b
JH
8699 /* In EBCDIC [\x89-\x91] should include
8700 * the \x8e but [i-j] should not. */
8701 if (literal_endpoint == 2 &&
8702 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8703 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 8704 {
3568d838
JH
8705 if (isLOWER(prevvalue)) {
8706 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 8707 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326
KW
8708 stored +=
8709 S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
2670d666 8710 }
ffc61ed2 8711 } else {
3568d838 8712 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 8713 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326
KW
8714 stored +=
8715 S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
2670d666 8716 }
ffc61ed2 8717 }
8ada0baa 8718 }
ffc61ed2 8719 else
8ada0baa 8720#endif
07be1b83
YO
8721 for (i = prevvalue; i <= ceilvalue; i++) {
8722 if (!ANYOF_BITMAP_TEST(ret,i)) {
2283d326
KW
8723 stored +=
8724 S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
07be1b83
YO
8725 }
8726 }
3568d838 8727 }
a5961de5 8728 if (value > 255 || UTF) {
1df70142
AL
8729 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8730 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 8731 stored+=2; /* can't optimize this class */
ef87b810
KW
8732
8733 /* If the code point requires utf8 to represent, and we are not
8734 * folding, it can't match unless the target is in utf8. Only
8735 * a few code points above 255 fold to below it, so XXX an
8736 * optimization would be to know which ones and set the flag
8737 * appropriately. */
8738 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
8739 ? ANYOF_NONBITMAP
8740 : ANYOF_UTF8;
b08decb7 8741 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 8742 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
8743 prevnatvalue, natvalue);
8744 }
8745 else if (prevnatvalue == natvalue) {
8746 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 8747 if (FOLD) {
89ebb4a3 8748 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 8749 STRLEN foldlen;
1df70142 8750 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 8751
e294cc5d
JH
8752#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8753 if (RExC_precomp[0] == ':' &&
8754 RExC_precomp[1] == '[' &&
8755 (f == 0xDF || f == 0x92)) {
8756 f = NATIVE_TO_UNI(f);
8757 }
8758#endif
c840d2a2
JH
8759 /* If folding and foldable and a single
8760 * character, insert also the folded version
8761 * to the charclass. */
9e55ce06 8762 if (f != value) {
e294cc5d
JH
8763#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8764 if ((RExC_precomp[0] == ':' &&
8765 RExC_precomp[1] == '[' &&
8766 (f == 0xA2 &&
8767 (value == 0xFB05 || value == 0xFB06))) ?
8768 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8769 foldlen == (STRLEN)UNISKIP(f) )
8770#else
eb160463 8771 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 8772#endif
9e55ce06
JH
8773 Perl_sv_catpvf(aTHX_ listsv,
8774 "%04"UVxf"\n", f);
8775 else {
8776 /* Any multicharacter foldings
8777 * require the following transform:
8778 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8779 * where E folds into "pq" and F folds
8780 * into "rst", all other characters
8781 * fold to single characters. We save
8782 * away these multicharacter foldings,
8783 * to be later saved as part of the
8784 * additional "s" data. */
8785 SV *sv;
8786
8787 if (!unicode_alternate)
8788 unicode_alternate = newAV();
740cce10
NC
8789 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8790 TRUE);
9e55ce06
JH
8791 av_push(unicode_alternate, sv);
8792 }
8793 }
254ba52a 8794
60a8b682
JH
8795 /* If folding and the value is one of the Greek
8796 * sigmas insert a few more sigmas to make the
8797 * folding rules of the sigmas to work right.
8798 * Note that not all the possible combinations
8799 * are handled here: some of them are handled
9e55ce06
JH
8800 * by the standard folding rules, and some of
8801 * them (literal or EXACTF cases) are handled
8802 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
8803 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8804 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8805 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 8806 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8807 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8808 }
8809 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8810 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8811 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8812 }
8813 }
ffc61ed2 8814 }
1b2d223b
JH
8815#ifdef EBCDIC
8816 literal_endpoint = 0;
8817#endif
8ada0baa 8818 }
ffc61ed2
JH
8819
8820 range = 0; /* this range (if it was one) is done now */
a0d0e21e 8821 }
ffc61ed2 8822
ffc61ed2 8823
7f6f358c
YO
8824
8825 if (SIZE_ONLY)
8826 return ret;
8827 /****** !SIZE_ONLY AFTER HERE *********/
8828
3b57cd43 8829 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
7f6f358c
YO
8830 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8831 ) {
1d5eda9d
KW
8832 /* optimize single char class to an EXACT node but *only* when its not
8833 * a UTF/high char. Note that the information needed to decide to do
8834 * this optimization is not currently available until the 2nd pass, and
8835 * that the actually used EXACT node takes less space than the
8836 * calculated ANYOF node, and hence the amount of space calculated in
8837 * the first pass is larger than actually used. Currently we don't
8838 * keep track of enough information to do this for nodes which contain
8839 * matches outside the bitmap */
07be1b83
YO
8840 const char * cur_parse= RExC_parse;
8841 RExC_emit = (regnode *)orig_emit;
8842 RExC_parse = (char *)orig_parse;
7f6f358c
YO
8843 ret = reg_node(pRExC_state,
8844 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 8845 RExC_parse = (char *)cur_parse;
7f6f358c
YO
8846 *STRING(ret)= (char)value;
8847 STR_LEN(ret)= 1;
8848 RExC_emit += STR_SZ(1);
ef8d46e8 8849 SvREFCNT_dec(listsv);
7f6f358c
YO
8850 return ret;
8851 }
ae5c130c 8852 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 8853 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
8854 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8855 ) {
a0ed51b3 8856 for (value = 0; value < 256; ++value) {
936ed897 8857 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 8858 UV fold = PL_fold[value];
ffc61ed2
JH
8859
8860 if (fold != value)
8861 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
8862 }
8863 }
936ed897 8864 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 8865 }
ffc61ed2 8866
ae5c130c 8867 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 8868 if (optimize_invert &&
ffc61ed2
JH
8869 /* If the only flag is inversion. */
8870 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 8871 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 8872 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 8873 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 8874 }
7f6f358c 8875 {
097eb12c 8876 AV * const av = newAV();
ffc61ed2 8877 SV *rv;
9e55ce06 8878 /* The 0th element stores the character class description
6a0407ee 8879 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
8880 * to initialize the appropriate swash (which gets stored in
8881 * the 1st element), and also useful for dumping the regnode.
8882 * The 2nd element stores the multicharacter foldings,
6a0407ee 8883 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
8884 av_store(av, 0, listsv);
8885 av_store(av, 1, NULL);
ad64d0ec
NC
8886 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8887 rv = newRV_noinc(MUTABLE_SV(av));
19860706 8888 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 8889 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 8890 ARG_SET(ret, n);
a0ed51b3 8891 }
a0ed51b3
LW
8892 return ret;
8893}
89836f1f
YO
8894#undef _C_C_T_
8895
a0ed51b3 8896
bcdf7404
YO
8897/* reg_skipcomment()
8898
8899 Absorbs an /x style # comments from the input stream.
8900 Returns true if there is more text remaining in the stream.
8901 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8902 terminates the pattern without including a newline.
8903
8904 Note its the callers responsibility to ensure that we are
8905 actually in /x mode
8906
8907*/
8908
8909STATIC bool
8910S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8911{
8912 bool ended = 0;
7918f24d
NC
8913
8914 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8915
bcdf7404
YO
8916 while (RExC_parse < RExC_end)
8917 if (*RExC_parse++ == '\n') {
8918 ended = 1;
8919 break;
8920 }
8921 if (!ended) {
8922 /* we ran off the end of the pattern without ending
8923 the comment, so we have to add an \n when wrapping */
8924 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8925 return 0;
8926 } else
8927 return 1;
8928}
8929
8930/* nextchar()
8931
3b753521 8932 Advances the parse position, and optionally absorbs
bcdf7404
YO
8933 "whitespace" from the inputstream.
8934
8935 Without /x "whitespace" means (?#...) style comments only,
8936 with /x this means (?#...) and # comments and whitespace proper.
8937
8938 Returns the RExC_parse point from BEFORE the scan occurs.
8939
8940 This is the /x friendly way of saying RExC_parse++.
8941*/
8942
76e3520e 8943STATIC char*
830247a4 8944S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 8945{
097eb12c 8946 char* const retval = RExC_parse++;
a0d0e21e 8947
7918f24d
NC
8948 PERL_ARGS_ASSERT_NEXTCHAR;
8949
4633a7c4 8950 for (;;) {
830247a4
IZ
8951 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8952 RExC_parse[2] == '#') {
e994fd66
AE
8953 while (*RExC_parse != ')') {
8954 if (RExC_parse == RExC_end)
8955 FAIL("Sequence (?#... not terminated");
830247a4 8956 RExC_parse++;
e994fd66 8957 }
830247a4 8958 RExC_parse++;
4633a7c4
LW
8959 continue;
8960 }
bbe252da 8961 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
8962 if (isSPACE(*RExC_parse)) {
8963 RExC_parse++;
748a9306
LW
8964 continue;
8965 }
830247a4 8966 else if (*RExC_parse == '#') {
bcdf7404
YO
8967 if ( reg_skipcomment( pRExC_state ) )
8968 continue;
748a9306 8969 }
748a9306 8970 }
4633a7c4 8971 return retval;
a0d0e21e 8972 }
a687059c
LW
8973}
8974
8975/*
c277df42 8976- reg_node - emit a node
a0d0e21e 8977*/
76e3520e 8978STATIC regnode * /* Location. */
830247a4 8979S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 8980{
97aff369 8981 dVAR;
c277df42 8982 register regnode *ptr;
504618e9 8983 regnode * const ret = RExC_emit;
07be1b83 8984 GET_RE_DEBUG_FLAGS_DECL;
a687059c 8985
7918f24d
NC
8986 PERL_ARGS_ASSERT_REG_NODE;
8987
c277df42 8988 if (SIZE_ONLY) {
830247a4
IZ
8989 SIZE_ALIGN(RExC_size);
8990 RExC_size += 1;
a0d0e21e
LW
8991 return(ret);
8992 }
3b57cd43
YO
8993 if (RExC_emit >= RExC_emit_bound)
8994 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8995
c277df42 8996 NODE_ALIGN_FILL(ret);
a0d0e21e 8997 ptr = ret;
c277df42 8998 FILL_ADVANCE_NODE(ptr, op);
7122b237 8999#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9000 if (RExC_offsets) { /* MJD */
07be1b83 9001 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 9002 "reg_node", __LINE__,
13d6edb4 9003 PL_reg_name[op],
07be1b83
YO
9004 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
9005 ? "Overwriting end of array!\n" : "OK",
9006 (UV)(RExC_emit - RExC_emit_start),
9007 (UV)(RExC_parse - RExC_start),
9008 (UV)RExC_offsets[0]));
ccb2c380 9009 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 9010 }
7122b237 9011#endif
830247a4 9012 RExC_emit = ptr;
a0d0e21e 9013 return(ret);
a687059c
LW
9014}
9015
9016/*
a0d0e21e
LW
9017- reganode - emit a node with an argument
9018*/
76e3520e 9019STATIC regnode * /* Location. */
830247a4 9020S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 9021{
97aff369 9022 dVAR;
c277df42 9023 register regnode *ptr;
504618e9 9024 regnode * const ret = RExC_emit;
07be1b83 9025 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 9026
7918f24d
NC
9027 PERL_ARGS_ASSERT_REGANODE;
9028
c277df42 9029 if (SIZE_ONLY) {
830247a4
IZ
9030 SIZE_ALIGN(RExC_size);
9031 RExC_size += 2;
6bda09f9
YO
9032 /*
9033 We can't do this:
9034
9035 assert(2==regarglen[op]+1);
9036
9037 Anything larger than this has to allocate the extra amount.
9038 If we changed this to be:
9039
9040 RExC_size += (1 + regarglen[op]);
9041
9042 then it wouldn't matter. Its not clear what side effect
9043 might come from that so its not done so far.
9044 -- dmq
9045 */
a0d0e21e
LW
9046 return(ret);
9047 }
3b57cd43
YO
9048 if (RExC_emit >= RExC_emit_bound)
9049 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9050
c277df42 9051 NODE_ALIGN_FILL(ret);
a0d0e21e 9052 ptr = ret;
c277df42 9053 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 9054#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9055 if (RExC_offsets) { /* MJD */
07be1b83 9056 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 9057 "reganode",
ccb2c380 9058 __LINE__,
13d6edb4 9059 PL_reg_name[op],
07be1b83 9060 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 9061 "Overwriting end of array!\n" : "OK",
07be1b83
YO
9062 (UV)(RExC_emit - RExC_emit_start),
9063 (UV)(RExC_parse - RExC_start),
9064 (UV)RExC_offsets[0]));
ccb2c380 9065 Set_Cur_Node_Offset;
fac92740 9066 }
7122b237 9067#endif
830247a4 9068 RExC_emit = ptr;
a0d0e21e 9069 return(ret);
fe14fcc3
LW
9070}
9071
9072/*
cd439c50 9073- reguni - emit (if appropriate) a Unicode character
a0ed51b3 9074*/
71207a34
AL
9075STATIC STRLEN
9076S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 9077{
97aff369 9078 dVAR;
7918f24d
NC
9079
9080 PERL_ARGS_ASSERT_REGUNI;
9081
71207a34 9082 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
9083}
9084
9085/*
a0d0e21e
LW
9086- reginsert - insert an operator in front of already-emitted operand
9087*
9088* Means relocating the operand.
9089*/
76e3520e 9090STATIC void
6bda09f9 9091S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 9092{
97aff369 9093 dVAR;
c277df42
IZ
9094 register regnode *src;
9095 register regnode *dst;
9096 register regnode *place;
504618e9 9097 const int offset = regarglen[(U8)op];
6bda09f9 9098 const int size = NODE_STEP_REGNODE + offset;
07be1b83 9099 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9100
9101 PERL_ARGS_ASSERT_REGINSERT;
def51078 9102 PERL_UNUSED_ARG(depth);
22c35a8c 9103/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 9104 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 9105 if (SIZE_ONLY) {
6bda09f9 9106 RExC_size += size;
a0d0e21e
LW
9107 return;
9108 }
a687059c 9109
830247a4 9110 src = RExC_emit;
6bda09f9 9111 RExC_emit += size;
830247a4 9112 dst = RExC_emit;
40d049e4 9113 if (RExC_open_parens) {
6bda09f9 9114 int paren;
3b57cd43 9115 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 9116 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 9117 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 9118 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
9119 RExC_open_parens[paren] += size;
9120 } else {
3b57cd43 9121 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
9122 }
9123 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 9124 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
9125 RExC_close_parens[paren] += size;
9126 } else {
3b57cd43 9127 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
9128 }
9129 }
6bda09f9 9130 }
40d049e4 9131
fac92740 9132 while (src > opnd) {
c277df42 9133 StructCopy(--src, --dst, regnode);
7122b237 9134#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9135 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 9136 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 9137 "reg_insert",
ccb2c380 9138 __LINE__,
13d6edb4 9139 PL_reg_name[op],
07be1b83
YO
9140 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9141 ? "Overwriting end of array!\n" : "OK",
9142 (UV)(src - RExC_emit_start),
9143 (UV)(dst - RExC_emit_start),
9144 (UV)RExC_offsets[0]));
ccb2c380
MP
9145 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9146 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 9147 }
7122b237 9148#endif
fac92740
MJD
9149 }
9150
a0d0e21e
LW
9151
9152 place = opnd; /* Op node, where operand used to be. */
7122b237 9153#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 9154 if (RExC_offsets) { /* MJD */
07be1b83 9155 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 9156 "reginsert",
ccb2c380 9157 __LINE__,
13d6edb4 9158 PL_reg_name[op],
07be1b83 9159 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 9160 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
9161 (UV)(place - RExC_emit_start),
9162 (UV)(RExC_parse - RExC_start),
786e8c11 9163 (UV)RExC_offsets[0]));
ccb2c380 9164 Set_Node_Offset(place, RExC_parse);
45948336 9165 Set_Node_Length(place, 1);
fac92740 9166 }
7122b237 9167#endif
c277df42
IZ
9168 src = NEXTOPER(place);
9169 FILL_ADVANCE_NODE(place, op);
9170 Zero(src, offset, regnode);
a687059c
LW
9171}
9172
9173/*
c277df42 9174- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 9175- SEE ALSO: regtail_study
a0d0e21e 9176*/
097eb12c 9177/* TODO: All three parms should be const */
76e3520e 9178STATIC void
3dab1dad 9179S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 9180{
97aff369 9181 dVAR;
c277df42 9182 register regnode *scan;
72f13be8 9183 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9184
9185 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
9186#ifndef DEBUGGING
9187 PERL_UNUSED_ARG(depth);
9188#endif
a0d0e21e 9189
c277df42 9190 if (SIZE_ONLY)
a0d0e21e
LW
9191 return;
9192
9193 /* Find last node. */
9194 scan = p;
9195 for (;;) {
504618e9 9196 regnode * const temp = regnext(scan);
3dab1dad
YO
9197 DEBUG_PARSE_r({
9198 SV * const mysv=sv_newmortal();
9199 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9200 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
9201 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9202 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9203 (temp == NULL ? "->" : ""),
13d6edb4 9204 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 9205 );
3dab1dad
YO
9206 });
9207 if (temp == NULL)
9208 break;
9209 scan = temp;
9210 }
9211
9212 if (reg_off_by_arg[OP(scan)]) {
9213 ARG_SET(scan, val - scan);
9214 }
9215 else {
9216 NEXT_OFF(scan) = val - scan;
9217 }
9218}
9219
07be1b83 9220#ifdef DEBUGGING
3dab1dad
YO
9221/*
9222- regtail_study - set the next-pointer at the end of a node chain of p to val.
9223- Look for optimizable sequences at the same time.
9224- currently only looks for EXACT chains.
07be1b83
YO
9225
9226This is expermental code. The idea is to use this routine to perform
9227in place optimizations on branches and groups as they are constructed,
9228with the long term intention of removing optimization from study_chunk so
9229that it is purely analytical.
9230
9231Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9232to control which is which.
9233
3dab1dad
YO
9234*/
9235/* TODO: All four parms should be const */
07be1b83 9236
3dab1dad
YO
9237STATIC U8
9238S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9239{
9240 dVAR;
9241 register regnode *scan;
07be1b83
YO
9242 U8 exact = PSEUDO;
9243#ifdef EXPERIMENTAL_INPLACESCAN
9244 I32 min = 0;
9245#endif
3dab1dad
YO
9246 GET_RE_DEBUG_FLAGS_DECL;
9247
7918f24d
NC
9248 PERL_ARGS_ASSERT_REGTAIL_STUDY;
9249
07be1b83 9250
3dab1dad
YO
9251 if (SIZE_ONLY)
9252 return exact;
9253
9254 /* Find last node. */
9255
9256 scan = p;
9257 for (;;) {
9258 regnode * const temp = regnext(scan);
07be1b83
YO
9259#ifdef EXPERIMENTAL_INPLACESCAN
9260 if (PL_regkind[OP(scan)] == EXACT)
9261 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9262 return EXACT;
9263#endif
3dab1dad
YO
9264 if ( exact ) {
9265 switch (OP(scan)) {
9266 case EXACT:
9267 case EXACTF:
9268 case EXACTFL:
9269 if( exact == PSEUDO )
9270 exact= OP(scan);
07be1b83
YO
9271 else if ( exact != OP(scan) )
9272 exact= 0;
3dab1dad
YO
9273 case NOTHING:
9274 break;
9275 default:
9276 exact= 0;
9277 }
9278 }
9279 DEBUG_PARSE_r({
9280 SV * const mysv=sv_newmortal();
9281 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9282 regprop(RExC_rx, mysv, scan);
eaf3ca90 9283 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 9284 SvPV_nolen_const(mysv),
eaf3ca90 9285 REG_NODE_NUM(scan),
13d6edb4 9286 PL_reg_name[exact]);
3dab1dad 9287 });
a0d0e21e
LW
9288 if (temp == NULL)
9289 break;
9290 scan = temp;
9291 }
07be1b83
YO
9292 DEBUG_PARSE_r({
9293 SV * const mysv_val=sv_newmortal();
9294 DEBUG_PARSE_MSG("");
9295 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
9296 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9297 SvPV_nolen_const(mysv_val),
9298 (IV)REG_NODE_NUM(val),
9299 (IV)(val - scan)
07be1b83
YO
9300 );
9301 });
c277df42
IZ
9302 if (reg_off_by_arg[OP(scan)]) {
9303 ARG_SET(scan, val - scan);
a0ed51b3
LW
9304 }
9305 else {
c277df42
IZ
9306 NEXT_OFF(scan) = val - scan;
9307 }
3dab1dad
YO
9308
9309 return exact;
a687059c 9310}
07be1b83 9311#endif
a687059c
LW
9312
9313/*
fd181c75 9314 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 9315 */
f7819f85 9316#ifdef DEBUGGING
c33269f7 9317static void
7918f24d
NC
9318S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9319{
f7819f85
A
9320 int bit;
9321 int set=0;
7918f24d 9322
f7819f85
A
9323 for (bit=0; bit<32; bit++) {
9324 if (flags & (1<<bit)) {
9325 if (!set++ && lead)
9326 PerlIO_printf(Perl_debug_log, "%s",lead);
9327 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9328 }
9329 }
9330 if (lead) {
9331 if (set)
9332 PerlIO_printf(Perl_debug_log, "\n");
9333 else
9334 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9335 }
9336}
9337#endif
9338
a687059c 9339void
097eb12c 9340Perl_regdump(pTHX_ const regexp *r)
a687059c 9341{
35ff7856 9342#ifdef DEBUGGING
97aff369 9343 dVAR;
c445ea15 9344 SV * const sv = sv_newmortal();
ab3bbdeb 9345 SV *dsv= sv_newmortal();
f8fc2ecf 9346 RXi_GET_DECL(r,ri);
f7819f85 9347 GET_RE_DEBUG_FLAGS_DECL;
a687059c 9348
7918f24d
NC
9349 PERL_ARGS_ASSERT_REGDUMP;
9350
f8fc2ecf 9351 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
9352
9353 /* Header fields of interest. */
ab3bbdeb
YO
9354 if (r->anchored_substr) {
9355 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9356 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 9357 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9358 "anchored %s%s at %"IVdf" ",
9359 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 9360 (IV)r->anchored_offset);
ab3bbdeb
YO
9361 } else if (r->anchored_utf8) {
9362 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9363 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 9364 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9365 "anchored utf8 %s%s at %"IVdf" ",
9366 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 9367 (IV)r->anchored_offset);
ab3bbdeb
YO
9368 }
9369 if (r->float_substr) {
9370 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9371 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 9372 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9373 "floating %s%s at %"IVdf"..%"UVuf" ",
9374 s, RE_SV_TAIL(r->float_substr),
7b0972df 9375 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
9376 } else if (r->float_utf8) {
9377 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9378 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 9379 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
9380 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9381 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 9382 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 9383 }
33b8afdf 9384 if (r->check_substr || r->check_utf8)
b81d288d 9385 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
9386 (const char *)
9387 (r->check_substr == r->float_substr
9388 && r->check_utf8 == r->float_utf8
9389 ? "(checking floating" : "(checking anchored"));
bbe252da 9390 if (r->extflags & RXf_NOSCAN)
c277df42 9391 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 9392 if (r->extflags & RXf_CHECK_ALL)
c277df42 9393 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 9394 if (r->check_substr || r->check_utf8)
c277df42
IZ
9395 PerlIO_printf(Perl_debug_log, ") ");
9396
f8fc2ecf
YO
9397 if (ri->regstclass) {
9398 regprop(r, sv, ri->regstclass);
1de06328 9399 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 9400 }
bbe252da 9401 if (r->extflags & RXf_ANCH) {
774d564b 9402 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 9403 if (r->extflags & RXf_ANCH_BOL)
774d564b 9404 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 9405 if (r->extflags & RXf_ANCH_MBOL)
c277df42 9406 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 9407 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 9408 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 9409 if (r->extflags & RXf_ANCH_GPOS)
774d564b 9410 PerlIO_printf(Perl_debug_log, "(GPOS)");
9411 PerlIO_putc(Perl_debug_log, ' ');
9412 }
bbe252da 9413 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 9414 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 9415 if (r->intflags & PREGf_SKIP)
760ac839 9416 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 9417 if (r->intflags & PREGf_IMPLICIT)
760ac839 9418 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 9419 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 9420 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 9421 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 9422 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 9423 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 9424#else
7918f24d 9425 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 9426 PERL_UNUSED_CONTEXT;
65e66c80 9427 PERL_UNUSED_ARG(r);
17c3b450 9428#endif /* DEBUGGING */
a687059c
LW
9429}
9430
9431/*
a0d0e21e
LW
9432- regprop - printable representation of opcode
9433*/
3339dfd8
YO
9434#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9435STMT_START { \
9436 if (do_sep) { \
9437 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9438 if (flags & ANYOF_INVERT) \
9439 /*make sure the invert info is in each */ \
9440 sv_catpvs(sv, "^"); \
9441 do_sep = 0; \
9442 } \
9443} STMT_END
9444
46fc3d4c 9445void
32fc9b6a 9446Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 9447{
35ff7856 9448#ifdef DEBUGGING
97aff369 9449 dVAR;
9b155405 9450 register int k;
f8fc2ecf 9451 RXi_GET_DECL(prog,progi);
1de06328 9452 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 9453
7918f24d 9454 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 9455
76f68e9b 9456 sv_setpvs(sv, "");
8aa23a47 9457
03363afd 9458 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
9459 /* It would be nice to FAIL() here, but this may be called from
9460 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 9461 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 9462 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 9463
3dab1dad 9464 k = PL_regkind[OP(o)];
9b155405 9465
2a782b5b 9466 if (k == EXACT) {
f92a2122 9467 sv_catpvs(sv, " ");
ab3bbdeb
YO
9468 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9469 * is a crude hack but it may be the best for now since
9470 * we have no flag "this EXACTish node was UTF-8"
9471 * --jhi */
f92a2122
NC
9472 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9473 PERL_PV_ESCAPE_UNI_DETECT |
9474 PERL_PV_PRETTY_ELLIPSES |
9475 PERL_PV_PRETTY_LTGT |
9476 PERL_PV_PRETTY_NOCLEAR
9477 );
bb263b4e 9478 } else if (k == TRIE) {
3dab1dad 9479 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 9480 * progi->data isn't available here */
1de06328 9481 const char op = OP(o);
647f639f 9482 const U32 n = ARG(o);
1de06328 9483 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 9484 (reg_ac_data *)progi->data->data[n] :
1de06328 9485 NULL;
3251b653
NC
9486 const reg_trie_data * const trie
9487 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 9488
13d6edb4 9489 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
9490 DEBUG_TRIE_COMPILE_r(
9491 Perl_sv_catpvf(aTHX_ sv,
9492 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9493 (UV)trie->startstate,
1e2e3d02 9494 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
9495 (UV)trie->wordcount,
9496 (UV)trie->minlen,
9497 (UV)trie->maxlen,
9498 (UV)TRIE_CHARCOUNT(trie),
9499 (UV)trie->uniquecharcount
9500 )
9501 );
9502 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9503 int i;
9504 int rangestart = -1;
f46cb337 9505 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 9506 sv_catpvs(sv, "[");
1de06328
YO
9507 for (i = 0; i <= 256; i++) {
9508 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9509 if (rangestart == -1)
9510 rangestart = i;
9511 } else if (rangestart != -1) {
9512 if (i <= rangestart + 3)
9513 for (; rangestart < i; rangestart++)
9514 put_byte(sv, rangestart);
9515 else {
9516 put_byte(sv, rangestart);
9517 sv_catpvs(sv, "-");
9518 put_byte(sv, i - 1);
9519 }
9520 rangestart = -1;
9521 }
9522 }
f3a2811a 9523 sv_catpvs(sv, "]");
1de06328
YO
9524 }
9525
a3621e74 9526 } else if (k == CURLY) {
cb434fcc 9527 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
9528 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9529 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 9530 }
2c2d71f5
JH
9531 else if (k == WHILEM && o->flags) /* Ordinal/of */
9532 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 9533 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 9534 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 9535 if ( RXp_PAREN_NAMES(prog) ) {
ee9b8eae 9536 if ( k != REF || OP(o) < NREF) {
502c6561 9537 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
9538 SV **name= av_fetch(list, ARG(o), 0 );
9539 if (name)
9540 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9541 }
9542 else {
502c6561 9543 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 9544 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
9545 I32 *nums=(I32*)SvPVX(sv_dat);
9546 SV **name= av_fetch(list, nums[0], 0 );
9547 I32 n;
9548 if (name) {
9549 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9550 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9551 (n ? "," : ""), (IV)nums[n]);
9552 }
9553 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 9554 }
1f1031fe 9555 }
ee9b8eae 9556 }
1f1031fe 9557 } else if (k == GOSUB)
6bda09f9 9558 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
9559 else if (k == VERB) {
9560 if (!o->flags)
9561 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 9562 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 9563 } else if (k == LOGICAL)
04ebc1ab 9564 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
f9a79580 9565 else if (k == FOLDCHAR)
df44d732 9566 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
653099ff
GS
9567 else if (k == ANYOF) {
9568 int i, rangestart = -1;
2d03de9c 9569 const U8 flags = ANYOF_FLAGS(o);
24d786f4 9570 int do_sep = 0;
0bd48802
AL
9571
9572 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9573 static const char * const anyofs[] = {
653099ff
GS
9574 "\\w",
9575 "\\W",
9576 "\\s",
9577 "\\S",
9578 "\\d",
9579 "\\D",
9580 "[:alnum:]",
9581 "[:^alnum:]",
9582 "[:alpha:]",
9583 "[:^alpha:]",
9584 "[:ascii:]",
9585 "[:^ascii:]",
24d786f4
YO
9586 "[:cntrl:]",
9587 "[:^cntrl:]",
653099ff
GS
9588 "[:graph:]",
9589 "[:^graph:]",
9590 "[:lower:]",
9591 "[:^lower:]",
9592 "[:print:]",
9593 "[:^print:]",
9594 "[:punct:]",
9595 "[:^punct:]",
9596 "[:upper:]",
aaa51d5e 9597 "[:^upper:]",
653099ff 9598 "[:xdigit:]",
aaa51d5e
JF
9599 "[:^xdigit:]",
9600 "[:space:]",
9601 "[:^space:]",
9602 "[:blank:]",
9603 "[:^blank:]"
653099ff
GS
9604 };
9605
19860706 9606 if (flags & ANYOF_LOCALE)
396482e1 9607 sv_catpvs(sv, "{loc}");
19860706 9608 if (flags & ANYOF_FOLD)
396482e1 9609 sv_catpvs(sv, "{i}");
653099ff 9610 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 9611 if (flags & ANYOF_INVERT)
396482e1 9612 sv_catpvs(sv, "^");
3339dfd8
YO
9613
9614 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
9615 for (i = 0; i <= 256; i++) {
9616 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9617 if (rangestart == -1)
9618 rangestart = i;
9619 } else if (rangestart != -1) {
9620 if (i <= rangestart + 3)
9621 for (; rangestart < i; rangestart++)
653099ff 9622 put_byte(sv, rangestart);
ffc61ed2
JH
9623 else {
9624 put_byte(sv, rangestart);
396482e1 9625 sv_catpvs(sv, "-");
ffc61ed2 9626 put_byte(sv, i - 1);
653099ff 9627 }
24d786f4 9628 do_sep = 1;
ffc61ed2 9629 rangestart = -1;
653099ff 9630 }
847a199f 9631 }
3339dfd8
YO
9632
9633 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9634 /* output any special charclass tests (used mostly under use locale) */
cefafd73 9635 if (o->flags & ANYOF_CLASS && ANYOF_CLASS_TEST_ANY_SET(o))
bb7a0f54 9636 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 9637 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 9638 sv_catpv(sv, anyofs[i]);
24d786f4
YO
9639 do_sep = 1;
9640 }
9641
3339dfd8
YO
9642 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9643
9644 /* output information about the unicode matching */
ef87b810 9645 if (flags & ANYOF_UNICODE_ALL)
396482e1 9646 sv_catpvs(sv, "{unicode_all}");
ef87b810
KW
9647 else if (flags & ANYOF_UTF8)
9648 sv_catpvs(sv, "{unicode}");
9649 else if (flags & ANYOF_NONBITMAP)
9650 sv_catpvs(sv, "{outside bitmap}");
ffc61ed2
JH
9651
9652 {
9653 SV *lv;
32fc9b6a 9654 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 9655
ffc61ed2
JH
9656 if (lv) {
9657 if (sw) {
89ebb4a3 9658 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 9659
ffc61ed2 9660 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 9661 uvchr_to_utf8(s, i);
ffc61ed2 9662
3568d838 9663 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
9664 if (rangestart == -1)
9665 rangestart = i;
9666 } else if (rangestart != -1) {
ffc61ed2
JH
9667 if (i <= rangestart + 3)
9668 for (; rangestart < i; rangestart++) {
2d03de9c
AL
9669 const U8 * const e = uvchr_to_utf8(s,rangestart);
9670 U8 *p;
9671 for(p = s; p < e; p++)
ffc61ed2
JH
9672 put_byte(sv, *p);
9673 }
9674 else {
2d03de9c
AL
9675 const U8 *e = uvchr_to_utf8(s,rangestart);
9676 U8 *p;
9677 for (p = s; p < e; p++)
ffc61ed2 9678 put_byte(sv, *p);
396482e1 9679 sv_catpvs(sv, "-");
2d03de9c
AL
9680 e = uvchr_to_utf8(s, i-1);
9681 for (p = s; p < e; p++)
1df70142 9682 put_byte(sv, *p);
ffc61ed2
JH
9683 }
9684 rangestart = -1;
9685 }
19860706 9686 }
ffc61ed2 9687
396482e1 9688 sv_catpvs(sv, "..."); /* et cetera */
19860706 9689 }
fde631ed 9690
ffc61ed2 9691 {
2e0de35c 9692 char *s = savesvpv(lv);
c445ea15 9693 char * const origs = s;
b81d288d 9694
3dab1dad
YO
9695 while (*s && *s != '\n')
9696 s++;
b81d288d 9697
ffc61ed2 9698 if (*s == '\n') {
2d03de9c 9699 const char * const t = ++s;
ffc61ed2
JH
9700
9701 while (*s) {
9702 if (*s == '\n')
9703 *s = ' ';
9704 s++;
9705 }
9706 if (s[-1] == ' ')
9707 s[-1] = 0;
9708
9709 sv_catpv(sv, t);
fde631ed 9710 }
b81d288d 9711
ffc61ed2 9712 Safefree(origs);
fde631ed
JH
9713 }
9714 }
653099ff 9715 }
ffc61ed2 9716
653099ff
GS
9717 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9718 }
9b155405 9719 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 9720 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 9721#else
96a5add6 9722 PERL_UNUSED_CONTEXT;
65e66c80
SP
9723 PERL_UNUSED_ARG(sv);
9724 PERL_UNUSED_ARG(o);
f9049ba1 9725 PERL_UNUSED_ARG(prog);
17c3b450 9726#endif /* DEBUGGING */
35ff7856 9727}
a687059c 9728
cad2e5aa 9729SV *
288b8c02 9730Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 9731{ /* Assume that RE_INTUIT is set */
97aff369 9732 dVAR;
288b8c02 9733 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 9734 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9735
9736 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
9737 PERL_UNUSED_CONTEXT;
9738
a3621e74 9739 DEBUG_COMPILE_r(
cfd0369c 9740 {
2d03de9c 9741 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 9742 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
9743
9744 if (!PL_colorset) reginitcolors();
9745 PerlIO_printf(Perl_debug_log,
a0288114 9746 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
9747 PL_colors[4],
9748 prog->check_substr ? "" : "utf8 ",
9749 PL_colors[5],PL_colors[0],
cad2e5aa
JH
9750 s,
9751 PL_colors[1],
9752 (strlen(s) > 60 ? "..." : ""));
9753 } );
9754
33b8afdf 9755 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
9756}
9757
84da74a7 9758/*
f8149455 9759 pregfree()
84da74a7 9760
f8149455
YO
9761 handles refcounting and freeing the perl core regexp structure. When
9762 it is necessary to actually free the structure the first thing it
3b753521 9763 does is call the 'free' method of the regexp_engine associated to
f8149455
YO
9764 the regexp, allowing the handling of the void *pprivate; member
9765 first. (This routine is not overridable by extensions, which is why
9766 the extensions free is called first.)
9767
9768 See regdupe and regdupe_internal if you change anything here.
84da74a7 9769*/
f8149455 9770#ifndef PERL_IN_XSUB_RE
2b69d0c2 9771void
84679df5 9772Perl_pregfree(pTHX_ REGEXP *r)
a687059c 9773{
288b8c02
NC
9774 SvREFCNT_dec(r);
9775}
9776
9777void
9778Perl_pregfree2(pTHX_ REGEXP *rx)
9779{
27da23d5 9780 dVAR;
288b8c02 9781 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 9782 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 9783
7918f24d
NC
9784 PERL_ARGS_ASSERT_PREGFREE2;
9785
28d8d7f4
YO
9786 if (r->mother_re) {
9787 ReREFCNT_dec(r->mother_re);
9788 } else {
288b8c02 9789 CALLREGFREE_PVT(rx); /* free the private data */
ef8d46e8 9790 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
9791 }
9792 if (r->substrs) {
ef8d46e8
VP
9793 SvREFCNT_dec(r->anchored_substr);
9794 SvREFCNT_dec(r->anchored_utf8);
9795 SvREFCNT_dec(r->float_substr);
9796 SvREFCNT_dec(r->float_utf8);
28d8d7f4
YO
9797 Safefree(r->substrs);
9798 }
288b8c02 9799 RX_MATCH_COPY_FREE(rx);
f8c7b90f 9800#ifdef PERL_OLD_COPY_ON_WRITE
ef8d46e8 9801 SvREFCNT_dec(r->saved_copy);
ed252734 9802#endif
f0ab9afb 9803 Safefree(r->offs);
f8149455 9804}
28d8d7f4
YO
9805
9806/* reg_temp_copy()
9807
9808 This is a hacky workaround to the structural issue of match results
9809 being stored in the regexp structure which is in turn stored in
9810 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9811 could be PL_curpm in multiple contexts, and could require multiple
9812 result sets being associated with the pattern simultaneously, such
9813 as when doing a recursive match with (??{$qr})
9814
9815 The solution is to make a lightweight copy of the regexp structure
9816 when a qr// is returned from the code executed by (??{$qr}) this
9817 lightweight copy doesnt actually own any of its data except for
9818 the starp/end and the actual regexp structure itself.
9819
9820*/
9821
9822
84679df5 9823REGEXP *
f0826785 9824Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
7918f24d 9825{
f0826785 9826 struct regexp *ret;
288b8c02 9827 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 9828 register const I32 npar = r->nparens+1;
7918f24d
NC
9829
9830 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9831
f0826785
BM
9832 if (!ret_x)
9833 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9834 ret = (struct regexp *)SvANY(ret_x);
9835
288b8c02 9836 (void)ReREFCNT_inc(rx);
f7c278bf
NC
9837 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9838 by pointing directly at the buffer, but flagging that the allocated
9839 space in the copy is zero. As we've just done a struct copy, it's now
9840 a case of zero-ing that, rather than copying the current length. */
9841 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 9842 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
b6f60916
NC
9843 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9844 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
f7c278bf 9845 SvLEN_set(ret_x, 0);
b9ad13ac 9846 SvSTASH_set(ret_x, NULL);
703c388d 9847 SvMAGIC_set(ret_x, NULL);
f0ab9afb
NC
9848 Newx(ret->offs, npar, regexp_paren_pair);
9849 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 9850 if (r->substrs) {
28d8d7f4 9851 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
9852 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9853
9854 SvREFCNT_inc_void(ret->anchored_substr);
9855 SvREFCNT_inc_void(ret->anchored_utf8);
9856 SvREFCNT_inc_void(ret->float_substr);
9857 SvREFCNT_inc_void(ret->float_utf8);
9858
9859 /* check_substr and check_utf8, if non-NULL, point to either their
9860 anchored or float namesakes, and don't hold a second reference. */
486913e4 9861 }
288b8c02 9862 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 9863#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 9864 ret->saved_copy = NULL;
28d8d7f4 9865#endif
288b8c02 9866 ret->mother_re = rx;
28d8d7f4 9867
288b8c02 9868 return ret_x;
28d8d7f4 9869}
f8149455
YO
9870#endif
9871
9872/* regfree_internal()
9873
9874 Free the private data in a regexp. This is overloadable by
9875 extensions. Perl takes care of the regexp structure in pregfree(),
3b753521 9876 this covers the *pprivate pointer which technically perl doesn't
f8149455
YO
9877 know about, however of course we have to handle the
9878 regexp_internal structure when no extension is in use.
9879
9880 Note this is called before freeing anything in the regexp
9881 structure.
9882 */
9883
9884void
288b8c02 9885Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
9886{
9887 dVAR;
288b8c02 9888 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
9889 RXi_GET_DECL(r,ri);
9890 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9891
9892 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9893
f8149455
YO
9894 DEBUG_COMPILE_r({
9895 if (!PL_colorset)
9896 reginitcolors();
9897 {
9898 SV *dsv= sv_newmortal();
3c8556c3 9899 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 9900 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
9901 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9902 PL_colors[4],PL_colors[5],s);
9903 }
9904 });
7122b237
YO
9905#ifdef RE_TRACK_PATTERN_OFFSETS
9906 if (ri->u.offsets)
9907 Safefree(ri->u.offsets); /* 20010421 MJD */
9908#endif
f8fc2ecf
YO
9909 if (ri->data) {
9910 int n = ri->data->count;
f3548bdc
DM
9911 PAD* new_comppad = NULL;
9912 PAD* old_comppad;
4026c95a 9913 PADOFFSET refcnt;
dfad63ad 9914
c277df42 9915 while (--n >= 0) {
261faec3 9916 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 9917 switch (ri->data->what[n]) {
af534a04 9918 case 'a':
c277df42 9919 case 's':
81714fb9 9920 case 'S':
55eed653 9921 case 'u':
ad64d0ec 9922 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 9923 break;
653099ff 9924 case 'f':
f8fc2ecf 9925 Safefree(ri->data->data[n]);
653099ff 9926 break;
dfad63ad 9927 case 'p':
502c6561 9928 new_comppad = MUTABLE_AV(ri->data->data[n]);
dfad63ad 9929 break;
c277df42 9930 case 'o':
dfad63ad 9931 if (new_comppad == NULL)
cea2e8a9 9932 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
9933 PAD_SAVE_LOCAL(old_comppad,
9934 /* Watch out for global destruction's random ordering. */
c445ea15 9935 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 9936 );
b34c0dd4 9937 OP_REFCNT_LOCK;
f8fc2ecf 9938 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
9939 OP_REFCNT_UNLOCK;
9940 if (!refcnt)
f8fc2ecf 9941 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 9942
f3548bdc 9943 PAD_RESTORE_LOCAL(old_comppad);
ad64d0ec 9944 SvREFCNT_dec(MUTABLE_SV(new_comppad));
dfad63ad 9945 new_comppad = NULL;
c277df42
IZ
9946 break;
9947 case 'n':
9e55ce06 9948 break;
07be1b83 9949 case 'T':
be8e71aa
YO
9950 { /* Aho Corasick add-on structure for a trie node.
9951 Used in stclass optimization only */
07be1b83 9952 U32 refcount;
f8fc2ecf 9953 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
9954 OP_REFCNT_LOCK;
9955 refcount = --aho->refcount;
9956 OP_REFCNT_UNLOCK;
9957 if ( !refcount ) {
446bd890
NC
9958 PerlMemShared_free(aho->states);
9959 PerlMemShared_free(aho->fail);
446bd890
NC
9960 /* do this last!!!! */
9961 PerlMemShared_free(ri->data->data[n]);
9962 PerlMemShared_free(ri->regstclass);
07be1b83
YO
9963 }
9964 }
9965 break;
a3621e74 9966 case 't':
07be1b83 9967 {
be8e71aa 9968 /* trie structure. */
07be1b83 9969 U32 refcount;
f8fc2ecf 9970 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
9971 OP_REFCNT_LOCK;
9972 refcount = --trie->refcount;
9973 OP_REFCNT_UNLOCK;
9974 if ( !refcount ) {
446bd890 9975 PerlMemShared_free(trie->charmap);
446bd890
NC
9976 PerlMemShared_free(trie->states);
9977 PerlMemShared_free(trie->trans);
07be1b83 9978 if (trie->bitmap)
446bd890 9979 PerlMemShared_free(trie->bitmap);
786e8c11 9980 if (trie->jump)
446bd890 9981 PerlMemShared_free(trie->jump);
2e64971a 9982 PerlMemShared_free(trie->wordinfo);
446bd890
NC
9983 /* do this last!!!! */
9984 PerlMemShared_free(ri->data->data[n]);
a3621e74 9985 }
07be1b83
YO
9986 }
9987 break;
c277df42 9988 default:
f8fc2ecf 9989 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
9990 }
9991 }
f8fc2ecf
YO
9992 Safefree(ri->data->what);
9993 Safefree(ri->data);
a0d0e21e 9994 }
28d8d7f4 9995
f8fc2ecf 9996 Safefree(ri);
a687059c 9997}
c277df42 9998
a09252eb
NC
9999#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
10000#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
84da74a7
YO
10001#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
10002
10003/*
32cd70f6 10004 re_dup - duplicate a regexp.
84da74a7 10005
8233f606
DM
10006 This routine is expected to clone a given regexp structure. It is only
10007 compiled under USE_ITHREADS.
32cd70f6 10008
f8149455
YO
10009 After all of the core data stored in struct regexp is duplicated
10010 the regexp_engine.dupe method is used to copy any private data
10011 stored in the *pprivate pointer. This allows extensions to handle
10012 any duplication it needs to do.
10013
10014 See pregfree() and regfree_internal() if you change anything here.
84da74a7 10015*/
a3c0e9ca 10016#if defined(USE_ITHREADS)
f8149455 10017#ifndef PERL_IN_XSUB_RE
288b8c02
NC
10018void
10019Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 10020{
84da74a7 10021 dVAR;
a86a1ca7 10022 I32 npar;
288b8c02
NC
10023 const struct regexp *r = (const struct regexp *)SvANY(sstr);
10024 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 10025
7918f24d
NC
10026 PERL_ARGS_ASSERT_RE_DUP_GUTS;
10027
84da74a7 10028 npar = r->nparens+1;
f0ab9afb
NC
10029 Newx(ret->offs, npar, regexp_paren_pair);
10030 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 10031 if(ret->swap) {
28d8d7f4 10032 /* no need to copy these */
f0ab9afb 10033 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 10034 }
84da74a7 10035
6057429f 10036 if (ret->substrs) {
32cd70f6
NC
10037 /* Do it this way to avoid reading from *r after the StructCopy().
10038 That way, if any of the sv_dup_inc()s dislodge *r from the L1
10039 cache, it doesn't matter. */
66b1de87
NC
10040 const bool anchored = r->check_substr
10041 ? r->check_substr == r->anchored_substr
10042 : r->check_utf8 == r->anchored_utf8;
785a26d5 10043 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
10044 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10045
32cd70f6
NC
10046 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
10047 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
10048 ret->float_substr = sv_dup_inc(ret->float_substr, param);
10049 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 10050
32cd70f6
NC
10051 /* check_substr and check_utf8, if non-NULL, point to either their
10052 anchored or float namesakes, and don't hold a second reference. */
10053
10054 if (ret->check_substr) {
10055 if (anchored) {
10056 assert(r->check_utf8 == r->anchored_utf8);
10057 ret->check_substr = ret->anchored_substr;
10058 ret->check_utf8 = ret->anchored_utf8;
10059 } else {
10060 assert(r->check_substr == r->float_substr);
10061 assert(r->check_utf8 == r->float_utf8);
10062 ret->check_substr = ret->float_substr;
10063 ret->check_utf8 = ret->float_utf8;
10064 }
66b1de87
NC
10065 } else if (ret->check_utf8) {
10066 if (anchored) {
10067 ret->check_utf8 = ret->anchored_utf8;
10068 } else {
10069 ret->check_utf8 = ret->float_utf8;
10070 }
32cd70f6 10071 }
6057429f 10072 }
f8149455 10073
5daac39c 10074 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 10075
6057429f 10076 if (ret->pprivate)
288b8c02 10077 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 10078
288b8c02 10079 if (RX_MATCH_COPIED(dstr))
6057429f 10080 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
10081 else
10082 ret->subbeg = NULL;
10083#ifdef PERL_OLD_COPY_ON_WRITE
10084 ret->saved_copy = NULL;
10085#endif
6057429f 10086
c2123ae3
NC
10087 if (ret->mother_re) {
10088 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
10089 /* Our storage points directly to our mother regexp, but that's
10090 1: a buffer in a different thread
10091 2: something we no longer hold a reference on
10092 so we need to copy it locally. */
10093 /* Note we need to sue SvCUR() on our mother_re, because it, in
10094 turn, may well be pointing to its own mother_re. */
10095 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
10096 SvCUR(ret->mother_re)+1));
10097 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
10098 }
10099 ret->mother_re = NULL;
10100 }
6057429f 10101 ret->gofs = 0;
f8149455
YO
10102}
10103#endif /* PERL_IN_XSUB_RE */
10104
10105/*
10106 regdupe_internal()
10107
10108 This is the internal complement to regdupe() which is used to copy
10109 the structure pointed to by the *pprivate pointer in the regexp.
10110 This is the core version of the extension overridable cloning hook.
10111 The regexp structure being duplicated will be copied by perl prior
10112 to this and will be provided as the regexp *r argument, however
10113 with the /old/ structures pprivate pointer value. Thus this routine
10114 may override any copying normally done by perl.
10115
10116 It returns a pointer to the new regexp_internal structure.
10117*/
10118
10119void *
288b8c02 10120Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
10121{
10122 dVAR;
288b8c02 10123 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
10124 regexp_internal *reti;
10125 int len, npar;
10126 RXi_GET_DECL(r,ri);
7918f24d
NC
10127
10128 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455
YO
10129
10130 npar = r->nparens+1;
7122b237 10131 len = ProgLen(ri);
f8149455 10132
45cf4570 10133 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455
YO
10134 Copy(ri->program, reti->program, len+1, regnode);
10135
f8149455 10136
f8fc2ecf 10137 reti->regstclass = NULL;
bcdf7404 10138
f8fc2ecf 10139 if (ri->data) {
84da74a7 10140 struct reg_data *d;
f8fc2ecf 10141 const int count = ri->data->count;
84da74a7
YO
10142 int i;
10143
10144 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10145 char, struct reg_data);
10146 Newx(d->what, count, U8);
10147
10148 d->count = count;
10149 for (i = 0; i < count; i++) {
f8fc2ecf 10150 d->what[i] = ri->data->what[i];
84da74a7 10151 switch (d->what[i]) {
af534a04 10152 /* legal options are one of: sSfpontTua
84da74a7 10153 see also regcomp.h and pregfree() */
af534a04 10154 case 'a': /* actually an AV, but the dup function is identical. */
84da74a7 10155 case 's':
81714fb9 10156 case 'S':
0536c0a7 10157 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 10158 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 10159 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 10160 break;
84da74a7
YO
10161 case 'f':
10162 /* This is cheating. */
10163 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 10164 StructCopy(ri->data->data[i], d->data[i],
84da74a7 10165 struct regnode_charclass_class);
f8fc2ecf 10166 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
10167 break;
10168 case 'o':
bbe252da
YO
10169 /* Compiled op trees are readonly and in shared memory,
10170 and can thus be shared without duplication. */
84da74a7 10171 OP_REFCNT_LOCK;
f8fc2ecf 10172 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
10173 OP_REFCNT_UNLOCK;
10174 break;
23eab42c
NC
10175 case 'T':
10176 /* Trie stclasses are readonly and can thus be shared
10177 * without duplication. We free the stclass in pregfree
10178 * when the corresponding reg_ac_data struct is freed.
10179 */
10180 reti->regstclass= ri->regstclass;
10181 /* Fall through */
84da74a7 10182 case 't':
84da74a7 10183 OP_REFCNT_LOCK;
0536c0a7 10184 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 10185 OP_REFCNT_UNLOCK;
0536c0a7
NC
10186 /* Fall through */
10187 case 'n':
10188 d->data[i] = ri->data->data[i];
84da74a7 10189 break;
84da74a7 10190 default:
f8fc2ecf 10191 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
10192 }
10193 }
10194
f8fc2ecf 10195 reti->data = d;
84da74a7
YO
10196 }
10197 else
f8fc2ecf 10198 reti->data = NULL;
84da74a7 10199
cde0cee5
YO
10200 reti->name_list_idx = ri->name_list_idx;
10201
7122b237
YO
10202#ifdef RE_TRACK_PATTERN_OFFSETS
10203 if (ri->u.offsets) {
10204 Newx(reti->u.offsets, 2*len+1, U32);
10205 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10206 }
10207#else
10208 SetProgLen(reti,len);
10209#endif
10210
f8149455 10211 return (void*)reti;
84da74a7 10212}
f8149455
YO
10213
10214#endif /* USE_ITHREADS */
84da74a7 10215
f8149455 10216#ifndef PERL_IN_XSUB_RE
bcdf7404 10217
c277df42
IZ
10218/*
10219 - regnext - dig the "next" pointer out of a node
c277df42
IZ
10220 */
10221regnode *
864dbfa3 10222Perl_regnext(pTHX_ register regnode *p)
c277df42 10223{
97aff369 10224 dVAR;
c277df42
IZ
10225 register I32 offset;
10226
f8fc2ecf 10227 if (!p)
c277df42
IZ
10228 return(NULL);
10229
35db910f
KW
10230 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
10231 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10232 }
10233
c277df42
IZ
10234 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10235 if (offset == 0)
10236 return(NULL);
10237
c277df42 10238 return(p+offset);
c277df42 10239}
76234dfb 10240#endif
c277df42 10241
01f988be 10242STATIC void
cea2e8a9 10243S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
10244{
10245 va_list args;
10246 STRLEN l1 = strlen(pat1);
10247 STRLEN l2 = strlen(pat2);
10248 char buf[512];
06bf62c7 10249 SV *msv;
73d840c0 10250 const char *message;
c277df42 10251
7918f24d
NC
10252 PERL_ARGS_ASSERT_RE_CROAK2;
10253
c277df42
IZ
10254 if (l1 > 510)
10255 l1 = 510;
10256 if (l1 + l2 > 510)
10257 l2 = 510 - l1;
10258 Copy(pat1, buf, l1 , char);
10259 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
10260 buf[l1 + l2] = '\n';
10261 buf[l1 + l2 + 1] = '\0';
8736538c
AS
10262#ifdef I_STDARG
10263 /* ANSI variant takes additional second argument */
c277df42 10264 va_start(args, pat2);
8736538c
AS
10265#else
10266 va_start(args);
10267#endif
5a844595 10268 msv = vmess(buf, &args);
c277df42 10269 va_end(args);
cfd0369c 10270 message = SvPV_const(msv,l1);
c277df42
IZ
10271 if (l1 > 512)
10272 l1 = 512;
10273 Copy(message, buf, l1 , char);
197cf9b9 10274 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 10275 Perl_croak(aTHX_ "%s", buf);
c277df42 10276}
a0ed51b3
LW
10277
10278/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
10279
76234dfb 10280#ifndef PERL_IN_XSUB_RE
a0ed51b3 10281void
864dbfa3 10282Perl_save_re_context(pTHX)
b81d288d 10283{
97aff369 10284 dVAR;
1ade1aa1
NC
10285
10286 struct re_save_state *state;
10287
10288 SAVEVPTR(PL_curcop);
10289 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10290
10291 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10292 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
c6bf6a65 10293 SSPUSHUV(SAVEt_RE_STATE);
1ade1aa1 10294
46ab3289 10295 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 10296
a0ed51b3 10297 PL_reg_start_tmp = 0;
a0ed51b3 10298 PL_reg_start_tmpl = 0;
c445ea15 10299 PL_reg_oldsaved = NULL;
a5db57d6 10300 PL_reg_oldsavedlen = 0;
a5db57d6 10301 PL_reg_maxiter = 0;
a5db57d6 10302 PL_reg_leftiter = 0;
c445ea15 10303 PL_reg_poscache = NULL;
a5db57d6 10304 PL_reg_poscache_size = 0;
1ade1aa1
NC
10305#ifdef PERL_OLD_COPY_ON_WRITE
10306 PL_nrs = NULL;
10307#endif
ada6e8a9 10308
c445ea15
AL
10309 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10310 if (PL_curpm) {
10311 const REGEXP * const rx = PM_GETRE(PL_curpm);
10312 if (rx) {
1df70142 10313 U32 i;
07bc277f 10314 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 10315 char digits[TYPE_CHARS(long)];
d9fad198 10316 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
10317 GV *const *const gvp
10318 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10319
b37c2d43
AL
10320 if (gvp) {
10321 GV * const gv = *gvp;
10322 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10323 save_scalar(gv);
49f27e4b 10324 }
ada6e8a9
AMS
10325 }
10326 }
10327 }
a0ed51b3 10328}
76234dfb 10329#endif
51371543 10330
51371543 10331static void
acfe0abc 10332clear_re(pTHX_ void *r)
51371543 10333{
97aff369 10334 dVAR;
84679df5 10335 ReREFCNT_dec((REGEXP *)r);
51371543 10336}
ffbc6a93 10337
a28509cc
AL
10338#ifdef DEBUGGING
10339
10340STATIC void
10341S_put_byte(pTHX_ SV *sv, int c)
10342{
7918f24d
NC
10343 PERL_ARGS_ASSERT_PUT_BYTE;
10344
7fddd944
NC
10345 /* Our definition of isPRINT() ignores locales, so only bytes that are
10346 not part of UTF-8 are considered printable. I assume that the same
10347 holds for UTF-EBCDIC.
10348 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10349 which Wikipedia says:
10350
10351 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10352 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10353 identical, to the ASCII delete (DEL) or rubout control character.
10354 ) So the old condition can be simplified to !isPRINT(c) */
10355 if (!isPRINT(c))
a28509cc 10356 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5e7aa789 10357 else {
88c9ea1e 10358 const char string = c;
5e7aa789
NC
10359 if (c == '-' || c == ']' || c == '\\' || c == '^')
10360 sv_catpvs(sv, "\\");
10361 sv_catpvn(sv, &string, 1);
10362 }
a28509cc
AL
10363}
10364
786e8c11 10365
3dab1dad
YO
10366#define CLEAR_OPTSTART \
10367 if (optstart) STMT_START { \
70685ca0 10368 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
10369 optstart=NULL; \
10370 } STMT_END
10371
786e8c11 10372#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 10373
b5a2f8d8
NC
10374STATIC const regnode *
10375S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
10376 const regnode *last, const regnode *plast,
10377 SV* sv, I32 indent, U32 depth)
a28509cc 10378{
97aff369 10379 dVAR;
786e8c11 10380 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 10381 register const regnode *next;
3dab1dad 10382 const regnode *optstart= NULL;
1f1031fe 10383
f8fc2ecf 10384 RXi_GET_DECL(r,ri);
3dab1dad 10385 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10386
10387 PERL_ARGS_ASSERT_DUMPUNTIL;
10388
786e8c11
YO
10389#ifdef DEBUG_DUMPUNTIL
10390 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10391 last ? last-start : 0,plast ? plast-start : 0);
10392#endif
10393
10394 if (plast && plast < last)
10395 last= plast;
10396
10397 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 10398 /* While that wasn't END last time... */
a28509cc
AL
10399 NODE_ALIGN(node);
10400 op = OP(node);
de734bd5 10401 if (op == CLOSE || op == WHILEM)
786e8c11 10402 indent--;
b5a2f8d8 10403 next = regnext((regnode *)node);
1f1031fe 10404
a28509cc 10405 /* Where, what. */
8e11feef 10406 if (OP(node) == OPTIMIZED) {
e68ec53f 10407 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 10408 optstart = node;
3dab1dad 10409 else
8e11feef 10410 goto after_print;
3dab1dad
YO
10411 } else
10412 CLEAR_OPTSTART;
1f1031fe 10413
32fc9b6a 10414 regprop(r, sv, node);
a28509cc 10415 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 10416 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
10417
10418 if (OP(node) != OPTIMIZED) {
10419 if (next == NULL) /* Next ptr. */
10420 PerlIO_printf(Perl_debug_log, " (0)");
10421 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10422 PerlIO_printf(Perl_debug_log, " (FAIL)");
10423 else
10424 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10425 (void)PerlIO_putc(Perl_debug_log, '\n');
10426 }
10427
a28509cc
AL
10428 after_print:
10429 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
10430 assert(next);
10431 {
10432 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
10433 ? regnext((regnode *)next)
10434 : next);
be8e71aa
YO
10435 if (last && nnode > last)
10436 nnode = last;
786e8c11 10437 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 10438 }
a28509cc
AL
10439 }
10440 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 10441 assert(next);
786e8c11 10442 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
10443 }
10444 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 10445 const regnode *this_trie = node;
1de06328 10446 const char op = OP(node);
647f639f 10447 const U32 n = ARG(node);
1de06328 10448 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 10449 (reg_ac_data *)ri->data->data[n] :
1de06328 10450 NULL;
3251b653
NC
10451 const reg_trie_data * const trie =
10452 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 10453#ifdef DEBUGGING
502c6561 10454 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 10455#endif
786e8c11 10456 const regnode *nextbranch= NULL;
a28509cc 10457 I32 word_idx;
76f68e9b 10458 sv_setpvs(sv, "");
786e8c11 10459 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 10460 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
10461
10462 PerlIO_printf(Perl_debug_log, "%*s%s ",
10463 (int)(2*(indent+3)), "",
10464 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
10465 PL_colors[0], PL_colors[1],
10466 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 10467 PERL_PV_PRETTY_ELLIPSES |
7f69552c 10468 PERL_PV_PRETTY_LTGT
786e8c11
YO
10469 )
10470 : "???"
10471 );
10472 if (trie->jump) {
40d049e4 10473 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
10474 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10475 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
10476 if (dist) {
10477 if (!nextbranch)
24b23f37 10478 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
10479 DUMPUNTIL(this_trie + dist, nextbranch);
10480 }
786e8c11
YO
10481 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10482 nextbranch= regnext((regnode *)nextbranch);
10483 } else {
10484 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 10485 }
786e8c11
YO
10486 }
10487 if (last && next > last)
10488 node= last;
10489 else
10490 node= next;
a28509cc 10491 }
786e8c11
YO
10492 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10493 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10494 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
10495 }
10496 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 10497 assert(next);
786e8c11 10498 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
10499 }
10500 else if ( op == PLUS || op == STAR) {
786e8c11 10501 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
10502 }
10503 else if (op == ANYOF) {
10504 /* arglen 1 + class block */
4a3ee7a8 10505 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
a28509cc
AL
10506 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10507 node = NEXTOPER(node);
10508 }
10509 else if (PL_regkind[(U8)op] == EXACT) {
10510 /* Literal string, where present. */
10511 node += NODE_SZ_STR(node) - 1;
10512 node = NEXTOPER(node);
10513 }
10514 else {
10515 node = NEXTOPER(node);
10516 node += regarglen[(U8)op];
10517 }
10518 if (op == CURLYX || op == OPEN)
786e8c11 10519 indent++;
a28509cc 10520 }
3dab1dad 10521 CLEAR_OPTSTART;
786e8c11 10522#ifdef DEBUG_DUMPUNTIL
70685ca0 10523 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 10524#endif
1de06328 10525 return node;
a28509cc
AL
10526}
10527
10528#endif /* DEBUGGING */
10529
241d1a3b
NC
10530/*
10531 * Local variables:
10532 * c-indentation-style: bsd
10533 * c-basic-offset: 4
10534 * indent-tabs-mode: t
10535 * End:
10536 *
37442d52
RGS
10537 * ex: set ts=8 sts=4 sw=4 noet:
10538 */