This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Eliminated unitialised clone_params value bug accidentally added with
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
40d049e4
YO
129 regnode **recurse; /* Recurse regops */
130 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
131#if ADD_TO_REGEXEC
132 char *starttry; /* -Dr: where regtry was called. */
133#define RExC_starttry (pRExC_state->starttry)
134#endif
3dab1dad 135#ifdef DEBUGGING
be8e71aa 136 const char *lastparse;
3dab1dad
YO
137 I32 lastnum;
138#define RExC_lastparse (pRExC_state->lastparse)
139#define RExC_lastnum (pRExC_state->lastnum)
140#endif
830247a4
IZ
141} RExC_state_t;
142
e2509266 143#define RExC_flags (pRExC_state->flags)
830247a4
IZ
144#define RExC_precomp (pRExC_state->precomp)
145#define RExC_rx (pRExC_state->rx)
f8fc2ecf 146#define RExC_rxi (pRExC_state->rxi)
fac92740 147#define RExC_start (pRExC_state->start)
830247a4
IZ
148#define RExC_end (pRExC_state->end)
149#define RExC_parse (pRExC_state->parse)
150#define RExC_whilem_seen (pRExC_state->whilem_seen)
f8fc2ecf 151#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
830247a4 152#define RExC_emit (pRExC_state->emit)
fac92740 153#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
154#define RExC_naughty (pRExC_state->naughty)
155#define RExC_sawback (pRExC_state->sawback)
156#define RExC_seen (pRExC_state->seen)
157#define RExC_size (pRExC_state->size)
158#define RExC_npar (pRExC_state->npar)
c74340f9 159#define RExC_cpar (pRExC_state->cpar)
e2e6a0f1 160#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
161#define RExC_extralen (pRExC_state->extralen)
162#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 164#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 165#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
166#define RExC_open_parens (pRExC_state->open_parens)
167#define RExC_close_parens (pRExC_state->close_parens)
168#define RExC_opend (pRExC_state->opend)
81714fb9 169#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
170#define RExC_recurse (pRExC_state->recurse)
171#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 172
a687059c
LW
173#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
174#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
175 ((*s) == '{' && regcurly(s)))
a687059c 176
35c8bce7
LW
177#ifdef SPSTART
178#undef SPSTART /* dratted cpp namespace... */
179#endif
a687059c
LW
180/*
181 * Flags to be passed up and down.
182 */
a687059c 183#define WORST 0 /* Worst case. */
821b33a5 184#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
185#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
186#define SPSTART 0x4 /* Starts with * or +. */
187#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 188
3dab1dad
YO
189#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
190
07be1b83
YO
191/* whether trie related optimizations are enabled */
192#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
193#define TRIE_STUDY_OPT
786e8c11 194#define FULL_TRIE_STUDY
07be1b83
YO
195#define TRIE_STCLASS
196#endif
1de06328
YO
197
198
40d049e4
YO
199
200#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
201#define PBITVAL(paren) (1 << ((paren) & 7))
202#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
203#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
204#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
205
206
1de06328
YO
207/* About scan_data_t.
208
209 During optimisation we recurse through the regexp program performing
210 various inplace (keyhole style) optimisations. In addition study_chunk
211 and scan_commit populate this data structure with information about
212 what strings MUST appear in the pattern. We look for the longest
213 string that must appear for at a fixed location, and we look for the
214 longest string that may appear at a floating location. So for instance
215 in the pattern:
216
217 /FOO[xX]A.*B[xX]BAR/
218
219 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
220 strings (because they follow a .* construct). study_chunk will identify
221 both FOO and BAR as being the longest fixed and floating strings respectively.
222
223 The strings can be composites, for instance
224
225 /(f)(o)(o)/
226
227 will result in a composite fixed substring 'foo'.
228
229 For each string some basic information is maintained:
230
231 - offset or min_offset
232 This is the position the string must appear at, or not before.
233 It also implicitly (when combined with minlenp) tells us how many
234 character must match before the string we are searching.
235 Likewise when combined with minlenp and the length of the string
236 tells us how many characters must appear after the string we have
237 found.
238
239 - max_offset
240 Only used for floating strings. This is the rightmost point that
241 the string can appear at. Ifset to I32 max it indicates that the
242 string can occur infinitely far to the right.
243
244 - minlenp
245 A pointer to the minimum length of the pattern that the string
246 was found inside. This is important as in the case of positive
247 lookahead or positive lookbehind we can have multiple patterns
248 involved. Consider
249
250 /(?=FOO).*F/
251
252 The minimum length of the pattern overall is 3, the minimum length
253 of the lookahead part is 3, but the minimum length of the part that
254 will actually match is 1. So 'FOO's minimum length is 3, but the
255 minimum length for the F is 1. This is important as the minimum length
256 is used to determine offsets in front of and behind the string being
257 looked for. Since strings can be composites this is the length of the
258 pattern at the time it was commited with a scan_commit. Note that
259 the length is calculated by study_chunk, so that the minimum lengths
260 are not known until the full pattern has been compiled, thus the
261 pointer to the value.
262
263 - lookbehind
264
265 In the case of lookbehind the string being searched for can be
266 offset past the start point of the final matching string.
267 If this value was just blithely removed from the min_offset it would
268 invalidate some of the calculations for how many chars must match
269 before or after (as they are derived from min_offset and minlen and
270 the length of the string being searched for).
271 When the final pattern is compiled and the data is moved from the
272 scan_data_t structure into the regexp structure the information
273 about lookbehind is factored in, with the information that would
274 have been lost precalculated in the end_shift field for the
275 associated string.
276
277 The fields pos_min and pos_delta are used to store the minimum offset
278 and the delta to the maximum offset at the current point in the pattern.
279
280*/
2c2d71f5
JH
281
282typedef struct scan_data_t {
1de06328
YO
283 /*I32 len_min; unused */
284 /*I32 len_delta; unused */
2c2d71f5
JH
285 I32 pos_min;
286 I32 pos_delta;
287 SV *last_found;
1de06328 288 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
289 I32 last_start_min;
290 I32 last_start_max;
1de06328
YO
291 SV **longest; /* Either &l_fixed, or &l_float. */
292 SV *longest_fixed; /* longest fixed string found in pattern */
293 I32 offset_fixed; /* offset where it starts */
294 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
295 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
296 SV *longest_float; /* longest floating string found in pattern */
297 I32 offset_float_min; /* earliest point in string it can appear */
298 I32 offset_float_max; /* latest point in string it can appear */
299 I32 *minlen_float; /* pointer to the minlen relevent to the string */
300 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
301 I32 flags;
302 I32 whilem_c;
cb434fcc 303 I32 *last_closep;
653099ff 304 struct regnode_charclass_class *start_class;
2c2d71f5
JH
305} scan_data_t;
306
a687059c 307/*
e50aee73 308 * Forward declarations for pregcomp()'s friends.
a687059c 309 */
a0d0e21e 310
27da23d5 311static const scan_data_t zero_scan_data =
1de06328 312 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
313
314#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
315#define SF_BEFORE_SEOL 0x0001
316#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
317#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
318#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
319
09b7f37c
CB
320#ifdef NO_UNARY_PLUS
321# define SF_FIX_SHIFT_EOL (0+2)
322# define SF_FL_SHIFT_EOL (0+4)
323#else
324# define SF_FIX_SHIFT_EOL (+2)
325# define SF_FL_SHIFT_EOL (+4)
326#endif
c277df42
IZ
327
328#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
329#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
330
331#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
332#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
333#define SF_IS_INF 0x0040
334#define SF_HAS_PAR 0x0080
335#define SF_IN_PAR 0x0100
336#define SF_HAS_EVAL 0x0200
337#define SCF_DO_SUBSTR 0x0400
653099ff
GS
338#define SCF_DO_STCLASS_AND 0x0800
339#define SCF_DO_STCLASS_OR 0x1000
340#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 341#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 342
786e8c11 343#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 344#define SCF_SEEN_ACCEPT 0x8000
07be1b83 345
eb160463 346#define UTF (RExC_utf8 != 0)
bbe252da
YO
347#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
348#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 349
ffc61ed2 350#define OOB_UNICODE 12345678
93733859 351#define OOB_NAMEDCLASS -1
b8c5462f 352
a0ed51b3
LW
353#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
354#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
355
8615cb43 356
b45f050a
JF
357/* length of regex to show in messages that don't mark a position within */
358#define RegexLengthToShowInErrorMessages 127
359
360/*
361 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
362 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
363 * op/pragma/warn/regcomp.
364 */
7253e4e3
RK
365#define MARKER1 "<-- HERE" /* marker as it appears in the description */
366#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 367
7253e4e3 368#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
369
370/*
371 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
372 * arg. Show regex, up to a maximum length. If it's too long, chop and add
373 * "...".
374 */
58e23c8d 375#define _FAIL(code) STMT_START { \
bfed75c6 376 const char *ellipses = ""; \
ccb2c380
MP
377 IV len = RExC_end - RExC_precomp; \
378 \
379 if (!SIZE_ONLY) \
380 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
381 if (len > RegexLengthToShowInErrorMessages) { \
382 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
383 len = RegexLengthToShowInErrorMessages - 10; \
384 ellipses = "..."; \
385 } \
58e23c8d 386 code; \
ccb2c380 387} STMT_END
8615cb43 388
58e23c8d
YO
389#define FAIL(msg) _FAIL( \
390 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
391 msg, (int)len, RExC_precomp, ellipses))
392
393#define FAIL2(msg,arg) _FAIL( \
394 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
395 arg, (int)len, RExC_precomp, ellipses))
396
b45f050a 397/*
b45f050a
JF
398 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
399 */
ccb2c380 400#define Simple_vFAIL(m) STMT_START { \
a28509cc 401 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
402 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
403 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
404} STMT_END
b45f050a
JF
405
406/*
407 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
408 */
ccb2c380
MP
409#define vFAIL(m) STMT_START { \
410 if (!SIZE_ONLY) \
411 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
412 Simple_vFAIL(m); \
413} STMT_END
b45f050a
JF
414
415/*
416 * Like Simple_vFAIL(), but accepts two arguments.
417 */
ccb2c380 418#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 419 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
420 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
421 (int)offset, RExC_precomp, RExC_precomp + offset); \
422} STMT_END
b45f050a
JF
423
424/*
425 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
426 */
ccb2c380
MP
427#define vFAIL2(m,a1) STMT_START { \
428 if (!SIZE_ONLY) \
429 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
430 Simple_vFAIL2(m, a1); \
431} STMT_END
b45f050a
JF
432
433
434/*
435 * Like Simple_vFAIL(), but accepts three arguments.
436 */
ccb2c380 437#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 438 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
439 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
440 (int)offset, RExC_precomp, RExC_precomp + offset); \
441} STMT_END
b45f050a
JF
442
443/*
444 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
445 */
ccb2c380
MP
446#define vFAIL3(m,a1,a2) STMT_START { \
447 if (!SIZE_ONLY) \
448 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
449 Simple_vFAIL3(m, a1, a2); \
450} STMT_END
b45f050a
JF
451
452/*
453 * Like Simple_vFAIL(), but accepts four arguments.
454 */
ccb2c380 455#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 456 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
457 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
458 (int)offset, RExC_precomp, RExC_precomp + offset); \
459} STMT_END
b45f050a 460
ccb2c380 461#define vWARN(loc,m) STMT_START { \
a28509cc 462 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
463 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
464 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
465} STMT_END
466
467#define vWARNdep(loc,m) STMT_START { \
a28509cc 468 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
469 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
470 "%s" REPORT_LOCATION, \
471 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
472} STMT_END
473
474
475#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 476 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
477 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
478 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
479} STMT_END
480
481#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 482 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
483 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
484 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
485} STMT_END
486
487#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 488 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
489 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
491} STMT_END
492
493#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 494 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
495 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
496 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
497} STMT_END
9d1d55b5 498
8615cb43 499
cd439c50 500/* Allow for side effects in s */
ccb2c380
MP
501#define REGC(c,s) STMT_START { \
502 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
503} STMT_END
cd439c50 504
fac92740
MJD
505/* Macros for recording node offsets. 20001227 mjd@plover.com
506 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
507 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
508 * Element 0 holds the number n.
07be1b83 509 * Position is 1 indexed.
fac92740
MJD
510 */
511
ccb2c380
MP
512#define Set_Node_Offset_To_R(node,byte) STMT_START { \
513 if (! SIZE_ONLY) { \
514 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 515 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 516 if((node) < 0) { \
551405c4 517 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
518 } else { \
519 RExC_offsets[2*(node)-1] = (byte); \
520 } \
521 } \
522} STMT_END
523
524#define Set_Node_Offset(node,byte) \
525 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
526#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
527
528#define Set_Node_Length_To_R(node,len) STMT_START { \
529 if (! SIZE_ONLY) { \
530 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 531 __LINE__, (int)(node), (int)(len))); \
ccb2c380 532 if((node) < 0) { \
551405c4 533 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
534 } else { \
535 RExC_offsets[2*(node)] = (len); \
536 } \
537 } \
538} STMT_END
539
540#define Set_Node_Length(node,len) \
541 Set_Node_Length_To_R((node)-RExC_emit_start, len)
542#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
543#define Set_Node_Cur_Length(node) \
544 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
545
546/* Get offsets and lengths */
547#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
548#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
549
07be1b83
YO
550#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
551 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
552 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
553} STMT_END
554
555
556#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
557#define EXPERIMENTAL_INPLACESCAN
558#endif
559
1de06328 560#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 561DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 562 PerlIO_printf(Perl_debug_log, \
b515a41d 563 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
1de06328
YO
564 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
565 (int)(depth)*2, "", \
566 (IV)((data)->pos_min), \
567 (IV)((data)->pos_delta), \
568 (IV)((data)->flags), \
569 (IV)((data)->whilem_c), \
570 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
571 ); \
572 if ((data)->last_found) \
573 PerlIO_printf(Perl_debug_log, \
574 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
575 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
576 SvPVX_const((data)->last_found), \
577 (IV)((data)->last_end), \
578 (IV)((data)->last_start_min), \
579 (IV)((data)->last_start_max), \
580 ((data)->longest && \
581 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
582 SvPVX_const((data)->longest_fixed), \
583 (IV)((data)->offset_fixed), \
584 ((data)->longest && \
585 (data)->longest==&((data)->longest_float)) ? "*" : "", \
586 SvPVX_const((data)->longest_float), \
587 (IV)((data)->offset_float_min), \
588 (IV)((data)->offset_float_max) \
589 ); \
590 PerlIO_printf(Perl_debug_log,"\n"); \
591});
592
acfe0abc 593static void clear_re(pTHX_ void *r);
4327152a 594
653099ff 595/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 596 Update the longest found anchored substring and the longest found
653099ff
GS
597 floating substrings if needed. */
598
4327152a 599STATIC void
1de06328 600S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 601{
e1ec3a88
AL
602 const STRLEN l = CHR_SVLEN(data->last_found);
603 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 604 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 605
c277df42 606 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 607 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
608 if (*data->longest == data->longest_fixed) {
609 data->offset_fixed = l ? data->last_start_min : data->pos_min;
610 if (data->flags & SF_BEFORE_EOL)
b81d288d 611 data->flags
c277df42
IZ
612 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
613 else
614 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
615 data->minlen_fixed=minlenp;
616 data->lookbehind_fixed=0;
a0ed51b3
LW
617 }
618 else {
c277df42 619 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
620 data->offset_float_max = (l
621 ? data->last_start_max
c277df42 622 : data->pos_min + data->pos_delta);
9051bda5
HS
623 if ((U32)data->offset_float_max > (U32)I32_MAX)
624 data->offset_float_max = I32_MAX;
c277df42 625 if (data->flags & SF_BEFORE_EOL)
b81d288d 626 data->flags
c277df42
IZ
627 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
628 else
629 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
630 data->minlen_float=minlenp;
631 data->lookbehind_float=0;
c277df42
IZ
632 }
633 }
634 SvCUR_set(data->last_found, 0);
0eda9292 635 {
a28509cc 636 SV * const sv = data->last_found;
097eb12c
AL
637 if (SvUTF8(sv) && SvMAGICAL(sv)) {
638 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
639 if (mg)
640 mg->mg_len = 0;
641 }
0eda9292 642 }
c277df42
IZ
643 data->last_end = -1;
644 data->flags &= ~SF_BEFORE_EOL;
1de06328 645 DEBUG_STUDYDATA(data,0);
c277df42
IZ
646}
647
653099ff
GS
648/* Can match anything (initialization) */
649STATIC void
097eb12c 650S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 651{
653099ff 652 ANYOF_CLASS_ZERO(cl);
f8bef550 653 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 654 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
655 if (LOC)
656 cl->flags |= ANYOF_LOCALE;
657}
658
659/* Can match anything (initialization) */
660STATIC int
5f66b61c 661S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
662{
663 int value;
664
aaa51d5e 665 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
666 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
667 return 1;
1aa99e6b
IH
668 if (!(cl->flags & ANYOF_UNICODE_ALL))
669 return 0;
10edeb5d 670 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 671 return 0;
653099ff
GS
672 return 1;
673}
674
675/* Can match anything (initialization) */
676STATIC void
097eb12c 677S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 678{
8ecf7187 679 Zero(cl, 1, struct regnode_charclass_class);
653099ff 680 cl->type = ANYOF;
830247a4 681 cl_anything(pRExC_state, cl);
653099ff
GS
682}
683
684STATIC void
097eb12c 685S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 686{
8ecf7187 687 Zero(cl, 1, struct regnode_charclass_class);
653099ff 688 cl->type = ANYOF;
830247a4 689 cl_anything(pRExC_state, cl);
653099ff
GS
690 if (LOC)
691 cl->flags |= ANYOF_LOCALE;
692}
693
694/* 'And' a given class with another one. Can create false positives */
695/* We assume that cl is not inverted */
696STATIC void
5f66b61c 697S_cl_and(struct regnode_charclass_class *cl,
a28509cc 698 const struct regnode_charclass_class *and_with)
653099ff 699{
40d049e4
YO
700
701 assert(and_with->type == ANYOF);
653099ff
GS
702 if (!(and_with->flags & ANYOF_CLASS)
703 && !(cl->flags & ANYOF_CLASS)
704 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
705 && !(and_with->flags & ANYOF_FOLD)
706 && !(cl->flags & ANYOF_FOLD)) {
707 int i;
708
709 if (and_with->flags & ANYOF_INVERT)
710 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
711 cl->bitmap[i] &= ~and_with->bitmap[i];
712 else
713 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
714 cl->bitmap[i] &= and_with->bitmap[i];
715 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
716 if (!(and_with->flags & ANYOF_EOS))
717 cl->flags &= ~ANYOF_EOS;
1aa99e6b 718
14ebb1a2
JH
719 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
720 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
721 cl->flags &= ~ANYOF_UNICODE_ALL;
722 cl->flags |= ANYOF_UNICODE;
723 ARG_SET(cl, ARG(and_with));
724 }
14ebb1a2
JH
725 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
726 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 727 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
728 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
729 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 730 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
731}
732
733/* 'OR' a given class with another one. Can create false positives */
734/* We assume that cl is not inverted */
735STATIC void
097eb12c 736S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 737{
653099ff
GS
738 if (or_with->flags & ANYOF_INVERT) {
739 /* We do not use
740 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
741 * <= (B1 | !B2) | (CL1 | !CL2)
742 * which is wasteful if CL2 is small, but we ignore CL2:
743 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
744 * XXXX Can we handle case-fold? Unclear:
745 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
746 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
747 */
748 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
749 && !(or_with->flags & ANYOF_FOLD)
750 && !(cl->flags & ANYOF_FOLD) ) {
751 int i;
752
753 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
754 cl->bitmap[i] |= ~or_with->bitmap[i];
755 } /* XXXX: logic is complicated otherwise */
756 else {
830247a4 757 cl_anything(pRExC_state, cl);
653099ff
GS
758 }
759 } else {
760 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
761 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 762 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
763 || (cl->flags & ANYOF_FOLD)) ) {
764 int i;
765
766 /* OR char bitmap and class bitmap separately */
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] |= or_with->bitmap[i];
769 if (or_with->flags & ANYOF_CLASS) {
770 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
771 cl->classflags[i] |= or_with->classflags[i];
772 cl->flags |= ANYOF_CLASS;
773 }
774 }
775 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 776 cl_anything(pRExC_state, cl);
653099ff
GS
777 }
778 }
779 if (or_with->flags & ANYOF_EOS)
780 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
781
782 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
783 ARG(cl) != ARG(or_with)) {
784 cl->flags |= ANYOF_UNICODE_ALL;
785 cl->flags &= ~ANYOF_UNICODE;
786 }
787 if (or_with->flags & ANYOF_UNICODE_ALL) {
788 cl->flags |= ANYOF_UNICODE_ALL;
789 cl->flags &= ~ANYOF_UNICODE;
790 }
653099ff
GS
791}
792
a3621e74
YO
793#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
794#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
795#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
796#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
797
3dab1dad
YO
798
799#ifdef DEBUGGING
07be1b83 800/*
3dab1dad
YO
801 dump_trie(trie)
802 dump_trie_interim_list(trie,next_alloc)
803 dump_trie_interim_table(trie,next_alloc)
804
805 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
806 The _interim_ variants are used for debugging the interim
807 tables that are used to generate the final compressed
808 representation which is what dump_trie expects.
809
3dab1dad
YO
810 Part of the reason for their existance is to provide a form
811 of documentation as to how the different representations function.
07be1b83
YO
812
813*/
3dab1dad
YO
814
815/*
816 dump_trie(trie)
817 Dumps the final compressed table form of the trie to Perl_debug_log.
818 Used for debugging make_trie().
819*/
820
821STATIC void
822S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
823{
824 U32 state;
ab3bbdeb
YO
825 SV *sv=sv_newmortal();
826 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
827 GET_RE_DEBUG_FLAGS_DECL;
828
ab3bbdeb 829
3dab1dad
YO
830 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
831 (int)depth * 2 + 2,"",
832 "Match","Base","Ofs" );
833
834 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 835 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 836 if ( tmp ) {
ab3bbdeb
YO
837 PerlIO_printf( Perl_debug_log, "%*s",
838 colwidth,
ddc5bc0f 839 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
840 PL_colors[0], PL_colors[1],
841 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
842 PERL_PV_ESCAPE_FIRSTCHAR
843 )
844 );
3dab1dad
YO
845 }
846 }
847 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
848 (int)depth * 2 + 2,"");
849
850 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 851 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
852 PerlIO_printf( Perl_debug_log, "\n");
853
1e2e3d02 854 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 855 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
856
857 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
858
859 if ( trie->states[ state ].wordnum ) {
860 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
861 } else {
862 PerlIO_printf( Perl_debug_log, "%6s", "" );
863 }
864
865 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
866
867 if ( base ) {
868 U32 ofs = 0;
869
870 while( ( base + ofs < trie->uniquecharcount ) ||
871 ( base + ofs - trie->uniquecharcount < trie->lasttrans
872 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
873 ofs++;
874
875 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
876
877 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
878 if ( ( base + ofs >= trie->uniquecharcount ) &&
879 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
880 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
881 {
ab3bbdeb
YO
882 PerlIO_printf( Perl_debug_log, "%*"UVXf,
883 colwidth,
3dab1dad
YO
884 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
885 } else {
ab3bbdeb 886 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
887 }
888 }
889
890 PerlIO_printf( Perl_debug_log, "]");
891
892 }
893 PerlIO_printf( Perl_debug_log, "\n" );
894 }
895}
896/*
897 dump_trie_interim_list(trie,next_alloc)
898 Dumps a fully constructed but uncompressed trie in list form.
899 List tries normally only are used for construction when the number of
900 possible chars (trie->uniquecharcount) is very high.
901 Used for debugging make_trie().
902*/
903STATIC void
904S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
905{
906 U32 state;
ab3bbdeb
YO
907 SV *sv=sv_newmortal();
908 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
909 GET_RE_DEBUG_FLAGS_DECL;
910 /* print out the table precompression. */
ab3bbdeb
YO
911 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
912 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
913 "------:-----+-----------------\n" );
3dab1dad
YO
914
915 for( state=1 ; state < next_alloc ; state ++ ) {
916 U16 charid;
917
ab3bbdeb 918 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
919 (int)depth * 2 + 2,"", (UV)state );
920 if ( ! trie->states[ state ].wordnum ) {
921 PerlIO_printf( Perl_debug_log, "%5s| ","");
922 } else {
923 PerlIO_printf( Perl_debug_log, "W%4x| ",
924 trie->states[ state ].wordnum
925 );
926 }
927 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 928 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
929 if ( tmp ) {
930 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
931 colwidth,
ddc5bc0f 932 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
933 PL_colors[0], PL_colors[1],
934 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
935 PERL_PV_ESCAPE_FIRSTCHAR
936 ) ,
1e2e3d02
YO
937 TRIE_LIST_ITEM(state,charid).forid,
938 (UV)TRIE_LIST_ITEM(state,charid).newstate
939 );
940 if (!(charid % 10))
664e119d
RGS
941 PerlIO_printf(Perl_debug_log, "\n%*s| ",
942 (int)((depth * 2) + 14), "");
1e2e3d02 943 }
ab3bbdeb
YO
944 }
945 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
946 }
947}
948
949/*
950 dump_trie_interim_table(trie,next_alloc)
951 Dumps a fully constructed but uncompressed trie in table form.
952 This is the normal DFA style state transition table, with a few
953 twists to facilitate compression later.
954 Used for debugging make_trie().
955*/
956STATIC void
957S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
958{
959 U32 state;
960 U16 charid;
ab3bbdeb
YO
961 SV *sv=sv_newmortal();
962 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
963 GET_RE_DEBUG_FLAGS_DECL;
964
965 /*
966 print out the table precompression so that we can do a visual check
967 that they are identical.
968 */
969
970 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
971
972 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 973 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 974 if ( tmp ) {
ab3bbdeb
YO
975 PerlIO_printf( Perl_debug_log, "%*s",
976 colwidth,
ddc5bc0f 977 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
978 PL_colors[0], PL_colors[1],
979 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
980 PERL_PV_ESCAPE_FIRSTCHAR
981 )
982 );
3dab1dad
YO
983 }
984 }
985
986 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
987
988 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 989 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
990 }
991
992 PerlIO_printf( Perl_debug_log, "\n" );
993
994 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
995
996 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
997 (int)depth * 2 + 2,"",
998 (UV)TRIE_NODENUM( state ) );
999
1000 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1001 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1002 if (v)
1003 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1004 else
1005 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1006 }
1007 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1008 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1009 } else {
1010 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1011 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1012 }
1013 }
07be1b83 1014}
3dab1dad
YO
1015
1016#endif
1017
786e8c11
YO
1018/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1019 startbranch: the first branch in the whole branch sequence
1020 first : start branch of sequence of branch-exact nodes.
1021 May be the same as startbranch
1022 last : Thing following the last branch.
1023 May be the same as tail.
1024 tail : item following the branch sequence
1025 count : words in the sequence
1026 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1027 depth : indent depth
3dab1dad 1028
786e8c11 1029Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1030
786e8c11
YO
1031A trie is an N'ary tree where the branches are determined by digital
1032decomposition of the key. IE, at the root node you look up the 1st character and
1033follow that branch repeat until you find the end of the branches. Nodes can be
1034marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1035
786e8c11 1036 /he|she|his|hers/
72f13be8 1037
786e8c11
YO
1038would convert into the following structure. Numbers represent states, letters
1039following numbers represent valid transitions on the letter from that state, if
1040the number is in square brackets it represents an accepting state, otherwise it
1041will be in parenthesis.
07be1b83 1042
786e8c11
YO
1043 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1044 | |
1045 | (2)
1046 | |
1047 (1) +-i->(6)-+-s->[7]
1048 |
1049 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1050
786e8c11
YO
1051 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1052
1053This shows that when matching against the string 'hers' we will begin at state 1
1054read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1055then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1056is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1057single traverse. We store a mapping from accepting to state to which word was
1058matched, and then when we have multiple possibilities we try to complete the
1059rest of the regex in the order in which they occured in the alternation.
1060
1061The only prior NFA like behaviour that would be changed by the TRIE support is
1062the silent ignoring of duplicate alternations which are of the form:
1063
1064 / (DUPE|DUPE) X? (?{ ... }) Y /x
1065
1066Thus EVAL blocks follwing a trie may be called a different number of times with
1067and without the optimisation. With the optimisations dupes will be silently
1068ignored. This inconsistant behaviour of EVAL type nodes is well established as
1069the following demonstrates:
1070
1071 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1072
1073which prints out 'word' three times, but
1074
1075 'words'=~/(word|word|word)(?{ print $1 })S/
1076
1077which doesnt print it out at all. This is due to other optimisations kicking in.
1078
1079Example of what happens on a structural level:
1080
1081The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1082
1083 1: CURLYM[1] {1,32767}(18)
1084 5: BRANCH(8)
1085 6: EXACT <ac>(16)
1086 8: BRANCH(11)
1087 9: EXACT <ad>(16)
1088 11: BRANCH(14)
1089 12: EXACT <ab>(16)
1090 16: SUCCEED(0)
1091 17: NOTHING(18)
1092 18: END(0)
1093
1094This would be optimizable with startbranch=5, first=5, last=16, tail=16
1095and should turn into:
1096
1097 1: CURLYM[1] {1,32767}(18)
1098 5: TRIE(16)
1099 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1100 <ac>
1101 <ad>
1102 <ab>
1103 16: SUCCEED(0)
1104 17: NOTHING(18)
1105 18: END(0)
1106
1107Cases where tail != last would be like /(?foo|bar)baz/:
1108
1109 1: BRANCH(4)
1110 2: EXACT <foo>(8)
1111 4: BRANCH(7)
1112 5: EXACT <bar>(8)
1113 7: TAIL(8)
1114 8: EXACT <baz>(10)
1115 10: END(0)
1116
1117which would be optimizable with startbranch=1, first=1, last=7, tail=8
1118and would end up looking like:
1119
1120 1: TRIE(8)
1121 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1122 <foo>
1123 <bar>
1124 7: TAIL(8)
1125 8: EXACT <baz>(10)
1126 10: END(0)
1127
1128 d = uvuni_to_utf8_flags(d, uv, 0);
1129
1130is the recommended Unicode-aware way of saying
1131
1132 *(d++) = uv;
1133*/
1134
1e2e3d02 1135#define TRIE_STORE_REVCHAR \
786e8c11 1136 STMT_START { \
1e2e3d02 1137 SV *tmp = newSVpvs(""); \
786e8c11 1138 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1139 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1140 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1141 } STMT_END
1142
1143#define TRIE_READ_CHAR STMT_START { \
1144 wordlen++; \
1145 if ( UTF ) { \
1146 if ( folder ) { \
1147 if ( foldlen > 0 ) { \
1148 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1149 foldlen -= len; \
1150 scan += len; \
1151 len = 0; \
1152 } else { \
1153 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1154 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1155 foldlen -= UNISKIP( uvc ); \
1156 scan = foldbuf + UNISKIP( uvc ); \
1157 } \
1158 } else { \
1159 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1160 } \
1161 } else { \
1162 uvc = (U32)*uc; \
1163 len = 1; \
1164 } \
1165} STMT_END
1166
1167
1168
1169#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1170 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1171 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1172 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1173 } \
1174 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1175 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1176 TRIE_LIST_CUR( state )++; \
1177} STMT_END
07be1b83 1178
786e8c11
YO
1179#define TRIE_LIST_NEW(state) STMT_START { \
1180 Newxz( trie->states[ state ].trans.list, \
1181 4, reg_trie_trans_le ); \
1182 TRIE_LIST_CUR( state ) = 1; \
1183 TRIE_LIST_LEN( state ) = 4; \
1184} STMT_END
07be1b83 1185
786e8c11
YO
1186#define TRIE_HANDLE_WORD(state) STMT_START { \
1187 U16 dupe= trie->states[ state ].wordnum; \
1188 regnode * const noper_next = regnext( noper ); \
1189 \
1190 if (trie->wordlen) \
1191 trie->wordlen[ curword ] = wordlen; \
1192 DEBUG_r({ \
1193 /* store the word for dumping */ \
1194 SV* tmp; \
1195 if (OP(noper) != NOTHING) \
1196 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1197 else \
1198 tmp = newSVpvn( "", 0 ); \
1199 if ( UTF ) SvUTF8_on( tmp ); \
1200 av_push( trie->words, tmp ); \
1201 }); \
1202 \
1203 curword++; \
1204 \
1205 if ( noper_next < tail ) { \
1206 if (!trie->jump) \
446bd890 1207 trie->jump = PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1208 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1209 if (!jumper) \
1210 jumper = noper_next; \
1211 if (!nextbranch) \
1212 nextbranch= regnext(cur); \
1213 } \
1214 \
1215 if ( dupe ) { \
1216 /* So it's a dupe. This means we need to maintain a */\
1217 /* linked-list from the first to the next. */\
1218 /* we only allocate the nextword buffer when there */\
1219 /* a dupe, so first time we have to do the allocation */\
1220 if (!trie->nextword) \
446bd890
NC
1221 trie->nextword = \
1222 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1223 while ( trie->nextword[dupe] ) \
1224 dupe= trie->nextword[dupe]; \
1225 trie->nextword[dupe]= curword; \
1226 } else { \
1227 /* we haven't inserted this word yet. */ \
1228 trie->states[ state ].wordnum = curword; \
1229 } \
1230} STMT_END
07be1b83 1231
3dab1dad 1232
786e8c11
YO
1233#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1234 ( ( base + charid >= ucharcount \
1235 && base + charid < ubound \
1236 && state == trie->trans[ base - ucharcount + charid ].check \
1237 && trie->trans[ base - ucharcount + charid ].next ) \
1238 ? trie->trans[ base - ucharcount + charid ].next \
1239 : ( state==1 ? special : 0 ) \
1240 )
3dab1dad 1241
786e8c11
YO
1242#define MADE_TRIE 1
1243#define MADE_JUMP_TRIE 2
1244#define MADE_EXACT_TRIE 4
3dab1dad 1245
a3621e74 1246STATIC I32
786e8c11 1247S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1248{
27da23d5 1249 dVAR;
a3621e74
YO
1250 /* first pass, loop through and scan words */
1251 reg_trie_data *trie;
1252 regnode *cur;
9f7f3913 1253 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1254 STRLEN len = 0;
1255 UV uvc = 0;
1256 U16 curword = 0;
1257 U32 next_alloc = 0;
786e8c11
YO
1258 regnode *jumper = NULL;
1259 regnode *nextbranch = NULL;
7f69552c 1260 regnode *convert = NULL;
a3621e74 1261 /* we just use folder as a flag in utf8 */
e1ec3a88 1262 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1263 ? PL_fold
1264 : ( flags == EXACTFL
1265 ? PL_fold_locale
1266 : NULL
1267 )
1268 );
1269
e1ec3a88 1270 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1271 SV *re_trie_maxbuff;
3dab1dad
YO
1272#ifndef DEBUGGING
1273 /* these are only used during construction but are useful during
8e11feef 1274 * debugging so we store them in the struct when debugging.
8e11feef 1275 */
3dab1dad 1276 STRLEN trie_charcount=0;
3dab1dad
YO
1277 AV *trie_revcharmap;
1278#endif
a3621e74 1279 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1280#ifndef DEBUGGING
1281 PERL_UNUSED_ARG(depth);
1282#endif
a3621e74 1283
446bd890 1284 trie = PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1285 trie->refcount = 1;
3dab1dad 1286 trie->startstate = 1;
786e8c11 1287 trie->wordcount = word_count;
f8fc2ecf 1288 RExC_rxi->data->data[ data_slot ] = (void*)trie;
446bd890 1289 trie->charmap = PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1290 if (!(UTF && folder))
446bd890 1291 trie->bitmap = PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74
YO
1292 DEBUG_r({
1293 trie->words = newAV();
a3621e74 1294 });
3dab1dad 1295 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1296
0111c4fd 1297 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1298 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1299 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1300 }
3dab1dad
YO
1301 DEBUG_OPTIMISE_r({
1302 PerlIO_printf( Perl_debug_log,
786e8c11 1303 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1304 (int)depth * 2 + 2, "",
1305 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1306 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1307 (int)depth);
3dab1dad 1308 });
7f69552c
YO
1309
1310 /* Find the node we are going to overwrite */
1311 if ( first == startbranch && OP( last ) != BRANCH ) {
1312 /* whole branch chain */
1313 convert = first;
1314 } else {
1315 /* branch sub-chain */
1316 convert = NEXTOPER( first );
1317 }
1318
a3621e74
YO
1319 /* -- First loop and Setup --
1320
1321 We first traverse the branches and scan each word to determine if it
1322 contains widechars, and how many unique chars there are, this is
1323 important as we have to build a table with at least as many columns as we
1324 have unique chars.
1325
1326 We use an array of integers to represent the character codes 0..255
1327 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1328 native representation of the character value as the key and IV's for the
1329 coded index.
1330
1331 *TODO* If we keep track of how many times each character is used we can
1332 remap the columns so that the table compression later on is more
1333 efficient in terms of memory by ensuring most common value is in the
1334 middle and the least common are on the outside. IMO this would be better
1335 than a most to least common mapping as theres a decent chance the most
1336 common letter will share a node with the least common, meaning the node
1337 will not be compressable. With a middle is most common approach the worst
1338 case is when we have the least common nodes twice.
1339
1340 */
1341
a3621e74 1342 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1343 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1344 const U8 *uc = (U8*)STRING( noper );
a28509cc 1345 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1346 STRLEN foldlen = 0;
1347 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1348 const U8 *scan = (U8*)NULL;
07be1b83 1349 U32 wordlen = 0; /* required init */
3dab1dad 1350 STRLEN chars=0;
a3621e74 1351
3dab1dad
YO
1352 if (OP(noper) == NOTHING) {
1353 trie->minlen= 0;
1354 continue;
1355 }
1356 if (trie->bitmap) {
1357 TRIE_BITMAP_SET(trie,*uc);
1358 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1359 }
a3621e74 1360 for ( ; uc < e ; uc += len ) {
3dab1dad 1361 TRIE_CHARCOUNT(trie)++;
a3621e74 1362 TRIE_READ_CHAR;
3dab1dad 1363 chars++;
a3621e74
YO
1364 if ( uvc < 256 ) {
1365 if ( !trie->charmap[ uvc ] ) {
1366 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1367 if ( folder )
1368 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1369 TRIE_STORE_REVCHAR;
a3621e74
YO
1370 }
1371 } else {
1372 SV** svpp;
1373 if ( !trie->widecharmap )
1374 trie->widecharmap = newHV();
1375
1376 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1377
1378 if ( !svpp )
e4584336 1379 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1380
1381 if ( !SvTRUE( *svpp ) ) {
1382 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1383 TRIE_STORE_REVCHAR;
a3621e74
YO
1384 }
1385 }
1386 }
3dab1dad
YO
1387 if( cur == first ) {
1388 trie->minlen=chars;
1389 trie->maxlen=chars;
1390 } else if (chars < trie->minlen) {
1391 trie->minlen=chars;
1392 } else if (chars > trie->maxlen) {
1393 trie->maxlen=chars;
1394 }
1395
a3621e74
YO
1396 } /* end first pass */
1397 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1398 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1399 (int)depth * 2 + 2,"",
85c3142d 1400 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1401 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1402 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1403 );
446bd890 1404 trie->wordlen = PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1405
1406 /*
1407 We now know what we are dealing with in terms of unique chars and
1408 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1409 representation using a flat table will take. If it's over a reasonable
1410 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1411 conservative but potentially much slower representation using an array
1412 of lists.
1413
1414 At the end we convert both representations into the same compressed
1415 form that will be used in regexec.c for matching with. The latter
1416 is a form that cannot be used to construct with but has memory
1417 properties similar to the list form and access properties similar
1418 to the table form making it both suitable for fast searches and
1419 small enough that its feasable to store for the duration of a program.
1420
1421 See the comment in the code where the compressed table is produced
1422 inplace from the flat tabe representation for an explanation of how
1423 the compression works.
1424
1425 */
1426
1427
3dab1dad 1428 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1429 /*
1430 Second Pass -- Array Of Lists Representation
1431
1432 Each state will be represented by a list of charid:state records
1433 (reg_trie_trans_le) the first such element holds the CUR and LEN
1434 points of the allocated array. (See defines above).
1435
1436 We build the initial structure using the lists, and then convert
1437 it into the compressed table form which allows faster lookups
1438 (but cant be modified once converted).
a3621e74
YO
1439 */
1440
a3621e74
YO
1441 STRLEN transcount = 1;
1442
1e2e3d02
YO
1443 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1444 "%*sCompiling trie using list compiler\n",
1445 (int)depth * 2 + 2, ""));
446bd890
NC
1446
1447 trie->states = PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1448 sizeof(reg_trie_state) );
a3621e74
YO
1449 TRIE_LIST_NEW(1);
1450 next_alloc = 2;
1451
1452 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1453
c445ea15
AL
1454 regnode * const noper = NEXTOPER( cur );
1455 U8 *uc = (U8*)STRING( noper );
1456 const U8 * const e = uc + STR_LEN( noper );
1457 U32 state = 1; /* required init */
1458 U16 charid = 0; /* sanity init */
1459 U8 *scan = (U8*)NULL; /* sanity init */
1460 STRLEN foldlen = 0; /* required init */
07be1b83 1461 U32 wordlen = 0; /* required init */
c445ea15
AL
1462 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1463
3dab1dad 1464 if (OP(noper) != NOTHING) {
786e8c11 1465 for ( ; uc < e ; uc += len ) {
c445ea15 1466
786e8c11 1467 TRIE_READ_CHAR;
c445ea15 1468
786e8c11
YO
1469 if ( uvc < 256 ) {
1470 charid = trie->charmap[ uvc ];
c445ea15 1471 } else {
786e8c11
YO
1472 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1473 if ( !svpp ) {
1474 charid = 0;
1475 } else {
1476 charid=(U16)SvIV( *svpp );
1477 }
c445ea15 1478 }
786e8c11
YO
1479 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1480 if ( charid ) {
a3621e74 1481
786e8c11
YO
1482 U16 check;
1483 U32 newstate = 0;
a3621e74 1484
786e8c11
YO
1485 charid--;
1486 if ( !trie->states[ state ].trans.list ) {
1487 TRIE_LIST_NEW( state );
c445ea15 1488 }
786e8c11
YO
1489 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1490 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1491 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1492 break;
1493 }
1494 }
1495 if ( ! newstate ) {
1496 newstate = next_alloc++;
1497 TRIE_LIST_PUSH( state, charid, newstate );
1498 transcount++;
1499 }
1500 state = newstate;
1501 } else {
1502 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1503 }
a28509cc 1504 }
c445ea15 1505 }
3dab1dad 1506 TRIE_HANDLE_WORD(state);
a3621e74
YO
1507
1508 } /* end second pass */
1509
1e2e3d02
YO
1510 /* next alloc is the NEXT state to be allocated */
1511 trie->statecount = next_alloc;
446bd890
NC
1512 trie->states = PerlMemShared_realloc( trie->states, next_alloc
1513 * sizeof(reg_trie_state) );
a3621e74 1514
3dab1dad
YO
1515 /* and now dump it out before we compress it */
1516 DEBUG_TRIE_COMPILE_MORE_r(
1517 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1518 );
a3621e74 1519
446bd890
NC
1520 trie->trans
1521 = PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1522 {
1523 U32 state;
a3621e74
YO
1524 U32 tp = 0;
1525 U32 zp = 0;
1526
1527
1528 for( state=1 ; state < next_alloc ; state ++ ) {
1529 U32 base=0;
1530
1531 /*
1532 DEBUG_TRIE_COMPILE_MORE_r(
1533 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1534 );
1535 */
1536
1537 if (trie->states[state].trans.list) {
1538 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1539 U16 maxid=minid;
a28509cc 1540 U16 idx;
a3621e74
YO
1541
1542 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1543 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1544 if ( forid < minid ) {
1545 minid=forid;
1546 } else if ( forid > maxid ) {
1547 maxid=forid;
1548 }
a3621e74
YO
1549 }
1550 if ( transcount < tp + maxid - minid + 1) {
1551 transcount *= 2;
446bd890
NC
1552 trie->trans
1553 = PerlMemShared_realloc( trie->trans,
1554 transcount
1555 * sizeof(reg_trie_trans) );
a3621e74
YO
1556 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1557 }
1558 base = trie->uniquecharcount + tp - minid;
1559 if ( maxid == minid ) {
1560 U32 set = 0;
1561 for ( ; zp < tp ; zp++ ) {
1562 if ( ! trie->trans[ zp ].next ) {
1563 base = trie->uniquecharcount + zp - minid;
1564 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1565 trie->trans[ zp ].check = state;
1566 set = 1;
1567 break;
1568 }
1569 }
1570 if ( !set ) {
1571 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1572 trie->trans[ tp ].check = state;
1573 tp++;
1574 zp = tp;
1575 }
1576 } else {
1577 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1578 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1579 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1580 trie->trans[ tid ].check = state;
1581 }
1582 tp += ( maxid - minid + 1 );
1583 }
1584 Safefree(trie->states[ state ].trans.list);
1585 }
1586 /*
1587 DEBUG_TRIE_COMPILE_MORE_r(
1588 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1589 );
1590 */
1591 trie->states[ state ].trans.base=base;
1592 }
cc601c31 1593 trie->lasttrans = tp + 1;
a3621e74
YO
1594 }
1595 } else {
1596 /*
1597 Second Pass -- Flat Table Representation.
1598
1599 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1600 We know that we will need Charcount+1 trans at most to store the data
1601 (one row per char at worst case) So we preallocate both structures
1602 assuming worst case.
1603
1604 We then construct the trie using only the .next slots of the entry
1605 structs.
1606
1607 We use the .check field of the first entry of the node temporarily to
1608 make compression both faster and easier by keeping track of how many non
1609 zero fields are in the node.
1610
1611 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1612 transition.
1613
1614 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1615 number representing the first entry of the node, and state as a
1616 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1617 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1618 are 2 entrys per node. eg:
1619
1620 A B A B
1621 1. 2 4 1. 3 7
1622 2. 0 3 3. 0 5
1623 3. 0 0 5. 0 0
1624 4. 0 0 7. 0 0
1625
1626 The table is internally in the right hand, idx form. However as we also
1627 have to deal with the states array which is indexed by nodenum we have to
1628 use TRIE_NODENUM() to convert.
1629
1630 */
1e2e3d02
YO
1631 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1632 "%*sCompiling trie using table compiler\n",
1633 (int)depth * 2 + 2, ""));
3dab1dad 1634
446bd890
NC
1635 trie->trans = PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1636 * trie->uniquecharcount + 1,
1637 sizeof(reg_trie_trans) );
1638 trie->states = PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1639 sizeof(reg_trie_state) );
a3621e74
YO
1640 next_alloc = trie->uniquecharcount + 1;
1641
3dab1dad 1642
a3621e74
YO
1643 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1644
c445ea15 1645 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1646 const U8 *uc = (U8*)STRING( noper );
1647 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1648
1649 U32 state = 1; /* required init */
1650
1651 U16 charid = 0; /* sanity init */
1652 U32 accept_state = 0; /* sanity init */
1653 U8 *scan = (U8*)NULL; /* sanity init */
1654
1655 STRLEN foldlen = 0; /* required init */
07be1b83 1656 U32 wordlen = 0; /* required init */
a3621e74
YO
1657 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1658
3dab1dad 1659 if ( OP(noper) != NOTHING ) {
786e8c11 1660 for ( ; uc < e ; uc += len ) {
a3621e74 1661
786e8c11 1662 TRIE_READ_CHAR;
a3621e74 1663
786e8c11
YO
1664 if ( uvc < 256 ) {
1665 charid = trie->charmap[ uvc ];
1666 } else {
1667 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1668 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1669 }
786e8c11
YO
1670 if ( charid ) {
1671 charid--;
1672 if ( !trie->trans[ state + charid ].next ) {
1673 trie->trans[ state + charid ].next = next_alloc;
1674 trie->trans[ state ].check++;
1675 next_alloc += trie->uniquecharcount;
1676 }
1677 state = trie->trans[ state + charid ].next;
1678 } else {
1679 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1680 }
1681 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1682 }
a3621e74 1683 }
3dab1dad
YO
1684 accept_state = TRIE_NODENUM( state );
1685 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1686
1687 } /* end second pass */
1688
3dab1dad
YO
1689 /* and now dump it out before we compress it */
1690 DEBUG_TRIE_COMPILE_MORE_r(
1691 dump_trie_interim_table(trie,next_alloc,depth+1)
1692 );
a3621e74 1693
a3621e74
YO
1694 {
1695 /*
1696 * Inplace compress the table.*
1697
1698 For sparse data sets the table constructed by the trie algorithm will
1699 be mostly 0/FAIL transitions or to put it another way mostly empty.
1700 (Note that leaf nodes will not contain any transitions.)
1701
1702 This algorithm compresses the tables by eliminating most such
1703 transitions, at the cost of a modest bit of extra work during lookup:
1704
1705 - Each states[] entry contains a .base field which indicates the
1706 index in the state[] array wheres its transition data is stored.
1707
1708 - If .base is 0 there are no valid transitions from that node.
1709
1710 - If .base is nonzero then charid is added to it to find an entry in
1711 the trans array.
1712
1713 -If trans[states[state].base+charid].check!=state then the
1714 transition is taken to be a 0/Fail transition. Thus if there are fail
1715 transitions at the front of the node then the .base offset will point
1716 somewhere inside the previous nodes data (or maybe even into a node
1717 even earlier), but the .check field determines if the transition is
1718 valid.
1719
786e8c11 1720 XXX - wrong maybe?
a3621e74
YO
1721 The following process inplace converts the table to the compressed
1722 table: We first do not compress the root node 1,and mark its all its
1723 .check pointers as 1 and set its .base pointer as 1 as well. This
1724 allows to do a DFA construction from the compressed table later, and
1725 ensures that any .base pointers we calculate later are greater than
1726 0.
1727
1728 - We set 'pos' to indicate the first entry of the second node.
1729
1730 - We then iterate over the columns of the node, finding the first and
1731 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1732 and set the .check pointers accordingly, and advance pos
1733 appropriately and repreat for the next node. Note that when we copy
1734 the next pointers we have to convert them from the original
1735 NODEIDX form to NODENUM form as the former is not valid post
1736 compression.
1737
1738 - If a node has no transitions used we mark its base as 0 and do not
1739 advance the pos pointer.
1740
1741 - If a node only has one transition we use a second pointer into the
1742 structure to fill in allocated fail transitions from other states.
1743 This pointer is independent of the main pointer and scans forward
1744 looking for null transitions that are allocated to a state. When it
1745 finds one it writes the single transition into the "hole". If the
786e8c11 1746 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1747
1748 - Once compressed we can Renew/realloc the structures to release the
1749 excess space.
1750
1751 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1752 specifically Fig 3.47 and the associated pseudocode.
1753
1754 demq
1755 */
a3b680e6 1756 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1757 U32 state, charid;
a3621e74 1758 U32 pos = 0, zp=0;
1e2e3d02 1759 trie->statecount = laststate;
a3621e74
YO
1760
1761 for ( state = 1 ; state < laststate ; state++ ) {
1762 U8 flag = 0;
a28509cc
AL
1763 const U32 stateidx = TRIE_NODEIDX( state );
1764 const U32 o_used = trie->trans[ stateidx ].check;
1765 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1766 trie->trans[ stateidx ].check = 0;
1767
1768 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1769 if ( flag || trie->trans[ stateidx + charid ].next ) {
1770 if ( trie->trans[ stateidx + charid ].next ) {
1771 if (o_used == 1) {
1772 for ( ; zp < pos ; zp++ ) {
1773 if ( ! trie->trans[ zp ].next ) {
1774 break;
1775 }
1776 }
1777 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1778 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1779 trie->trans[ zp ].check = state;
1780 if ( ++zp > pos ) pos = zp;
1781 break;
1782 }
1783 used--;
1784 }
1785 if ( !flag ) {
1786 flag = 1;
1787 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1788 }
1789 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1790 trie->trans[ pos ].check = state;
1791 pos++;
1792 }
1793 }
1794 }
cc601c31 1795 trie->lasttrans = pos + 1;
446bd890
NC
1796 trie->states = PerlMemShared_realloc( trie->states, laststate
1797 * sizeof(reg_trie_state) );
a3621e74 1798 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1799 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1800 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1801 (int)depth * 2 + 2,"",
1802 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1803 (IV)next_alloc,
1804 (IV)pos,
a3621e74
YO
1805 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1806 );
1807
1808 } /* end table compress */
1809 }
1e2e3d02
YO
1810 DEBUG_TRIE_COMPILE_MORE_r(
1811 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1812 (int)depth * 2 + 2, "",
1813 (UV)trie->statecount,
1814 (UV)trie->lasttrans)
1815 );
cc601c31 1816 /* resize the trans array to remove unused space */
446bd890
NC
1817 trie->trans = PerlMemShared_realloc( trie->trans, trie->lasttrans
1818 * sizeof(reg_trie_trans) );
a3621e74 1819
3dab1dad
YO
1820 /* and now dump out the compressed format */
1821 DEBUG_TRIE_COMPILE_r(
1822 dump_trie(trie,depth+1)
1823 );
07be1b83 1824
3dab1dad 1825 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1826 U8 nodetype =(U8)(flags & 0xFF);
1827 char *str=NULL;
786e8c11 1828
07be1b83 1829#ifdef DEBUGGING
e62cc96a 1830 regnode *optimize = NULL;
b57a0404
JH
1831 U32 mjd_offset = 0;
1832 U32 mjd_nodelen = 0;
07be1b83 1833#endif
a3621e74 1834 /*
3dab1dad
YO
1835 This means we convert either the first branch or the first Exact,
1836 depending on whether the thing following (in 'last') is a branch
1837 or not and whther first is the startbranch (ie is it a sub part of
1838 the alternation or is it the whole thing.)
1839 Assuming its a sub part we conver the EXACT otherwise we convert
1840 the whole branch sequence, including the first.
a3621e74 1841 */
3dab1dad 1842 /* Find the node we are going to overwrite */
7f69552c 1843 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1844 /* branch sub-chain */
3dab1dad 1845 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1846 DEBUG_r({
1847 mjd_offset= Node_Offset((convert));
1848 mjd_nodelen= Node_Length((convert));
1849 });
7f69552c
YO
1850 /* whole branch chain */
1851 } else {
1852 DEBUG_r({
1853 const regnode *nop = NEXTOPER( convert );
1854 mjd_offset= Node_Offset((nop));
1855 mjd_nodelen= Node_Length((nop));
1856 });
07be1b83 1857 }
7f69552c 1858
07be1b83
YO
1859 DEBUG_OPTIMISE_r(
1860 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1861 (int)depth * 2 + 2, "",
786e8c11 1862 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1863 );
a3621e74 1864
3dab1dad
YO
1865 /* But first we check to see if there is a common prefix we can
1866 split out as an EXACT and put in front of the TRIE node. */
1867 trie->startstate= 1;
786e8c11 1868 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1869 U32 state;
1e2e3d02 1870 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1871 U32 ofs = 0;
8e11feef
RGS
1872 I32 idx = -1;
1873 U32 count = 0;
1874 const U32 base = trie->states[ state ].trans.base;
a3621e74 1875
3dab1dad 1876 if ( trie->states[state].wordnum )
8e11feef 1877 count = 1;
a3621e74 1878
8e11feef 1879 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1880 if ( ( base + ofs >= trie->uniquecharcount ) &&
1881 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1882 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1883 {
3dab1dad 1884 if ( ++count > 1 ) {
8e11feef 1885 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1886 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1887 if ( state == 1 ) break;
3dab1dad
YO
1888 if ( count == 2 ) {
1889 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1890 DEBUG_OPTIMISE_r(
8e11feef
RGS
1891 PerlIO_printf(Perl_debug_log,
1892 "%*sNew Start State=%"UVuf" Class: [",
1893 (int)depth * 2 + 2, "",
786e8c11 1894 (UV)state));
be8e71aa
YO
1895 if (idx >= 0) {
1896 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1897 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1898
3dab1dad 1899 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1900 if ( folder )
1901 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1902 DEBUG_OPTIMISE_r(
07be1b83 1903 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1904 );
8e11feef
RGS
1905 }
1906 }
1907 TRIE_BITMAP_SET(trie,*ch);
1908 if ( folder )
1909 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1910 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1911 }
1912 idx = ofs;
1913 }
3dab1dad
YO
1914 }
1915 if ( count == 1 ) {
1916 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
de734bd5
A
1917 char *ch = SvPV_nolen( *tmp );
1918 DEBUG_OPTIMISE_r({
1919 SV *sv=sv_newmortal();
8e11feef
RGS
1920 PerlIO_printf( Perl_debug_log,
1921 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1922 (int)depth * 2 + 2, "",
de734bd5
A
1923 (UV)state, (UV)idx,
1924 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1925 PL_colors[0], PL_colors[1],
1926 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1927 PERL_PV_ESCAPE_FIRSTCHAR
1928 )
1929 );
1930 });
3dab1dad
YO
1931 if ( state==1 ) {
1932 OP( convert ) = nodetype;
1933 str=STRING(convert);
1934 STR_LEN(convert)=0;
1935 }
de734bd5
A
1936 while (*ch) {
1937 *str++ = *ch++;
1938 STR_LEN(convert)++;
1939 }
1940
8e11feef 1941 } else {
f9049ba1 1942#ifdef DEBUGGING
8e11feef
RGS
1943 if (state>1)
1944 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1945#endif
8e11feef
RGS
1946 break;
1947 }
1948 }
3dab1dad 1949 if (str) {
8e11feef 1950 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1951 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1952 trie->startstate = state;
07be1b83
YO
1953 trie->minlen -= (state - 1);
1954 trie->maxlen -= (state - 1);
1955 DEBUG_r({
1956 regnode *fix = convert;
de734bd5 1957 U32 word = trie->wordcount;
07be1b83
YO
1958 mjd_nodelen++;
1959 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1960 while( ++fix < n ) {
1961 Set_Node_Offset_Length(fix, 0, 0);
1962 }
de734bd5
A
1963 while (word--) {
1964 SV ** const tmp = av_fetch( trie->words, word, 0 );
1965 if (tmp) {
1966 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1967 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1968 else
1969 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1970 }
1971 }
07be1b83 1972 });
8e11feef
RGS
1973 if (trie->maxlen) {
1974 convert = n;
1975 } else {
3dab1dad 1976 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1977 DEBUG_r(optimize= n);
3dab1dad
YO
1978 }
1979 }
1980 }
a5ca303d
YO
1981 if (!jumper)
1982 jumper = last;
3dab1dad 1983 if ( trie->maxlen ) {
8e11feef
RGS
1984 NEXT_OFF( convert ) = (U16)(tail - convert);
1985 ARG_SET( convert, data_slot );
786e8c11
YO
1986 /* Store the offset to the first unabsorbed branch in
1987 jump[0], which is otherwise unused by the jump logic.
1988 We use this when dumping a trie and during optimisation. */
1989 if (trie->jump)
7f69552c 1990 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1991
786e8c11
YO
1992 /* XXXX */
1993 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1994 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1995 {
1996 OP( convert ) = TRIEC;
1997 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 1998 PerlMemShared_free(trie->bitmap);
786e8c11
YO
1999 trie->bitmap= NULL;
2000 } else
2001 OP( convert ) = TRIE;
a3621e74 2002
3dab1dad
YO
2003 /* store the type in the flags */
2004 convert->flags = nodetype;
a5ca303d
YO
2005 DEBUG_r({
2006 optimize = convert
2007 + NODE_STEP_REGNODE
2008 + regarglen[ OP( convert ) ];
2009 });
2010 /* XXX We really should free up the resource in trie now,
2011 as we won't use them - (which resources?) dmq */
3dab1dad 2012 }
a3621e74 2013 /* needed for dumping*/
e62cc96a 2014 DEBUG_r(if (optimize) {
07be1b83 2015 regnode *opt = convert;
e62cc96a 2016 while ( ++opt < optimize) {
07be1b83
YO
2017 Set_Node_Offset_Length(opt,0,0);
2018 }
786e8c11
YO
2019 /*
2020 Try to clean up some of the debris left after the
2021 optimisation.
a3621e74 2022 */
786e8c11 2023 while( optimize < jumper ) {
07be1b83 2024 mjd_nodelen += Node_Length((optimize));
a3621e74 2025 OP( optimize ) = OPTIMIZED;
07be1b83 2026 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2027 optimize++;
2028 }
07be1b83 2029 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2030 });
2031 } /* end node insert */
07be1b83 2032#ifndef DEBUGGING
6e8b4190 2033 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 2034#endif
786e8c11
YO
2035 return trie->jump
2036 ? MADE_JUMP_TRIE
2037 : trie->startstate>1
2038 ? MADE_EXACT_TRIE
2039 : MADE_TRIE;
2040}
2041
2042STATIC void
2043S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2044{
2045/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2046
2047 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2048 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2049 ISBN 0-201-10088-6
2050
2051 We find the fail state for each state in the trie, this state is the longest proper
2052 suffix of the current states 'word' that is also a proper prefix of another word in our
2053 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2054 the DFA not to have to restart after its tried and failed a word at a given point, it
2055 simply continues as though it had been matching the other word in the first place.
2056 Consider
2057 'abcdgu'=~/abcdefg|cdgu/
2058 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2059 fail, which would bring use to the state representing 'd' in the second word where we would
2060 try 'g' and succeed, prodceding to match 'cdgu'.
2061 */
2062 /* add a fail transition */
f8fc2ecf 2063 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[ARG(source)];
786e8c11
YO
2064 U32 *q;
2065 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2066 const U32 numstates = trie->statecount;
786e8c11
YO
2067 const U32 ubound = trie->lasttrans + ucharcount;
2068 U32 q_read = 0;
2069 U32 q_write = 0;
2070 U32 charid;
2071 U32 base = trie->states[ 1 ].trans.base;
2072 U32 *fail;
2073 reg_ac_data *aho;
2074 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2075 GET_RE_DEBUG_FLAGS_DECL;
2076#ifndef DEBUGGING
2077 PERL_UNUSED_ARG(depth);
2078#endif
2079
2080
2081 ARG_SET( stclass, data_slot );
446bd890 2082 aho = PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2083 RExC_rxi->data->data[ data_slot ] = (void*)aho;
786e8c11 2084 aho->trie=trie;
446bd890
NC
2085 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2086 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2087 Newxz( q, numstates, U32);
446bd890 2088 aho->fail = PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2089 aho->refcount = 1;
2090 fail = aho->fail;
2091 /* initialize fail[0..1] to be 1 so that we always have
2092 a valid final fail state */
2093 fail[ 0 ] = fail[ 1 ] = 1;
2094
2095 for ( charid = 0; charid < ucharcount ; charid++ ) {
2096 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2097 if ( newstate ) {
2098 q[ q_write ] = newstate;
2099 /* set to point at the root */
2100 fail[ q[ q_write++ ] ]=1;
2101 }
2102 }
2103 while ( q_read < q_write) {
2104 const U32 cur = q[ q_read++ % numstates ];
2105 base = trie->states[ cur ].trans.base;
2106
2107 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2108 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2109 if (ch_state) {
2110 U32 fail_state = cur;
2111 U32 fail_base;
2112 do {
2113 fail_state = fail[ fail_state ];
2114 fail_base = aho->states[ fail_state ].trans.base;
2115 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2116
2117 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2118 fail[ ch_state ] = fail_state;
2119 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2120 {
2121 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2122 }
2123 q[ q_write++ % numstates] = ch_state;
2124 }
2125 }
2126 }
2127 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2128 when we fail in state 1, this allows us to use the
2129 charclass scan to find a valid start char. This is based on the principle
2130 that theres a good chance the string being searched contains lots of stuff
2131 that cant be a start char.
2132 */
2133 fail[ 0 ] = fail[ 1 ] = 0;
2134 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2135 PerlIO_printf(Perl_debug_log,
2136 "%*sStclass Failtable (%"UVuf" states): 0",
2137 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2138 );
786e8c11
YO
2139 for( q_read=1; q_read<numstates; q_read++ ) {
2140 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2141 }
2142 PerlIO_printf(Perl_debug_log, "\n");
2143 });
2144 Safefree(q);
2145 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2146}
2147
786e8c11 2148
a3621e74 2149/*
5d1c421c
JH
2150 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2151 * These need to be revisited when a newer toolchain becomes available.
2152 */
2153#if defined(__sparc64__) && defined(__GNUC__)
2154# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2155# undef SPARC64_GCC_WORKAROUND
2156# define SPARC64_GCC_WORKAROUND 1
2157# endif
2158#endif
2159
07be1b83 2160#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2161 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2162 SV * const mysv=sv_newmortal(); \
2163 regnode *Next = regnext(scan); \
2164 regprop(RExC_rx, mysv, scan); \
7f69552c 2165 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2166 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2167 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2168 }});
07be1b83 2169
1de06328
YO
2170
2171
2172
2173
07be1b83
YO
2174#define JOIN_EXACT(scan,min,flags) \
2175 if (PL_regkind[OP(scan)] == EXACT) \
2176 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2177
be8e71aa 2178STATIC U32
07be1b83
YO
2179S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2180 /* Merge several consecutive EXACTish nodes into one. */
2181 regnode *n = regnext(scan);
2182 U32 stringok = 1;
2183 regnode *next = scan + NODE_SZ_STR(scan);
2184 U32 merged = 0;
2185 U32 stopnow = 0;
2186#ifdef DEBUGGING
2187 regnode *stop = scan;
72f13be8 2188 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2189#else
d47053eb
RGS
2190 PERL_UNUSED_ARG(depth);
2191#endif
2192#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2193 PERL_UNUSED_ARG(flags);
2194 PERL_UNUSED_ARG(val);
07be1b83 2195#endif
07be1b83
YO
2196 DEBUG_PEEP("join",scan,depth);
2197
2198 /* Skip NOTHING, merge EXACT*. */
2199 while (n &&
2200 ( PL_regkind[OP(n)] == NOTHING ||
2201 (stringok && (OP(n) == OP(scan))))
2202 && NEXT_OFF(n)
2203 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2204
2205 if (OP(n) == TAIL || n > next)
2206 stringok = 0;
2207 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2208 DEBUG_PEEP("skip:",n,depth);
2209 NEXT_OFF(scan) += NEXT_OFF(n);
2210 next = n + NODE_STEP_REGNODE;
2211#ifdef DEBUGGING
2212 if (stringok)
2213 stop = n;
2214#endif
2215 n = regnext(n);
2216 }
2217 else if (stringok) {
786e8c11 2218 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2219 regnode * const nnext = regnext(n);
2220
2221 DEBUG_PEEP("merg",n,depth);
2222
2223 merged++;
2224 if (oldl + STR_LEN(n) > U8_MAX)
2225 break;
2226 NEXT_OFF(scan) += NEXT_OFF(n);
2227 STR_LEN(scan) += STR_LEN(n);
2228 next = n + NODE_SZ_STR(n);
2229 /* Now we can overwrite *n : */
2230 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2231#ifdef DEBUGGING
2232 stop = next - 1;
2233#endif
2234 n = nnext;
2235 if (stopnow) break;
2236 }
2237
d47053eb
RGS
2238#ifdef EXPERIMENTAL_INPLACESCAN
2239 if (flags && !NEXT_OFF(n)) {
2240 DEBUG_PEEP("atch", val, depth);
2241 if (reg_off_by_arg[OP(n)]) {
2242 ARG_SET(n, val - n);
2243 }
2244 else {
2245 NEXT_OFF(n) = val - n;
2246 }
2247 stopnow = 1;
2248 }
07be1b83
YO
2249#endif
2250 }
2251
2252 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2253 /*
2254 Two problematic code points in Unicode casefolding of EXACT nodes:
2255
2256 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2257 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2258
2259 which casefold to
2260
2261 Unicode UTF-8
2262
2263 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2264 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2265
2266 This means that in case-insensitive matching (or "loose matching",
2267 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2268 length of the above casefolded versions) can match a target string
2269 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2270 This would rather mess up the minimum length computation.
2271
2272 What we'll do is to look for the tail four bytes, and then peek
2273 at the preceding two bytes to see whether we need to decrease
2274 the minimum length by four (six minus two).
2275
2276 Thanks to the design of UTF-8, there cannot be false matches:
2277 A sequence of valid UTF-8 bytes cannot be a subsequence of
2278 another valid sequence of UTF-8 bytes.
2279
2280 */
2281 char * const s0 = STRING(scan), *s, *t;
2282 char * const s1 = s0 + STR_LEN(scan) - 1;
2283 char * const s2 = s1 - 4;
e294cc5d
JH
2284#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2285 const char t0[] = "\xaf\x49\xaf\x42";
2286#else
07be1b83 2287 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2288#endif
07be1b83
YO
2289 const char * const t1 = t0 + 3;
2290
2291 for (s = s0 + 2;
2292 s < s2 && (t = ninstr(s, s1, t0, t1));
2293 s = t + 4) {
e294cc5d
JH
2294#ifdef EBCDIC
2295 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2296 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2297#else
07be1b83
YO
2298 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2299 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2300#endif
07be1b83
YO
2301 *min -= 4;
2302 }
2303 }
2304
2305#ifdef DEBUGGING
2306 /* Allow dumping */
2307 n = scan + NODE_SZ_STR(scan);
2308 while (n <= stop) {
2309 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2310 OP(n) = OPTIMIZED;
2311 NEXT_OFF(n) = 0;
2312 }
2313 n++;
2314 }
2315#endif
2316 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2317 return stopnow;
2318}
2319
653099ff
GS
2320/* REx optimizer. Converts nodes into quickier variants "in place".
2321 Finds fixed substrings. */
2322
a0288114 2323/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2324 to the position after last scanned or to NULL. */
2325
40d049e4
YO
2326#define INIT_AND_WITHP \
2327 assert(!and_withp); \
2328 Newx(and_withp,1,struct regnode_charclass_class); \
2329 SAVEFREEPV(and_withp)
07be1b83 2330
b515a41d
YO
2331/* this is a chain of data about sub patterns we are processing that
2332 need to be handled seperately/specially in study_chunk. Its so
2333 we can simulate recursion without losing state. */
2334struct scan_frame;
2335typedef struct scan_frame {
2336 regnode *last; /* last node to process in this frame */
2337 regnode *next; /* next node to process when last is reached */
2338 struct scan_frame *prev; /*previous frame*/
2339 I32 stop; /* what stopparen do we use */
2340} scan_frame;
2341
76e3520e 2342STATIC I32
40d049e4 2343S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2344 I32 *minlenp, I32 *deltap,
40d049e4
YO
2345 regnode *last,
2346 scan_data_t *data,
2347 I32 stopparen,
2348 U8* recursed,
2349 struct regnode_charclass_class *and_withp,
2350 U32 flags, U32 depth)
c277df42
IZ
2351 /* scanp: Start here (read-write). */
2352 /* deltap: Write maxlen-minlen here. */
2353 /* last: Stop before this one. */
40d049e4
YO
2354 /* data: string data about the pattern */
2355 /* stopparen: treat close N as END */
2356 /* recursed: which subroutines have we recursed into */
2357 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2358{
97aff369 2359 dVAR;
c277df42
IZ
2360 I32 min = 0, pars = 0, code;
2361 regnode *scan = *scanp, *next;
2362 I32 delta = 0;
2363 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2364 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2365 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2366 scan_data_t data_fake;
a3621e74 2367 SV *re_trie_maxbuff = NULL;
786e8c11 2368 regnode *first_non_open = scan;
e2e6a0f1 2369 I32 stopmin = I32_MAX;
8aa23a47
YO
2370 scan_frame *frame = NULL;
2371
a3621e74 2372 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2373
13a24bad 2374#ifdef DEBUGGING
40d049e4 2375 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2376#endif
40d049e4 2377
786e8c11 2378 if ( depth == 0 ) {
40d049e4 2379 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2380 first_non_open=regnext(first_non_open);
2381 }
2382
b81d288d 2383
8aa23a47
YO
2384 fake_study_recurse:
2385 while ( scan && OP(scan) != END && scan < last ){
2386 /* Peephole optimizer: */
2387 DEBUG_STUDYDATA(data,depth);
2388 DEBUG_PEEP("Peep",scan,depth);
2389 JOIN_EXACT(scan,&min,0);
2390
2391 /* Follow the next-chain of the current node and optimize
2392 away all the NOTHINGs from it. */
2393 if (OP(scan) != CURLYX) {
2394 const int max = (reg_off_by_arg[OP(scan)]
2395 ? I32_MAX
2396 /* I32 may be smaller than U16 on CRAYs! */
2397 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2398 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2399 int noff;
2400 regnode *n = scan;
2401
2402 /* Skip NOTHING and LONGJMP. */
2403 while ((n = regnext(n))
2404 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2405 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2406 && off + noff < max)
2407 off += noff;
2408 if (reg_off_by_arg[OP(scan)])
2409 ARG(scan) = off;
2410 else
2411 NEXT_OFF(scan) = off;
2412 }
a3621e74 2413
c277df42 2414
8aa23a47
YO
2415
2416 /* The principal pseudo-switch. Cannot be a switch, since we
2417 look into several different things. */
2418 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2419 || OP(scan) == IFTHEN) {
2420 next = regnext(scan);
2421 code = OP(scan);
2422 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2423
2424 if (OP(next) == code || code == IFTHEN) {
2425 /* NOTE - There is similar code to this block below for handling
2426 TRIE nodes on a re-study. If you change stuff here check there
2427 too. */
2428 I32 max1 = 0, min1 = I32_MAX, num = 0;
2429 struct regnode_charclass_class accum;
2430 regnode * const startbranch=scan;
2431
2432 if (flags & SCF_DO_SUBSTR)
2433 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2434 if (flags & SCF_DO_STCLASS)
2435 cl_init_zero(pRExC_state, &accum);
2436
2437 while (OP(scan) == code) {
2438 I32 deltanext, minnext, f = 0, fake;
2439 struct regnode_charclass_class this_class;
2440
2441 num++;
2442 data_fake.flags = 0;
2443 if (data) {
2444 data_fake.whilem_c = data->whilem_c;
2445 data_fake.last_closep = data->last_closep;
2446 }
2447 else
2448 data_fake.last_closep = &fake;
58e23c8d
YO
2449
2450 data_fake.pos_delta = delta;
8aa23a47
YO
2451 next = regnext(scan);
2452 scan = NEXTOPER(scan);
2453 if (code != BRANCH)
c277df42 2454 scan = NEXTOPER(scan);
8aa23a47
YO
2455 if (flags & SCF_DO_STCLASS) {
2456 cl_init(pRExC_state, &this_class);
2457 data_fake.start_class = &this_class;
2458 f = SCF_DO_STCLASS_AND;
58e23c8d 2459 }
8aa23a47
YO
2460 if (flags & SCF_WHILEM_VISITED_POS)
2461 f |= SCF_WHILEM_VISITED_POS;
2462
2463 /* we suppose the run is continuous, last=next...*/
2464 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2465 next, &data_fake,
2466 stopparen, recursed, NULL, f,depth+1);
2467 if (min1 > minnext)
2468 min1 = minnext;
2469 if (max1 < minnext + deltanext)
2470 max1 = minnext + deltanext;
2471 if (deltanext == I32_MAX)
2472 is_inf = is_inf_internal = 1;
2473 scan = next;
2474 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2475 pars++;
2476 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2477 if ( stopmin > minnext)
2478 stopmin = min + min1;
2479 flags &= ~SCF_DO_SUBSTR;
2480 if (data)
2481 data->flags |= SCF_SEEN_ACCEPT;
2482 }
2483 if (data) {
2484 if (data_fake.flags & SF_HAS_EVAL)
2485 data->flags |= SF_HAS_EVAL;
2486 data->whilem_c = data_fake.whilem_c;
3dab1dad 2487 }
8aa23a47
YO
2488 if (flags & SCF_DO_STCLASS)
2489 cl_or(pRExC_state, &accum, &this_class);
2490 }
2491 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2492 min1 = 0;
2493 if (flags & SCF_DO_SUBSTR) {
2494 data->pos_min += min1;
2495 data->pos_delta += max1 - min1;
2496 if (max1 != min1 || is_inf)
2497 data->longest = &(data->longest_float);
2498 }
2499 min += min1;
2500 delta += max1 - min1;
2501 if (flags & SCF_DO_STCLASS_OR) {
2502 cl_or(pRExC_state, data->start_class, &accum);
2503 if (min1) {
2504 cl_and(data->start_class, and_withp);
2505 flags &= ~SCF_DO_STCLASS;
653099ff 2506 }
8aa23a47
YO
2507 }
2508 else if (flags & SCF_DO_STCLASS_AND) {
2509 if (min1) {
2510 cl_and(data->start_class, &accum);
2511 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2512 }
8aa23a47
YO
2513 else {
2514 /* Switch to OR mode: cache the old value of
2515 * data->start_class */
2516 INIT_AND_WITHP;
2517 StructCopy(data->start_class, and_withp,
2518 struct regnode_charclass_class);
2519 flags &= ~SCF_DO_STCLASS_AND;
2520 StructCopy(&accum, data->start_class,
2521 struct regnode_charclass_class);
2522 flags |= SCF_DO_STCLASS_OR;
2523 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2524 }
8aa23a47 2525 }
a3621e74 2526
8aa23a47
YO
2527 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2528 /* demq.
a3621e74 2529
8aa23a47
YO
2530 Assuming this was/is a branch we are dealing with: 'scan' now
2531 points at the item that follows the branch sequence, whatever
2532 it is. We now start at the beginning of the sequence and look
2533 for subsequences of
a3621e74 2534
8aa23a47
YO
2535 BRANCH->EXACT=>x1
2536 BRANCH->EXACT=>x2
2537 tail
a3621e74 2538
8aa23a47 2539 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2540
8aa23a47
YO
2541 If we can find such a subseqence we need to turn the first
2542 element into a trie and then add the subsequent branch exact
2543 strings to the trie.
a3621e74 2544
8aa23a47 2545 We have two cases
a3621e74 2546
8aa23a47 2547 1. patterns where the whole set of branch can be converted.
a3621e74 2548
8aa23a47 2549 2. patterns where only a subset can be converted.
a3621e74 2550
8aa23a47
YO
2551 In case 1 we can replace the whole set with a single regop
2552 for the trie. In case 2 we need to keep the start and end
2553 branchs so
a3621e74 2554
8aa23a47
YO
2555 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2556 becomes BRANCH TRIE; BRANCH X;
786e8c11 2557
8aa23a47
YO
2558 There is an additional case, that being where there is a
2559 common prefix, which gets split out into an EXACT like node
2560 preceding the TRIE node.
a3621e74 2561
8aa23a47
YO
2562 If x(1..n)==tail then we can do a simple trie, if not we make
2563 a "jump" trie, such that when we match the appropriate word
2564 we "jump" to the appopriate tail node. Essentailly we turn
2565 a nested if into a case structure of sorts.
b515a41d 2566
8aa23a47
YO
2567 */
2568
2569 int made=0;
2570 if (!re_trie_maxbuff) {
2571 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2572 if (!SvIOK(re_trie_maxbuff))
2573 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2574 }
2575 if ( SvIV(re_trie_maxbuff)>=0 ) {
2576 regnode *cur;
2577 regnode *first = (regnode *)NULL;
2578 regnode *last = (regnode *)NULL;
2579 regnode *tail = scan;
2580 U8 optype = 0;
2581 U32 count=0;
a3621e74
YO
2582
2583#ifdef DEBUGGING
8aa23a47 2584 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2585#endif
8aa23a47
YO
2586 /* var tail is used because there may be a TAIL
2587 regop in the way. Ie, the exacts will point to the
2588 thing following the TAIL, but the last branch will
2589 point at the TAIL. So we advance tail. If we
2590 have nested (?:) we may have to move through several
2591 tails.
2592 */
2593
2594 while ( OP( tail ) == TAIL ) {
2595 /* this is the TAIL generated by (?:) */
2596 tail = regnext( tail );
2597 }
a3621e74 2598
8aa23a47
YO
2599
2600 DEBUG_OPTIMISE_r({
2601 regprop(RExC_rx, mysv, tail );
2602 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2603 (int)depth * 2 + 2, "",
2604 "Looking for TRIE'able sequences. Tail node is: ",
2605 SvPV_nolen_const( mysv )
2606 );
2607 });
2608
2609 /*
2610
2611 step through the branches, cur represents each
2612 branch, noper is the first thing to be matched
2613 as part of that branch and noper_next is the
2614 regnext() of that node. if noper is an EXACT
2615 and noper_next is the same as scan (our current
2616 position in the regex) then the EXACT branch is
2617 a possible optimization target. Once we have
2618 two or more consequetive such branches we can
2619 create a trie of the EXACT's contents and stich
2620 it in place. If the sequence represents all of
2621 the branches we eliminate the whole thing and
2622 replace it with a single TRIE. If it is a
2623 subsequence then we need to stitch it in. This
2624 means the first branch has to remain, and needs
2625 to be repointed at the item on the branch chain
2626 following the last branch optimized. This could
2627 be either a BRANCH, in which case the
2628 subsequence is internal, or it could be the
2629 item following the branch sequence in which
2630 case the subsequence is at the end.
2631
2632 */
2633
2634 /* dont use tail as the end marker for this traverse */
2635 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2636 regnode * const noper = NEXTOPER( cur );
b515a41d 2637#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2638 regnode * const noper_next = regnext( noper );
b515a41d
YO
2639#endif
2640
8aa23a47
YO
2641 DEBUG_OPTIMISE_r({
2642 regprop(RExC_rx, mysv, cur);
2643 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2644 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2645
2646 regprop(RExC_rx, mysv, noper);
2647 PerlIO_printf( Perl_debug_log, " -> %s",
2648 SvPV_nolen_const(mysv));
2649
2650 if ( noper_next ) {
2651 regprop(RExC_rx, mysv, noper_next );
2652 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2653 SvPV_nolen_const(mysv));
2654 }
2655 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2656 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2657 });
2658 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2659 : PL_regkind[ OP( noper ) ] == EXACT )
2660 || OP(noper) == NOTHING )
786e8c11 2661#ifdef NOJUMPTRIE
8aa23a47 2662 && noper_next == tail
786e8c11 2663#endif
8aa23a47
YO
2664 && count < U16_MAX)
2665 {
2666 count++;
2667 if ( !first || optype == NOTHING ) {
2668 if (!first) first = cur;
2669 optype = OP( noper );
2670 } else {
2671 last = cur;
2672 }
2673 } else {
2674 if ( last ) {
2675 make_trie( pRExC_state,
2676 startbranch, first, cur, tail, count,
2677 optype, depth+1 );
2678 }
2679 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2680#ifdef NOJUMPTRIE
8aa23a47 2681 && noper_next == tail
786e8c11 2682#endif
8aa23a47
YO
2683 ){
2684 count = 1;
2685 first = cur;
2686 optype = OP( noper );
2687 } else {
2688 count = 0;
2689 first = NULL;
2690 optype = 0;
2691 }
2692 last = NULL;
2693 }
2694 }
2695 DEBUG_OPTIMISE_r({
2696 regprop(RExC_rx, mysv, cur);
2697 PerlIO_printf( Perl_debug_log,
2698 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2699 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2700
2701 });
2702 if ( last ) {
2703 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2704#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2705 if ( ((made == MADE_EXACT_TRIE &&
2706 startbranch == first)
2707 || ( first_non_open == first )) &&
2708 depth==0 ) {
2709 flags |= SCF_TRIE_RESTUDY;
2710 if ( startbranch == first
2711 && scan == tail )
2712 {
2713 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2714 }
2715 }
3dab1dad 2716#endif
8aa23a47
YO
2717 }
2718 }
2719
2720 } /* do trie */
2721
653099ff 2722 }
8aa23a47
YO
2723 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2724 scan = NEXTOPER(NEXTOPER(scan));
2725 } else /* single branch is optimized. */
2726 scan = NEXTOPER(scan);
2727 continue;
2728 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2729 scan_frame *newframe = NULL;
2730 I32 paren;
2731 regnode *start;
2732 regnode *end;
2733
2734 if (OP(scan) != SUSPEND) {
2735 /* set the pointer */
2736 if (OP(scan) == GOSUB) {
2737 paren = ARG(scan);
2738 RExC_recurse[ARG2L(scan)] = scan;
2739 start = RExC_open_parens[paren-1];
2740 end = RExC_close_parens[paren-1];
2741 } else {
2742 paren = 0;
f8fc2ecf 2743 start = RExC_rxi->program + 1;
8aa23a47
YO
2744 end = RExC_opend;
2745 }
2746 if (!recursed) {
2747 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2748 SAVEFREEPV(recursed);
2749 }
2750 if (!PAREN_TEST(recursed,paren+1)) {
2751 PAREN_SET(recursed,paren+1);
2752 Newx(newframe,1,scan_frame);
2753 } else {
2754 if (flags & SCF_DO_SUBSTR) {
2755 scan_commit(pRExC_state,data,minlenp);
2756 data->longest = &(data->longest_float);
2757 }
2758 is_inf = is_inf_internal = 1;
2759 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2760 cl_anything(pRExC_state, data->start_class);
2761 flags &= ~SCF_DO_STCLASS;
2762 }
2763 } else {
2764 Newx(newframe,1,scan_frame);
2765 paren = stopparen;
2766 start = scan+2;
2767 end = regnext(scan);
2768 }
2769 if (newframe) {
2770 assert(start);
2771 assert(end);
2772 SAVEFREEPV(newframe);
2773 newframe->next = regnext(scan);
2774 newframe->last = last;
2775 newframe->stop = stopparen;
2776 newframe->prev = frame;
2777
2778 frame = newframe;
2779 scan = start;
2780 stopparen = paren;
2781 last = end;
2782
2783 continue;
2784 }
2785 }
2786 else if (OP(scan) == EXACT) {
2787 I32 l = STR_LEN(scan);
2788 UV uc;
2789 if (UTF) {
2790 const U8 * const s = (U8*)STRING(scan);
2791 l = utf8_length(s, s + l);
2792 uc = utf8_to_uvchr(s, NULL);
2793 } else {
2794 uc = *((U8*)STRING(scan));
2795 }
2796 min += l;
2797 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2798 /* The code below prefers earlier match for fixed
2799 offset, later match for variable offset. */
2800 if (data->last_end == -1) { /* Update the start info. */
2801 data->last_start_min = data->pos_min;
2802 data->last_start_max = is_inf
2803 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2804 }
8aa23a47
YO
2805 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2806 if (UTF)
2807 SvUTF8_on(data->last_found);
2808 {
2809 SV * const sv = data->last_found;
2810 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2811 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2812 if (mg && mg->mg_len >= 0)
2813 mg->mg_len += utf8_length((U8*)STRING(scan),
2814 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2815 }
8aa23a47
YO
2816 data->last_end = data->pos_min + l;
2817 data->pos_min += l; /* As in the first entry. */
2818 data->flags &= ~SF_BEFORE_EOL;
2819 }
2820 if (flags & SCF_DO_STCLASS_AND) {
2821 /* Check whether it is compatible with what we know already! */
2822 int compat = 1;
2823
2824 if (uc >= 0x100 ||
2825 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2826 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2827 && (!(data->start_class->flags & ANYOF_FOLD)
2828 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2829 )
2830 compat = 0;
2831 ANYOF_CLASS_ZERO(data->start_class);
2832 ANYOF_BITMAP_ZERO(data->start_class);
2833 if (compat)
2834 ANYOF_BITMAP_SET(data->start_class, uc);
2835 data->start_class->flags &= ~ANYOF_EOS;
2836 if (uc < 0x100)
2837 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2838 }
2839 else if (flags & SCF_DO_STCLASS_OR) {
2840 /* false positive possible if the class is case-folded */
2841 if (uc < 0x100)
2842 ANYOF_BITMAP_SET(data->start_class, uc);
2843 else
2844 data->start_class->flags |= ANYOF_UNICODE_ALL;
2845 data->start_class->flags &= ~ANYOF_EOS;
2846 cl_and(data->start_class, and_withp);
2847 }
2848 flags &= ~SCF_DO_STCLASS;
2849 }
2850 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2851 I32 l = STR_LEN(scan);
2852 UV uc = *((U8*)STRING(scan));
2853
2854 /* Search for fixed substrings supports EXACT only. */
2855 if (flags & SCF_DO_SUBSTR) {
2856 assert(data);
2857 scan_commit(pRExC_state, data, minlenp);
2858 }
2859 if (UTF) {
2860 const U8 * const s = (U8 *)STRING(scan);
2861 l = utf8_length(s, s + l);
2862 uc = utf8_to_uvchr(s, NULL);
2863 }
2864 min += l;
2865 if (flags & SCF_DO_SUBSTR)
2866 data->pos_min += l;
2867 if (flags & SCF_DO_STCLASS_AND) {
2868 /* Check whether it is compatible with what we know already! */
2869 int compat = 1;
2870
2871 if (uc >= 0x100 ||
2872 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2873 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2874 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2875 compat = 0;
2876 ANYOF_CLASS_ZERO(data->start_class);
2877 ANYOF_BITMAP_ZERO(data->start_class);
2878 if (compat) {
2879 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2880 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2881 data->start_class->flags |= ANYOF_FOLD;
2882 if (OP(scan) == EXACTFL)
2883 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2884 }
8aa23a47
YO
2885 }
2886 else if (flags & SCF_DO_STCLASS_OR) {
2887 if (data->start_class->flags & ANYOF_FOLD) {
2888 /* false positive possible if the class is case-folded.
2889 Assume that the locale settings are the same... */
1aa99e6b
IH
2890 if (uc < 0x100)
2891 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2892 data->start_class->flags &= ~ANYOF_EOS;
2893 }
8aa23a47 2894 cl_and(data->start_class, and_withp);
653099ff 2895 }
8aa23a47
YO
2896 flags &= ~SCF_DO_STCLASS;
2897 }
2898 else if (strchr((const char*)PL_varies,OP(scan))) {
2899 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2900 I32 f = flags, pos_before = 0;
2901 regnode * const oscan = scan;
2902 struct regnode_charclass_class this_class;
2903 struct regnode_charclass_class *oclass = NULL;
2904 I32 next_is_eval = 0;
2905
2906 switch (PL_regkind[OP(scan)]) {
2907 case WHILEM: /* End of (?:...)* . */
2908 scan = NEXTOPER(scan);
2909 goto finish;
2910 case PLUS:
2911 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2912 next = NEXTOPER(scan);
2913 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2914 mincount = 1;
2915 maxcount = REG_INFTY;
2916 next = regnext(scan);
2917 scan = NEXTOPER(scan);
2918 goto do_curly;
2919 }
2920 }
2921 if (flags & SCF_DO_SUBSTR)
2922 data->pos_min++;
2923 min++;
2924 /* Fall through. */
2925 case STAR:
2926 if (flags & SCF_DO_STCLASS) {
2927 mincount = 0;
2928 maxcount = REG_INFTY;
2929 next = regnext(scan);
2930 scan = NEXTOPER(scan);
2931 goto do_curly;
2932 }
2933 is_inf = is_inf_internal = 1;
2934 scan = regnext(scan);
c277df42 2935 if (flags & SCF_DO_SUBSTR) {
8aa23a47
YO
2936 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2937 data->longest = &(data->longest_float);
c277df42 2938 }
8aa23a47
YO
2939 goto optimize_curly_tail;
2940 case CURLY:
2941 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2942 && (scan->flags == stopparen))
2943 {
2944 mincount = 1;
2945 maxcount = 1;
2946 } else {
2947 mincount = ARG1(scan);
2948 maxcount = ARG2(scan);
653099ff 2949 }
8aa23a47
YO
2950 next = regnext(scan);
2951 if (OP(scan) == CURLYX) {
2952 I32 lp = (data ? *(data->last_closep) : 0);
2953 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2954 }
8aa23a47
YO
2955 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2956 next_is_eval = (OP(scan) == EVAL);
2957 do_curly:
2958 if (flags & SCF_DO_SUBSTR) {
2959 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2960 pos_before = data->pos_min;
b45f050a 2961 }
8aa23a47
YO
2962 if (data) {
2963 fl = data->flags;
2964 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2965 if (is_inf)
2966 data->flags |= SF_IS_INF;
2967 }
2968 if (flags & SCF_DO_STCLASS) {
2969 cl_init(pRExC_state, &this_class);
2970 oclass = data->start_class;
2971 data->start_class = &this_class;
2972 f |= SCF_DO_STCLASS_AND;
2973 f &= ~SCF_DO_STCLASS_OR;
2974 }
2975 /* These are the cases when once a subexpression
2976 fails at a particular position, it cannot succeed
2977 even after backtracking at the enclosing scope.
2978
2979 XXXX what if minimal match and we are at the
2980 initial run of {n,m}? */
2981 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2982 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 2983
8aa23a47
YO
2984 /* This will finish on WHILEM, setting scan, or on NULL: */
2985 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2986 last, data, stopparen, recursed, NULL,
2987 (mincount == 0
2988 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 2989
8aa23a47
YO
2990 if (flags & SCF_DO_STCLASS)
2991 data->start_class = oclass;
2992 if (mincount == 0 || minnext == 0) {
2993 if (flags & SCF_DO_STCLASS_OR) {
2994 cl_or(pRExC_state, data->start_class, &this_class);
2995 }
2996 else if (flags & SCF_DO_STCLASS_AND) {
2997 /* Switch to OR mode: cache the old value of
2998 * data->start_class */
2999 INIT_AND_WITHP;
3000 StructCopy(data->start_class, and_withp,
3001 struct regnode_charclass_class);
3002 flags &= ~SCF_DO_STCLASS_AND;
3003 StructCopy(&this_class, data->start_class,
3004 struct regnode_charclass_class);
3005 flags |= SCF_DO_STCLASS_OR;
3006 data->start_class->flags |= ANYOF_EOS;
3007 }
3008 } else { /* Non-zero len */
3009 if (flags & SCF_DO_STCLASS_OR) {
3010 cl_or(pRExC_state, data->start_class, &this_class);
3011 cl_and(data->start_class, and_withp);
3012 }
3013 else if (flags & SCF_DO_STCLASS_AND)
3014 cl_and(data->start_class, &this_class);
3015 flags &= ~SCF_DO_STCLASS;
3016 }
3017 if (!scan) /* It was not CURLYX, but CURLY. */
3018 scan = next;
3019 if ( /* ? quantifier ok, except for (?{ ... }) */
3020 (next_is_eval || !(mincount == 0 && maxcount == 1))
3021 && (minnext == 0) && (deltanext == 0)
3022 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3023 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3024 && ckWARN(WARN_REGEXP))
3025 {
3026 vWARN(RExC_parse,
3027 "Quantifier unexpected on zero-length expression");
3028 }
3029
3030 min += minnext * mincount;
3031 is_inf_internal |= ((maxcount == REG_INFTY
3032 && (minnext + deltanext) > 0)
3033 || deltanext == I32_MAX);
3034 is_inf |= is_inf_internal;
3035 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3036
3037 /* Try powerful optimization CURLYX => CURLYN. */
3038 if ( OP(oscan) == CURLYX && data
3039 && data->flags & SF_IN_PAR
3040 && !(data->flags & SF_HAS_EVAL)
3041 && !deltanext && minnext == 1 ) {
3042 /* Try to optimize to CURLYN. */
3043 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3044 regnode * const nxt1 = nxt;
497b47a8 3045#ifdef DEBUGGING
8aa23a47 3046 regnode *nxt2;
497b47a8 3047#endif
c277df42 3048
8aa23a47
YO
3049 /* Skip open. */
3050 nxt = regnext(nxt);
3051 if (!strchr((const char*)PL_simple,OP(nxt))
3052 && !(PL_regkind[OP(nxt)] == EXACT
3053 && STR_LEN(nxt) == 1))
3054 goto nogo;
497b47a8 3055#ifdef DEBUGGING
8aa23a47 3056 nxt2 = nxt;
497b47a8 3057#endif
8aa23a47
YO
3058 nxt = regnext(nxt);
3059 if (OP(nxt) != CLOSE)
3060 goto nogo;
3061 if (RExC_open_parens) {
3062 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3063 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3064 }
3065 /* Now we know that nxt2 is the only contents: */
3066 oscan->flags = (U8)ARG(nxt);
3067 OP(oscan) = CURLYN;
3068 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3069
c277df42 3070#ifdef DEBUGGING
8aa23a47
YO
3071 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3072 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3073 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3074 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3075 OP(nxt + 1) = OPTIMIZED; /* was count. */
3076 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3077#endif
8aa23a47
YO
3078 }
3079 nogo:
3080
3081 /* Try optimization CURLYX => CURLYM. */
3082 if ( OP(oscan) == CURLYX && data
3083 && !(data->flags & SF_HAS_PAR)
3084 && !(data->flags & SF_HAS_EVAL)
3085 && !deltanext /* atom is fixed width */
3086 && minnext != 0 /* CURLYM can't handle zero width */
3087 ) {
3088 /* XXXX How to optimize if data == 0? */
3089 /* Optimize to a simpler form. */
3090 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3091 regnode *nxt2;
3092
3093 OP(oscan) = CURLYM;
3094 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3095 && (OP(nxt2) != WHILEM))
3096 nxt = nxt2;
3097 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3098 /* Need to optimize away parenths. */
3099 if (data->flags & SF_IN_PAR) {
3100 /* Set the parenth number. */
3101 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3102
3103 if (OP(nxt) != CLOSE)
3104 FAIL("Panic opt close");
3105 oscan->flags = (U8)ARG(nxt);
3106 if (RExC_open_parens) {
3107 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3108 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3109 }
8aa23a47
YO
3110 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3111 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3112
c277df42 3113#ifdef DEBUGGING
8aa23a47
YO
3114 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3115 OP(nxt + 1) = OPTIMIZED; /* was count. */
3116 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3117 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3118#endif
c277df42 3119#if 0
8aa23a47
YO
3120 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3121 regnode *nnxt = regnext(nxt1);
3122
3123 if (nnxt == nxt) {
3124 if (reg_off_by_arg[OP(nxt1)])
3125 ARG_SET(nxt1, nxt2 - nxt1);
3126 else if (nxt2 - nxt1 < U16_MAX)
3127 NEXT_OFF(nxt1) = nxt2 - nxt1;
3128 else
3129 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3130 }
8aa23a47 3131 nxt1 = nnxt;
c277df42 3132 }
5d1c421c 3133#endif
8aa23a47
YO
3134 /* Optimize again: */
3135 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3136 NULL, stopparen, recursed, NULL, 0,depth+1);
3137 }
3138 else
3139 oscan->flags = 0;
3140 }
3141 else if ((OP(oscan) == CURLYX)
3142 && (flags & SCF_WHILEM_VISITED_POS)
3143 /* See the comment on a similar expression above.
3144 However, this time it not a subexpression
3145 we care about, but the expression itself. */
3146 && (maxcount == REG_INFTY)
3147 && data && ++data->whilem_c < 16) {
3148 /* This stays as CURLYX, we can put the count/of pair. */
3149 /* Find WHILEM (as in regexec.c) */
3150 regnode *nxt = oscan + NEXT_OFF(oscan);
3151
3152 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3153 nxt += ARG(nxt);
3154 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3155 | (RExC_whilem_seen << 4)); /* On WHILEM */
3156 }
3157 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3158 pars++;
3159 if (flags & SCF_DO_SUBSTR) {
3160 SV *last_str = NULL;
3161 int counted = mincount != 0;
a0ed51b3 3162
8aa23a47
YO
3163 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3164#if defined(SPARC64_GCC_WORKAROUND)
3165 I32 b = 0;
3166 STRLEN l = 0;
3167 const char *s = NULL;
3168 I32 old = 0;
b515a41d 3169
8aa23a47
YO
3170 if (pos_before >= data->last_start_min)
3171 b = pos_before;
3172 else
3173 b = data->last_start_min;
b515a41d 3174
8aa23a47
YO
3175 l = 0;
3176 s = SvPV_const(data->last_found, l);
3177 old = b - data->last_start_min;
3178
3179#else
3180 I32 b = pos_before >= data->last_start_min
3181 ? pos_before : data->last_start_min;
3182 STRLEN l;
3183 const char * const s = SvPV_const(data->last_found, l);
3184 I32 old = b - data->last_start_min;
3185#endif
3186
3187 if (UTF)
3188 old = utf8_hop((U8*)s, old) - (U8*)s;
3189
3190 l -= old;
3191 /* Get the added string: */
3192 last_str = newSVpvn(s + old, l);
3193 if (UTF)
3194 SvUTF8_on(last_str);
3195 if (deltanext == 0 && pos_before == b) {
3196 /* What was added is a constant string */
3197 if (mincount > 1) {
3198 SvGROW(last_str, (mincount * l) + 1);
3199 repeatcpy(SvPVX(last_str) + l,
3200 SvPVX_const(last_str), l, mincount - 1);
3201 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3202 /* Add additional parts. */
3203 SvCUR_set(data->last_found,
3204 SvCUR(data->last_found) - l);
3205 sv_catsv(data->last_found, last_str);
3206 {
3207 SV * sv = data->last_found;
3208 MAGIC *mg =
3209 SvUTF8(sv) && SvMAGICAL(sv) ?
3210 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3211 if (mg && mg->mg_len >= 0)
3212 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3213 }
8aa23a47 3214 data->last_end += l * (mincount - 1);
b515a41d 3215 }
8aa23a47
YO
3216 } else {
3217 /* start offset must point into the last copy */
3218 data->last_start_min += minnext * (mincount - 1);
3219 data->last_start_max += is_inf ? I32_MAX
3220 : (maxcount - 1) * (minnext + data->pos_delta);
3221 }
c277df42 3222 }
8aa23a47
YO
3223 /* It is counted once already... */
3224 data->pos_min += minnext * (mincount - counted);
3225 data->pos_delta += - counted * deltanext +
3226 (minnext + deltanext) * maxcount - minnext * mincount;
3227 if (mincount != maxcount) {
3228 /* Cannot extend fixed substrings found inside
3229 the group. */
3230 scan_commit(pRExC_state,data,minlenp);
3231 if (mincount && last_str) {
3232 SV * const sv = data->last_found;
3233 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3234 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3235
3236 if (mg)
3237 mg->mg_len = -1;
3238 sv_setsv(sv, last_str);
3239 data->last_end = data->pos_min;
3240 data->last_start_min =
3241 data->pos_min - CHR_SVLEN(last_str);
3242 data->last_start_max = is_inf
3243 ? I32_MAX
3244 : data->pos_min + data->pos_delta
3245 - CHR_SVLEN(last_str);
3246 }
3247 data->longest = &(data->longest_float);
3248 }
3249 SvREFCNT_dec(last_str);
c277df42 3250 }
8aa23a47
YO
3251 if (data && (fl & SF_HAS_EVAL))
3252 data->flags |= SF_HAS_EVAL;
3253 optimize_curly_tail:
3254 if (OP(oscan) != CURLYX) {
3255 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3256 && NEXT_OFF(next))
3257 NEXT_OFF(oscan) += NEXT_OFF(next);
3258 }
3259 continue;
3260 default: /* REF and CLUMP only? */
3261 if (flags & SCF_DO_SUBSTR) {
3262 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3263 data->longest = &(data->longest_float);
3264 }
3265 is_inf = is_inf_internal = 1;
3266 if (flags & SCF_DO_STCLASS_OR)
3267 cl_anything(pRExC_state, data->start_class);
3268 flags &= ~SCF_DO_STCLASS;
3269 break;
c277df42 3270 }
8aa23a47
YO
3271 }
3272 else if (strchr((const char*)PL_simple,OP(scan))) {
3273 int value = 0;
653099ff 3274
8aa23a47
YO
3275 if (flags & SCF_DO_SUBSTR) {
3276 scan_commit(pRExC_state,data,minlenp);
3277 data->pos_min++;
3278 }
3279 min++;
3280 if (flags & SCF_DO_STCLASS) {
3281 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3282
8aa23a47
YO
3283 /* Some of the logic below assumes that switching
3284 locale on will only add false positives. */
3285 switch (PL_regkind[OP(scan)]) {
3286 case SANY:
3287 default:
3288 do_default:
3289 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3290 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3291 cl_anything(pRExC_state, data->start_class);
3292 break;
3293 case REG_ANY:
3294 if (OP(scan) == SANY)
3295 goto do_default;
3296 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3297 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3298 || (data->start_class->flags & ANYOF_CLASS));
3299 cl_anything(pRExC_state, data->start_class);
653099ff 3300 }
8aa23a47
YO
3301 if (flags & SCF_DO_STCLASS_AND || !value)
3302 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3303 break;
3304 case ANYOF:
3305 if (flags & SCF_DO_STCLASS_AND)
3306 cl_and(data->start_class,
3307 (struct regnode_charclass_class*)scan);
653099ff 3308 else
8aa23a47
YO
3309 cl_or(pRExC_state, data->start_class,
3310 (struct regnode_charclass_class*)scan);
3311 break;
3312 case ALNUM:
3313 if (flags & SCF_DO_STCLASS_AND) {
3314 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3315 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3316 for (value = 0; value < 256; value++)
3317 if (!isALNUM(value))
3318 ANYOF_BITMAP_CLEAR(data->start_class, value);
3319 }
653099ff 3320 }
8aa23a47
YO
3321 else {
3322 if (data->start_class->flags & ANYOF_LOCALE)
3323 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3324 else {
3325 for (value = 0; value < 256; value++)
3326 if (isALNUM(value))
3327 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3328 }
8aa23a47
YO
3329 }
3330 break;
3331 case ALNUML:
3332 if (flags & SCF_DO_STCLASS_AND) {
3333 if (data->start_class->flags & ANYOF_LOCALE)
3334 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3335 }
3336 else {
3337 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3338 data->start_class->flags |= ANYOF_LOCALE;
3339 }
3340 break;
3341 case NALNUM:
3342 if (flags & SCF_DO_STCLASS_AND) {
3343 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3344 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3345 for (value = 0; value < 256; value++)
3346 if (isALNUM(value))
3347 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3348 }
3349 }
8aa23a47
YO
3350 else {
3351 if (data->start_class->flags & ANYOF_LOCALE)
3352 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3353 else {
3354 for (value = 0; value < 256; value++)
3355 if (!isALNUM(value))
3356 ANYOF_BITMAP_SET(data->start_class, value);
3357 }
653099ff 3358 }
8aa23a47
YO
3359 break;
3360 case NALNUML:
3361 if (flags & SCF_DO_STCLASS_AND) {
3362 if (data->start_class->flags & ANYOF_LOCALE)
3363 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3364 }
8aa23a47
YO
3365 else {
3366 data->start_class->flags |= ANYOF_LOCALE;
3367 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3368 }
3369 break;
3370 case SPACE:
3371 if (flags & SCF_DO_STCLASS_AND) {
3372 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3373 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3374 for (value = 0; value < 256; value++)
3375 if (!isSPACE(value))
3376 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3377 }
3378 }
8aa23a47
YO
3379 else {
3380 if (data->start_class->flags & ANYOF_LOCALE)
3381 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3382 else {
3383 for (value = 0; value < 256; value++)
3384 if (isSPACE(value))
3385 ANYOF_BITMAP_SET(data->start_class, value);
3386 }
653099ff 3387 }
8aa23a47
YO
3388 break;
3389 case SPACEL:
3390 if (flags & SCF_DO_STCLASS_AND) {
3391 if (data->start_class->flags & ANYOF_LOCALE)
3392 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3393 }
3394 else {
3395 data->start_class->flags |= ANYOF_LOCALE;
3396 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3397 }
3398 break;
3399 case NSPACE:
3400 if (flags & SCF_DO_STCLASS_AND) {
3401 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3402 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3403 for (value = 0; value < 256; value++)
3404 if (isSPACE(value))
3405 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3406 }
8aa23a47
YO
3407 }
3408 else {
3409 if (data->start_class->flags & ANYOF_LOCALE)
3410 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3411 else {
3412 for (value = 0; value < 256; value++)
3413 if (!isSPACE(value))
3414 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3415 }
3416 }
8aa23a47
YO
3417 break;
3418 case NSPACEL:
3419 if (flags & SCF_DO_STCLASS_AND) {
3420 if (data->start_class->flags & ANYOF_LOCALE) {
3421 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3422 for (value = 0; value < 256; value++)
3423 if (!isSPACE(value))
3424 ANYOF_BITMAP_CLEAR(data->start_class, value);
3425 }
653099ff 3426 }
8aa23a47
YO
3427 else {
3428 data->start_class->flags |= ANYOF_LOCALE;
3429 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3430 }
3431 break;
3432 case DIGIT:
3433 if (flags & SCF_DO_STCLASS_AND) {
3434 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3435 for (value = 0; value < 256; value++)
3436 if (!isDIGIT(value))
3437 ANYOF_BITMAP_CLEAR(data->start_class, value);
3438 }
3439 else {
3440 if (data->start_class->flags & ANYOF_LOCALE)
3441 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3442 else {
3443 for (value = 0; value < 256; value++)
3444 if (isDIGIT(value))
3445 ANYOF_BITMAP_SET(data->start_class, value);
3446 }
3447 }
3448 break;
3449 case NDIGIT:
3450 if (flags & SCF_DO_STCLASS_AND) {
3451 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3452 for (value = 0; value < 256; value++)
3453 if (isDIGIT(value))
3454 ANYOF_BITMAP_CLEAR(data->start_class, value);
3455 }
3456 else {
3457 if (data->start_class->flags & ANYOF_LOCALE)
3458 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3459 else {
3460 for (value = 0; value < 256; value++)
3461 if (!isDIGIT(value))
3462 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3463 }
3464 }
8aa23a47
YO
3465 break;
3466 }
3467 if (flags & SCF_DO_STCLASS_OR)
3468 cl_and(data->start_class, and_withp);
3469 flags &= ~SCF_DO_STCLASS;
3470 }
3471 }
3472 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3473 data->flags |= (OP(scan) == MEOL
3474 ? SF_BEFORE_MEOL
3475 : SF_BEFORE_SEOL);
3476 }
3477 else if ( PL_regkind[OP(scan)] == BRANCHJ
3478 /* Lookbehind, or need to calculate parens/evals/stclass: */
3479 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3480 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3481 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3482 || OP(scan) == UNLESSM )
3483 {
3484 /* Negative Lookahead/lookbehind
3485 In this case we can't do fixed string optimisation.
3486 */
1de06328 3487
8aa23a47
YO
3488 I32 deltanext, minnext, fake = 0;
3489 regnode *nscan;
3490 struct regnode_charclass_class intrnl;
3491 int f = 0;
1de06328 3492
8aa23a47
YO
3493 data_fake.flags = 0;
3494 if (data) {
3495 data_fake.whilem_c = data->whilem_c;
3496 data_fake.last_closep = data->last_closep;
c277df42 3497 }
8aa23a47
YO
3498 else
3499 data_fake.last_closep = &fake;
58e23c8d 3500 data_fake.pos_delta = delta;
8aa23a47
YO
3501 if ( flags & SCF_DO_STCLASS && !scan->flags
3502 && OP(scan) == IFMATCH ) { /* Lookahead */
3503 cl_init(pRExC_state, &intrnl);
3504 data_fake.start_class = &intrnl;
3505 f |= SCF_DO_STCLASS_AND;
3506 }
3507 if (flags & SCF_WHILEM_VISITED_POS)
3508 f |= SCF_WHILEM_VISITED_POS;
3509 next = regnext(scan);
3510 nscan = NEXTOPER(NEXTOPER(scan));
3511 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3512 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3513 if (scan->flags) {
3514 if (deltanext) {
58e23c8d 3515 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3516 }
3517 else if (minnext > (I32)U8_MAX) {
58e23c8d 3518 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3519 }
3520 scan->flags = (U8)minnext;
3521 }
3522 if (data) {
3523 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3524 pars++;
3525 if (data_fake.flags & SF_HAS_EVAL)
3526 data->flags |= SF_HAS_EVAL;
3527 data->whilem_c = data_fake.whilem_c;
3528 }
3529 if (f & SCF_DO_STCLASS_AND) {
3530 const int was = (data->start_class->flags & ANYOF_EOS);
3531
3532 cl_and(data->start_class, &intrnl);
3533 if (was)
3534 data->start_class->flags |= ANYOF_EOS;
3535 }
cb434fcc 3536 }
8aa23a47
YO
3537#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3538 else {
3539 /* Positive Lookahead/lookbehind
3540 In this case we can do fixed string optimisation,
3541 but we must be careful about it. Note in the case of
3542 lookbehind the positions will be offset by the minimum
3543 length of the pattern, something we won't know about
3544 until after the recurse.
3545 */
3546 I32 deltanext, fake = 0;
3547 regnode *nscan;
3548 struct regnode_charclass_class intrnl;
3549 int f = 0;
3550 /* We use SAVEFREEPV so that when the full compile
3551 is finished perl will clean up the allocated
3552 minlens when its all done. This was we don't
3553 have to worry about freeing them when we know
3554 they wont be used, which would be a pain.
3555 */
3556 I32 *minnextp;
3557 Newx( minnextp, 1, I32 );
3558 SAVEFREEPV(minnextp);
3559
3560 if (data) {
3561 StructCopy(data, &data_fake, scan_data_t);
3562 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3563 f |= SCF_DO_SUBSTR;
3564 if (scan->flags)
3565 scan_commit(pRExC_state, &data_fake,minlenp);
3566 data_fake.last_found=newSVsv(data->last_found);
3567 }
3568 }
3569 else
3570 data_fake.last_closep = &fake;
3571 data_fake.flags = 0;
58e23c8d 3572 data_fake.pos_delta = delta;
8aa23a47
YO
3573 if (is_inf)
3574 data_fake.flags |= SF_IS_INF;
3575 if ( flags & SCF_DO_STCLASS && !scan->flags
3576 && OP(scan) == IFMATCH ) { /* Lookahead */
3577 cl_init(pRExC_state, &intrnl);
3578 data_fake.start_class = &intrnl;
3579 f |= SCF_DO_STCLASS_AND;
3580 }
3581 if (flags & SCF_WHILEM_VISITED_POS)
3582 f |= SCF_WHILEM_VISITED_POS;
3583 next = regnext(scan);
3584 nscan = NEXTOPER(NEXTOPER(scan));
3585
3586 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3587 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3588 if (scan->flags) {
3589 if (deltanext) {
58e23c8d 3590 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3591 }
3592 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3593 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3594 }
3595 scan->flags = (U8)*minnextp;
3596 }
3597
3598 *minnextp += min;
3599
3600 if (f & SCF_DO_STCLASS_AND) {
3601 const int was = (data->start_class->flags & ANYOF_EOS);
3602
3603 cl_and(data->start_class, &intrnl);
3604 if (was)
3605 data->start_class->flags |= ANYOF_EOS;
3606 }
3607 if (data) {
3608 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3609 pars++;
3610 if (data_fake.flags & SF_HAS_EVAL)
3611 data->flags |= SF_HAS_EVAL;
3612 data->whilem_c = data_fake.whilem_c;
3613 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3614 if (RExC_rx->minlen<*minnextp)
3615 RExC_rx->minlen=*minnextp;
3616 scan_commit(pRExC_state, &data_fake, minnextp);
3617 SvREFCNT_dec(data_fake.last_found);
3618
3619 if ( data_fake.minlen_fixed != minlenp )
3620 {
3621 data->offset_fixed= data_fake.offset_fixed;
3622 data->minlen_fixed= data_fake.minlen_fixed;
3623 data->lookbehind_fixed+= scan->flags;
3624 }
3625 if ( data_fake.minlen_float != minlenp )
3626 {
3627 data->minlen_float= data_fake.minlen_float;
3628 data->offset_float_min=data_fake.offset_float_min;
3629 data->offset_float_max=data_fake.offset_float_max;
3630 data->lookbehind_float+= scan->flags;
3631 }
3632 }
3633 }
3634
3635
40d049e4 3636 }
8aa23a47
YO
3637#endif
3638 }
3639 else if (OP(scan) == OPEN) {
3640 if (stopparen != (I32)ARG(scan))
3641 pars++;
3642 }
3643 else if (OP(scan) == CLOSE) {
3644 if (stopparen == (I32)ARG(scan)) {
3645 break;
3646 }
3647 if ((I32)ARG(scan) == is_par) {
3648 next = regnext(scan);
b515a41d 3649
8aa23a47
YO
3650 if ( next && (OP(next) != WHILEM) && next < last)
3651 is_par = 0; /* Disable optimization */
40d049e4 3652 }
8aa23a47
YO
3653 if (data)
3654 *(data->last_closep) = ARG(scan);
3655 }
3656 else if (OP(scan) == EVAL) {
c277df42
IZ
3657 if (data)
3658 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3659 }
3660 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3661 if (flags & SCF_DO_SUBSTR) {
3662 scan_commit(pRExC_state,data,minlenp);
3663 flags &= ~SCF_DO_SUBSTR;
40d049e4 3664 }
8aa23a47
YO
3665 if (data && OP(scan)==ACCEPT) {
3666 data->flags |= SCF_SEEN_ACCEPT;
3667 if (stopmin > min)
3668 stopmin = min;
e2e6a0f1 3669 }
8aa23a47
YO
3670 }
3671 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3672 {
0f5d15d6 3673 if (flags & SCF_DO_SUBSTR) {
1de06328 3674 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3675 data->longest = &(data->longest_float);
3676 }
3677 is_inf = is_inf_internal = 1;
653099ff 3678 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3679 cl_anything(pRExC_state, data->start_class);
96776eda 3680 flags &= ~SCF_DO_STCLASS;
8aa23a47 3681 }
58e23c8d 3682 else if (OP(scan) == GPOS) {
bbe252da 3683 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3684 !(delta || is_inf || (data && data->pos_delta)))
3685 {
bbe252da
YO
3686 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3687 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3688 if (RExC_rx->gofs < (U32)min)
3689 RExC_rx->gofs = min;
3690 } else {
bbe252da 3691 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3692 RExC_rx->gofs = 0;
3693 }
3694 }
786e8c11 3695#ifdef TRIE_STUDY_OPT
40d049e4 3696#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3697 else if (PL_regkind[OP(scan)] == TRIE) {
3698 /* NOTE - There is similar code to this block above for handling
3699 BRANCH nodes on the initial study. If you change stuff here
3700 check there too. */
3701 regnode *trie_node= scan;
3702 regnode *tail= regnext(scan);
f8fc2ecf 3703 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3704 I32 max1 = 0, min1 = I32_MAX;
3705 struct regnode_charclass_class accum;
3706
3707 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3708 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3709 if (flags & SCF_DO_STCLASS)
3710 cl_init_zero(pRExC_state, &accum);
3711
3712 if (!trie->jump) {
3713 min1= trie->minlen;
3714 max1= trie->maxlen;
3715 } else {
3716 const regnode *nextbranch= NULL;
3717 U32 word;
3718
3719 for ( word=1 ; word <= trie->wordcount ; word++)
3720 {
3721 I32 deltanext=0, minnext=0, f = 0, fake;
3722 struct regnode_charclass_class this_class;
3723
3724 data_fake.flags = 0;
3725 if (data) {
3726 data_fake.whilem_c = data->whilem_c;
3727 data_fake.last_closep = data->last_closep;
3728 }
3729 else
3730 data_fake.last_closep = &fake;
58e23c8d 3731 data_fake.pos_delta = delta;
8aa23a47
YO
3732 if (flags & SCF_DO_STCLASS) {
3733 cl_init(pRExC_state, &this_class);
3734 data_fake.start_class = &this_class;
3735 f = SCF_DO_STCLASS_AND;
3736 }
3737 if (flags & SCF_WHILEM_VISITED_POS)
3738 f |= SCF_WHILEM_VISITED_POS;
3739
3740 if (trie->jump[word]) {
3741 if (!nextbranch)
3742 nextbranch = trie_node + trie->jump[0];
3743 scan= trie_node + trie->jump[word];
3744 /* We go from the jump point to the branch that follows
3745 it. Note this means we need the vestigal unused branches
3746 even though they arent otherwise used.
3747 */
3748 minnext = study_chunk(pRExC_state, &scan, minlenp,
3749 &deltanext, (regnode *)nextbranch, &data_fake,
3750 stopparen, recursed, NULL, f,depth+1);
3751 }
3752 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3753 nextbranch= regnext((regnode*)nextbranch);
3754
3755 if (min1 > (I32)(minnext + trie->minlen))
3756 min1 = minnext + trie->minlen;
3757 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3758 max1 = minnext + deltanext + trie->maxlen;
3759 if (deltanext == I32_MAX)
3760 is_inf = is_inf_internal = 1;
3761
3762 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3763 pars++;
3764 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3765 if ( stopmin > min + min1)
3766 stopmin = min + min1;
3767 flags &= ~SCF_DO_SUBSTR;
3768 if (data)
3769 data->flags |= SCF_SEEN_ACCEPT;
3770 }
3771 if (data) {
3772 if (data_fake.flags & SF_HAS_EVAL)
3773 data->flags |= SF_HAS_EVAL;
3774 data->whilem_c = data_fake.whilem_c;
3775 }
3776 if (flags & SCF_DO_STCLASS)
3777 cl_or(pRExC_state, &accum, &this_class);
3778 }
3779 }
3780 if (flags & SCF_DO_SUBSTR) {
3781 data->pos_min += min1;
3782 data->pos_delta += max1 - min1;
3783 if (max1 != min1 || is_inf)
3784 data->longest = &(data->longest_float);
3785 }
3786 min += min1;
3787 delta += max1 - min1;
3788 if (flags & SCF_DO_STCLASS_OR) {
3789 cl_or(pRExC_state, data->start_class, &accum);
3790 if (min1) {
3791 cl_and(data->start_class, and_withp);
3792 flags &= ~SCF_DO_STCLASS;
3793 }
3794 }
3795 else if (flags & SCF_DO_STCLASS_AND) {
3796 if (min1) {
3797 cl_and(data->start_class, &accum);
3798 flags &= ~SCF_DO_STCLASS;
3799 }
3800 else {
3801 /* Switch to OR mode: cache the old value of
3802 * data->start_class */
3803 INIT_AND_WITHP;
3804 StructCopy(data->start_class, and_withp,
3805 struct regnode_charclass_class);
3806 flags &= ~SCF_DO_STCLASS_AND;
3807 StructCopy(&accum, data->start_class,
3808 struct regnode_charclass_class);
3809 flags |= SCF_DO_STCLASS_OR;
3810 data->start_class->flags |= ANYOF_EOS;
3811 }
3812 }
3813 scan= tail;
3814 continue;
3815 }
786e8c11 3816#else
8aa23a47 3817 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 3818 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3819 U8*bang=NULL;
3820
3821 min += trie->minlen;
3822 delta += (trie->maxlen - trie->minlen);
3823 flags &= ~SCF_DO_STCLASS; /* xxx */
3824 if (flags & SCF_DO_SUBSTR) {
3825 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3826 data->pos_min += trie->minlen;
3827 data->pos_delta += (trie->maxlen - trie->minlen);
3828 if (trie->maxlen != trie->minlen)
3829 data->longest = &(data->longest_float);
3830 }
3831 if (trie->jump) /* no more substrings -- for now /grr*/
3832 flags &= ~SCF_DO_SUBSTR;
b515a41d 3833 }
8aa23a47
YO
3834#endif /* old or new */
3835#endif /* TRIE_STUDY_OPT */
3836 /* Else: zero-length, ignore. */
3837 scan = regnext(scan);
3838 }
3839 if (frame) {
3840 last = frame->last;
3841 scan = frame->next;
3842 stopparen = frame->stop;
3843 frame = frame->prev;
3844 goto fake_study_recurse;
c277df42
IZ
3845 }
3846
3847 finish:
8aa23a47
YO
3848 assert(!frame);
3849
c277df42 3850 *scanp = scan;
aca2d497 3851 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3852 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3853 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3854 if (is_par > (I32)U8_MAX)
c277df42
IZ
3855 is_par = 0;
3856 if (is_par && pars==1 && data) {
3857 data->flags |= SF_IN_PAR;
3858 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3859 }
3860 else if (pars && data) {
c277df42
IZ
3861 data->flags |= SF_HAS_PAR;
3862 data->flags &= ~SF_IN_PAR;
3863 }
653099ff 3864 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3865 cl_and(data->start_class, and_withp);
786e8c11
YO
3866 if (flags & SCF_TRIE_RESTUDY)
3867 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3868
3869 DEBUG_STUDYDATA(data,depth);
3870
e2e6a0f1 3871 return min < stopmin ? min : stopmin;
c277df42
IZ
3872}
3873
2eccd3b2
NC
3874STATIC U32
3875S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 3876{
4a4e7719
NC
3877 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
3878
3879 Renewc(RExC_rxi->data,
3880 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
3881 char, struct reg_data);
3882 if(count)
f8fc2ecf 3883 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 3884 else
f8fc2ecf 3885 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
3886 RExC_rxi->data->count = count + n;
3887 Copy(s, RExC_rxi->data->what + count, n, U8);
3888 return count;
c277df42
IZ
3889}
3890
76234dfb 3891#ifndef PERL_IN_XSUB_RE
d88dccdf 3892void
864dbfa3 3893Perl_reginitcolors(pTHX)
d88dccdf 3894{
97aff369 3895 dVAR;
1df70142 3896 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3897 if (s) {
1df70142
AL
3898 char *t = savepv(s);
3899 int i = 0;
3900 PL_colors[0] = t;
d88dccdf 3901 while (++i < 6) {
1df70142
AL
3902 t = strchr(t, '\t');
3903 if (t) {
3904 *t = '\0';
3905 PL_colors[i] = ++t;
d88dccdf
IZ
3906 }
3907 else
1df70142 3908 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3909 }
3910 } else {
1df70142 3911 int i = 0;
b81d288d 3912 while (i < 6)
06b5626a 3913 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3914 }
3915 PL_colorset = 1;
3916}
76234dfb 3917#endif
8615cb43 3918
07be1b83 3919
786e8c11
YO
3920#ifdef TRIE_STUDY_OPT
3921#define CHECK_RESTUDY_GOTO \
3922 if ( \
3923 (data.flags & SCF_TRIE_RESTUDY) \
3924 && ! restudied++ \
3925 ) goto reStudy
3926#else
3927#define CHECK_RESTUDY_GOTO
3928#endif
f9f4320a 3929
a687059c 3930/*
e50aee73 3931 - pregcomp - compile a regular expression into internal code
a687059c
LW
3932 *
3933 * We can't allocate space until we know how big the compiled form will be,
3934 * but we can't compile it (and thus know how big it is) until we've got a
3935 * place to put the code. So we cheat: we compile it twice, once with code
3936 * generation turned off and size counting turned on, and once "for real".
3937 * This also means that we don't allocate space until we are sure that the
3938 * thing really will compile successfully, and we never have to move the
3939 * code and thus invalidate pointers into it. (Note that it has to be in
3940 * one piece because free() must be able to free it all.) [NB: not true in perl]
3941 *
3942 * Beware that the optimization-preparation code in here knows about some
3943 * of the structure of the compiled regexp. [I'll say.]
3944 */
b9b4dddf
YO
3945
3946
3947
f9f4320a 3948#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3949#define RE_ENGINE_PTR &PL_core_reg_engine
3950#else
f9f4320a
YO
3951extern const struct regexp_engine my_reg_engine;
3952#define RE_ENGINE_PTR &my_reg_engine
3953#endif
b9b4dddf
YO
3954/* these make a few things look better, to avoid indentation */
3955#define BEGIN_BLOCK {
f9f4320a
YO
3956#define END_BLOCK }
3957
a687059c 3958regexp *
864dbfa3 3959Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3960{
97aff369 3961 dVAR;
f9f4320a
YO
3962 GET_RE_DEBUG_FLAGS_DECL;
3963 DEBUG_r(if (!PL_colorset) reginitcolors());
b9b4dddf
YO
3964#ifndef PERL_IN_XSUB_RE
3965 BEGIN_BLOCK
f9f4320a
YO
3966 /* Dispatch a request to compile a regexp to correct
3967 regexp engine. */
3968 HV * const table = GvHV(PL_hintgv);
3969 if (table) {
3970 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
1e2e3d02 3971 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3972 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3973 DEBUG_COMPILE_r({
8d8756e7 3974 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3975 SvIV(*ptr));
3976 });
f2f78491 3977 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3978 }
b9b4dddf
YO
3979 }
3980 END_BLOCK
3981#endif
3982 BEGIN_BLOCK
a0d0e21e 3983 register regexp *r;
f8fc2ecf 3984 register regexp_internal *ri;
c277df42 3985 regnode *scan;
c277df42 3986 regnode *first;
a0d0e21e 3987 I32 flags;
a0d0e21e
LW
3988 I32 minlen = 0;
3989 I32 sawplus = 0;
3990 I32 sawopen = 0;
2c2d71f5 3991 scan_data_t data;
830247a4 3992 RExC_state_t RExC_state;
be8e71aa 3993 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3994#ifdef TRIE_STUDY_OPT
3995 int restudied= 0;
3996 RExC_state_t copyRExC_state;
3997#endif
a0d0e21e 3998 if (exp == NULL)
c277df42 3999 FAIL("NULL regexp argument");
a0d0e21e 4000
a5961de5 4001 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 4002
5cfc7842 4003 RExC_precomp = exp;
a3621e74 4004 DEBUG_COMPILE_r({
ab3bbdeb
YO
4005 SV *dsv= sv_newmortal();
4006 RE_PV_QUOTED_DECL(s, RExC_utf8,
4007 dsv, RExC_precomp, (xend - exp), 60);
4008 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4009 PL_colors[4],PL_colors[5],s);
a5961de5 4010 });
e2509266 4011 RExC_flags = pm->op_pmflags;
830247a4 4012 RExC_sawback = 0;
bbce6d69 4013
830247a4
IZ
4014 RExC_seen = 0;
4015 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4016 RExC_seen_evals = 0;
4017 RExC_extralen = 0;
c277df42 4018
bbce6d69 4019 /* First pass: determine size, legality. */
830247a4 4020 RExC_parse = exp;
fac92740 4021 RExC_start = exp;
830247a4
IZ
4022 RExC_end = xend;
4023 RExC_naughty = 0;
4024 RExC_npar = 1;
c74340f9 4025 RExC_cpar = 1;
e2e6a0f1 4026 RExC_nestroot = 0;
830247a4
IZ
4027 RExC_size = 0L;
4028 RExC_emit = &PL_regdummy;
4029 RExC_whilem_seen = 0;
fc8cd66c 4030 RExC_charnames = NULL;
40d049e4
YO
4031 RExC_open_parens = NULL;
4032 RExC_close_parens = NULL;
4033 RExC_opend = NULL;
81714fb9 4034 RExC_paren_names = NULL;
40d049e4
YO
4035 RExC_recurse = NULL;
4036 RExC_recurse_count = 0;
81714fb9 4037
85ddcde9
JH
4038#if 0 /* REGC() is (currently) a NOP at the first pass.
4039 * Clever compilers notice this and complain. --jhi */
830247a4 4040 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4041#endif
3dab1dad
YO
4042 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4043 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4044 RExC_precomp = NULL;
a0d0e21e
LW
4045 return(NULL);
4046 }
07be1b83 4047 DEBUG_PARSE_r({
81714fb9
YO
4048 PerlIO_printf(Perl_debug_log,
4049 "Required size %"IVdf" nodes\n"
4050 "Starting second pass (creation)\n",
4051 (IV)RExC_size);
07be1b83
YO
4052 RExC_lastnum=0;
4053 RExC_lastparse=NULL;
4054 });
c277df42
IZ
4055 /* Small enough for pointer-storage convention?
4056 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4057 if (RExC_size >= 0x10000L && RExC_extralen)
4058 RExC_size += RExC_extralen;
c277df42 4059 else
830247a4
IZ
4060 RExC_extralen = 0;
4061 if (RExC_whilem_seen > 15)
4062 RExC_whilem_seen = 15;
a0d0e21e 4063
e2e6a0f1
YO
4064#ifdef DEBUGGING
4065 /* Make room for a sentinel value at the end of the program */
4066 RExC_size++;
4067#endif
4068
f9f4320a
YO
4069 /* Allocate space and zero-initialize. Note, the two step process
4070 of zeroing when in debug mode, thus anything assigned has to
4071 happen after that */
f8fc2ecf
YO
4072 Newxz(r, 1, regexp);
4073 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4074 char, regexp_internal);
4075 if ( r == NULL || ri == NULL )
b45f050a 4076 FAIL("Regexp out of space");
0f79a09d
GS
4077#ifdef DEBUGGING
4078 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4079 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4080#else
f8fc2ecf
YO
4081 /* bulk initialize base fields with 0. */
4082 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4083#endif
58e23c8d
YO
4084
4085 /* non-zero initialization begins here */
f8fc2ecf 4086 RXi_SET( r, ri );
f9f4320a 4087 r->engine= RE_ENGINE_PTR;
c277df42 4088 r->refcnt = 1;
bbce6d69 4089 r->prelen = xend - exp;
5cfc7842 4090 r->precomp = savepvn(RExC_precomp, r->prelen);
bbe252da
YO
4091 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
4092 r->intflags = 0;
830247a4 4093 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4094
6bda09f9 4095 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4096 Newxz(RExC_open_parens, RExC_npar,regnode *);
4097 SAVEFREEPV(RExC_open_parens);
4098 Newxz(RExC_close_parens,RExC_npar,regnode *);
4099 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4100 }
4101
4102 /* Useful during FAIL. */
f8fc2ecf
YO
4103 Newxz(ri->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4104 if (ri->offsets) {
4105 ri->offsets[0] = RExC_size;
fac92740 4106 }
a3621e74 4107 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4108 "%s %"UVuf" bytes for offset annotations.\n",
f8fc2ecf 4109 ri->offsets ? "Got" : "Couldn't get",
392fbf5d 4110 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4111
830247a4 4112 RExC_rx = r;
f8fc2ecf 4113 RExC_rxi = ri;
bbce6d69 4114
4115 /* Second pass: emit code. */
e2509266 4116 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4117 RExC_parse = exp;
4118 RExC_end = xend;
4119 RExC_naughty = 0;
4120 RExC_npar = 1;
c74340f9 4121 RExC_cpar = 1;
f8fc2ecf
YO
4122 RExC_emit_start = ri->program;
4123 RExC_emit = ri->program;
e2e6a0f1
YO
4124#ifdef DEBUGGING
4125 /* put a sentinal on the end of the program so we can check for
4126 overwrites */
f8fc2ecf 4127 ri->program[RExC_size].type = 255;
e2e6a0f1 4128#endif
2cd61cdb 4129 /* Store the count of eval-groups for security checks: */
786e8c11 4130 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 4131 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3dab1dad 4132 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4133 return(NULL);
6bda09f9 4134
07be1b83
YO
4135 /* XXXX To minimize changes to RE engine we always allocate
4136 3-units-long substrs field. */
4137 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4138 if (RExC_recurse_count) {
4139 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4140 SAVEFREEPV(RExC_recurse);
4141 }
a0d0e21e 4142
07be1b83 4143reStudy:
1de06328 4144 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4145 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4146
07be1b83
YO
4147#ifdef TRIE_STUDY_OPT
4148 if ( restudied ) {
5d458dd8 4149 U32 seen=RExC_seen;
07be1b83 4150 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4151
4152 RExC_state = copyRExC_state;
4153 if (seen & REG_TOP_LEVEL_BRANCHES)
4154 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4155 else
4156 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4157 if (data.last_found) {
07be1b83 4158 SvREFCNT_dec(data.longest_fixed);
07be1b83 4159 SvREFCNT_dec(data.longest_float);
07be1b83 4160 SvREFCNT_dec(data.last_found);
1de06328 4161 }
40d049e4 4162 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4163 } else {
40d049e4 4164 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4165 copyRExC_state = RExC_state;
07be1b83 4166 }
40d049e4
YO
4167#else
4168 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4169#endif
fc8cd66c 4170
a0d0e21e 4171 /* Dig out information for optimizations. */
bbe252da 4172 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */
e2509266 4173 pm->op_pmflags = RExC_flags;
a0ed51b3 4174 if (UTF)
bbe252da 4175 r->extflags |= RXf_UTF8; /* Unicode in it? */
f8fc2ecf 4176 ri->regstclass = NULL;
830247a4 4177 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4178 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4179 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4180
1de06328
YO
4181 /* testing for BRANCH here tells us whether there is "must appear"
4182 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4183 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4184 I32 fake;
c5254dd6 4185 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4186 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4187 int stclass_flag;
07be1b83 4188 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4189
4190 first = scan;
c277df42 4191 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4192 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4193 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4194 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4195 /* for now we can't handle lookbehind IFMATCH*/
4196 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4197 (OP(first) == PLUS) ||
4198 (OP(first) == MINMOD) ||
653099ff 4199 /* An {n,m} with n>0 */
07be1b83
YO
4200 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4201 {
786e8c11 4202
a0d0e21e
LW
4203 if (OP(first) == PLUS)
4204 sawplus = 1;
4205 else
3dab1dad 4206 first += regarglen[OP(first)];
07be1b83
YO
4207 if (OP(first) == IFMATCH) {
4208 first = NEXTOPER(first);
4209 first += EXTRA_STEP_2ARGS;
7c167cea 4210 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4211 first = NEXTOPER(first);
a687059c
LW
4212 }
4213
a0d0e21e
LW
4214 /* Starting-point info. */
4215 again:
786e8c11 4216 DEBUG_PEEP("first:",first,0);
07be1b83 4217 /* Ignore EXACT as we deal with it later. */
3dab1dad 4218 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4219 if (OP(first) == EXACT)
6f207bd3 4220 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4221 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4222 ri->regstclass = first;
b3c9acc1 4223 }
07be1b83 4224#ifdef TRIE_STCLASS
786e8c11 4225 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4226 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4227 {
786e8c11 4228 regnode *trie_op;
07be1b83 4229 /* this can happen only on restudy */
786e8c11 4230 if ( OP(first) == TRIE ) {
446bd890
NC
4231 struct regnode_1 *trieop =
4232 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4233 StructCopy(first,trieop,struct regnode_1);
4234 trie_op=(regnode *)trieop;
4235 } else {
446bd890
NC
4236 struct regnode_charclass *trieop =
4237 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4238 StructCopy(first,trieop,struct regnode_charclass);
4239 trie_op=(regnode *)trieop;
4240 }
1de06328 4241 OP(trie_op)+=2;
786e8c11 4242 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4243 ri->regstclass = trie_op;
07be1b83
YO
4244 }
4245#endif
bfed75c6 4246 else if (strchr((const char*)PL_simple,OP(first)))
f8fc2ecf 4247 ri->regstclass = first;
3dab1dad
YO
4248 else if (PL_regkind[OP(first)] == BOUND ||
4249 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4250 ri->regstclass = first;
3dab1dad 4251 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4252 r->extflags |= (OP(first) == MBOL
4253 ? RXf_ANCH_MBOL
cad2e5aa 4254 : (OP(first) == SBOL
bbe252da
YO
4255 ? RXf_ANCH_SBOL
4256 : RXf_ANCH_BOL));
a0d0e21e 4257 first = NEXTOPER(first);
774d564b 4258 goto again;
4259 }
4260 else if (OP(first) == GPOS) {
bbe252da 4261 r->extflags |= RXf_ANCH_GPOS;
774d564b 4262 first = NEXTOPER(first);
4263 goto again;
a0d0e21e 4264 }
cf2a2b69
YO
4265 else if ((!sawopen || !RExC_sawback) &&
4266 (OP(first) == STAR &&
3dab1dad 4267 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4268 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4269 {
4270 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4271 const int type =
4272 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4273 ? RXf_ANCH_MBOL
4274 : RXf_ANCH_SBOL;
4275 r->extflags |= type;
4276 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4277 first = NEXTOPER(first);
774d564b 4278 goto again;
a0d0e21e 4279 }
b81d288d 4280 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4281 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4282 /* x+ must match at the 1st pos of run of x's */
bbe252da 4283 r->intflags |= PREGf_SKIP;
a0d0e21e 4284
c277df42 4285 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4286#ifdef TRIE_STUDY_OPT
81714fb9 4287 DEBUG_PARSE_r(
be8e71aa
YO
4288 if (!restudied)
4289 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4290 (IV)(first - scan + 1))
4291 );
4292#else
81714fb9 4293 DEBUG_PARSE_r(
be8e71aa
YO
4294 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4295 (IV)(first - scan + 1))
4296 );
4297#endif
4298
4299
a0d0e21e
LW
4300 /*
4301 * If there's something expensive in the r.e., find the
4302 * longest literal string that must appear and make it the
4303 * regmust. Resolve ties in favor of later strings, since
4304 * the regstart check works with the beginning of the r.e.
4305 * and avoiding duplication strengthens checking. Not a
4306 * strong reason, but sufficient in the absence of others.
4307 * [Now we resolve ties in favor of the earlier string if
c277df42 4308 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4309 * earlier string may buy us something the later one won't.]
4310 */
de8c5301 4311
396482e1
GA
4312 data.longest_fixed = newSVpvs("");
4313 data.longest_float = newSVpvs("");
4314 data.last_found = newSVpvs("");
c277df42
IZ
4315 data.longest = &(data.longest_fixed);
4316 first = scan;
f8fc2ecf 4317 if (!ri->regstclass) {
830247a4 4318 cl_init(pRExC_state, &ch_class);
653099ff
GS
4319 data.start_class = &ch_class;
4320 stclass_flag = SCF_DO_STCLASS_AND;
4321 } else /* XXXX Check for BOUND? */
4322 stclass_flag = 0;
cb434fcc 4323 data.last_closep = &last_close;
de8c5301 4324
1de06328 4325 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4326 &data, -1, NULL, NULL,
4327 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4328
07be1b83 4329
786e8c11
YO
4330 CHECK_RESTUDY_GOTO;
4331
4332
830247a4 4333 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4334 && data.last_start_min == 0 && data.last_end > 0
830247a4 4335 && !RExC_seen_zerolen
bbe252da
YO
4336 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4337 r->extflags |= RXf_CHECK_ALL;
1de06328 4338 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4339 SvREFCNT_dec(data.last_found);
4340
1de06328
YO
4341 /* Note that code very similar to this but for anchored string
4342 follows immediately below, changes may need to be made to both.
4343 Be careful.
4344 */
a0ed51b3 4345 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4346 if (longest_float_length
c277df42
IZ
4347 || (data.flags & SF_FL_BEFORE_EOL
4348 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4349 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4350 {
1182767e 4351 I32 t,ml;
cf93c79d 4352
1de06328 4353 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4354 && data.offset_fixed == data.offset_float_min
4355 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4356 goto remove_float; /* As in (a)+. */
4357
1de06328
YO
4358 /* copy the information about the longest float from the reg_scan_data
4359 over to the program. */
33b8afdf
JH
4360 if (SvUTF8(data.longest_float)) {
4361 r->float_utf8 = data.longest_float;
c445ea15 4362 r->float_substr = NULL;
33b8afdf
JH
4363 } else {
4364 r->float_substr = data.longest_float;
c445ea15 4365 r->float_utf8 = NULL;
33b8afdf 4366 }
1de06328
YO
4367 /* float_end_shift is how many chars that must be matched that
4368 follow this item. We calculate it ahead of time as once the
4369 lookbehind offset is added in we lose the ability to correctly
4370 calculate it.*/
4371 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4372 : (I32)longest_float_length;
1de06328
YO
4373 r->float_end_shift = ml - data.offset_float_min
4374 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4375 + data.lookbehind_float;
4376 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4377 r->float_max_offset = data.offset_float_max;
1182767e 4378 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4379 r->float_max_offset -= data.lookbehind_float;
4380
cf93c79d
IZ
4381 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4382 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4383 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4384 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4385 }
4386 else {
aca2d497 4387 remove_float:
c445ea15 4388 r->float_substr = r->float_utf8 = NULL;
c277df42 4389 SvREFCNT_dec(data.longest_float);
c5254dd6 4390 longest_float_length = 0;
a0d0e21e 4391 }
c277df42 4392
1de06328
YO
4393 /* Note that code very similar to this but for floating string
4394 is immediately above, changes may need to be made to both.
4395 Be careful.
4396 */
a0ed51b3 4397 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4398 if (longest_fixed_length
c277df42
IZ
4399 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4400 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4401 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4402 {
1182767e 4403 I32 t,ml;
cf93c79d 4404
1de06328
YO
4405 /* copy the information about the longest fixed
4406 from the reg_scan_data over to the program. */
33b8afdf
JH
4407 if (SvUTF8(data.longest_fixed)) {
4408 r->anchored_utf8 = data.longest_fixed;
c445ea15 4409 r->anchored_substr = NULL;
33b8afdf
JH
4410 } else {
4411 r->anchored_substr = data.longest_fixed;
c445ea15 4412 r->anchored_utf8 = NULL;
33b8afdf 4413 }
1de06328
YO
4414 /* fixed_end_shift is how many chars that must be matched that
4415 follow this item. We calculate it ahead of time as once the
4416 lookbehind offset is added in we lose the ability to correctly
4417 calculate it.*/
4418 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4419 : (I32)longest_fixed_length;
1de06328
YO
4420 r->anchored_end_shift = ml - data.offset_fixed
4421 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4422 + data.lookbehind_fixed;
4423 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4424
cf93c79d
IZ
4425 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4426 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4427 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4428 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4429 }
4430 else {
c445ea15 4431 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4432 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4433 longest_fixed_length = 0;
a0d0e21e 4434 }
f8fc2ecf
YO
4435 if (ri->regstclass
4436 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4437 ri->regstclass = NULL;
33b8afdf
JH
4438 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4439 && stclass_flag
653099ff 4440 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4441 && !cl_is_anything(data.start_class))
4442 {
2eccd3b2 4443 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4444
f8fc2ecf 4445 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4446 struct regnode_charclass_class);
4447 StructCopy(data.start_class,
f8fc2ecf 4448 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4449 struct regnode_charclass_class);
f8fc2ecf 4450 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4451 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4452 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4453 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4454 PerlIO_printf(Perl_debug_log,
a0288114 4455 "synthetic stclass \"%s\".\n",
3f7c398e 4456 SvPVX_const(sv));});
653099ff 4457 }
c277df42
IZ
4458
4459 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4460 if (longest_fixed_length > longest_float_length) {
1de06328 4461 r->check_end_shift = r->anchored_end_shift;
c277df42 4462 r->check_substr = r->anchored_substr;
33b8afdf 4463 r->check_utf8 = r->anchored_utf8;
c277df42 4464 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4465 if (r->extflags & RXf_ANCH_SINGLE)
4466 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4467 }
4468 else {
1de06328 4469 r->check_end_shift = r->float_end_shift;
c277df42 4470 r->check_substr = r->float_substr;
33b8afdf 4471 r->check_utf8 = r->float_utf8;
1de06328
YO
4472 r->check_offset_min = r->float_min_offset;
4473 r->check_offset_max = r->float_max_offset;
a0d0e21e 4474 }
30382c73
IZ
4475 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4476 This should be changed ASAP! */
bbe252da
YO
4477 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4478 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4479 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4480 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4481 }
1de06328
YO
4482 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4483 if ( (STRLEN)minlen < longest_float_length )
4484 minlen= longest_float_length;
4485 if ( (STRLEN)minlen < longest_fixed_length )
4486 minlen= longest_fixed_length;
4487 */
a0ed51b3
LW
4488 }
4489 else {
c277df42
IZ
4490 /* Several toplevels. Best we can is to set minlen. */
4491 I32 fake;
653099ff 4492 struct regnode_charclass_class ch_class;
cb434fcc 4493 I32 last_close = 0;
c277df42 4494
5d458dd8 4495 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4496
f8fc2ecf 4497 scan = ri->program + 1;
830247a4 4498 cl_init(pRExC_state, &ch_class);
653099ff 4499 data.start_class = &ch_class;
cb434fcc 4500 data.last_closep = &last_close;
07be1b83 4501
de8c5301 4502
1de06328 4503 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4504 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4505
786e8c11 4506 CHECK_RESTUDY_GOTO;
07be1b83 4507
33b8afdf 4508 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4509 = r->float_substr = r->float_utf8 = NULL;
653099ff 4510 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4511 && !cl_is_anything(data.start_class))
4512 {
2eccd3b2 4513 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4514
f8fc2ecf 4515 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4516 struct regnode_charclass_class);
4517 StructCopy(data.start_class,
f8fc2ecf 4518 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4519 struct regnode_charclass_class);
f8fc2ecf 4520 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4521 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4522 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4523 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4524 PerlIO_printf(Perl_debug_log,
a0288114 4525 "synthetic stclass \"%s\".\n",
3f7c398e 4526 SvPVX_const(sv));});
653099ff 4527 }
a0d0e21e
LW
4528 }
4529
1de06328
YO
4530 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4531 the "real" pattern. */
cf9788e3
RGS
4532 DEBUG_OPTIMISE_r({
4533 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4534 (IV)minlen, (IV)r->minlen);
cf9788e3 4535 });
de8c5301 4536 r->minlenret = minlen;
1de06328
YO
4537 if (r->minlen < minlen)
4538 r->minlen = minlen;
4539
b81d288d 4540 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4541 r->extflags |= RXf_GPOS_SEEN;
830247a4 4542 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4543 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4544 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4545 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4546 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4547 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4548 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4549 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4550 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4551 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9
YO
4552 if (RExC_paren_names)
4553 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4554 else
4555 r->paren_names = NULL;
4556
40d049e4
YO
4557 if (RExC_recurse_count) {
4558 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4559 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4560 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4561 }
4562 }
a02a5408
JC
4563 Newxz(r->startp, RExC_npar, I32);
4564 Newxz(r->endp, RExC_npar, I32);
c74340f9
YO
4565 /* assume we don't need to swap parens around before we match */
4566
be8e71aa
YO
4567 DEBUG_DUMP_r({
4568 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4569 regdump(r);
4570 });
f8fc2ecf
YO
4571 DEBUG_OFFSETS_r(if (ri->offsets) {
4572 const U32 len = ri->offsets[0];
8e9a8a48
YO
4573 U32 i;
4574 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 4575 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->offsets[0]);
8e9a8a48 4576 for (i = 1; i <= len; i++) {
f8fc2ecf 4577 if (ri->offsets[i*2-1] || ri->offsets[i*2])
8e9a8a48 4578 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
f8fc2ecf 4579 (UV)i, (UV)ri->offsets[i*2-1], (UV)ri->offsets[i*2]);
8e9a8a48
YO
4580 }
4581 PerlIO_printf(Perl_debug_log, "\n");
4582 });
a0d0e21e 4583 return(r);
f9f4320a 4584 END_BLOCK
a687059c
LW
4585}
4586
f9f4320a
YO
4587#undef CORE_ONLY_BLOCK
4588#undef END_BLOCK
4589#undef RE_ENGINE_PTR
3dab1dad 4590
9af228c6 4591#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4592SV*
4593Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4594{
4595 I32 parno = 0; /* no match */
4596 if (PL_curpm) {
4597 const REGEXP * const rx = PM_GETRE(PL_curpm);
4598 if (rx && rx->paren_names) {
4599 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4600 if (he_str) {
4601 IV i;
4602 SV* sv_dat=HeVAL(he_str);
4603 I32 *nums=(I32*)SvPVX(sv_dat);
4604 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4605 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4606 rx->endp[nums[i]] != -1)
4607 {
4608 parno = nums[i];
4609 break;
4610 }
4611 }
4612 }
4613 }
4614 }
4615 if ( !parno ) {
4616 return 0;
4617 } else {
4618 GV *gv_paren;
4619 SV *sv= sv_newmortal();
4620 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4621 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4622 return GvSVn(gv_paren);
4623 }
4624}
9af228c6 4625#endif
0a4db386 4626
894be9b7 4627/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4628 * If flags is REG_RSN_RETURN_NULL returns null.
4629 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4630 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4631 * to the parsed name as looked up in the RExC_paren_names hash.
4632 * If there is an error throws a vFAIL().. type exception.
894be9b7 4633 */
0a4db386
YO
4634
4635#define REG_RSN_RETURN_NULL 0
4636#define REG_RSN_RETURN_NAME 1
4637#define REG_RSN_RETURN_DATA 2
4638
894be9b7
YO
4639STATIC SV*
4640S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4641 char *name_start = RExC_parse;
0a4db386 4642 if ( UTF ) {
894be9b7 4643 STRLEN numlen;
0a4db386
YO
4644 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4645 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4646 {
4647 RExC_parse += numlen;
4648 }
4649 } else {
4650 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4651 RExC_parse++;
4652 }
0a4db386
YO
4653 if ( flags ) {
4654 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4655 (int)(RExC_parse - name_start)));
894be9b7 4656 if (UTF)
0a4db386
YO
4657 SvUTF8_on(sv_name);
4658 if ( flags == REG_RSN_RETURN_NAME)
4659 return sv_name;
4660 else if (flags==REG_RSN_RETURN_DATA) {
4661 HE *he_str = NULL;
4662 SV *sv_dat = NULL;
4663 if ( ! sv_name ) /* should not happen*/
4664 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4665 if (RExC_paren_names)
4666 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4667 if ( he_str )
4668 sv_dat = HeVAL(he_str);
4669 if ( ! sv_dat )
4670 vFAIL("Reference to nonexistent named group");
4671 return sv_dat;
4672 }
4673 else {
4674 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4675 }
4676 /* NOT REACHED */
894be9b7 4677 }
0a4db386 4678 return NULL;
894be9b7
YO
4679}
4680
3dab1dad
YO
4681#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4682 int rem=(int)(RExC_end - RExC_parse); \
4683 int cut; \
4684 int num; \
4685 int iscut=0; \
4686 if (rem>10) { \
4687 rem=10; \
4688 iscut=1; \
4689 } \
4690 cut=10-rem; \
4691 if (RExC_lastparse!=RExC_parse) \
4692 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4693 rem, RExC_parse, \
4694 cut + 4, \
4695 iscut ? "..." : "<" \
4696 ); \
4697 else \
4698 PerlIO_printf(Perl_debug_log,"%16s",""); \
4699 \
4700 if (SIZE_ONLY) \
4701 num=RExC_size; \
4702 else \
4703 num=REG_NODE_NUM(RExC_emit); \
4704 if (RExC_lastnum!=num) \
0a4db386 4705 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4706 else \
0a4db386 4707 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4708 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4709 (int)((depth*2)), "", \
3dab1dad
YO
4710 (funcname) \
4711 ); \
4712 RExC_lastnum=num; \
4713 RExC_lastparse=RExC_parse; \
4714})
4715
07be1b83
YO
4716
4717
3dab1dad
YO
4718#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4719 DEBUG_PARSE_MSG((funcname)); \
4720 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4721})
6bda09f9
YO
4722#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4723 DEBUG_PARSE_MSG((funcname)); \
4724 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4725})
a687059c
LW
4726/*
4727 - reg - regular expression, i.e. main body or parenthesized thing
4728 *
4729 * Caller must absorb opening parenthesis.
4730 *
4731 * Combining parenthesis handling with the base level of regular expression
4732 * is a trifle forced, but the need to tie the tails of the branches to what
4733 * follows makes it hard to avoid.
4734 */
07be1b83
YO
4735#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4736#ifdef DEBUGGING
4737#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4738#else
4739#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4740#endif
3dab1dad 4741
e2e6a0f1
YO
4742/* this idea is borrowed from STR_WITH_LEN in handy.h */
4743#define CHECK_WORD(s,v,l) \
4744 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4745
76e3520e 4746STATIC regnode *
3dab1dad 4747S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4748 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4749{
27da23d5 4750 dVAR;
c277df42
IZ
4751 register regnode *ret; /* Will be the head of the group. */
4752 register regnode *br;
4753 register regnode *lastbr;
cbbf8932 4754 register regnode *ender = NULL;
a0d0e21e 4755 register I32 parno = 0;
cbbf8932
AL
4756 I32 flags;
4757 const I32 oregflags = RExC_flags;
6136c704
AL
4758 bool have_branch = 0;
4759 bool is_open = 0;
9d1d55b5
JP
4760
4761 /* for (?g), (?gc), and (?o) warnings; warning
4762 about (?c) will warn about (?g) -- japhy */
4763
6136c704
AL
4764#define WASTED_O 0x01
4765#define WASTED_G 0x02
4766#define WASTED_C 0x04
4767#define WASTED_GC (0x02|0x04)
cbbf8932 4768 I32 wastedflags = 0x00;
9d1d55b5 4769
fac92740 4770 char * parse_start = RExC_parse; /* MJD */
a28509cc 4771 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4772
3dab1dad
YO
4773 GET_RE_DEBUG_FLAGS_DECL;
4774 DEBUG_PARSE("reg ");
4775
4776
821b33a5 4777 *flagp = 0; /* Tentatively. */
a0d0e21e 4778
9d1d55b5 4779
a0d0e21e
LW
4780 /* Make an OPEN node, if parenthesized. */
4781 if (paren) {
e2e6a0f1
YO
4782 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4783 char *start_verb = RExC_parse;
4784 STRLEN verb_len = 0;
4785 char *start_arg = NULL;
4786 unsigned char op = 0;
4787 int argok = 1;
4788 int internal_argval = 0; /* internal_argval is only useful if !argok */
4789 while ( *RExC_parse && *RExC_parse != ')' ) {
4790 if ( *RExC_parse == ':' ) {
4791 start_arg = RExC_parse + 1;
4792 break;
4793 }
4794 RExC_parse++;
4795 }
4796 ++start_verb;
4797 verb_len = RExC_parse - start_verb;
4798 if ( start_arg ) {
4799 RExC_parse++;
4800 while ( *RExC_parse && *RExC_parse != ')' )
4801 RExC_parse++;
4802 if ( *RExC_parse != ')' )
4803 vFAIL("Unterminated verb pattern argument");
4804 if ( RExC_parse == start_arg )
4805 start_arg = NULL;
4806 } else {
4807 if ( *RExC_parse != ')' )
4808 vFAIL("Unterminated verb pattern");
4809 }
5d458dd8 4810
e2e6a0f1
YO
4811 switch ( *start_verb ) {
4812 case 'A': /* (*ACCEPT) */
4813 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4814 op = ACCEPT;
4815 internal_argval = RExC_nestroot;
4816 }
4817 break;
4818 case 'C': /* (*COMMIT) */
4819 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4820 op = COMMIT;
e2e6a0f1
YO
4821 break;
4822 case 'F': /* (*FAIL) */
4823 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4824 op = OPFAIL;
4825 argok = 0;
4826 }
4827 break;
5d458dd8
YO
4828 case ':': /* (*:NAME) */
4829 case 'M': /* (*MARK:NAME) */
4830 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 4831 op = MARKPOINT;
5d458dd8
YO
4832 argok = -1;
4833 }
4834 break;
4835 case 'P': /* (*PRUNE) */
4836 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4837 op = PRUNE;
e2e6a0f1 4838 break;
5d458dd8
YO
4839 case 'S': /* (*SKIP) */
4840 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4841 op = SKIP;
4842 break;
4843 case 'T': /* (*THEN) */
4844 /* [19:06] <TimToady> :: is then */
4845 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4846 op = CUTGROUP;
4847 RExC_seen |= REG_SEEN_CUTGROUP;
4848 }
e2e6a0f1
YO
4849 break;
4850 }
4851 if ( ! op ) {
4852 RExC_parse++;
4853 vFAIL3("Unknown verb pattern '%.*s'",
4854 verb_len, start_verb);
4855 }
4856 if ( argok ) {
4857 if ( start_arg && internal_argval ) {
4858 vFAIL3("Verb pattern '%.*s' may not have an argument",
4859 verb_len, start_verb);
4860 } else if ( argok < 0 && !start_arg ) {
4861 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4862 verb_len, start_verb);
4863 } else {
4864 ret = reganode(pRExC_state, op, internal_argval);
4865 if ( ! internal_argval && ! SIZE_ONLY ) {
4866 if (start_arg) {
4867 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4868 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 4869 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
4870 ret->flags = 0;
4871 } else {
4872 ret->flags = 1;
4873 }
4874 }
4875 }
4876 if (!internal_argval)
4877 RExC_seen |= REG_SEEN_VERBARG;
4878 } else if ( start_arg ) {
4879 vFAIL3("Verb pattern '%.*s' may not have an argument",
4880 verb_len, start_verb);
4881 } else {
4882 ret = reg_node(pRExC_state, op);
4883 }
4884 nextchar(pRExC_state);
4885 return ret;
4886 } else
fac92740 4887 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4888 U32 posflags = 0, negflags = 0;
4889 U32 *flagsp = &posflags;
6136c704 4890 bool is_logical = 0;
a28509cc 4891 const char * const seqstart = RExC_parse;
ca9dfc88 4892
830247a4
IZ
4893 RExC_parse++;
4894 paren = *RExC_parse++;
c277df42 4895 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4896 switch (paren) {
894be9b7 4897
fac92740 4898 case '<': /* (?<...) */
b81d288d 4899 if (*RExC_parse == '!')
c277df42 4900 paren = ',';
0a4db386
YO
4901 else if (*RExC_parse != '=')
4902 { /* (?<...>) */
81714fb9 4903 char *name_start;
894be9b7 4904 SV *svname;
81714fb9
YO
4905 paren= '>';
4906 case '\'': /* (?'...') */
4907 name_start= RExC_parse;
0a4db386
YO
4908 svname = reg_scan_name(pRExC_state,
4909 SIZE_ONLY ? /* reverse test from the others */
4910 REG_RSN_RETURN_NAME :
4911 REG_RSN_RETURN_NULL);
81714fb9
YO
4912 if (RExC_parse == name_start)
4913 goto unknown;
4914 if (*RExC_parse != paren)
4915 vFAIL2("Sequence (?%c... not terminated",
4916 paren=='>' ? '<' : paren);
4917 if (SIZE_ONLY) {
e62cc96a
YO
4918 HE *he_str;
4919 SV *sv_dat = NULL;
894be9b7
YO
4920 if (!svname) /* shouldnt happen */
4921 Perl_croak(aTHX_
4922 "panic: reg_scan_name returned NULL");
81714fb9
YO
4923 if (!RExC_paren_names) {
4924 RExC_paren_names= newHV();
4925 sv_2mortal((SV*)RExC_paren_names);
4926 }
4927 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4928 if ( he_str )
81714fb9 4929 sv_dat = HeVAL(he_str);
e62cc96a 4930 if ( ! sv_dat ) {
81714fb9 4931 /* croak baby croak */
e62cc96a
YO
4932 Perl_croak(aTHX_
4933 "panic: paren_name hash element allocation failed");
4934 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4935 IV count=SvIV(sv_dat);
4936 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4937 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4938 pv[count]=RExC_npar;
4939 SvIVX(sv_dat)++;
4940 } else {
4941 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4942 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4943 SvIOK_on(sv_dat);
4944 SvIVX(sv_dat)= 1;
e62cc96a
YO
4945 }
4946
81714fb9
YO
4947 /*sv_dump(sv_dat);*/
4948 }
4949 nextchar(pRExC_state);
4950 paren = 1;
4951 goto capturing_parens;
4952 }
4953 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4954 RExC_parse++;
fac92740
MJD
4955 case '=': /* (?=...) */
4956 case '!': /* (?!...) */
830247a4 4957 RExC_seen_zerolen++;
e2e6a0f1
YO
4958 if (*RExC_parse == ')') {
4959 ret=reg_node(pRExC_state, OPFAIL);
4960 nextchar(pRExC_state);
4961 return ret;
4962 }
fac92740
MJD
4963 case ':': /* (?:...) */
4964 case '>': /* (?>...) */
a0d0e21e 4965 break;
fac92740
MJD
4966 case '$': /* (?$...) */
4967 case '@': /* (?@...) */
8615cb43 4968 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4969 break;
fac92740 4970 case '#': /* (?#...) */
830247a4
IZ
4971 while (*RExC_parse && *RExC_parse != ')')
4972 RExC_parse++;
4973 if (*RExC_parse != ')')
c277df42 4974 FAIL("Sequence (?#... not terminated");
830247a4 4975 nextchar(pRExC_state);
a0d0e21e
LW
4976 *flagp = TRYAGAIN;
4977 return NULL;
894be9b7
YO
4978 case '0' : /* (?0) */
4979 case 'R' : /* (?R) */
4980 if (*RExC_parse != ')')
6bda09f9 4981 FAIL("Sequence (?R) not terminated");
1a147d38 4982 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
4983 nextchar(pRExC_state);
4984 return ret;
4985 /*notreached*/
894be9b7
YO
4986 { /* named and numeric backreferences */
4987 I32 num;
4988 char * parse_start;
4989 case '&': /* (?&NAME) */
4990 parse_start = RExC_parse - 1;
4991 {
0a4db386
YO
4992 SV *sv_dat = reg_scan_name(pRExC_state,
4993 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4994 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
4995 }
4996 goto gen_recurse_regop;
4997 /* NOT REACHED */
542fa716
YO
4998 case '+':
4999 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5000 RExC_parse++;
5001 vFAIL("Illegal pattern");
5002 }
5003 goto parse_recursion;
5004 /* NOT REACHED*/
5005 case '-': /* (?-1) */
5006 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5007 RExC_parse--; /* rewind to let it be handled later */
5008 goto parse_flags;
5009 }
5010 /*FALLTHROUGH */
6bda09f9
YO
5011 case '1': case '2': case '3': case '4': /* (?1) */
5012 case '5': case '6': case '7': case '8': case '9':
5013 RExC_parse--;
542fa716 5014 parse_recursion:
894be9b7
YO
5015 num = atoi(RExC_parse);
5016 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5017 if (*RExC_parse == '-')
5018 RExC_parse++;
6bda09f9
YO
5019 while (isDIGIT(*RExC_parse))
5020 RExC_parse++;
5021 if (*RExC_parse!=')')
5022 vFAIL("Expecting close bracket");
894be9b7
YO
5023
5024 gen_recurse_regop:
542fa716
YO
5025 if ( paren == '-' ) {
5026 /*
5027 Diagram of capture buffer numbering.
5028 Top line is the normal capture buffer numbers
5029 Botton line is the negative indexing as from
5030 the X (the (?-2))
5031
5032 + 1 2 3 4 5 X 6 7
5033 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5034 - 5 4 3 2 1 X x x
5035
5036 */
5037 num = RExC_npar + num;
5038 if (num < 1) {
5039 RExC_parse++;
5040 vFAIL("Reference to nonexistent group");
5041 }
5042 } else if ( paren == '+' ) {
5043 num = RExC_npar + num - 1;
5044 }
5045
1a147d38 5046 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5047 if (!SIZE_ONLY) {
5048 if (num > (I32)RExC_rx->nparens) {
5049 RExC_parse++;
5050 vFAIL("Reference to nonexistent group");
5051 }
40d049e4 5052 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5053 RExC_emit++;
226de585 5054 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5055 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5056 } else {
6bda09f9 5057 RExC_size++;
6bda09f9 5058 }
0a4db386 5059 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5060 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5061 Set_Node_Offset(ret, parse_start); /* MJD */
5062
6bda09f9
YO
5063 nextchar(pRExC_state);
5064 return ret;
894be9b7
YO
5065 } /* named and numeric backreferences */
5066 /* NOT REACHED */
5067
fac92740 5068 case 'p': /* (?p...) */
9014280d 5069 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 5070 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 5071 /* FALL THROUGH*/
fac92740 5072 case '?': /* (??...) */
6136c704 5073 is_logical = 1;
438a3801
YST
5074 if (*RExC_parse != '{')
5075 goto unknown;
830247a4 5076 paren = *RExC_parse++;
0f5d15d6 5077 /* FALL THROUGH */
fac92740 5078 case '{': /* (?{...}) */
c277df42 5079 {
2eccd3b2
NC
5080 I32 count = 1;
5081 U32 n = 0;
c277df42 5082 char c;
830247a4 5083 char *s = RExC_parse;
c277df42 5084
830247a4
IZ
5085 RExC_seen_zerolen++;
5086 RExC_seen |= REG_SEEN_EVAL;
5087 while (count && (c = *RExC_parse)) {
6136c704
AL
5088 if (c == '\\') {
5089 if (RExC_parse[1])
5090 RExC_parse++;
5091 }
b81d288d 5092 else if (c == '{')
c277df42 5093 count++;
b81d288d 5094 else if (c == '}')
c277df42 5095 count--;
830247a4 5096 RExC_parse++;
c277df42 5097 }
6136c704 5098 if (*RExC_parse != ')') {
b81d288d 5099 RExC_parse = s;
b45f050a
JF
5100 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5101 }
c277df42 5102 if (!SIZE_ONLY) {
f3548bdc 5103 PAD *pad;
6136c704
AL
5104 OP_4tree *sop, *rop;
5105 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5106
569233ed
SB
5107 ENTER;
5108 Perl_save_re_context(aTHX);
f3548bdc 5109 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5110 sop->op_private |= OPpREFCOUNTED;
5111 /* re_dup will OpREFCNT_inc */
5112 OpREFCNT_set(sop, 1);
569233ed 5113 LEAVE;
c277df42 5114
830247a4 5115 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5116 RExC_rxi->data->data[n] = (void*)rop;
5117 RExC_rxi->data->data[n+1] = (void*)sop;
5118 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 5119 SvREFCNT_dec(sv);
a0ed51b3 5120 }
e24b16f9 5121 else { /* First pass */
830247a4 5122 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5123 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5124 /* No compiled RE interpolated, has runtime
5125 components ===> unsafe. */
5126 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5127 if (PL_tainting && PL_tainted)
cc6b7395 5128 FAIL("Eval-group in insecure regular expression");
54df2634 5129#if PERL_VERSION > 8
923e4eb5 5130 if (IN_PERL_COMPILETIME)
b5c19bd7 5131 PL_cv_has_eval = 1;
54df2634 5132#endif
c277df42 5133 }
b5c19bd7 5134
830247a4 5135 nextchar(pRExC_state);
6136c704 5136 if (is_logical) {
830247a4 5137 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5138 if (!SIZE_ONLY)
5139 ret->flags = 2;
3dab1dad 5140 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5141 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5142 return ret;
5143 }
ccb2c380
MP
5144 ret = reganode(pRExC_state, EVAL, n);
5145 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5146 Set_Node_Offset(ret, parse_start);
5147 return ret;
c277df42 5148 }
fac92740 5149 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5150 {
0a4db386 5151 int is_define= 0;
fac92740 5152 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5153 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5154 || RExC_parse[1] == '<'
830247a4 5155 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5156 I32 flag;
5157
830247a4 5158 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5159 if (!SIZE_ONLY)
5160 ret->flags = 1;
3dab1dad 5161 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5162 goto insert_if;
b81d288d 5163 }
a0ed51b3 5164 }
0a4db386
YO
5165 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5166 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5167 {
5168 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5169 char *name_start= RExC_parse++;
2eccd3b2 5170 U32 num = 0;
0a4db386
YO
5171 SV *sv_dat=reg_scan_name(pRExC_state,
5172 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5173 if (RExC_parse == name_start || *RExC_parse != ch)
5174 vFAIL2("Sequence (?(%c... not terminated",
5175 (ch == '>' ? '<' : ch));
5176 RExC_parse++;
5177 if (!SIZE_ONLY) {
5178 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5179 RExC_rxi->data->data[num]=(void*)sv_dat;
0a4db386
YO
5180 SvREFCNT_inc(sv_dat);
5181 }
5182 ret = reganode(pRExC_state,NGROUPP,num);
5183 goto insert_if_check_paren;
5184 }
5185 else if (RExC_parse[0] == 'D' &&
5186 RExC_parse[1] == 'E' &&
5187 RExC_parse[2] == 'F' &&
5188 RExC_parse[3] == 'I' &&
5189 RExC_parse[4] == 'N' &&
5190 RExC_parse[5] == 'E')
5191 {
5192 ret = reganode(pRExC_state,DEFINEP,0);
5193 RExC_parse +=6 ;
5194 is_define = 1;
5195 goto insert_if_check_paren;
5196 }
5197 else if (RExC_parse[0] == 'R') {
5198 RExC_parse++;
5199 parno = 0;
5200 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5201 parno = atoi(RExC_parse++);
5202 while (isDIGIT(*RExC_parse))
5203 RExC_parse++;
5204 } else if (RExC_parse[0] == '&') {
5205 SV *sv_dat;
5206 RExC_parse++;
5207 sv_dat = reg_scan_name(pRExC_state,
5208 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5209 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5210 }
1a147d38 5211 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5212 goto insert_if_check_paren;
5213 }
830247a4 5214 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5215 /* (?(1)...) */
6136c704 5216 char c;
830247a4 5217 parno = atoi(RExC_parse++);
c277df42 5218
830247a4
IZ
5219 while (isDIGIT(*RExC_parse))
5220 RExC_parse++;
fac92740 5221 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5222
0a4db386 5223 insert_if_check_paren:
830247a4 5224 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5225 vFAIL("Switch condition not recognized");
c277df42 5226 insert_if:
3dab1dad
YO
5227 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5228 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5229 if (br == NULL)
830247a4 5230 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5231 else
3dab1dad 5232 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5233 c = *nextchar(pRExC_state);
d1b80229
IZ
5234 if (flags&HASWIDTH)
5235 *flagp |= HASWIDTH;
c277df42 5236 if (c == '|') {
0a4db386
YO
5237 if (is_define)
5238 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5239 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5240 regbranch(pRExC_state, &flags, 1,depth+1);
5241 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5242 if (flags&HASWIDTH)
5243 *flagp |= HASWIDTH;
830247a4 5244 c = *nextchar(pRExC_state);
a0ed51b3
LW
5245 }
5246 else
c277df42
IZ
5247 lastbr = NULL;
5248 if (c != ')')
8615cb43 5249 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5250 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5251 REGTAIL(pRExC_state, br, ender);
c277df42 5252 if (lastbr) {
3dab1dad
YO
5253 REGTAIL(pRExC_state, lastbr, ender);
5254 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5255 }
5256 else
3dab1dad 5257 REGTAIL(pRExC_state, ret, ender);
c277df42 5258 return ret;
a0ed51b3
LW
5259 }
5260 else {
830247a4 5261 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5262 }
5263 }
1b1626e4 5264 case 0:
830247a4 5265 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5266 vFAIL("Sequence (? incomplete");
1b1626e4 5267 break;
a0d0e21e 5268 default:
830247a4 5269 --RExC_parse;
fac92740 5270 parse_flags: /* (?i) */
830247a4 5271 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5272 /* (?g), (?gc) and (?o) are useless here
5273 and must be globally applied -- japhy */
5274
5275 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5276 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5277 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5278 if (! (wastedflags & wflagbit) ) {
5279 wastedflags |= wflagbit;
5280 vWARN5(
5281 RExC_parse + 1,
5282 "Useless (%s%c) - %suse /%c modifier",
5283 flagsp == &negflags ? "?-" : "?",
5284 *RExC_parse,
5285 flagsp == &negflags ? "don't " : "",
5286 *RExC_parse
5287 );
5288 }
5289 }
5290 }
5291 else if (*RExC_parse == 'c') {
5292 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5293 if (! (wastedflags & WASTED_C) ) {
5294 wastedflags |= WASTED_GC;
9d1d55b5
JP
5295 vWARN3(
5296 RExC_parse + 1,
5297 "Useless (%sc) - %suse /gc modifier",
5298 flagsp == &negflags ? "?-" : "?",
5299 flagsp == &negflags ? "don't " : ""
5300 );
5301 }
5302 }
5303 }
5304 else { pmflag(flagsp, *RExC_parse); }
5305
830247a4 5306 ++RExC_parse;
ca9dfc88 5307 }
830247a4 5308 if (*RExC_parse == '-') {
ca9dfc88 5309 flagsp = &negflags;
9d1d55b5 5310 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5311 ++RExC_parse;
ca9dfc88 5312 goto parse_flags;
48c036b1 5313 }
e2509266
JH
5314 RExC_flags |= posflags;
5315 RExC_flags &= ~negflags;
830247a4
IZ
5316 if (*RExC_parse == ':') {
5317 RExC_parse++;
ca9dfc88
IZ
5318 paren = ':';
5319 break;
5320 }
c277df42 5321 unknown:
830247a4
IZ
5322 if (*RExC_parse != ')') {
5323 RExC_parse++;
5324 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5325 }
830247a4 5326 nextchar(pRExC_state);
a0d0e21e
LW
5327 *flagp = TRYAGAIN;
5328 return NULL;
5329 }
5330 }
fac92740 5331 else { /* (...) */
81714fb9 5332 capturing_parens:
830247a4
IZ
5333 parno = RExC_npar;
5334 RExC_npar++;
e2e6a0f1 5335
830247a4 5336 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5337 if (!SIZE_ONLY ){
5338 if (!RExC_nestroot)
5339 RExC_nestroot = parno;
5340 if (RExC_seen & REG_SEEN_RECURSE) {
5341 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5342 "Setting open paren #%"IVdf" to %d\n",
5343 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5344 RExC_open_parens[parno-1]= ret;
5345 }
6bda09f9 5346 }
fac92740
MJD
5347 Set_Node_Length(ret, 1); /* MJD */
5348 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5349 is_open = 1;
a0d0e21e 5350 }
a0ed51b3 5351 }
fac92740 5352 else /* ! paren */
a0d0e21e
LW
5353 ret = NULL;
5354
5355 /* Pick up the branches, linking them together. */
fac92740 5356 parse_start = RExC_parse; /* MJD */
3dab1dad 5357 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5358 /* branch_len = (paren != 0); */
2af232bd 5359
a0d0e21e
LW
5360 if (br == NULL)
5361 return(NULL);
830247a4
IZ
5362 if (*RExC_parse == '|') {
5363 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5364 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5365 }
fac92740 5366 else { /* MJD */
6bda09f9 5367 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5368 Set_Node_Length(br, paren != 0);
5369 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5370 }
c277df42
IZ
5371 have_branch = 1;
5372 if (SIZE_ONLY)
830247a4 5373 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5374 }
5375 else if (paren == ':') {
c277df42
IZ
5376 *flagp |= flags&SIMPLE;
5377 }
6136c704 5378 if (is_open) { /* Starts with OPEN. */
3dab1dad 5379 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5380 }
5381 else if (paren != '?') /* Not Conditional */
a0d0e21e 5382 ret = br;
32a0ca98 5383 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5384 lastbr = br;
830247a4
IZ
5385 while (*RExC_parse == '|') {
5386 if (!SIZE_ONLY && RExC_extralen) {
5387 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5388 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5389 }
5390 if (SIZE_ONLY)
830247a4
IZ
5391 RExC_extralen += 2; /* Account for LONGJMP. */
5392 nextchar(pRExC_state);
3dab1dad 5393 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5394
a687059c 5395 if (br == NULL)
a0d0e21e 5396 return(NULL);
3dab1dad 5397 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5398 lastbr = br;
821b33a5
IZ
5399 if (flags&HASWIDTH)
5400 *flagp |= HASWIDTH;
a687059c 5401 *flagp |= flags&SPSTART;
a0d0e21e
LW
5402 }
5403
c277df42
IZ
5404 if (have_branch || paren != ':') {
5405 /* Make a closing node, and hook it on the end. */
5406 switch (paren) {
5407 case ':':
830247a4 5408 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5409 break;
5410 case 1:
c74340f9 5411 RExC_cpar++;
830247a4 5412 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5413 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5414 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5415 "Setting close paren #%"IVdf" to %d\n",
5416 (IV)parno, REG_NODE_NUM(ender)));
5417 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5418 if (RExC_nestroot == parno)
5419 RExC_nestroot = 0;
40d049e4 5420 }
fac92740
MJD
5421 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5422 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5423 break;
5424 case '<':
c277df42
IZ
5425 case ',':
5426 case '=':
5427 case '!':
c277df42 5428 *flagp &= ~HASWIDTH;
821b33a5
IZ
5429 /* FALL THROUGH */
5430 case '>':
830247a4 5431 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5432 break;
5433 case 0:
830247a4 5434 ender = reg_node(pRExC_state, END);
40d049e4
YO
5435 if (!SIZE_ONLY) {
5436 assert(!RExC_opend); /* there can only be one! */
5437 RExC_opend = ender;
5438 }
c277df42
IZ
5439 break;
5440 }
eaf3ca90 5441 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5442
9674d46a 5443 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5444 if (depth==1)
5445 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5446
c277df42 5447 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5448 for (br = ret; br; br = regnext(br)) {
5449 const U8 op = PL_regkind[OP(br)];
5450 if (op == BRANCH) {
07be1b83 5451 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5452 }
5453 else if (op == BRANCHJ) {
07be1b83 5454 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5455 }
c277df42
IZ
5456 }
5457 }
a0d0e21e 5458 }
c277df42
IZ
5459
5460 {
e1ec3a88
AL
5461 const char *p;
5462 static const char parens[] = "=!<,>";
c277df42
IZ
5463
5464 if (paren && (p = strchr(parens, paren))) {
eb160463 5465 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5466 int flag = (p - parens) > 1;
5467
5468 if (paren == '>')
5469 node = SUSPEND, flag = 0;
6bda09f9 5470 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5471 Set_Node_Cur_Length(ret);
5472 Set_Node_Offset(ret, parse_start + 1);
c277df42 5473 ret->flags = flag;
07be1b83 5474 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5475 }
a0d0e21e
LW
5476 }
5477
5478 /* Check for proper termination. */
ce3e6498 5479 if (paren) {
e2509266 5480 RExC_flags = oregflags;
830247a4
IZ
5481 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5482 RExC_parse = oregcomp_parse;
380a0633 5483 vFAIL("Unmatched (");
ce3e6498 5484 }
a0ed51b3 5485 }
830247a4
IZ
5486 else if (!paren && RExC_parse < RExC_end) {
5487 if (*RExC_parse == ')') {
5488 RExC_parse++;
380a0633 5489 vFAIL("Unmatched )");
a0ed51b3
LW
5490 }
5491 else
b45f050a 5492 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5493 /* NOTREACHED */
5494 }
a687059c 5495
a0d0e21e 5496 return(ret);
a687059c
LW
5497}
5498
5499/*
5500 - regbranch - one alternative of an | operator
5501 *
5502 * Implements the concatenation operator.
5503 */
76e3520e 5504STATIC regnode *
3dab1dad 5505S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5506{
97aff369 5507 dVAR;
c277df42
IZ
5508 register regnode *ret;
5509 register regnode *chain = NULL;
5510 register regnode *latest;
5511 I32 flags = 0, c = 0;
3dab1dad
YO
5512 GET_RE_DEBUG_FLAGS_DECL;
5513 DEBUG_PARSE("brnc");
b81d288d 5514 if (first)
c277df42
IZ
5515 ret = NULL;
5516 else {
b81d288d 5517 if (!SIZE_ONLY && RExC_extralen)
830247a4 5518 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5519 else {
830247a4 5520 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5521 Set_Node_Length(ret, 1);
5522 }
c277df42
IZ
5523 }
5524
b81d288d 5525 if (!first && SIZE_ONLY)
830247a4 5526 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5527
c277df42 5528 *flagp = WORST; /* Tentatively. */
a0d0e21e 5529
830247a4
IZ
5530 RExC_parse--;
5531 nextchar(pRExC_state);
5532 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5533 flags &= ~TRYAGAIN;
3dab1dad 5534 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5535 if (latest == NULL) {
5536 if (flags & TRYAGAIN)
5537 continue;
5538 return(NULL);
a0ed51b3
LW
5539 }
5540 else if (ret == NULL)
c277df42 5541 ret = latest;
a0d0e21e 5542 *flagp |= flags&HASWIDTH;
c277df42 5543 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5544 *flagp |= flags&SPSTART;
5545 else {
830247a4 5546 RExC_naughty++;
3dab1dad 5547 REGTAIL(pRExC_state, chain, latest);
a687059c 5548 }
a0d0e21e 5549 chain = latest;
c277df42
IZ
5550 c++;
5551 }
5552 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5553 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5554 if (ret == NULL)
5555 ret = chain;
5556 }
5557 if (c == 1) {
5558 *flagp |= flags&SIMPLE;
a0d0e21e 5559 }
a687059c 5560
d4c19fe8 5561 return ret;
a687059c
LW
5562}
5563
5564/*
5565 - regpiece - something followed by possible [*+?]
5566 *
5567 * Note that the branching code sequences used for ? and the general cases
5568 * of * and + are somewhat optimized: they use the same NOTHING node as
5569 * both the endmarker for their branch list and the body of the last branch.
5570 * It might seem that this node could be dispensed with entirely, but the
5571 * endmarker role is not redundant.
5572 */
76e3520e 5573STATIC regnode *
3dab1dad 5574S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5575{
97aff369 5576 dVAR;
c277df42 5577 register regnode *ret;
a0d0e21e
LW
5578 register char op;
5579 register char *next;
5580 I32 flags;
1df70142 5581 const char * const origparse = RExC_parse;
a0d0e21e 5582 I32 min;
c277df42 5583 I32 max = REG_INFTY;
fac92740 5584 char *parse_start;
10edeb5d 5585 const char *maxpos = NULL;
3dab1dad
YO
5586 GET_RE_DEBUG_FLAGS_DECL;
5587 DEBUG_PARSE("piec");
a0d0e21e 5588
3dab1dad 5589 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5590 if (ret == NULL) {
5591 if (flags & TRYAGAIN)
5592 *flagp |= TRYAGAIN;
5593 return(NULL);
5594 }
5595
830247a4 5596 op = *RExC_parse;
a0d0e21e 5597
830247a4 5598 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5599 maxpos = NULL;
fac92740 5600 parse_start = RExC_parse; /* MJD */
830247a4 5601 next = RExC_parse + 1;
a0d0e21e
LW
5602 while (isDIGIT(*next) || *next == ',') {
5603 if (*next == ',') {
5604 if (maxpos)
5605 break;
5606 else
5607 maxpos = next;
a687059c 5608 }
a0d0e21e
LW
5609 next++;
5610 }
5611 if (*next == '}') { /* got one */
5612 if (!maxpos)
5613 maxpos = next;
830247a4
IZ
5614 RExC_parse++;
5615 min = atoi(RExC_parse);
a0d0e21e
LW
5616 if (*maxpos == ',')
5617 maxpos++;
5618 else
830247a4 5619 maxpos = RExC_parse;
a0d0e21e
LW
5620 max = atoi(maxpos);
5621 if (!max && *maxpos != '0')
c277df42
IZ
5622 max = REG_INFTY; /* meaning "infinity" */
5623 else if (max >= REG_INFTY)
8615cb43 5624 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5625 RExC_parse = next;
5626 nextchar(pRExC_state);
a0d0e21e
LW
5627
5628 do_curly:
5629 if ((flags&SIMPLE)) {
830247a4 5630 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5631 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5632 Set_Node_Offset(ret, parse_start+1); /* MJD */
5633 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5634 }
5635 else {
3dab1dad 5636 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5637
5638 w->flags = 0;
3dab1dad 5639 REGTAIL(pRExC_state, ret, w);
830247a4 5640 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5641 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5642 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5643 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5644 }
6bda09f9 5645 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5646 /* MJD hk */
5647 Set_Node_Offset(ret, parse_start+1);
2af232bd 5648 Set_Node_Length(ret,
fac92740 5649 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5650
830247a4 5651 if (!SIZE_ONLY && RExC_extralen)
c277df42 5652 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5653 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5654 if (SIZE_ONLY)
830247a4
IZ
5655 RExC_whilem_seen++, RExC_extralen += 3;
5656 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5657 }
c277df42 5658 ret->flags = 0;
a0d0e21e
LW
5659
5660 if (min > 0)
821b33a5
IZ
5661 *flagp = WORST;
5662 if (max > 0)
5663 *flagp |= HASWIDTH;
a0d0e21e 5664 if (max && max < min)
8615cb43 5665 vFAIL("Can't do {n,m} with n > m");
c277df42 5666 if (!SIZE_ONLY) {
eb160463
GS
5667 ARG1_SET(ret, (U16)min);
5668 ARG2_SET(ret, (U16)max);
a687059c 5669 }
a687059c 5670
a0d0e21e 5671 goto nest_check;
a687059c 5672 }
a0d0e21e 5673 }
a687059c 5674
a0d0e21e
LW
5675 if (!ISMULT1(op)) {
5676 *flagp = flags;
a687059c 5677 return(ret);
a0d0e21e 5678 }
bb20fd44 5679
c277df42 5680#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5681
5682 /* if this is reinstated, don't forget to put this back into perldiag:
5683
5684 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5685
5686 (F) The part of the regexp subject to either the * or + quantifier
5687 could match an empty string. The {#} shows in the regular
5688 expression about where the problem was discovered.
5689
5690 */
5691
bb20fd44 5692 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5693 vFAIL("Regexp *+ operand could be empty");
b81d288d 5694#endif
bb20fd44 5695
fac92740 5696 parse_start = RExC_parse;
830247a4 5697 nextchar(pRExC_state);
a0d0e21e 5698
821b33a5 5699 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5700
5701 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5702 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5703 ret->flags = 0;
830247a4 5704 RExC_naughty += 4;
a0d0e21e
LW
5705 }
5706 else if (op == '*') {
5707 min = 0;
5708 goto do_curly;
a0ed51b3
LW
5709 }
5710 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5711 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5712 ret->flags = 0;
830247a4 5713 RExC_naughty += 3;
a0d0e21e
LW
5714 }
5715 else if (op == '+') {
5716 min = 1;
5717 goto do_curly;
a0ed51b3
LW
5718 }
5719 else if (op == '?') {
a0d0e21e
LW
5720 min = 0; max = 1;
5721 goto do_curly;
5722 }
5723 nest_check:
041457d9 5724 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5725 vWARN3(RExC_parse,
b45f050a 5726 "%.*s matches null string many times",
afd78fd5 5727 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5728 origparse);
a0d0e21e
LW
5729 }
5730
b9b4dddf 5731 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5732 nextchar(pRExC_state);
6bda09f9 5733 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5734 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5735 }
b9b4dddf
YO
5736#ifndef REG_ALLOW_MINMOD_SUSPEND
5737 else
5738#endif
5739 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5740 regnode *ender;
5741 nextchar(pRExC_state);
5742 ender = reg_node(pRExC_state, SUCCEED);
5743 REGTAIL(pRExC_state, ret, ender);
5744 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5745 ret->flags = 0;
5746 ender = reg_node(pRExC_state, TAIL);
5747 REGTAIL(pRExC_state, ret, ender);
5748 /*ret= ender;*/
5749 }
5750
5751 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5752 RExC_parse++;
b45f050a
JF
5753 vFAIL("Nested quantifiers");
5754 }
a0d0e21e
LW
5755
5756 return(ret);
a687059c
LW
5757}
5758
fc8cd66c
YO
5759
5760/* reg_namedseq(pRExC_state,UVp)
5761
5762 This is expected to be called by a parser routine that has
5763 recognized'\N' and needs to handle the rest. RExC_parse is
5764 expected to point at the first char following the N at the time
5765 of the call.
5766
5767 If valuep is non-null then it is assumed that we are parsing inside
5768 of a charclass definition and the first codepoint in the resolved
5769 string is returned via *valuep and the routine will return NULL.
5770 In this mode if a multichar string is returned from the charnames
5771 handler a warning will be issued, and only the first char in the
5772 sequence will be examined. If the string returned is zero length
5773 then the value of *valuep is undefined and NON-NULL will
5774 be returned to indicate failure. (This will NOT be a valid pointer
5775 to a regnode.)
5776
5777 If value is null then it is assumed that we are parsing normal text
5778 and inserts a new EXACT node into the program containing the resolved
5779 string and returns a pointer to the new node. If the string is
5780 zerolength a NOTHING node is emitted.
5781
5782 On success RExC_parse is set to the char following the endbrace.
5783 Parsing failures will generate a fatal errorvia vFAIL(...)
5784
5785 NOTE: We cache all results from the charnames handler locally in
5786 the RExC_charnames hash (created on first use) to prevent a charnames
5787 handler from playing silly-buggers and returning a short string and
5788 then a long string for a given pattern. Since the regexp program
5789 size is calculated during an initial parse this would result
5790 in a buffer overrun so we cache to prevent the charname result from
5791 changing during the course of the parse.
5792
5793 */
5794STATIC regnode *
5795S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5796{
5797 char * name; /* start of the content of the name */
5798 char * endbrace; /* endbrace following the name */
5799 SV *sv_str = NULL;
5800 SV *sv_name = NULL;
5801 STRLEN len; /* this has various purposes throughout the code */
5802 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5803 regnode *ret = NULL;
5804
5805 if (*RExC_parse != '{') {
5806 vFAIL("Missing braces on \\N{}");
5807 }
5808 name = RExC_parse+1;
5809 endbrace = strchr(RExC_parse, '}');
5810 if ( ! endbrace ) {
5811 RExC_parse++;
5812 vFAIL("Missing right brace on \\N{}");
5813 }
5814 RExC_parse = endbrace + 1;
5815
5816
5817 /* RExC_parse points at the beginning brace,
5818 endbrace points at the last */
5819 if ( name[0]=='U' && name[1]=='+' ) {
5820 /* its a "unicode hex" notation {U+89AB} */
5821 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5822 | PERL_SCAN_DISALLOW_PREFIX
5823 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5824 UV cp;
196f1508 5825 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5826 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5827 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5828 cp = 0xFFFD;
5829 }
5830 if (cp > 0xff)
5831 RExC_utf8 = 1;
5832 if ( valuep ) {
5833 *valuep = cp;
5834 return NULL;
5835 }
5836 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5837 } else {
5838 /* fetch the charnames handler for this scope */
5839 HV * const table = GvHV(PL_hintgv);
5840 SV **cvp= table ?
5841 hv_fetchs(table, "charnames", FALSE) :
5842 NULL;
5843 SV *cv= cvp ? *cvp : NULL;
5844 HE *he_str;
5845 int count;
5846 /* create an SV with the name as argument */
5847 sv_name = newSVpvn(name, endbrace - name);
5848
5849 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5850 vFAIL2("Constant(\\N{%s}) unknown: "
5851 "(possibly a missing \"use charnames ...\")",
5852 SvPVX(sv_name));
5853 }
5854 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5855 vFAIL2("Constant(\\N{%s}): "
5856 "$^H{charnames} is not defined",SvPVX(sv_name));
5857 }
5858
5859
5860
5861 if (!RExC_charnames) {
5862 /* make sure our cache is allocated */
5863 RExC_charnames = newHV();
6bda09f9 5864 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5865 }
5866 /* see if we have looked this one up before */
5867 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5868 if ( he_str ) {
5869 sv_str = HeVAL(he_str);
5870 cached = 1;
5871 } else {
5872 dSP ;
5873
5874 ENTER ;
5875 SAVETMPS ;
5876 PUSHMARK(SP) ;
5877
5878 XPUSHs(sv_name);
5879
5880 PUTBACK ;
5881
5882 count= call_sv(cv, G_SCALAR);
5883
5884 if (count == 1) { /* XXXX is this right? dmq */
5885 sv_str = POPs;
5886 SvREFCNT_inc_simple_void(sv_str);
5887 }
5888
5889 SPAGAIN ;
5890 PUTBACK ;
5891 FREETMPS ;
5892 LEAVE ;
5893
5894 if ( !sv_str || !SvOK(sv_str) ) {
5895 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5896 "did not return a defined value",SvPVX(sv_name));
5897 }
5898 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5899 cached = 1;
5900 }
5901 }
5902 if (valuep) {
5903 char *p = SvPV(sv_str, len);
5904 if (len) {
5905 STRLEN numlen = 1;
5906 if ( SvUTF8(sv_str) ) {
196f1508 5907 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5908 if (*valuep > 0x7F)
5909 RExC_utf8 = 1;
5910 /* XXXX
5911 We have to turn on utf8 for high bit chars otherwise
5912 we get failures with
5913
5914 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5915 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5916
5917 This is different from what \x{} would do with the same
5918 codepoint, where the condition is > 0xFF.
5919 - dmq
5920 */
5921
5922
5923 } else {
5924 *valuep = (UV)*p;
5925 /* warn if we havent used the whole string? */
5926 }
5927 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5928 vWARN2(RExC_parse,
5929 "Ignoring excess chars from \\N{%s} in character class",
5930 SvPVX(sv_name)
5931 );
5932 }
5933 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5934 vWARN2(RExC_parse,
5935 "Ignoring zero length \\N{%s} in character class",
5936 SvPVX(sv_name)
5937 );
5938 }
5939 if (sv_name)
5940 SvREFCNT_dec(sv_name);
5941 if (!cached)
5942 SvREFCNT_dec(sv_str);
5943 return len ? NULL : (regnode *)&len;
5944 } else if(SvCUR(sv_str)) {
5945
5946 char *s;
5947 char *p, *pend;
5948 STRLEN charlen = 1;
5949 char * parse_start = name-3; /* needed for the offsets */
5950 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5951
5952 ret = reg_node(pRExC_state,
5953 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5954 s= STRING(ret);
5955
5956 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5957 sv_utf8_upgrade(sv_str);
5958 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5959 RExC_utf8= 1;
5960 }
5961
5962 p = SvPV(sv_str, len);
5963 pend = p + len;
5964 /* len is the length written, charlen is the size the char read */
5965 for ( len = 0; p < pend; p += charlen ) {
5966 if (UTF) {
196f1508 5967 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5968 if (FOLD) {
5969 STRLEN foldlen,numlen;
5970 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5971 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5972 /* Emit all the Unicode characters. */
5973
5974 for (foldbuf = tmpbuf;
5975 foldlen;
5976 foldlen -= numlen)
5977 {
5978 uvc = utf8_to_uvchr(foldbuf, &numlen);
5979 if (numlen > 0) {
5980 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5981 s += unilen;
5982 len += unilen;
5983 /* In EBCDIC the numlen
5984 * and unilen can differ. */
5985 foldbuf += numlen;
5986 if (numlen >= foldlen)
5987 break;
5988 }
5989 else
5990 break; /* "Can't happen." */
5991 }
5992 } else {
5993 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5994 if (unilen > 0) {
5995 s += unilen;
5996 len += unilen;
5997 }
5998 }
5999 } else {
6000 len++;
6001 REGC(*p, s++);
6002 }
6003 }
6004 if (SIZE_ONLY) {
6005 RExC_size += STR_SZ(len);
6006 } else {
6007 STR_LEN(ret) = len;
6008 RExC_emit += STR_SZ(len);
6009 }
6010 Set_Node_Cur_Length(ret); /* MJD */
6011 RExC_parse--;
6012 nextchar(pRExC_state);
6013 } else {
6014 ret = reg_node(pRExC_state,NOTHING);
6015 }
6016 if (!cached) {
6017 SvREFCNT_dec(sv_str);
6018 }
6019 if (sv_name) {
6020 SvREFCNT_dec(sv_name);
6021 }
6022 return ret;
6023
6024}
6025
6026
9e08bc66
TS
6027/*
6028 * reg_recode
6029 *
6030 * It returns the code point in utf8 for the value in *encp.
6031 * value: a code value in the source encoding
6032 * encp: a pointer to an Encode object
6033 *
6034 * If the result from Encode is not a single character,
6035 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6036 */
6037STATIC UV
6038S_reg_recode(pTHX_ const char value, SV **encp)
6039{
6040 STRLEN numlen = 1;
6041 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6042 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6043 : SvPVX(sv);
6044 const STRLEN newlen = SvCUR(sv);
6045 UV uv = UNICODE_REPLACEMENT;
6046
6047 if (newlen)
6048 uv = SvUTF8(sv)
6049 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6050 : *(U8*)s;
6051
6052 if (!newlen || numlen != newlen) {
6053 uv = UNICODE_REPLACEMENT;
6054 if (encp)
6055 *encp = NULL;
6056 }
6057 return uv;
6058}
6059
fc8cd66c 6060
a687059c
LW
6061/*
6062 - regatom - the lowest level
6063 *
6064 * Optimization: gobbles an entire sequence of ordinary characters so that
6065 * it can turn them into a single node, which is smaller to store and
6066 * faster to run. Backslashed characters are exceptions, each becoming a
6067 * separate node; the code is simpler that way and it's not worth fixing.
6068 *
7f6f358c
YO
6069 * [Yes, it is worth fixing, some scripts can run twice the speed.]
6070 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6071 */
76e3520e 6072STATIC regnode *
3dab1dad 6073S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6074{
97aff369 6075 dVAR;
cbbf8932 6076 register regnode *ret = NULL;
a0d0e21e 6077 I32 flags;
45948336 6078 char *parse_start = RExC_parse;
3dab1dad
YO
6079 GET_RE_DEBUG_FLAGS_DECL;
6080 DEBUG_PARSE("atom");
a0d0e21e
LW
6081 *flagp = WORST; /* Tentatively. */
6082
6083tryagain:
830247a4 6084 switch (*RExC_parse) {
a0d0e21e 6085 case '^':
830247a4
IZ
6086 RExC_seen_zerolen++;
6087 nextchar(pRExC_state);
bbe252da 6088 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6089 ret = reg_node(pRExC_state, MBOL);
bbe252da 6090 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6091 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6092 else
830247a4 6093 ret = reg_node(pRExC_state, BOL);
fac92740 6094 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6095 break;
6096 case '$':
830247a4 6097 nextchar(pRExC_state);
b81d288d 6098 if (*RExC_parse)
830247a4 6099 RExC_seen_zerolen++;
bbe252da 6100 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6101 ret = reg_node(pRExC_state, MEOL);
bbe252da 6102 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6103 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6104 else
830247a4 6105 ret = reg_node(pRExC_state, EOL);
fac92740 6106 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6107 break;
6108 case '.':
830247a4 6109 nextchar(pRExC_state);
bbe252da 6110 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
6111 ret = reg_node(pRExC_state, SANY);
6112 else
6113 ret = reg_node(pRExC_state, REG_ANY);
6114 *flagp |= HASWIDTH|SIMPLE;
830247a4 6115 RExC_naughty++;
fac92740 6116 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6117 break;
6118 case '[':
b45f050a 6119 {
3dab1dad
YO
6120 char * const oregcomp_parse = ++RExC_parse;
6121 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6122 if (*RExC_parse != ']') {
6123 RExC_parse = oregcomp_parse;
b45f050a
JF
6124 vFAIL("Unmatched [");
6125 }
830247a4 6126 nextchar(pRExC_state);
a0d0e21e 6127 *flagp |= HASWIDTH|SIMPLE;
fac92740 6128 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6129 break;
b45f050a 6130 }
a0d0e21e 6131 case '(':
830247a4 6132 nextchar(pRExC_state);
3dab1dad 6133 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6134 if (ret == NULL) {
bf93d4cc 6135 if (flags & TRYAGAIN) {
830247a4 6136 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6137 /* Make parent create an empty node if needed. */
6138 *flagp |= TRYAGAIN;
6139 return(NULL);
6140 }
a0d0e21e 6141 goto tryagain;
bf93d4cc 6142 }
a0d0e21e
LW
6143 return(NULL);
6144 }
c277df42 6145 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
6146 break;
6147 case '|':
6148 case ')':
6149 if (flags & TRYAGAIN) {
6150 *flagp |= TRYAGAIN;
6151 return NULL;
6152 }
b45f050a 6153 vFAIL("Internal urp");
a0d0e21e
LW
6154 /* Supposed to be caught earlier. */
6155 break;
85afd4ae 6156 case '{':
830247a4
IZ
6157 if (!regcurly(RExC_parse)) {
6158 RExC_parse++;
85afd4ae
CS
6159 goto defchar;
6160 }
6161 /* FALL THROUGH */
a0d0e21e
LW
6162 case '?':
6163 case '+':
6164 case '*':
830247a4 6165 RExC_parse++;
b45f050a 6166 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6167 break;
6168 case '\\':
830247a4 6169 switch (*++RExC_parse) {
a0d0e21e 6170 case 'A':
830247a4
IZ
6171 RExC_seen_zerolen++;
6172 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6173 *flagp |= SIMPLE;
830247a4 6174 nextchar(pRExC_state);
fac92740 6175 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6176 break;
6177 case 'G':
830247a4
IZ
6178 ret = reg_node(pRExC_state, GPOS);
6179 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6180 *flagp |= SIMPLE;
830247a4 6181 nextchar(pRExC_state);
fac92740 6182 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6183 break;
6184 case 'Z':
830247a4 6185 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6186 *flagp |= SIMPLE;
a1917ab9 6187 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 6188 nextchar(pRExC_state);
a0d0e21e 6189 break;
b85d18e9 6190 case 'z':
830247a4 6191 ret = reg_node(pRExC_state, EOS);
b85d18e9 6192 *flagp |= SIMPLE;
830247a4
IZ
6193 RExC_seen_zerolen++; /* Do not optimize RE away */
6194 nextchar(pRExC_state);
fac92740 6195 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 6196 break;
4a2d328f 6197 case 'C':
f33976b4
DB
6198 ret = reg_node(pRExC_state, CANY);
6199 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6200 *flagp |= HASWIDTH|SIMPLE;
830247a4 6201 nextchar(pRExC_state);
fac92740 6202 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
6203 break;
6204 case 'X':
830247a4 6205 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6206 *flagp |= HASWIDTH;
830247a4 6207 nextchar(pRExC_state);
fac92740 6208 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 6209 break;
a0d0e21e 6210 case 'w':
eb160463 6211 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6212 *flagp |= HASWIDTH|SIMPLE;
830247a4 6213 nextchar(pRExC_state);
fac92740 6214 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6215 break;
6216 case 'W':
eb160463 6217 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6218 *flagp |= HASWIDTH|SIMPLE;
830247a4 6219 nextchar(pRExC_state);
fac92740 6220 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6221 break;
6222 case 'b':
830247a4
IZ
6223 RExC_seen_zerolen++;
6224 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6225 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6226 *flagp |= SIMPLE;
830247a4 6227 nextchar(pRExC_state);
fac92740 6228 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6229 break;
6230 case 'B':
830247a4
IZ
6231 RExC_seen_zerolen++;
6232 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6233 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6234 *flagp |= SIMPLE;
830247a4 6235 nextchar(pRExC_state);
fac92740 6236 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6237 break;
6238 case 's':
eb160463 6239 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6240 *flagp |= HASWIDTH|SIMPLE;
830247a4 6241 nextchar(pRExC_state);
fac92740 6242 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6243 break;
6244 case 'S':
eb160463 6245 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6246 *flagp |= HASWIDTH|SIMPLE;
830247a4 6247 nextchar(pRExC_state);
fac92740 6248 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6249 break;
6250 case 'd':
ffc61ed2 6251 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6252 *flagp |= HASWIDTH|SIMPLE;
830247a4 6253 nextchar(pRExC_state);
fac92740 6254 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6255 break;
6256 case 'D':
ffc61ed2 6257 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6258 *flagp |= HASWIDTH|SIMPLE;
830247a4 6259 nextchar(pRExC_state);
fac92740 6260 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 6261 break;
a14b48bc
LW
6262 case 'p':
6263 case 'P':
3568d838 6264 {
3dab1dad 6265 char* const oldregxend = RExC_end;
ccb2c380 6266 char* parse_start = RExC_parse - 2;
a14b48bc 6267
830247a4 6268 if (RExC_parse[1] == '{') {
3568d838 6269 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6270 RExC_end = strchr(RExC_parse, '}');
6271 if (!RExC_end) {
3dab1dad 6272 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6273 RExC_parse += 2;
6274 RExC_end = oldregxend;
0da60cf5 6275 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6276 }
830247a4 6277 RExC_end++;
a14b48bc 6278 }
af6f566e 6279 else {
830247a4 6280 RExC_end = RExC_parse + 2;
af6f566e
HS
6281 if (RExC_end > oldregxend)
6282 RExC_end = oldregxend;
6283 }
830247a4 6284 RExC_parse--;
a14b48bc 6285
3dab1dad 6286 ret = regclass(pRExC_state,depth+1);
a14b48bc 6287
830247a4
IZ
6288 RExC_end = oldregxend;
6289 RExC_parse--;
ccb2c380
MP
6290
6291 Set_Node_Offset(ret, parse_start + 2);
6292 Set_Node_Cur_Length(ret);
830247a4 6293 nextchar(pRExC_state);
a14b48bc
LW
6294 *flagp |= HASWIDTH|SIMPLE;
6295 }
6296 break;
fc8cd66c
YO
6297 case 'N':
6298 /* Handle \N{NAME} here and not below because it can be
6299 multicharacter. join_exact() will join them up later on.
6300 Also this makes sure that things like /\N{BLAH}+/ and
6301 \N{BLAH} being multi char Just Happen. dmq*/
6302 ++RExC_parse;
6303 ret= reg_namedseq(pRExC_state, NULL);
6304 break;
0a4db386 6305 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
6306 {
6307 char ch= RExC_parse[1];
6308 if (ch != '<' && ch != '\'') {
6309 if (SIZE_ONLY)
6310 vWARN( RExC_parse + 1,
6311 "Possible broken named back reference treated as literal k");
6312 parse_start--;
6313 goto defchar;
6314 } else {
6315 char* name_start = (RExC_parse += 2);
2eccd3b2 6316 U32 num = 0;
0a4db386
YO
6317 SV *sv_dat = reg_scan_name(pRExC_state,
6318 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
6319 ch= (ch == '<') ? '>' : '\'';
6320
81714fb9
YO
6321 if (RExC_parse == name_start || *RExC_parse != ch)
6322 vFAIL2("Sequence \\k%c... not terminated",
6323 (ch == '>' ? '<' : ch));
6324
6325 RExC_sawback = 1;
6326 ret = reganode(pRExC_state,
6327 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6328 num);
6329 *flagp |= HASWIDTH;
6330
6331
6332 if (!SIZE_ONLY) {
81714fb9
YO
6333 num = add_data( pRExC_state, 1, "S" );
6334 ARG_SET(ret,num);
f8fc2ecf 6335 RExC_rxi->data->data[num]=(void*)sv_dat;
81714fb9
YO
6336 SvREFCNT_inc(sv_dat);
6337 }
6338 /* override incorrect value set in reganode MJD */
6339 Set_Node_Offset(ret, parse_start+1);
6340 Set_Node_Cur_Length(ret); /* MJD */
6341 nextchar(pRExC_state);
6342
6343 }
6344 break;
6345 }
a0d0e21e
LW
6346 case 'n':
6347 case 'r':
6348 case 't':
6349 case 'f':
6350 case 'e':
6351 case 'a':
6352 case 'x':
6353 case 'c':
6354 case '0':
6355 goto defchar;
c74340f9 6356 case 'R':
a0d0e21e
LW
6357 case '1': case '2': case '3': case '4':
6358 case '5': case '6': case '7': case '8': case '9':
6359 {
c74340f9
YO
6360 I32 num;
6361 bool isrel=(*RExC_parse=='R');
6362 if (isrel)
6363 RExC_parse++;
6364 num = atoi(RExC_parse);
6365 if (isrel) {
6366 num = RExC_cpar - num;
6367 if (num < 1)
6368 vFAIL("Reference to nonexistent or unclosed group");
6369 }
830247a4 6370 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
6371 goto defchar;
6372 else {
3dab1dad 6373 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6374 while (isDIGIT(*RExC_parse))
6375 RExC_parse++;
b45f050a 6376
c74340f9
YO
6377 if (!SIZE_ONLY) {
6378 if (num > (I32)RExC_rx->nparens)
6379 vFAIL("Reference to nonexistent group");
6380 /* People make this error all the time apparently.
6381 So we cant fail on it, even though we should
6382
6383 else if (num >= RExC_cpar)
6384 vFAIL("Reference to unclosed group will always match");
6385 */
6386 }
830247a4 6387 RExC_sawback = 1;
eb160463
GS
6388 ret = reganode(pRExC_state,
6389 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6390 num);
a0d0e21e 6391 *flagp |= HASWIDTH;
2af232bd 6392
fac92740 6393 /* override incorrect value set in reganode MJD */
2af232bd 6394 Set_Node_Offset(ret, parse_start+1);
fac92740 6395 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6396 RExC_parse--;
6397 nextchar(pRExC_state);
a0d0e21e
LW
6398 }
6399 }
6400 break;
6401 case '\0':
830247a4 6402 if (RExC_parse >= RExC_end)
b45f050a 6403 FAIL("Trailing \\");
a0d0e21e
LW
6404 /* FALL THROUGH */
6405 default:
a0288114 6406 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6407 back into the quick-grab loop below */
45948336 6408 parse_start--;
a0d0e21e
LW
6409 goto defchar;
6410 }
6411 break;
4633a7c4
LW
6412
6413 case '#':
bbe252da 6414 if (RExC_flags & RXf_PMf_EXTENDED) {
3dab1dad
YO
6415 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6416 RExC_parse++;
830247a4 6417 if (RExC_parse < RExC_end)
4633a7c4
LW
6418 goto tryagain;
6419 }
6420 /* FALL THROUGH */
6421
a0d0e21e 6422 default: {
ba210ebe 6423 register STRLEN len;
58ae7d3f 6424 register UV ender;
a0d0e21e 6425 register char *p;
3dab1dad 6426 char *s;
80aecb99 6427 STRLEN foldlen;
89ebb4a3 6428 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6429
6430 parse_start = RExC_parse - 1;
a0d0e21e 6431
830247a4 6432 RExC_parse++;
a0d0e21e
LW
6433
6434 defchar:
58ae7d3f 6435 ender = 0;
eb160463
GS
6436 ret = reg_node(pRExC_state,
6437 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6438 s = STRING(ret);
830247a4
IZ
6439 for (len = 0, p = RExC_parse - 1;
6440 len < 127 && p < RExC_end;
a0d0e21e
LW
6441 len++)
6442 {
3dab1dad 6443 char * const oldp = p;
5b5a24f7 6444
bbe252da 6445 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6446 p = regwhite(p, RExC_end);
a0d0e21e
LW
6447 switch (*p) {
6448 case '^':
6449 case '$':
6450 case '.':
6451 case '[':
6452 case '(':
6453 case ')':
6454 case '|':
6455 goto loopdone;
6456 case '\\':
6457 switch (*++p) {
6458 case 'A':
1ed8eac0
JF
6459 case 'C':
6460 case 'X':
a0d0e21e
LW
6461 case 'G':
6462 case 'Z':
b85d18e9 6463 case 'z':
a0d0e21e
LW
6464 case 'w':
6465 case 'W':
6466 case 'b':
6467 case 'B':
6468 case 's':
6469 case 'S':
6470 case 'd':
6471 case 'D':
a14b48bc
LW
6472 case 'p':
6473 case 'P':
fc8cd66c 6474 case 'N':
c74340f9 6475 case 'R':
a0d0e21e
LW
6476 --p;
6477 goto loopdone;
6478 case 'n':
6479 ender = '\n';
6480 p++;
a687059c 6481 break;
a0d0e21e
LW
6482 case 'r':
6483 ender = '\r';
6484 p++;
a687059c 6485 break;
a0d0e21e
LW
6486 case 't':
6487 ender = '\t';
6488 p++;
a687059c 6489 break;
a0d0e21e
LW
6490 case 'f':
6491 ender = '\f';
6492 p++;
a687059c 6493 break;
a0d0e21e 6494 case 'e':
c7f1f016 6495 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6496 p++;
a687059c 6497 break;
a0d0e21e 6498 case 'a':
c7f1f016 6499 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6500 p++;
a687059c 6501 break;
a0d0e21e 6502 case 'x':
a0ed51b3 6503 if (*++p == '{') {
1df70142 6504 char* const e = strchr(p, '}');
b81d288d 6505
b45f050a 6506 if (!e) {
830247a4 6507 RExC_parse = p + 1;
b45f050a
JF
6508 vFAIL("Missing right brace on \\x{}");
6509 }
de5f0749 6510 else {
a4c04bdc
NC
6511 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6512 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6513 STRLEN numlen = e - p - 1;
53305cf1 6514 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6515 if (ender > 0xff)
6516 RExC_utf8 = 1;
a0ed51b3
LW
6517 p = e + 1;
6518 }
a0ed51b3
LW
6519 }
6520 else {
a4c04bdc 6521 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6522 STRLEN numlen = 2;
53305cf1 6523 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6524 p += numlen;
6525 }
9e08bc66
TS
6526 if (PL_encoding && ender < 0x100)
6527 goto recode_encoding;
a687059c 6528 break;
a0d0e21e
LW
6529 case 'c':
6530 p++;
bbce6d69 6531 ender = UCHARAT(p++);
6532 ender = toCTRL(ender);
a687059c 6533 break;
a0d0e21e
LW
6534 case '0': case '1': case '2': case '3':case '4':
6535 case '5': case '6': case '7': case '8':case '9':
6536 if (*p == '0' ||
830247a4 6537 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6538 I32 flags = 0;
1df70142 6539 STRLEN numlen = 3;
53305cf1 6540 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6541 p += numlen;
6542 }
6543 else {
6544 --p;
6545 goto loopdone;
a687059c 6546 }
9e08bc66
TS
6547 if (PL_encoding && ender < 0x100)
6548 goto recode_encoding;
6549 break;
6550 recode_encoding:
6551 {
6552 SV* enc = PL_encoding;
6553 ender = reg_recode((const char)(U8)ender, &enc);
6554 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6555 vWARN(p, "Invalid escape in the specified encoding");
6556 RExC_utf8 = 1;
6557 }
a687059c 6558 break;
a0d0e21e 6559 case '\0':
830247a4 6560 if (p >= RExC_end)
b45f050a 6561 FAIL("Trailing \\");
a687059c 6562 /* FALL THROUGH */
a0d0e21e 6563 default:
041457d9 6564 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6565 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6566 goto normal_default;
a0d0e21e
LW
6567 }
6568 break;
a687059c 6569 default:
a0ed51b3 6570 normal_default:
fd400ab9 6571 if (UTF8_IS_START(*p) && UTF) {
1df70142 6572 STRLEN numlen;
5e12f4fb 6573 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6574 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6575 p += numlen;
6576 }
6577 else
6578 ender = *p++;
a0d0e21e 6579 break;
a687059c 6580 }
bbe252da 6581 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6582 p = regwhite(p, RExC_end);
60a8b682
JH
6583 if (UTF && FOLD) {
6584 /* Prime the casefolded buffer. */
ac7e0132 6585 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6586 }
a0d0e21e
LW
6587 if (ISMULT2(p)) { /* Back off on ?+*. */
6588 if (len)
6589 p = oldp;
16ea2a2e 6590 else if (UTF) {
80aecb99 6591 if (FOLD) {
60a8b682 6592 /* Emit all the Unicode characters. */
1df70142 6593 STRLEN numlen;
80aecb99
JH
6594 for (foldbuf = tmpbuf;
6595 foldlen;
6596 foldlen -= numlen) {
6597 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6598 if (numlen > 0) {
71207a34 6599 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6600 s += unilen;
6601 len += unilen;
6602 /* In EBCDIC the numlen
6603 * and unilen can differ. */
9dc45d57 6604 foldbuf += numlen;
47654450
JH
6605 if (numlen >= foldlen)
6606 break;
9dc45d57
JH
6607 }
6608 else
6609 break; /* "Can't happen." */
80aecb99
JH
6610 }
6611 }
6612 else {
71207a34 6613 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6614 if (unilen > 0) {
0ebc6274
JH
6615 s += unilen;
6616 len += unilen;
9dc45d57 6617 }
80aecb99 6618 }
a0ed51b3 6619 }
a0d0e21e
LW
6620 else {
6621 len++;
eb160463 6622 REGC((char)ender, s++);
a0d0e21e
LW
6623 }
6624 break;
a687059c 6625 }
16ea2a2e 6626 if (UTF) {
80aecb99 6627 if (FOLD) {
60a8b682 6628 /* Emit all the Unicode characters. */
1df70142 6629 STRLEN numlen;
80aecb99
JH
6630 for (foldbuf = tmpbuf;
6631 foldlen;
6632 foldlen -= numlen) {
6633 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6634 if (numlen > 0) {
71207a34 6635 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6636 len += unilen;
6637 s += unilen;
6638 /* In EBCDIC the numlen
6639 * and unilen can differ. */
9dc45d57 6640 foldbuf += numlen;
47654450
JH
6641 if (numlen >= foldlen)
6642 break;
9dc45d57
JH
6643 }
6644 else
6645 break;
80aecb99
JH
6646 }
6647 }
6648 else {
71207a34 6649 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6650 if (unilen > 0) {
0ebc6274
JH
6651 s += unilen;
6652 len += unilen;
9dc45d57 6653 }
80aecb99
JH
6654 }
6655 len--;
a0ed51b3
LW
6656 }
6657 else
eb160463 6658 REGC((char)ender, s++);
a0d0e21e
LW
6659 }
6660 loopdone:
830247a4 6661 RExC_parse = p - 1;
fac92740 6662 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6663 nextchar(pRExC_state);
793db0cb
JH
6664 {
6665 /* len is STRLEN which is unsigned, need to copy to signed */
6666 IV iv = len;
6667 if (iv < 0)
6668 vFAIL("Internal disaster");
6669 }
a0d0e21e
LW
6670 if (len > 0)
6671 *flagp |= HASWIDTH;
090f7165 6672 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6673 *flagp |= SIMPLE;
3dab1dad 6674
cd439c50 6675 if (SIZE_ONLY)
830247a4 6676 RExC_size += STR_SZ(len);
3dab1dad
YO
6677 else {
6678 STR_LEN(ret) = len;
830247a4 6679 RExC_emit += STR_SZ(len);
07be1b83 6680 }
3dab1dad 6681 }
a0d0e21e
LW
6682 break;
6683 }
a687059c 6684
a0d0e21e 6685 return(ret);
a687059c
LW
6686}
6687
873ef191 6688STATIC char *
5f66b61c 6689S_regwhite(char *p, const char *e)
5b5a24f7
CS
6690{
6691 while (p < e) {
6692 if (isSPACE(*p))
6693 ++p;
6694 else if (*p == '#') {
6695 do {
6696 p++;
6697 } while (p < e && *p != '\n');
6698 }
6699 else
6700 break;
6701 }
6702 return p;
6703}
6704
b8c5462f
JH
6705/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6706 Character classes ([:foo:]) can also be negated ([:^foo:]).
6707 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6708 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6709 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6710
6711#define POSIXCC_DONE(c) ((c) == ':')
6712#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6713#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6714
b8c5462f 6715STATIC I32
830247a4 6716S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6717{
97aff369 6718 dVAR;
936ed897 6719 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6720
830247a4 6721 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6722 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6723 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6724 const char c = UCHARAT(RExC_parse);
097eb12c 6725 char* const s = RExC_parse++;
b81d288d 6726
9a86a77b 6727 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6728 RExC_parse++;
6729 if (RExC_parse == RExC_end)
620e46c5 6730 /* Grandfather lone [:, [=, [. */
830247a4 6731 RExC_parse = s;
620e46c5 6732 else {
3dab1dad 6733 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6734 assert(*t == c);
6735
9a86a77b 6736 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6737 const char *posixcc = s + 1;
830247a4 6738 RExC_parse++; /* skip over the ending ] */
3dab1dad 6739
b8c5462f 6740 if (*s == ':') {
1df70142
AL
6741 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6742 const I32 skip = t - posixcc;
80916619
NC
6743
6744 /* Initially switch on the length of the name. */
6745 switch (skip) {
6746 case 4:
3dab1dad
YO
6747 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6748 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6749 break;
80916619
NC
6750 case 5:
6751 /* Names all of length 5. */
6752 /* alnum alpha ascii blank cntrl digit graph lower
6753 print punct space upper */
6754 /* Offset 4 gives the best switch position. */
6755 switch (posixcc[4]) {
6756 case 'a':
3dab1dad
YO
6757 if (memEQ(posixcc, "alph", 4)) /* alpha */
6758 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6759 break;
6760 case 'e':
3dab1dad
YO
6761 if (memEQ(posixcc, "spac", 4)) /* space */
6762 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6763 break;
6764 case 'h':
3dab1dad
YO
6765 if (memEQ(posixcc, "grap", 4)) /* graph */
6766 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6767 break;
6768 case 'i':
3dab1dad
YO
6769 if (memEQ(posixcc, "asci", 4)) /* ascii */
6770 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6771 break;
6772 case 'k':
3dab1dad
YO
6773 if (memEQ(posixcc, "blan", 4)) /* blank */
6774 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6775 break;
6776 case 'l':
3dab1dad
YO
6777 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6778 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6779 break;
6780 case 'm':
3dab1dad
YO
6781 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6782 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6783 break;
6784 case 'r':
3dab1dad
YO
6785 if (memEQ(posixcc, "lowe", 4)) /* lower */
6786 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6787 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6788 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6789 break;
6790 case 't':
3dab1dad
YO
6791 if (memEQ(posixcc, "digi", 4)) /* digit */
6792 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6793 else if (memEQ(posixcc, "prin", 4)) /* print */
6794 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6795 else if (memEQ(posixcc, "punc", 4)) /* punct */
6796 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6797 break;
b8c5462f
JH
6798 }
6799 break;
80916619 6800 case 6:
3dab1dad
YO
6801 if (memEQ(posixcc, "xdigit", 6))
6802 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6803 break;
6804 }
80916619
NC
6805
6806 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6807 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6808 t - s - 1, s + 1);
80916619
NC
6809 assert (posixcc[skip] == ':');
6810 assert (posixcc[skip+1] == ']');
b45f050a 6811 } else if (!SIZE_ONLY) {
b8c5462f 6812 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6813
830247a4 6814 /* adjust RExC_parse so the warning shows after
b45f050a 6815 the class closes */
9a86a77b 6816 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6817 RExC_parse++;
b45f050a
JF
6818 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6819 }
b8c5462f
JH
6820 } else {
6821 /* Maternal grandfather:
6822 * "[:" ending in ":" but not in ":]" */
830247a4 6823 RExC_parse = s;
767d463e 6824 }
620e46c5
JH
6825 }
6826 }
6827
b8c5462f
JH
6828 return namedclass;
6829}
6830
6831STATIC void
830247a4 6832S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6833{
97aff369 6834 dVAR;
3dab1dad 6835 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6836 const char *s = RExC_parse;
6837 const char c = *s++;
b8c5462f 6838
3dab1dad 6839 while (isALNUM(*s))
b8c5462f
JH
6840 s++;
6841 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6842 if (ckWARN(WARN_REGEXP))
6843 vWARN3(s+2,
6844 "POSIX syntax [%c %c] belongs inside character classes",
6845 c, c);
b45f050a
JF
6846
6847 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6848 if (POSIXCC_NOTYET(c)) {
830247a4 6849 /* adjust RExC_parse so the error shows after
b45f050a 6850 the class closes */
9a86a77b 6851 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6852 NOOP;
b45f050a
JF
6853 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6854 }
b8c5462f
JH
6855 }
6856 }
620e46c5
JH
6857}
6858
7f6f358c
YO
6859
6860/*
6861 parse a class specification and produce either an ANYOF node that
6862 matches the pattern. If the pattern matches a single char only and
6863 that char is < 256 then we produce an EXACT node instead.
6864*/
76e3520e 6865STATIC regnode *
3dab1dad 6866S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6867{
97aff369 6868 dVAR;
9ef43ace 6869 register UV value = 0;
9a86a77b 6870 register UV nextvalue;
3568d838 6871 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6872 register IV range = 0;
c277df42 6873 register regnode *ret;
ba210ebe 6874 STRLEN numlen;
ffc61ed2 6875 IV namedclass;
cbbf8932 6876 char *rangebegin = NULL;
936ed897 6877 bool need_class = 0;
c445ea15 6878 SV *listsv = NULL;
ffc61ed2 6879 UV n;
9e55ce06 6880 bool optimize_invert = TRUE;
cbbf8932 6881 AV* unicode_alternate = NULL;
1b2d223b
JH
6882#ifdef EBCDIC
6883 UV literal_endpoint = 0;
6884#endif
7f6f358c 6885 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6886
3dab1dad 6887 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6888 case we need to change the emitted regop to an EXACT. */
07be1b83 6889 const char * orig_parse = RExC_parse;
72f13be8 6890 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6891#ifndef DEBUGGING
6892 PERL_UNUSED_ARG(depth);
6893#endif
72f13be8 6894
3dab1dad 6895 DEBUG_PARSE("clas");
7f6f358c
YO
6896
6897 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6898 ret = reganode(pRExC_state, ANYOF, 0);
6899
6900 if (!SIZE_ONLY)
6901 ANYOF_FLAGS(ret) = 0;
6902
9a86a77b 6903 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6904 RExC_naughty++;
6905 RExC_parse++;
6906 if (!SIZE_ONLY)
6907 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6908 }
a0d0e21e 6909
73060fc4 6910 if (SIZE_ONLY) {
830247a4 6911 RExC_size += ANYOF_SKIP;
73060fc4
JH
6912 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6913 }
936ed897 6914 else {
830247a4 6915 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6916 if (FOLD)
6917 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6918 if (LOC)
6919 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6920 ANYOF_BITMAP_ZERO(ret);
396482e1 6921 listsv = newSVpvs("# comment\n");
a0d0e21e 6922 }
b8c5462f 6923
9a86a77b
JH
6924 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6925
b938889d 6926 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6927 checkposixcc(pRExC_state);
b8c5462f 6928
f064b6ad
HS
6929 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6930 if (UCHARAT(RExC_parse) == ']')
6931 goto charclassloop;
ffc61ed2 6932
fc8cd66c 6933parseit:
9a86a77b 6934 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6935
6936 charclassloop:
6937
6938 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6939
73b437c8 6940 if (!range)
830247a4 6941 rangebegin = RExC_parse;
ffc61ed2 6942 if (UTF) {
5e12f4fb 6943 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6944 RExC_end - RExC_parse,
9f7f3913 6945 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6946 RExC_parse += numlen;
6947 }
6948 else
6949 value = UCHARAT(RExC_parse++);
7f6f358c 6950
9a86a77b
JH
6951 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6952 if (value == '[' && POSIXCC(nextvalue))
830247a4 6953 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6954 else if (value == '\\') {
ffc61ed2 6955 if (UTF) {
5e12f4fb 6956 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6957 RExC_end - RExC_parse,
9f7f3913 6958 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6959 RExC_parse += numlen;
6960 }
6961 else
6962 value = UCHARAT(RExC_parse++);
470c3474 6963 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6964 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6965 * be a problem later if we want switch on Unicode.
6966 * A similar issue a little bit later when switching on
6967 * namedclass. --jhi */
ffc61ed2 6968 switch ((I32)value) {
b8c5462f
JH
6969 case 'w': namedclass = ANYOF_ALNUM; break;
6970 case 'W': namedclass = ANYOF_NALNUM; break;
6971 case 's': namedclass = ANYOF_SPACE; break;
6972 case 'S': namedclass = ANYOF_NSPACE; break;
6973 case 'd': namedclass = ANYOF_DIGIT; break;
6974 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6975 case 'N': /* Handle \N{NAME} in class */
6976 {
6977 /* We only pay attention to the first char of
6978 multichar strings being returned. I kinda wonder
6979 if this makes sense as it does change the behaviour
6980 from earlier versions, OTOH that behaviour was broken
6981 as well. */
6982 UV v; /* value is register so we cant & it /grrr */
6983 if (reg_namedseq(pRExC_state, &v)) {
6984 goto parseit;
6985 }
6986 value= v;
6987 }
6988 break;
ffc61ed2
JH
6989 case 'p':
6990 case 'P':
3dab1dad
YO
6991 {
6992 char *e;
af6f566e 6993 if (RExC_parse >= RExC_end)
2a4859cd 6994 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6995 if (*RExC_parse == '{') {
1df70142 6996 const U8 c = (U8)value;
ffc61ed2
JH
6997 e = strchr(RExC_parse++, '}');
6998 if (!e)
0da60cf5 6999 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
7000 while (isSPACE(UCHARAT(RExC_parse)))
7001 RExC_parse++;
7002 if (e == RExC_parse)
0da60cf5 7003 vFAIL2("Empty \\%c{}", c);
ffc61ed2 7004 n = e - RExC_parse;
ab13f0c7
JH
7005 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7006 n--;
ffc61ed2
JH
7007 }
7008 else {
7009 e = RExC_parse;
7010 n = 1;
7011 }
7012 if (!SIZE_ONLY) {
ab13f0c7
JH
7013 if (UCHARAT(RExC_parse) == '^') {
7014 RExC_parse++;
7015 n--;
7016 value = value == 'p' ? 'P' : 'p'; /* toggle */
7017 while (isSPACE(UCHARAT(RExC_parse))) {
7018 RExC_parse++;
7019 n--;
7020 }
7021 }
097eb12c
AL
7022 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7023 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
7024 }
7025 RExC_parse = e + 1;
7026 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 7027 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 7028 }
f81125e2 7029 break;
b8c5462f
JH
7030 case 'n': value = '\n'; break;
7031 case 'r': value = '\r'; break;
7032 case 't': value = '\t'; break;
7033 case 'f': value = '\f'; break;
7034 case 'b': value = '\b'; break;
c7f1f016
NIS
7035 case 'e': value = ASCII_TO_NATIVE('\033');break;
7036 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 7037 case 'x':
ffc61ed2 7038 if (*RExC_parse == '{') {
a4c04bdc
NC
7039 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7040 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 7041 char * const e = strchr(RExC_parse++, '}');
b81d288d 7042 if (!e)
ffc61ed2 7043 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
7044
7045 numlen = e - RExC_parse;
7046 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7047 RExC_parse = e + 1;
7048 }
7049 else {
a4c04bdc 7050 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
7051 numlen = 2;
7052 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7053 RExC_parse += numlen;
7054 }
9e08bc66
TS
7055 if (PL_encoding && value < 0x100)
7056 goto recode_encoding;
b8c5462f
JH
7057 break;
7058 case 'c':
830247a4 7059 value = UCHARAT(RExC_parse++);
b8c5462f
JH
7060 value = toCTRL(value);
7061 break;
7062 case '0': case '1': case '2': case '3': case '4':
7063 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
7064 {
7065 I32 flags = 0;
7066 numlen = 3;
7067 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7068 RExC_parse += numlen;
7069 if (PL_encoding && value < 0x100)
7070 goto recode_encoding;
7071 break;
7072 }
7073 recode_encoding:
7074 {
7075 SV* enc = PL_encoding;
7076 value = reg_recode((const char)(U8)value, &enc);
7077 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7078 vWARN(RExC_parse,
7079 "Invalid escape in the specified encoding");
7080 break;
7081 }
1028017a 7082 default:
041457d9 7083 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7084 vWARN2(RExC_parse,
7085 "Unrecognized escape \\%c in character class passed through",
7086 (int)value);
1028017a 7087 break;
b8c5462f 7088 }
ffc61ed2 7089 } /* end of \blah */
1b2d223b
JH
7090#ifdef EBCDIC
7091 else
7092 literal_endpoint++;
7093#endif
ffc61ed2
JH
7094
7095 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7096
7097 if (!SIZE_ONLY && !need_class)
936ed897 7098 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7099
936ed897 7100 need_class = 1;
ffc61ed2
JH
7101
7102 /* a bad range like a-\d, a-[:digit:] ? */
7103 if (range) {
73b437c8 7104 if (!SIZE_ONLY) {
afd78fd5 7105 if (ckWARN(WARN_REGEXP)) {
097eb12c 7106 const int w =
afd78fd5
JH
7107 RExC_parse >= rangebegin ?
7108 RExC_parse - rangebegin : 0;
830247a4 7109 vWARN4(RExC_parse,
b45f050a 7110 "False [] range \"%*.*s\"",
097eb12c 7111 w, w, rangebegin);
afd78fd5 7112 }
3568d838
JH
7113 if (prevvalue < 256) {
7114 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7115 ANYOF_BITMAP_SET(ret, '-');
7116 }
7117 else {
7118 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7119 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7120 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7121 }
b8c5462f 7122 }
ffc61ed2
JH
7123
7124 range = 0; /* this was not a true range */
73b437c8 7125 }
ffc61ed2 7126
73b437c8 7127 if (!SIZE_ONLY) {
c49a72a9
NC
7128 const char *what = NULL;
7129 char yesno = 0;
7130
3568d838
JH
7131 if (namedclass > OOB_NAMEDCLASS)
7132 optimize_invert = FALSE;
e2962f66
JH
7133 /* Possible truncation here but in some 64-bit environments
7134 * the compiler gets heartburn about switch on 64-bit values.
7135 * A similar issue a little earlier when switching on value.
98f323fa 7136 * --jhi */
e2962f66 7137 switch ((I32)namedclass) {
73b437c8
JH
7138 case ANYOF_ALNUM:
7139 if (LOC)
936ed897 7140 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
7141 else {
7142 for (value = 0; value < 256; value++)
7143 if (isALNUM(value))
936ed897 7144 ANYOF_BITMAP_SET(ret, value);
73b437c8 7145 }
c49a72a9
NC
7146 yesno = '+';
7147 what = "Word";
73b437c8
JH
7148 break;
7149 case ANYOF_NALNUM:
7150 if (LOC)
936ed897 7151 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
7152 else {
7153 for (value = 0; value < 256; value++)
7154 if (!isALNUM(value))
936ed897 7155 ANYOF_BITMAP_SET(ret, value);
73b437c8 7156 }
c49a72a9
NC
7157 yesno = '!';
7158 what = "Word";
73b437c8 7159 break;
ffc61ed2 7160 case ANYOF_ALNUMC:
73b437c8 7161 if (LOC)
ffc61ed2 7162 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
7163 else {
7164 for (value = 0; value < 256; value++)
ffc61ed2 7165 if (isALNUMC(value))
936ed897 7166 ANYOF_BITMAP_SET(ret, value);
73b437c8 7167 }
c49a72a9
NC
7168 yesno = '+';
7169 what = "Alnum";
73b437c8
JH
7170 break;
7171 case ANYOF_NALNUMC:
7172 if (LOC)
936ed897 7173 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
7174 else {
7175 for (value = 0; value < 256; value++)
7176 if (!isALNUMC(value))
936ed897 7177 ANYOF_BITMAP_SET(ret, value);
73b437c8 7178 }
c49a72a9
NC
7179 yesno = '!';
7180 what = "Alnum";
73b437c8
JH
7181 break;
7182 case ANYOF_ALPHA:
7183 if (LOC)
936ed897 7184 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
7185 else {
7186 for (value = 0; value < 256; value++)
7187 if (isALPHA(value))
936ed897 7188 ANYOF_BITMAP_SET(ret, value);
73b437c8 7189 }
c49a72a9
NC
7190 yesno = '+';
7191 what = "Alpha";
73b437c8
JH
7192 break;
7193 case ANYOF_NALPHA:
7194 if (LOC)
936ed897 7195 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
7196 else {
7197 for (value = 0; value < 256; value++)
7198 if (!isALPHA(value))
936ed897 7199 ANYOF_BITMAP_SET(ret, value);
73b437c8 7200 }
c49a72a9
NC
7201 yesno = '!';
7202 what = "Alpha";
73b437c8
JH
7203 break;
7204 case ANYOF_ASCII:
7205 if (LOC)
936ed897 7206 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7207 else {
c7f1f016 7208#ifndef EBCDIC
1ba5c669
JH
7209 for (value = 0; value < 128; value++)
7210 ANYOF_BITMAP_SET(ret, value);
7211#else /* EBCDIC */
ffbc6a93 7212 for (value = 0; value < 256; value++) {
3a3c4447
JH
7213 if (isASCII(value))
7214 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7215 }
1ba5c669 7216#endif /* EBCDIC */
73b437c8 7217 }
c49a72a9
NC
7218 yesno = '+';
7219 what = "ASCII";
73b437c8
JH
7220 break;
7221 case ANYOF_NASCII:
7222 if (LOC)
936ed897 7223 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7224 else {
c7f1f016 7225#ifndef EBCDIC
1ba5c669
JH
7226 for (value = 128; value < 256; value++)
7227 ANYOF_BITMAP_SET(ret, value);
7228#else /* EBCDIC */
ffbc6a93 7229 for (value = 0; value < 256; value++) {
3a3c4447
JH
7230 if (!isASCII(value))
7231 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7232 }
1ba5c669 7233#endif /* EBCDIC */
73b437c8 7234 }
c49a72a9
NC
7235 yesno = '!';
7236 what = "ASCII";
73b437c8 7237 break;
aaa51d5e
JF
7238 case ANYOF_BLANK:
7239 if (LOC)
7240 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7241 else {
7242 for (value = 0; value < 256; value++)
7243 if (isBLANK(value))
7244 ANYOF_BITMAP_SET(ret, value);
7245 }
c49a72a9
NC
7246 yesno = '+';
7247 what = "Blank";
aaa51d5e
JF
7248 break;
7249 case ANYOF_NBLANK:
7250 if (LOC)
7251 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7252 else {
7253 for (value = 0; value < 256; value++)
7254 if (!isBLANK(value))
7255 ANYOF_BITMAP_SET(ret, value);
7256 }
c49a72a9
NC
7257 yesno = '!';
7258 what = "Blank";
aaa51d5e 7259 break;
73b437c8
JH
7260 case ANYOF_CNTRL:
7261 if (LOC)
936ed897 7262 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
7263 else {
7264 for (value = 0; value < 256; value++)
7265 if (isCNTRL(value))
936ed897 7266 ANYOF_BITMAP_SET(ret, value);
73b437c8 7267 }
c49a72a9
NC
7268 yesno = '+';
7269 what = "Cntrl";
73b437c8
JH
7270 break;
7271 case ANYOF_NCNTRL:
7272 if (LOC)
936ed897 7273 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
7274 else {
7275 for (value = 0; value < 256; value++)
7276 if (!isCNTRL(value))
936ed897 7277 ANYOF_BITMAP_SET(ret, value);
73b437c8 7278 }
c49a72a9
NC
7279 yesno = '!';
7280 what = "Cntrl";
ffc61ed2
JH
7281 break;
7282 case ANYOF_DIGIT:
7283 if (LOC)
7284 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7285 else {
7286 /* consecutive digits assumed */
7287 for (value = '0'; value <= '9'; value++)
7288 ANYOF_BITMAP_SET(ret, value);
7289 }
c49a72a9
NC
7290 yesno = '+';
7291 what = "Digit";
ffc61ed2
JH
7292 break;
7293 case ANYOF_NDIGIT:
7294 if (LOC)
7295 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7296 else {
7297 /* consecutive digits assumed */
7298 for (value = 0; value < '0'; value++)
7299 ANYOF_BITMAP_SET(ret, value);
7300 for (value = '9' + 1; value < 256; value++)
7301 ANYOF_BITMAP_SET(ret, value);
7302 }
c49a72a9
NC
7303 yesno = '!';
7304 what = "Digit";
73b437c8
JH
7305 break;
7306 case ANYOF_GRAPH:
7307 if (LOC)
936ed897 7308 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
7309 else {
7310 for (value = 0; value < 256; value++)
7311 if (isGRAPH(value))
936ed897 7312 ANYOF_BITMAP_SET(ret, value);
73b437c8 7313 }
c49a72a9
NC
7314 yesno = '+';
7315 what = "Graph";
73b437c8
JH
7316 break;
7317 case ANYOF_NGRAPH:
7318 if (LOC)
936ed897 7319 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
7320 else {
7321 for (value = 0; value < 256; value++)
7322 if (!isGRAPH(value))
936ed897 7323 ANYOF_BITMAP_SET(ret, value);
73b437c8 7324 }
c49a72a9
NC
7325 yesno = '!';
7326 what = "Graph";
73b437c8
JH
7327 break;
7328 case ANYOF_LOWER:
7329 if (LOC)
936ed897 7330 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
7331 else {
7332 for (value = 0; value < 256; value++)
7333 if (isLOWER(value))
936ed897 7334 ANYOF_BITMAP_SET(ret, value);
73b437c8 7335 }
c49a72a9
NC
7336 yesno = '+';
7337 what = "Lower";
73b437c8
JH
7338 break;
7339 case ANYOF_NLOWER:
7340 if (LOC)
936ed897 7341 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
7342 else {
7343 for (value = 0; value < 256; value++)
7344 if (!isLOWER(value))
936ed897 7345 ANYOF_BITMAP_SET(ret, value);
73b437c8 7346 }
c49a72a9
NC
7347 yesno = '!';
7348 what = "Lower";
73b437c8
JH
7349 break;
7350 case ANYOF_PRINT:
7351 if (LOC)
936ed897 7352 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
7353 else {
7354 for (value = 0; value < 256; value++)
7355 if (isPRINT(value))
936ed897 7356 ANYOF_BITMAP_SET(ret, value);
73b437c8 7357 }
c49a72a9
NC
7358 yesno = '+';
7359 what = "Print";
73b437c8
JH
7360 break;
7361 case ANYOF_NPRINT:
7362 if (LOC)
936ed897 7363 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
7364 else {
7365 for (value = 0; value < 256; value++)
7366 if (!isPRINT(value))
936ed897 7367 ANYOF_BITMAP_SET(ret, value);
73b437c8 7368 }
c49a72a9
NC
7369 yesno = '!';
7370 what = "Print";
73b437c8 7371 break;
aaa51d5e
JF
7372 case ANYOF_PSXSPC:
7373 if (LOC)
7374 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7375 else {
7376 for (value = 0; value < 256; value++)
7377 if (isPSXSPC(value))
7378 ANYOF_BITMAP_SET(ret, value);
7379 }
c49a72a9
NC
7380 yesno = '+';
7381 what = "Space";
aaa51d5e
JF
7382 break;
7383 case ANYOF_NPSXSPC:
7384 if (LOC)
7385 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7386 else {
7387 for (value = 0; value < 256; value++)
7388 if (!isPSXSPC(value))
7389 ANYOF_BITMAP_SET(ret, value);
7390 }
c49a72a9
NC
7391 yesno = '!';
7392 what = "Space";
aaa51d5e 7393 break;
73b437c8
JH
7394 case ANYOF_PUNCT:
7395 if (LOC)
936ed897 7396 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
7397 else {
7398 for (value = 0; value < 256; value++)
7399 if (isPUNCT(value))
936ed897 7400 ANYOF_BITMAP_SET(ret, value);
73b437c8 7401 }
c49a72a9
NC
7402 yesno = '+';
7403 what = "Punct";
73b437c8
JH
7404 break;
7405 case ANYOF_NPUNCT:
7406 if (LOC)
936ed897 7407 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
7408 else {
7409 for (value = 0; value < 256; value++)
7410 if (!isPUNCT(value))
936ed897 7411 ANYOF_BITMAP_SET(ret, value);
73b437c8 7412 }
c49a72a9
NC
7413 yesno = '!';
7414 what = "Punct";
ffc61ed2
JH
7415 break;
7416 case ANYOF_SPACE:
7417 if (LOC)
7418 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7419 else {
7420 for (value = 0; value < 256; value++)
7421 if (isSPACE(value))
7422 ANYOF_BITMAP_SET(ret, value);
7423 }
c49a72a9
NC
7424 yesno = '+';
7425 what = "SpacePerl";
ffc61ed2
JH
7426 break;
7427 case ANYOF_NSPACE:
7428 if (LOC)
7429 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7430 else {
7431 for (value = 0; value < 256; value++)
7432 if (!isSPACE(value))
7433 ANYOF_BITMAP_SET(ret, value);
7434 }
c49a72a9
NC
7435 yesno = '!';
7436 what = "SpacePerl";
73b437c8
JH
7437 break;
7438 case ANYOF_UPPER:
7439 if (LOC)
936ed897 7440 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
7441 else {
7442 for (value = 0; value < 256; value++)
7443 if (isUPPER(value))
936ed897 7444 ANYOF_BITMAP_SET(ret, value);
73b437c8 7445 }
c49a72a9
NC
7446 yesno = '+';
7447 what = "Upper";
73b437c8
JH
7448 break;
7449 case ANYOF_NUPPER:
7450 if (LOC)
936ed897 7451 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
7452 else {
7453 for (value = 0; value < 256; value++)
7454 if (!isUPPER(value))
936ed897 7455 ANYOF_BITMAP_SET(ret, value);
73b437c8 7456 }
c49a72a9
NC
7457 yesno = '!';
7458 what = "Upper";
73b437c8
JH
7459 break;
7460 case ANYOF_XDIGIT:
7461 if (LOC)
936ed897 7462 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
7463 else {
7464 for (value = 0; value < 256; value++)
7465 if (isXDIGIT(value))
936ed897 7466 ANYOF_BITMAP_SET(ret, value);
73b437c8 7467 }
c49a72a9
NC
7468 yesno = '+';
7469 what = "XDigit";
73b437c8
JH
7470 break;
7471 case ANYOF_NXDIGIT:
7472 if (LOC)
936ed897 7473 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
7474 else {
7475 for (value = 0; value < 256; value++)
7476 if (!isXDIGIT(value))
936ed897 7477 ANYOF_BITMAP_SET(ret, value);
73b437c8 7478 }
c49a72a9
NC
7479 yesno = '!';
7480 what = "XDigit";
73b437c8 7481 break;
f81125e2
JP
7482 case ANYOF_MAX:
7483 /* this is to handle \p and \P */
7484 break;
73b437c8 7485 default:
b45f050a 7486 vFAIL("Invalid [::] class");
73b437c8 7487 break;
b8c5462f 7488 }
c49a72a9
NC
7489 if (what) {
7490 /* Strings such as "+utf8::isWord\n" */
7491 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7492 }
b8c5462f 7493 if (LOC)
936ed897 7494 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7495 continue;
a0d0e21e 7496 }
ffc61ed2
JH
7497 } /* end of namedclass \blah */
7498
a0d0e21e 7499 if (range) {
eb160463 7500 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7501 const int w = RExC_parse - rangebegin;
7502 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7503 range = 0; /* not a valid range */
73b437c8 7504 }
a0d0e21e
LW
7505 }
7506 else {
3568d838 7507 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7508 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7509 RExC_parse[1] != ']') {
7510 RExC_parse++;
ffc61ed2
JH
7511
7512 /* a bad range like \w-, [:word:]- ? */
7513 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7514 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7515 const int w =
afd78fd5
JH
7516 RExC_parse >= rangebegin ?
7517 RExC_parse - rangebegin : 0;
830247a4 7518 vWARN4(RExC_parse,
b45f050a 7519 "False [] range \"%*.*s\"",
097eb12c 7520 w, w, rangebegin);
afd78fd5 7521 }
73b437c8 7522 if (!SIZE_ONLY)
936ed897 7523 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7524 } else
ffc61ed2
JH
7525 range = 1; /* yeah, it's a range! */
7526 continue; /* but do it the next time */
a0d0e21e 7527 }
a687059c 7528 }
ffc61ed2 7529
93733859 7530 /* now is the next time */
07be1b83 7531 /*stored += (value - prevvalue + 1);*/
ae5c130c 7532 if (!SIZE_ONLY) {
3568d838 7533 if (prevvalue < 256) {
1df70142 7534 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7535 IV i;
3568d838 7536#ifdef EBCDIC
1b2d223b
JH
7537 /* In EBCDIC [\x89-\x91] should include
7538 * the \x8e but [i-j] should not. */
7539 if (literal_endpoint == 2 &&
7540 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7541 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7542 {
3568d838
JH
7543 if (isLOWER(prevvalue)) {
7544 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7545 if (isLOWER(i))
7546 ANYOF_BITMAP_SET(ret, i);
7547 } else {
3568d838 7548 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7549 if (isUPPER(i))
7550 ANYOF_BITMAP_SET(ret, i);
7551 }
8ada0baa 7552 }
ffc61ed2 7553 else
8ada0baa 7554#endif
07be1b83
YO
7555 for (i = prevvalue; i <= ceilvalue; i++) {
7556 if (!ANYOF_BITMAP_TEST(ret,i)) {
7557 stored++;
7558 ANYOF_BITMAP_SET(ret, i);
7559 }
7560 }
3568d838 7561 }
a5961de5 7562 if (value > 255 || UTF) {
1df70142
AL
7563 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7564 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7565 stored+=2; /* can't optimize this class */
ffc61ed2 7566 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7567 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7568 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7569 prevnatvalue, natvalue);
7570 }
7571 else if (prevnatvalue == natvalue) {
7572 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7573 if (FOLD) {
89ebb4a3 7574 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7575 STRLEN foldlen;
1df70142 7576 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7577
e294cc5d
JH
7578#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7579 if (RExC_precomp[0] == ':' &&
7580 RExC_precomp[1] == '[' &&
7581 (f == 0xDF || f == 0x92)) {
7582 f = NATIVE_TO_UNI(f);
7583 }
7584#endif
c840d2a2
JH
7585 /* If folding and foldable and a single
7586 * character, insert also the folded version
7587 * to the charclass. */
9e55ce06 7588 if (f != value) {
e294cc5d
JH
7589#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7590 if ((RExC_precomp[0] == ':' &&
7591 RExC_precomp[1] == '[' &&
7592 (f == 0xA2 &&
7593 (value == 0xFB05 || value == 0xFB06))) ?
7594 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7595 foldlen == (STRLEN)UNISKIP(f) )
7596#else
eb160463 7597 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7598#endif
9e55ce06
JH
7599 Perl_sv_catpvf(aTHX_ listsv,
7600 "%04"UVxf"\n", f);
7601 else {
7602 /* Any multicharacter foldings
7603 * require the following transform:
7604 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7605 * where E folds into "pq" and F folds
7606 * into "rst", all other characters
7607 * fold to single characters. We save
7608 * away these multicharacter foldings,
7609 * to be later saved as part of the
7610 * additional "s" data. */
7611 SV *sv;
7612
7613 if (!unicode_alternate)
7614 unicode_alternate = newAV();
7615 sv = newSVpvn((char*)foldbuf, foldlen);
7616 SvUTF8_on(sv);
7617 av_push(unicode_alternate, sv);
7618 }
7619 }
254ba52a 7620
60a8b682
JH
7621 /* If folding and the value is one of the Greek
7622 * sigmas insert a few more sigmas to make the
7623 * folding rules of the sigmas to work right.
7624 * Note that not all the possible combinations
7625 * are handled here: some of them are handled
9e55ce06
JH
7626 * by the standard folding rules, and some of
7627 * them (literal or EXACTF cases) are handled
7628 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7629 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7630 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7631 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7632 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7633 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7634 }
7635 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7636 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7637 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7638 }
7639 }
ffc61ed2 7640 }
1b2d223b
JH
7641#ifdef EBCDIC
7642 literal_endpoint = 0;
7643#endif
8ada0baa 7644 }
ffc61ed2
JH
7645
7646 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7647 }
ffc61ed2 7648
936ed897 7649 if (need_class) {
4f66b38d 7650 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7651 if (SIZE_ONLY)
830247a4 7652 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7653 else
830247a4 7654 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7655 }
ffc61ed2 7656
7f6f358c
YO
7657
7658 if (SIZE_ONLY)
7659 return ret;
7660 /****** !SIZE_ONLY AFTER HERE *********/
7661
7662 if( stored == 1 && value < 256
7663 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7664 ) {
7665 /* optimize single char class to an EXACT node
7666 but *only* when its not a UTF/high char */
07be1b83
YO
7667 const char * cur_parse= RExC_parse;
7668 RExC_emit = (regnode *)orig_emit;
7669 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7670 ret = reg_node(pRExC_state,
7671 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7672 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7673 *STRING(ret)= (char)value;
7674 STR_LEN(ret)= 1;
7675 RExC_emit += STR_SZ(1);
7676 return ret;
7677 }
ae5c130c 7678 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7679 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7680 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7681 ) {
a0ed51b3 7682 for (value = 0; value < 256; ++value) {
936ed897 7683 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7684 UV fold = PL_fold[value];
ffc61ed2
JH
7685
7686 if (fold != value)
7687 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7688 }
7689 }
936ed897 7690 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7691 }
ffc61ed2 7692
ae5c130c 7693 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7694 if (optimize_invert &&
ffc61ed2
JH
7695 /* If the only flag is inversion. */
7696 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7697 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7698 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7699 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7700 }
7f6f358c 7701 {
097eb12c 7702 AV * const av = newAV();
ffc61ed2 7703 SV *rv;
9e55ce06 7704 /* The 0th element stores the character class description
6a0407ee 7705 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7706 * to initialize the appropriate swash (which gets stored in
7707 * the 1st element), and also useful for dumping the regnode.
7708 * The 2nd element stores the multicharacter foldings,
6a0407ee 7709 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7710 av_store(av, 0, listsv);
7711 av_store(av, 1, NULL);
9e55ce06 7712 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7713 rv = newRV_noinc((SV*)av);
19860706 7714 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 7715 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 7716 ARG_SET(ret, n);
a0ed51b3 7717 }
a0ed51b3
LW
7718 return ret;
7719}
7720
76e3520e 7721STATIC char*
830247a4 7722S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7723{
097eb12c 7724 char* const retval = RExC_parse++;
a0d0e21e 7725
4633a7c4 7726 for (;;) {
830247a4
IZ
7727 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7728 RExC_parse[2] == '#') {
e994fd66
AE
7729 while (*RExC_parse != ')') {
7730 if (RExC_parse == RExC_end)
7731 FAIL("Sequence (?#... not terminated");
830247a4 7732 RExC_parse++;
e994fd66 7733 }
830247a4 7734 RExC_parse++;
4633a7c4
LW
7735 continue;
7736 }
bbe252da 7737 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
7738 if (isSPACE(*RExC_parse)) {
7739 RExC_parse++;
748a9306
LW
7740 continue;
7741 }
830247a4 7742 else if (*RExC_parse == '#') {
e994fd66
AE
7743 while (RExC_parse < RExC_end)
7744 if (*RExC_parse++ == '\n') break;
748a9306
LW
7745 continue;
7746 }
748a9306 7747 }
4633a7c4 7748 return retval;
a0d0e21e 7749 }
a687059c
LW
7750}
7751
7752/*
c277df42 7753- reg_node - emit a node
a0d0e21e 7754*/
76e3520e 7755STATIC regnode * /* Location. */
830247a4 7756S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7757{
97aff369 7758 dVAR;
c277df42 7759 register regnode *ptr;
504618e9 7760 regnode * const ret = RExC_emit;
07be1b83 7761 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7762
c277df42 7763 if (SIZE_ONLY) {
830247a4
IZ
7764 SIZE_ALIGN(RExC_size);
7765 RExC_size += 1;
a0d0e21e
LW
7766 return(ret);
7767 }
e2e6a0f1
YO
7768#ifdef DEBUGGING
7769 if (OP(RExC_emit) == 255)
7770 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7771 reg_name[op], OP(RExC_emit));
7772#endif
c277df42 7773 NODE_ALIGN_FILL(ret);
a0d0e21e 7774 ptr = ret;
c277df42 7775 FILL_ADVANCE_NODE(ptr, op);
fac92740 7776 if (RExC_offsets) { /* MJD */
07be1b83 7777 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7778 "reg_node", __LINE__,
7779 reg_name[op],
07be1b83
YO
7780 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7781 ? "Overwriting end of array!\n" : "OK",
7782 (UV)(RExC_emit - RExC_emit_start),
7783 (UV)(RExC_parse - RExC_start),
7784 (UV)RExC_offsets[0]));
ccb2c380 7785 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7786 }
07be1b83 7787
830247a4 7788 RExC_emit = ptr;
a0d0e21e 7789 return(ret);
a687059c
LW
7790}
7791
7792/*
a0d0e21e
LW
7793- reganode - emit a node with an argument
7794*/
76e3520e 7795STATIC regnode * /* Location. */
830247a4 7796S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7797{
97aff369 7798 dVAR;
c277df42 7799 register regnode *ptr;
504618e9 7800 regnode * const ret = RExC_emit;
07be1b83 7801 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7802
c277df42 7803 if (SIZE_ONLY) {
830247a4
IZ
7804 SIZE_ALIGN(RExC_size);
7805 RExC_size += 2;
6bda09f9
YO
7806 /*
7807 We can't do this:
7808
7809 assert(2==regarglen[op]+1);
7810
7811 Anything larger than this has to allocate the extra amount.
7812 If we changed this to be:
7813
7814 RExC_size += (1 + regarglen[op]);
7815
7816 then it wouldn't matter. Its not clear what side effect
7817 might come from that so its not done so far.
7818 -- dmq
7819 */
a0d0e21e
LW
7820 return(ret);
7821 }
e2e6a0f1
YO
7822#ifdef DEBUGGING
7823 if (OP(RExC_emit) == 255)
7824 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7825#endif
c277df42 7826 NODE_ALIGN_FILL(ret);
a0d0e21e 7827 ptr = ret;
c277df42 7828 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7829 if (RExC_offsets) { /* MJD */
07be1b83 7830 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7831 "reganode",
ccb2c380
MP
7832 __LINE__,
7833 reg_name[op],
07be1b83 7834 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7835 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7836 (UV)(RExC_emit - RExC_emit_start),
7837 (UV)(RExC_parse - RExC_start),
7838 (UV)RExC_offsets[0]));
ccb2c380 7839 Set_Cur_Node_Offset;
fac92740
MJD
7840 }
7841
830247a4 7842 RExC_emit = ptr;
a0d0e21e 7843 return(ret);
fe14fcc3
LW
7844}
7845
7846/*
cd439c50 7847- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7848*/
71207a34
AL
7849STATIC STRLEN
7850S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7851{
97aff369 7852 dVAR;
71207a34 7853 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7854}
7855
7856/*
a0d0e21e
LW
7857- reginsert - insert an operator in front of already-emitted operand
7858*
7859* Means relocating the operand.
7860*/
76e3520e 7861STATIC void
6bda09f9 7862S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7863{
97aff369 7864 dVAR;
c277df42
IZ
7865 register regnode *src;
7866 register regnode *dst;
7867 register regnode *place;
504618e9 7868 const int offset = regarglen[(U8)op];
6bda09f9 7869 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7870 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7871/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7872 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7873 if (SIZE_ONLY) {
6bda09f9 7874 RExC_size += size;
a0d0e21e
LW
7875 return;
7876 }
a687059c 7877
830247a4 7878 src = RExC_emit;
6bda09f9 7879 RExC_emit += size;
830247a4 7880 dst = RExC_emit;
40d049e4 7881 if (RExC_open_parens) {
6bda09f9 7882 int paren;
6d99fb9b 7883 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 7884 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7885 if ( RExC_open_parens[paren] >= opnd ) {
7886 DEBUG_PARSE_FMT("open"," - %d",size);
7887 RExC_open_parens[paren] += size;
7888 } else {
7889 DEBUG_PARSE_FMT("open"," - %s","ok");
7890 }
7891 if ( RExC_close_parens[paren] >= opnd ) {
7892 DEBUG_PARSE_FMT("close"," - %d",size);
7893 RExC_close_parens[paren] += size;
7894 } else {
7895 DEBUG_PARSE_FMT("close"," - %s","ok");
7896 }
7897 }
6bda09f9 7898 }
40d049e4 7899
fac92740 7900 while (src > opnd) {
c277df42 7901 StructCopy(--src, --dst, regnode);
fac92740 7902 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7903 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7904 "reg_insert",
ccb2c380
MP
7905 __LINE__,
7906 reg_name[op],
07be1b83
YO
7907 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7908 ? "Overwriting end of array!\n" : "OK",
7909 (UV)(src - RExC_emit_start),
7910 (UV)(dst - RExC_emit_start),
7911 (UV)RExC_offsets[0]));
ccb2c380
MP
7912 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7913 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7914 }
7915 }
7916
a0d0e21e
LW
7917
7918 place = opnd; /* Op node, where operand used to be. */
fac92740 7919 if (RExC_offsets) { /* MJD */
07be1b83 7920 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7921 "reginsert",
ccb2c380
MP
7922 __LINE__,
7923 reg_name[op],
07be1b83 7924 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7925 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7926 (UV)(place - RExC_emit_start),
7927 (UV)(RExC_parse - RExC_start),
786e8c11 7928 (UV)RExC_offsets[0]));
ccb2c380 7929 Set_Node_Offset(place, RExC_parse);
45948336 7930 Set_Node_Length(place, 1);
fac92740 7931 }
c277df42
IZ
7932 src = NEXTOPER(place);
7933 FILL_ADVANCE_NODE(place, op);
7934 Zero(src, offset, regnode);
a687059c
LW
7935}
7936
7937/*
c277df42 7938- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7939- SEE ALSO: regtail_study
a0d0e21e 7940*/
097eb12c 7941/* TODO: All three parms should be const */
76e3520e 7942STATIC void
3dab1dad 7943S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7944{
97aff369 7945 dVAR;
c277df42 7946 register regnode *scan;
72f13be8 7947 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7948#ifndef DEBUGGING
7949 PERL_UNUSED_ARG(depth);
7950#endif
a0d0e21e 7951
c277df42 7952 if (SIZE_ONLY)
a0d0e21e
LW
7953 return;
7954
7955 /* Find last node. */
7956 scan = p;
7957 for (;;) {
504618e9 7958 regnode * const temp = regnext(scan);
3dab1dad
YO
7959 DEBUG_PARSE_r({
7960 SV * const mysv=sv_newmortal();
7961 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7962 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7963 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7964 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7965 (temp == NULL ? "->" : ""),
7966 (temp == NULL ? reg_name[OP(val)] : "")
7967 );
3dab1dad
YO
7968 });
7969 if (temp == NULL)
7970 break;
7971 scan = temp;
7972 }
7973
7974 if (reg_off_by_arg[OP(scan)]) {
7975 ARG_SET(scan, val - scan);
7976 }
7977 else {
7978 NEXT_OFF(scan) = val - scan;
7979 }
7980}
7981
07be1b83 7982#ifdef DEBUGGING
3dab1dad
YO
7983/*
7984- regtail_study - set the next-pointer at the end of a node chain of p to val.
7985- Look for optimizable sequences at the same time.
7986- currently only looks for EXACT chains.
07be1b83
YO
7987
7988This is expermental code. The idea is to use this routine to perform
7989in place optimizations on branches and groups as they are constructed,
7990with the long term intention of removing optimization from study_chunk so
7991that it is purely analytical.
7992
7993Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7994to control which is which.
7995
3dab1dad
YO
7996*/
7997/* TODO: All four parms should be const */
07be1b83 7998
3dab1dad
YO
7999STATIC U8
8000S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8001{
8002 dVAR;
8003 register regnode *scan;
07be1b83
YO
8004 U8 exact = PSEUDO;
8005#ifdef EXPERIMENTAL_INPLACESCAN
8006 I32 min = 0;
8007#endif
8008
3dab1dad
YO
8009 GET_RE_DEBUG_FLAGS_DECL;
8010
07be1b83 8011
3dab1dad
YO
8012 if (SIZE_ONLY)
8013 return exact;
8014
8015 /* Find last node. */
8016
8017 scan = p;
8018 for (;;) {
8019 regnode * const temp = regnext(scan);
07be1b83
YO
8020#ifdef EXPERIMENTAL_INPLACESCAN
8021 if (PL_regkind[OP(scan)] == EXACT)
8022 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8023 return EXACT;
8024#endif
3dab1dad
YO
8025 if ( exact ) {
8026 switch (OP(scan)) {
8027 case EXACT:
8028 case EXACTF:
8029 case EXACTFL:
8030 if( exact == PSEUDO )
8031 exact= OP(scan);
07be1b83
YO
8032 else if ( exact != OP(scan) )
8033 exact= 0;
3dab1dad
YO
8034 case NOTHING:
8035 break;
8036 default:
8037 exact= 0;
8038 }
8039 }
8040 DEBUG_PARSE_r({
8041 SV * const mysv=sv_newmortal();
8042 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8043 regprop(RExC_rx, mysv, scan);
eaf3ca90 8044 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8045 SvPV_nolen_const(mysv),
eaf3ca90
YO
8046 REG_NODE_NUM(scan),
8047 reg_name[exact]);
3dab1dad 8048 });
a0d0e21e
LW
8049 if (temp == NULL)
8050 break;
8051 scan = temp;
8052 }
07be1b83
YO
8053 DEBUG_PARSE_r({
8054 SV * const mysv_val=sv_newmortal();
8055 DEBUG_PARSE_MSG("");
8056 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
8057 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8058 SvPV_nolen_const(mysv_val),
8059 (IV)REG_NODE_NUM(val),
8060 (IV)(val - scan)
07be1b83
YO
8061 );
8062 });
c277df42
IZ
8063 if (reg_off_by_arg[OP(scan)]) {
8064 ARG_SET(scan, val - scan);
a0ed51b3
LW
8065 }
8066 else {
c277df42
IZ
8067 NEXT_OFF(scan) = val - scan;
8068 }
3dab1dad
YO
8069
8070 return exact;
a687059c 8071}
07be1b83 8072#endif
a687059c
LW
8073
8074/*
a687059c
LW
8075 - regcurly - a little FSA that accepts {\d+,?\d*}
8076 */
79072805 8077STATIC I32
5f66b61c 8078S_regcurly(register const char *s)
a687059c
LW
8079{
8080 if (*s++ != '{')
8081 return FALSE;
f0fcb552 8082 if (!isDIGIT(*s))
a687059c 8083 return FALSE;
f0fcb552 8084 while (isDIGIT(*s))
a687059c
LW
8085 s++;
8086 if (*s == ',')
8087 s++;
f0fcb552 8088 while (isDIGIT(*s))
a687059c
LW
8089 s++;
8090 if (*s != '}')
8091 return FALSE;
8092 return TRUE;
8093}
8094
a687059c
LW
8095
8096/*
fd181c75 8097 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
8098 */
8099void
097eb12c 8100Perl_regdump(pTHX_ const regexp *r)
a687059c 8101{
35ff7856 8102#ifdef DEBUGGING
97aff369 8103 dVAR;
c445ea15 8104 SV * const sv = sv_newmortal();
ab3bbdeb 8105 SV *dsv= sv_newmortal();
f8fc2ecf 8106 RXi_GET_DECL(r,ri);
a687059c 8107
f8fc2ecf 8108 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8109
8110 /* Header fields of interest. */
ab3bbdeb
YO
8111 if (r->anchored_substr) {
8112 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8113 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8114 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8115 "anchored %s%s at %"IVdf" ",
8116 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8117 (IV)r->anchored_offset);
ab3bbdeb
YO
8118 } else if (r->anchored_utf8) {
8119 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8120 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8121 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8122 "anchored utf8 %s%s at %"IVdf" ",
8123 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8124 (IV)r->anchored_offset);
ab3bbdeb
YO
8125 }
8126 if (r->float_substr) {
8127 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8128 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8129 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8130 "floating %s%s at %"IVdf"..%"UVuf" ",
8131 s, RE_SV_TAIL(r->float_substr),
7b0972df 8132 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8133 } else if (r->float_utf8) {
8134 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8135 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8136 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8137 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8138 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8139 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8140 }
33b8afdf 8141 if (r->check_substr || r->check_utf8)
b81d288d 8142 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8143 (const char *)
8144 (r->check_substr == r->float_substr
8145 && r->check_utf8 == r->float_utf8
8146 ? "(checking floating" : "(checking anchored"));
bbe252da 8147 if (r->extflags & RXf_NOSCAN)
c277df42 8148 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 8149 if (r->extflags & RXf_CHECK_ALL)
c277df42 8150 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8151 if (r->check_substr || r->check_utf8)
c277df42
IZ
8152 PerlIO_printf(Perl_debug_log, ") ");
8153
f8fc2ecf
YO
8154 if (ri->regstclass) {
8155 regprop(r, sv, ri->regstclass);
1de06328 8156 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8157 }
bbe252da 8158 if (r->extflags & RXf_ANCH) {
774d564b 8159 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 8160 if (r->extflags & RXf_ANCH_BOL)
774d564b 8161 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 8162 if (r->extflags & RXf_ANCH_MBOL)
c277df42 8163 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 8164 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 8165 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 8166 if (r->extflags & RXf_ANCH_GPOS)
774d564b 8167 PerlIO_printf(Perl_debug_log, "(GPOS)");
8168 PerlIO_putc(Perl_debug_log, ' ');
8169 }
bbe252da 8170 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 8171 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 8172 if (r->intflags & PREGf_SKIP)
760ac839 8173 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 8174 if (r->intflags & PREGf_IMPLICIT)
760ac839 8175 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 8176 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 8177 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 8178 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8179 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8180#else
96a5add6 8181 PERL_UNUSED_CONTEXT;
65e66c80 8182 PERL_UNUSED_ARG(r);
17c3b450 8183#endif /* DEBUGGING */
a687059c
LW
8184}
8185
8186/*
a0d0e21e
LW
8187- regprop - printable representation of opcode
8188*/
46fc3d4c 8189void
32fc9b6a 8190Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8191{
35ff7856 8192#ifdef DEBUGGING
97aff369 8193 dVAR;
9b155405 8194 register int k;
f8fc2ecf 8195 RXi_GET_DECL(prog,progi);
1de06328 8196 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 8197
a0d0e21e 8198
54dc92de 8199 sv_setpvn(sv, "", 0);
8aa23a47 8200
03363afd 8201 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8202 /* It would be nice to FAIL() here, but this may be called from
8203 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8204 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8205 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8206
3dab1dad 8207 k = PL_regkind[OP(o)];
9b155405 8208
2a782b5b 8209 if (k == EXACT) {
396482e1 8210 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8211 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8212 * is a crude hack but it may be the best for now since
8213 * we have no flag "this EXACTish node was UTF-8"
8214 * --jhi */
8215 const char * const s =
ddc5bc0f 8216 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8217 PL_colors[0], PL_colors[1],
8218 PERL_PV_ESCAPE_UNI_DETECT |
8219 PERL_PV_PRETTY_ELIPSES |
8220 PERL_PV_PRETTY_LTGT
8221 );
8222 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8223 } else if (k == TRIE) {
3dab1dad 8224 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 8225 * progi->data isn't available here */
1de06328
YO
8226 const char op = OP(o);
8227 const I32 n = ARG(o);
8228 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 8229 (reg_ac_data *)progi->data->data[n] :
1de06328
YO
8230 NULL;
8231 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
f8fc2ecf 8232 (reg_trie_data*)progi->data->data[n] :
1de06328
YO
8233 ac->trie;
8234
8235 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8236 DEBUG_TRIE_COMPILE_r(
8237 Perl_sv_catpvf(aTHX_ sv,
8238 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8239 (UV)trie->startstate,
1e2e3d02 8240 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8241 (UV)trie->wordcount,
8242 (UV)trie->minlen,
8243 (UV)trie->maxlen,
8244 (UV)TRIE_CHARCOUNT(trie),
8245 (UV)trie->uniquecharcount
8246 )
8247 );
8248 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8249 int i;
8250 int rangestart = -1;
f46cb337 8251 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8252 Perl_sv_catpvf(aTHX_ sv, "[");
8253 for (i = 0; i <= 256; i++) {
8254 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8255 if (rangestart == -1)
8256 rangestart = i;
8257 } else if (rangestart != -1) {
8258 if (i <= rangestart + 3)
8259 for (; rangestart < i; rangestart++)
8260 put_byte(sv, rangestart);
8261 else {
8262 put_byte(sv, rangestart);
8263 sv_catpvs(sv, "-");
8264 put_byte(sv, i - 1);
8265 }
8266 rangestart = -1;
8267 }
8268 }
8269 Perl_sv_catpvf(aTHX_ sv, "]");
8270 }
8271
a3621e74 8272 } else if (k == CURLY) {
cb434fcc 8273 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8274 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8275 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8276 }
2c2d71f5
JH
8277 else if (k == WHILEM && o->flags) /* Ordinal/of */
8278 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
e2e6a0f1 8279 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
894356b3 8280 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1a147d38 8281 else if (k == GOSUB)
6bda09f9 8282 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8283 else if (k == VERB) {
8284 if (!o->flags)
8285 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
f8fc2ecf 8286 (SV*)progi->data->data[ ARG( o ) ]);
e2e6a0f1 8287 } else if (k == LOGICAL)
04ebc1ab 8288 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8289 else if (k == ANYOF) {
8290 int i, rangestart = -1;
2d03de9c 8291 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8292
8293 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8294 static const char * const anyofs[] = {
653099ff
GS
8295 "\\w",
8296 "\\W",
8297 "\\s",
8298 "\\S",
8299 "\\d",
8300 "\\D",
8301 "[:alnum:]",
8302 "[:^alnum:]",
8303 "[:alpha:]",
8304 "[:^alpha:]",
8305 "[:ascii:]",
8306 "[:^ascii:]",
8307 "[:ctrl:]",
8308 "[:^ctrl:]",
8309 "[:graph:]",
8310 "[:^graph:]",
8311 "[:lower:]",
8312 "[:^lower:]",
8313 "[:print:]",
8314 "[:^print:]",
8315 "[:punct:]",
8316 "[:^punct:]",
8317 "[:upper:]",
aaa51d5e 8318 "[:^upper:]",
653099ff 8319 "[:xdigit:]",
aaa51d5e
JF
8320 "[:^xdigit:]",
8321 "[:space:]",
8322 "[:^space:]",
8323 "[:blank:]",
8324 "[:^blank:]"
653099ff
GS
8325 };
8326
19860706 8327 if (flags & ANYOF_LOCALE)
396482e1 8328 sv_catpvs(sv, "{loc}");
19860706 8329 if (flags & ANYOF_FOLD)
396482e1 8330 sv_catpvs(sv, "{i}");
653099ff 8331 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8332 if (flags & ANYOF_INVERT)
396482e1 8333 sv_catpvs(sv, "^");
ffc61ed2
JH
8334 for (i = 0; i <= 256; i++) {
8335 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8336 if (rangestart == -1)
8337 rangestart = i;
8338 } else if (rangestart != -1) {
8339 if (i <= rangestart + 3)
8340 for (; rangestart < i; rangestart++)
653099ff 8341 put_byte(sv, rangestart);
ffc61ed2
JH
8342 else {
8343 put_byte(sv, rangestart);
396482e1 8344 sv_catpvs(sv, "-");
ffc61ed2 8345 put_byte(sv, i - 1);
653099ff 8346 }
ffc61ed2 8347 rangestart = -1;
653099ff 8348 }
847a199f 8349 }
ffc61ed2
JH
8350
8351 if (o->flags & ANYOF_CLASS)
bb7a0f54 8352 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8353 if (ANYOF_CLASS_TEST(o,i))
8354 sv_catpv(sv, anyofs[i]);
8355
8356 if (flags & ANYOF_UNICODE)
396482e1 8357 sv_catpvs(sv, "{unicode}");
1aa99e6b 8358 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8359 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8360
8361 {
8362 SV *lv;
32fc9b6a 8363 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8364
ffc61ed2
JH
8365 if (lv) {
8366 if (sw) {
89ebb4a3 8367 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8368
ffc61ed2 8369 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8370 uvchr_to_utf8(s, i);
ffc61ed2 8371
3568d838 8372 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8373 if (rangestart == -1)
8374 rangestart = i;
8375 } else if (rangestart != -1) {
ffc61ed2
JH
8376 if (i <= rangestart + 3)
8377 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8378 const U8 * const e = uvchr_to_utf8(s,rangestart);
8379 U8 *p;
8380 for(p = s; p < e; p++)
ffc61ed2
JH
8381 put_byte(sv, *p);
8382 }
8383 else {
2d03de9c
AL
8384 const U8 *e = uvchr_to_utf8(s,rangestart);
8385 U8 *p;
8386 for (p = s; p < e; p++)
ffc61ed2 8387 put_byte(sv, *p);
396482e1 8388 sv_catpvs(sv, "-");
2d03de9c
AL
8389 e = uvchr_to_utf8(s, i-1);
8390 for (p = s; p < e; p++)
1df70142 8391 put_byte(sv, *p);
ffc61ed2
JH
8392 }
8393 rangestart = -1;
8394 }
19860706 8395 }
ffc61ed2 8396
396482e1 8397 sv_catpvs(sv, "..."); /* et cetera */
19860706 8398 }
fde631ed 8399
ffc61ed2 8400 {
2e0de35c 8401 char *s = savesvpv(lv);
c445ea15 8402 char * const origs = s;
b81d288d 8403
3dab1dad
YO
8404 while (*s && *s != '\n')
8405 s++;
b81d288d 8406
ffc61ed2 8407 if (*s == '\n') {
2d03de9c 8408 const char * const t = ++s;
ffc61ed2
JH
8409
8410 while (*s) {
8411 if (*s == '\n')
8412 *s = ' ';
8413 s++;
8414 }
8415 if (s[-1] == ' ')
8416 s[-1] = 0;
8417
8418 sv_catpv(sv, t);
fde631ed 8419 }
b81d288d 8420
ffc61ed2 8421 Safefree(origs);
fde631ed
JH
8422 }
8423 }
653099ff 8424 }
ffc61ed2 8425
653099ff
GS
8426 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8427 }
9b155405 8428 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8429 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8430#else
96a5add6 8431 PERL_UNUSED_CONTEXT;
65e66c80
SP
8432 PERL_UNUSED_ARG(sv);
8433 PERL_UNUSED_ARG(o);
f9049ba1 8434 PERL_UNUSED_ARG(prog);
17c3b450 8435#endif /* DEBUGGING */
35ff7856 8436}
a687059c 8437
cad2e5aa
JH
8438SV *
8439Perl_re_intuit_string(pTHX_ regexp *prog)
8440{ /* Assume that RE_INTUIT is set */
97aff369 8441 dVAR;
a3621e74 8442 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8443 PERL_UNUSED_CONTEXT;
8444
a3621e74 8445 DEBUG_COMPILE_r(
cfd0369c 8446 {
2d03de9c 8447 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8448 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8449
8450 if (!PL_colorset) reginitcolors();
8451 PerlIO_printf(Perl_debug_log,
a0288114 8452 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8453 PL_colors[4],
8454 prog->check_substr ? "" : "utf8 ",
8455 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8456 s,
8457 PL_colors[1],
8458 (strlen(s) > 60 ? "..." : ""));
8459 } );
8460
33b8afdf 8461 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8462}
8463
84da74a7
YO
8464/*
8465 pregfree - free a regexp
8466
8467 See regdupe below if you change anything here.
8468*/
8469
2b69d0c2 8470void
864dbfa3 8471Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8472{
27da23d5 8473 dVAR;
f8fc2ecf 8474 RXi_GET_DECL(r,ri);
fc32ee4a 8475 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8476
7821416a
IZ
8477 if (!r || (--r->refcnt > 0))
8478 return;
ab3bbdeb 8479 DEBUG_COMPILE_r({
0df25f3d
YO
8480 if (!PL_colorset)
8481 reginitcolors();
5c9f2f80
YO
8482 {
8483 SV *dsv= sv_newmortal();
bbe252da 8484 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
ab3bbdeb
YO
8485 dsv, r->precomp, r->prelen, 60);
8486 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8487 PL_colors[4],PL_colors[5],s);
8488 }
9e55ce06 8489 });
cad2e5aa 8490
43c5f42d
NC
8491 /* gcov results gave these as non-null 100% of the time, so there's no
8492 optimisation in checking them before calling Safefree */
8493 Safefree(r->precomp);
f8fc2ecf 8494 Safefree(ri->offsets); /* 20010421 MJD */
ed252734 8495 RX_MATCH_COPY_FREE(r);
f8c7b90f 8496#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8497 if (r->saved_copy)
8498 SvREFCNT_dec(r->saved_copy);
8499#endif
a193d654
GS
8500 if (r->substrs) {
8501 if (r->anchored_substr)
8502 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8503 if (r->anchored_utf8)
8504 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8505 if (r->float_substr)
8506 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8507 if (r->float_utf8)
8508 SvREFCNT_dec(r->float_utf8);
2779dcf1 8509 Safefree(r->substrs);
a193d654 8510 }
81714fb9
YO
8511 if (r->paren_names)
8512 SvREFCNT_dec(r->paren_names);
f8fc2ecf
YO
8513 if (ri->data) {
8514 int n = ri->data->count;
f3548bdc
DM
8515 PAD* new_comppad = NULL;
8516 PAD* old_comppad;
4026c95a 8517 PADOFFSET refcnt;
dfad63ad 8518
c277df42 8519 while (--n >= 0) {
261faec3 8520 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 8521 switch (ri->data->what[n]) {
c277df42 8522 case 's':
81714fb9 8523 case 'S':
f8fc2ecf 8524 SvREFCNT_dec((SV*)ri->data->data[n]);
c277df42 8525 break;
653099ff 8526 case 'f':
f8fc2ecf 8527 Safefree(ri->data->data[n]);
653099ff 8528 break;
dfad63ad 8529 case 'p':
f8fc2ecf 8530 new_comppad = (AV*)ri->data->data[n];
dfad63ad 8531 break;
c277df42 8532 case 'o':
dfad63ad 8533 if (new_comppad == NULL)
cea2e8a9 8534 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8535 PAD_SAVE_LOCAL(old_comppad,
8536 /* Watch out for global destruction's random ordering. */
c445ea15 8537 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8538 );
b34c0dd4 8539 OP_REFCNT_LOCK;
f8fc2ecf 8540 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
8541 OP_REFCNT_UNLOCK;
8542 if (!refcnt)
f8fc2ecf 8543 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 8544
f3548bdc 8545 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8546 SvREFCNT_dec((SV*)new_comppad);
8547 new_comppad = NULL;
c277df42
IZ
8548 break;
8549 case 'n':
9e55ce06 8550 break;
07be1b83 8551 case 'T':
be8e71aa
YO
8552 { /* Aho Corasick add-on structure for a trie node.
8553 Used in stclass optimization only */
07be1b83 8554 U32 refcount;
f8fc2ecf 8555 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
8556 OP_REFCNT_LOCK;
8557 refcount = --aho->refcount;
8558 OP_REFCNT_UNLOCK;
8559 if ( !refcount ) {
446bd890
NC
8560 PerlMemShared_free(aho->states);
8561 PerlMemShared_free(aho->fail);
07be1b83
YO
8562 aho->trie=NULL; /* not necessary to free this as it is
8563 handled by the 't' case */
446bd890
NC
8564 /* do this last!!!! */
8565 PerlMemShared_free(ri->data->data[n]);
8566 PerlMemShared_free(ri->regstclass);
07be1b83
YO
8567 }
8568 }
8569 break;
a3621e74 8570 case 't':
07be1b83 8571 {
be8e71aa 8572 /* trie structure. */
07be1b83 8573 U32 refcount;
f8fc2ecf 8574 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
8575 OP_REFCNT_LOCK;
8576 refcount = --trie->refcount;
8577 OP_REFCNT_UNLOCK;
8578 if ( !refcount ) {
446bd890 8579 PerlMemShared_free(trie->charmap);
07be1b83
YO
8580 if (trie->widecharmap)
8581 SvREFCNT_dec((SV*)trie->widecharmap);
446bd890
NC
8582 PerlMemShared_free(trie->states);
8583 PerlMemShared_free(trie->trans);
07be1b83 8584 if (trie->bitmap)
446bd890 8585 PerlMemShared_free(trie->bitmap);
07be1b83 8586 if (trie->wordlen)
446bd890 8587 PerlMemShared_free(trie->wordlen);
786e8c11 8588 if (trie->jump)
446bd890 8589 PerlMemShared_free(trie->jump);
786e8c11 8590 if (trie->nextword)
446bd890 8591 PerlMemShared_free(trie->nextword);
a3621e74 8592#ifdef DEBUGGING
5c9f2f80
YO
8593 if (trie->words)
8594 SvREFCNT_dec((SV*)trie->words);
8595 if (trie->revcharmap)
8596 SvREFCNT_dec((SV*)trie->revcharmap);
a3621e74 8597#endif
446bd890
NC
8598 /* do this last!!!! */
8599 PerlMemShared_free(ri->data->data[n]);
a3621e74 8600 }
07be1b83
YO
8601 }
8602 break;
c277df42 8603 default:
f8fc2ecf 8604 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
8605 }
8606 }
f8fc2ecf
YO
8607 Safefree(ri->data->what);
8608 Safefree(ri->data);
a0d0e21e
LW
8609 }
8610 Safefree(r->startp);
8611 Safefree(r->endp);
f8fc2ecf
YO
8612 if (ri->swap) {
8613 Safefree(ri->swap->startp);
8614 Safefree(ri->swap->endp);
8615 Safefree(ri->swap);
c74340f9 8616 }
f8fc2ecf 8617 Safefree(ri);
a0d0e21e 8618 Safefree(r);
a687059c 8619}
c277df42 8620
84da74a7
YO
8621#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8622#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8623#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8624#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8625
8626/*
8627 regdupe - duplicate a regexp.
8628
8629 This routine is called by sv.c's re_dup and is expected to clone a
8630 given regexp structure. It is a no-op when not under USE_ITHREADS.
8631 (Originally this *was* re_dup() for change history see sv.c)
8632
8633 See pregfree() above if you change anything here.
8634*/
a3c0e9ca 8635#if defined(USE_ITHREADS)
84da74a7
YO
8636regexp *
8637Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8638{
84da74a7 8639 dVAR;
f8fc2ecf
YO
8640 regexp *ret;
8641 regexp_internal *reti;
84da74a7
YO
8642 int i, len, npar;
8643 struct reg_substr_datum *s;
f8fc2ecf
YO
8644 RXi_GET_DECL(r,ri);
8645
84da74a7
YO
8646 if (!r)
8647 return (REGEXP *)NULL;
8648
8649 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8650 return ret;
8651
f8fc2ecf 8652 len = ri->offsets[0];
84da74a7
YO
8653 npar = r->nparens+1;
8654
f8fc2ecf
YO
8655 Newxz(ret, 1, regexp);
8656 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
8657 RXi_SET(ret,reti);
8658 Copy(ri->program, reti->program, len+1, regnode);
84da74a7
YO
8659
8660 Newx(ret->startp, npar, I32);
8661 Copy(r->startp, ret->startp, npar, I32);
8662 Newx(ret->endp, npar, I32);
8663 Copy(r->startp, ret->startp, npar, I32);
f8fc2ecf
YO
8664 if(ri->swap) {
8665 Newx(reti->swap, 1, regexp_paren_ofs);
c74340f9 8666 /* no need to copy these */
f8fc2ecf
YO
8667 Newx(reti->swap->startp, npar, I32);
8668 Newx(reti->swap->endp, npar, I32);
c74340f9 8669 } else {
f8fc2ecf 8670 reti->swap = NULL;
c74340f9 8671 }
84da74a7
YO
8672
8673 Newx(ret->substrs, 1, struct reg_substr_data);
8674 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8675 s->min_offset = r->substrs->data[i].min_offset;
8676 s->max_offset = r->substrs->data[i].max_offset;
8677 s->end_shift = r->substrs->data[i].end_shift;
8678 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8679 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8680 }
8681
f8fc2ecf
YO
8682 reti->regstclass = NULL;
8683 if (ri->data) {
84da74a7 8684 struct reg_data *d;
f8fc2ecf 8685 const int count = ri->data->count;
84da74a7
YO
8686 int i;
8687
8688 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8689 char, struct reg_data);
8690 Newx(d->what, count, U8);
8691
8692 d->count = count;
8693 for (i = 0; i < count; i++) {
f8fc2ecf 8694 d->what[i] = ri->data->what[i];
84da74a7 8695 switch (d->what[i]) {
bbe252da 8696 /* legal options are one of: sSfpontT
84da74a7
YO
8697 see also regcomp.h and pregfree() */
8698 case 's':
81714fb9 8699 case 'S':
f8fc2ecf 8700 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
84da74a7
YO
8701 break;
8702 case 'p':
f8fc2ecf 8703 d->data[i] = av_dup_inc((AV *)ri->data->data[i], param);
84da74a7
YO
8704 break;
8705 case 'f':
8706 /* This is cheating. */
8707 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 8708 StructCopy(ri->data->data[i], d->data[i],
84da74a7 8709 struct regnode_charclass_class);
f8fc2ecf 8710 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
8711 break;
8712 case 'o':
bbe252da
YO
8713 /* Compiled op trees are readonly and in shared memory,
8714 and can thus be shared without duplication. */
84da74a7 8715 OP_REFCNT_LOCK;
f8fc2ecf 8716 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
8717 OP_REFCNT_UNLOCK;
8718 break;
8719 case 'n':
f8fc2ecf 8720 d->data[i] = ri->data->data[i];
84da74a7
YO
8721 break;
8722 case 't':
f8fc2ecf 8723 d->data[i] = ri->data->data[i];
84da74a7
YO
8724 OP_REFCNT_LOCK;
8725 ((reg_trie_data*)d->data[i])->refcount++;
8726 OP_REFCNT_UNLOCK;
8727 break;
8728 case 'T':
f8fc2ecf 8729 d->data[i] = ri->data->data[i];
84da74a7
YO
8730 OP_REFCNT_LOCK;
8731 ((reg_ac_data*)d->data[i])->refcount++;
8732 OP_REFCNT_UNLOCK;
8733 /* Trie stclasses are readonly and can thus be shared
8734 * without duplication. We free the stclass in pregfree
8735 * when the corresponding reg_ac_data struct is freed.
8736 */
f8fc2ecf 8737 reti->regstclass= ri->regstclass;
84da74a7
YO
8738 break;
8739 default:
f8fc2ecf 8740 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
8741 }
8742 }
8743
f8fc2ecf 8744 reti->data = d;
84da74a7
YO
8745 }
8746 else
f8fc2ecf 8747 reti->data = NULL;
84da74a7 8748
f8fc2ecf
YO
8749 Newx(reti->offsets, 2*len+1, U32);
8750 Copy(ri->offsets, reti->offsets, 2*len+1, U32);
84da74a7
YO
8751
8752 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8753 ret->refcnt = r->refcnt;
8754 ret->minlen = r->minlen;
de8c5301 8755 ret->minlenret = r->minlenret;
84da74a7
YO
8756 ret->prelen = r->prelen;
8757 ret->nparens = r->nparens;
8758 ret->lastparen = r->lastparen;
8759 ret->lastcloseparen = r->lastcloseparen;
bbe252da
YO
8760 ret->intflags = r->intflags;
8761 ret->extflags = r->extflags;
84da74a7
YO
8762
8763 ret->sublen = r->sublen;
8764
f9f4320a 8765 ret->engine = r->engine;
81714fb9
YO
8766
8767 ret->paren_names = hv_dup_inc(r->paren_names, param);
f9f4320a 8768
84da74a7
YO
8769 if (RX_MATCH_COPIED(ret))
8770 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8771 else
8772 ret->subbeg = NULL;
8773#ifdef PERL_OLD_COPY_ON_WRITE
8774 ret->saved_copy = NULL;
8775#endif
8776
8777 ptr_table_store(PL_ptr_table, r, ret);
8778 return ret;
84da74a7 8779}
a3c0e9ca 8780#endif
84da74a7 8781
de8c5301
YO
8782/*
8783 reg_stringify()
8784
8785 converts a regexp embedded in a MAGIC struct to its stringified form,
8786 caching the converted form in the struct and returns the cached
8787 string.
8788
8789 If lp is nonnull then it is used to return the length of the
8790 resulting string
8791
8792 If flags is nonnull and the returned string contains UTF8 then
8793 (flags & 1) will be true.
8794
8795 If haseval is nonnull then it is used to return whether the pattern
8796 contains evals.
8797
8798 Normally called via macro:
8799
8800 CALLREG_STRINGIFY(mg,0,0);
8801
8802 And internally with
8803
8804 CALLREG_AS_STR(mg,lp,flags,haseval)
8805
8806 See sv_2pv_flags() in sv.c for an example of internal usage.
8807
8808 */
8809
8810char *
8811Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8812 dVAR;
8813 const regexp * const re = (regexp *)mg->mg_obj;
f8fc2ecf
YO
8814 RXi_GET_DECL(re,ri);
8815
de8c5301
YO
8816 if (!mg->mg_ptr) {
8817 const char *fptr = "msix";
8818 char reflags[6];
8819 char ch;
8820 int left = 0;
8821 int right = 4;
8822 bool need_newline = 0;
bbe252da 8823 U16 reganch = (U16)((re->extflags & RXf_PMf_COMPILETIME) >> 12);
de8c5301
YO
8824
8825 while((ch = *fptr++)) {
8826 if(reganch & 1) {
8827 reflags[left++] = ch;
8828 }
8829 else {
8830 reflags[right--] = ch;
8831 }
8832 reganch >>= 1;
8833 }
8834 if(left != 4) {
8835 reflags[left] = '-';
8836 left = 5;
8837 }
8838
8839 mg->mg_len = re->prelen + 4 + left;
8840 /*
8841 * If /x was used, we have to worry about a regex ending with a
8842 * comment later being embedded within another regex. If so, we don't
8843 * want this regex's "commentization" to leak out to the right part of
8844 * the enclosing regex, we must cap it with a newline.
8845 *
8846 * So, if /x was used, we scan backwards from the end of the regex. If
8847 * we find a '#' before we find a newline, we need to add a newline
8848 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8849 * we don't need to add anything. -jfriedl
8850 */
bbe252da 8851 if (PMf_EXTENDED & re->extflags) {
de8c5301
YO
8852 const char *endptr = re->precomp + re->prelen;
8853 while (endptr >= re->precomp) {
8854 const char c = *(endptr--);
8855 if (c == '\n')
8856 break; /* don't need another */
8857 if (c == '#') {
8858 /* we end while in a comment, so we need a newline */
8859 mg->mg_len++; /* save space for it */
8860 need_newline = 1; /* note to add it */
8861 break;
8862 }
8863 }
8864 }
8865
8866 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8867 mg->mg_ptr[0] = '(';
8868 mg->mg_ptr[1] = '?';
8869 Copy(reflags, mg->mg_ptr+2, left, char);
8870 *(mg->mg_ptr+left+2) = ':';
8871 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8872 if (need_newline)
8873 mg->mg_ptr[mg->mg_len - 2] = '\n';
8874 mg->mg_ptr[mg->mg_len - 1] = ')';
8875 mg->mg_ptr[mg->mg_len] = 0;
8876 }
8877 if (haseval)
f8fc2ecf 8878 *haseval = ri->program[0].next_off;
de8c5301 8879 if (flags)
bbe252da 8880 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
de8c5301
YO
8881
8882 if (lp)
8883 *lp = mg->mg_len;
8884 return mg->mg_ptr;
8885}
8886
8887
76234dfb 8888#ifndef PERL_IN_XSUB_RE
c277df42
IZ
8889/*
8890 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8891 */
8892regnode *
864dbfa3 8893Perl_regnext(pTHX_ register regnode *p)
c277df42 8894{
97aff369 8895 dVAR;
c277df42
IZ
8896 register I32 offset;
8897
f8fc2ecf 8898 if (!p)
c277df42
IZ
8899 return(NULL);
8900
8901 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8902 if (offset == 0)
8903 return(NULL);
8904
c277df42 8905 return(p+offset);
c277df42 8906}
76234dfb 8907#endif
c277df42 8908
01f988be 8909STATIC void
cea2e8a9 8910S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8911{
8912 va_list args;
8913 STRLEN l1 = strlen(pat1);
8914 STRLEN l2 = strlen(pat2);
8915 char buf[512];
06bf62c7 8916 SV *msv;
73d840c0 8917 const char *message;
c277df42
IZ
8918
8919 if (l1 > 510)
8920 l1 = 510;
8921 if (l1 + l2 > 510)
8922 l2 = 510 - l1;
8923 Copy(pat1, buf, l1 , char);
8924 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8925 buf[l1 + l2] = '\n';
8926 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8927#ifdef I_STDARG
8928 /* ANSI variant takes additional second argument */
c277df42 8929 va_start(args, pat2);
8736538c
AS
8930#else
8931 va_start(args);
8932#endif
5a844595 8933 msv = vmess(buf, &args);
c277df42 8934 va_end(args);
cfd0369c 8935 message = SvPV_const(msv,l1);
c277df42
IZ
8936 if (l1 > 512)
8937 l1 = 512;
8938 Copy(message, buf, l1 , char);
197cf9b9 8939 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8940 Perl_croak(aTHX_ "%s", buf);
c277df42 8941}
a0ed51b3
LW
8942
8943/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8944
76234dfb 8945#ifndef PERL_IN_XSUB_RE
a0ed51b3 8946void
864dbfa3 8947Perl_save_re_context(pTHX)
b81d288d 8948{
97aff369 8949 dVAR;
1ade1aa1
NC
8950
8951 struct re_save_state *state;
8952
8953 SAVEVPTR(PL_curcop);
8954 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8955
8956 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8957 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8958 SSPUSHINT(SAVEt_RE_STATE);
8959
46ab3289 8960 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8961
a0ed51b3 8962 PL_reg_start_tmp = 0;
a0ed51b3 8963 PL_reg_start_tmpl = 0;
c445ea15 8964 PL_reg_oldsaved = NULL;
a5db57d6 8965 PL_reg_oldsavedlen = 0;
a5db57d6 8966 PL_reg_maxiter = 0;
a5db57d6 8967 PL_reg_leftiter = 0;
c445ea15 8968 PL_reg_poscache = NULL;
a5db57d6 8969 PL_reg_poscache_size = 0;
1ade1aa1
NC
8970#ifdef PERL_OLD_COPY_ON_WRITE
8971 PL_nrs = NULL;
8972#endif
ada6e8a9 8973
c445ea15
AL
8974 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8975 if (PL_curpm) {
8976 const REGEXP * const rx = PM_GETRE(PL_curpm);
8977 if (rx) {
1df70142 8978 U32 i;
ada6e8a9 8979 for (i = 1; i <= rx->nparens; i++) {
1df70142 8980 char digits[TYPE_CHARS(long)];
d9fad198 8981 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8982 GV *const *const gvp
8983 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8984
b37c2d43
AL
8985 if (gvp) {
8986 GV * const gv = *gvp;
8987 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8988 save_scalar(gv);
49f27e4b 8989 }
ada6e8a9
AMS
8990 }
8991 }
8992 }
a0ed51b3 8993}
76234dfb 8994#endif
51371543 8995
51371543 8996static void
acfe0abc 8997clear_re(pTHX_ void *r)
51371543 8998{
97aff369 8999 dVAR;
51371543
GS
9000 ReREFCNT_dec((regexp *)r);
9001}
ffbc6a93 9002
a28509cc
AL
9003#ifdef DEBUGGING
9004
9005STATIC void
9006S_put_byte(pTHX_ SV *sv, int c)
9007{
9008 if (isCNTRL(c) || c == 255 || !isPRINT(c))
9009 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9010 else if (c == '-' || c == ']' || c == '\\' || c == '^')
9011 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
9012 else
9013 Perl_sv_catpvf(aTHX_ sv, "%c", c);
9014}
9015
786e8c11 9016
3dab1dad
YO
9017#define CLEAR_OPTSTART \
9018 if (optstart) STMT_START { \
70685ca0 9019 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
9020 optstart=NULL; \
9021 } STMT_END
9022
786e8c11 9023#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 9024
b5a2f8d8
NC
9025STATIC const regnode *
9026S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9027 const regnode *last, const regnode *plast,
9028 SV* sv, I32 indent, U32 depth)
a28509cc 9029{
97aff369 9030 dVAR;
786e8c11 9031 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9032 register const regnode *next;
3dab1dad 9033 const regnode *optstart= NULL;
f8fc2ecf 9034 RXi_GET_DECL(r,ri);
3dab1dad 9035 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 9036
786e8c11
YO
9037#ifdef DEBUG_DUMPUNTIL
9038 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9039 last ? last-start : 0,plast ? plast-start : 0);
9040#endif
9041
9042 if (plast && plast < last)
9043 last= plast;
9044
9045 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
9046 /* While that wasn't END last time... */
9047
9048 NODE_ALIGN(node);
9049 op = OP(node);
de734bd5 9050 if (op == CLOSE || op == WHILEM)
786e8c11 9051 indent--;
b5a2f8d8 9052 next = regnext((regnode *)node);
07be1b83 9053
a28509cc 9054 /* Where, what. */
8e11feef 9055 if (OP(node) == OPTIMIZED) {
e68ec53f 9056 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 9057 optstart = node;
3dab1dad 9058 else
8e11feef 9059 goto after_print;
3dab1dad
YO
9060 } else
9061 CLEAR_OPTSTART;
07be1b83 9062
32fc9b6a 9063 regprop(r, sv, node);
a28509cc 9064 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 9065 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
9066
9067 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
9068 if (next == NULL) /* Next ptr. */
9069 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
9070 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9071 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
9072 else
9073 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 9074
1de06328 9075 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 9076 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
9077 }
9078
a28509cc
AL
9079 after_print:
9080 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
9081 assert(next);
9082 {
9083 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
9084 ? regnext((regnode *)next)
9085 : next);
be8e71aa
YO
9086 if (last && nnode > last)
9087 nnode = last;
786e8c11 9088 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 9089 }
a28509cc
AL
9090 }
9091 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9092 assert(next);
786e8c11 9093 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9094 }
9095 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9096 const regnode *this_trie = node;
1de06328 9097 const char op = OP(node);
a28509cc 9098 const I32 n = ARG(node);
1de06328 9099 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 9100 (reg_ac_data *)ri->data->data[n] :
1de06328
YO
9101 NULL;
9102 const reg_trie_data * const trie = op<AHOCORASICK ?
f8fc2ecf 9103 (reg_trie_data*)ri->data->data[n] :
1de06328 9104 ac->trie;
786e8c11 9105 const regnode *nextbranch= NULL;
a28509cc 9106 I32 word_idx;
1de06328 9107 sv_setpvn(sv, "", 0);
786e8c11 9108 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 9109 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
9110
9111 PerlIO_printf(Perl_debug_log, "%*s%s ",
9112 (int)(2*(indent+3)), "",
9113 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9114 PL_colors[0], PL_colors[1],
9115 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9116 PERL_PV_PRETTY_ELIPSES |
7f69552c 9117 PERL_PV_PRETTY_LTGT
786e8c11
YO
9118 )
9119 : "???"
9120 );
9121 if (trie->jump) {
40d049e4 9122 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
9123 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9124 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
9125 if (dist) {
9126 if (!nextbranch)
24b23f37 9127 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9128 DUMPUNTIL(this_trie + dist, nextbranch);
9129 }
786e8c11
YO
9130 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9131 nextbranch= regnext((regnode *)nextbranch);
9132 } else {
9133 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9134 }
786e8c11
YO
9135 }
9136 if (last && next > last)
9137 node= last;
9138 else
9139 node= next;
a28509cc 9140 }
786e8c11
YO
9141 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9142 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9143 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9144 }
9145 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9146 assert(next);
786e8c11 9147 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9148 }
9149 else if ( op == PLUS || op == STAR) {
786e8c11 9150 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9151 }
9152 else if (op == ANYOF) {
9153 /* arglen 1 + class block */
9154 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9155 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9156 node = NEXTOPER(node);
9157 }
9158 else if (PL_regkind[(U8)op] == EXACT) {
9159 /* Literal string, where present. */
9160 node += NODE_SZ_STR(node) - 1;
9161 node = NEXTOPER(node);
9162 }
9163 else {
9164 node = NEXTOPER(node);
9165 node += regarglen[(U8)op];
9166 }
9167 if (op == CURLYX || op == OPEN)
786e8c11 9168 indent++;
a28509cc 9169 }
3dab1dad 9170 CLEAR_OPTSTART;
786e8c11 9171#ifdef DEBUG_DUMPUNTIL
70685ca0 9172 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 9173#endif
1de06328 9174 return node;
a28509cc
AL
9175}
9176
9177#endif /* DEBUGGING */
9178
241d1a3b
NC
9179/*
9180 * Local variables:
9181 * c-indentation-style: bsd
9182 * c-basic-offset: 4
9183 * indent-tabs-mode: t
9184 * End:
9185 *
37442d52
RGS
9186 * ex: set ts=8 sts=4 sw=4 noet:
9187 */