This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Swap _reg_ac_data.trie to U32 offset into the regdata array, as
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
40d049e4
YO
129 regnode **recurse; /* Recurse regops */
130 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
131#if ADD_TO_REGEXEC
132 char *starttry; /* -Dr: where regtry was called. */
133#define RExC_starttry (pRExC_state->starttry)
134#endif
3dab1dad 135#ifdef DEBUGGING
be8e71aa 136 const char *lastparse;
3dab1dad
YO
137 I32 lastnum;
138#define RExC_lastparse (pRExC_state->lastparse)
139#define RExC_lastnum (pRExC_state->lastnum)
140#endif
830247a4
IZ
141} RExC_state_t;
142
e2509266 143#define RExC_flags (pRExC_state->flags)
830247a4
IZ
144#define RExC_precomp (pRExC_state->precomp)
145#define RExC_rx (pRExC_state->rx)
f8fc2ecf 146#define RExC_rxi (pRExC_state->rxi)
fac92740 147#define RExC_start (pRExC_state->start)
830247a4
IZ
148#define RExC_end (pRExC_state->end)
149#define RExC_parse (pRExC_state->parse)
150#define RExC_whilem_seen (pRExC_state->whilem_seen)
f8fc2ecf 151#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
830247a4 152#define RExC_emit (pRExC_state->emit)
fac92740 153#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
154#define RExC_naughty (pRExC_state->naughty)
155#define RExC_sawback (pRExC_state->sawback)
156#define RExC_seen (pRExC_state->seen)
157#define RExC_size (pRExC_state->size)
158#define RExC_npar (pRExC_state->npar)
c74340f9 159#define RExC_cpar (pRExC_state->cpar)
e2e6a0f1 160#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
161#define RExC_extralen (pRExC_state->extralen)
162#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 164#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 165#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
166#define RExC_open_parens (pRExC_state->open_parens)
167#define RExC_close_parens (pRExC_state->close_parens)
168#define RExC_opend (pRExC_state->opend)
81714fb9 169#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
170#define RExC_recurse (pRExC_state->recurse)
171#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 172
a687059c
LW
173#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
174#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
175 ((*s) == '{' && regcurly(s)))
a687059c 176
35c8bce7
LW
177#ifdef SPSTART
178#undef SPSTART /* dratted cpp namespace... */
179#endif
a687059c
LW
180/*
181 * Flags to be passed up and down.
182 */
a687059c 183#define WORST 0 /* Worst case. */
821b33a5 184#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
185#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
186#define SPSTART 0x4 /* Starts with * or +. */
187#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 188
3dab1dad
YO
189#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
190
07be1b83
YO
191/* whether trie related optimizations are enabled */
192#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
193#define TRIE_STUDY_OPT
786e8c11 194#define FULL_TRIE_STUDY
07be1b83
YO
195#define TRIE_STCLASS
196#endif
1de06328
YO
197
198
40d049e4
YO
199
200#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
201#define PBITVAL(paren) (1 << ((paren) & 7))
202#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
203#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
204#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
205
206
1de06328
YO
207/* About scan_data_t.
208
209 During optimisation we recurse through the regexp program performing
210 various inplace (keyhole style) optimisations. In addition study_chunk
211 and scan_commit populate this data structure with information about
212 what strings MUST appear in the pattern. We look for the longest
213 string that must appear for at a fixed location, and we look for the
214 longest string that may appear at a floating location. So for instance
215 in the pattern:
216
217 /FOO[xX]A.*B[xX]BAR/
218
219 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
220 strings (because they follow a .* construct). study_chunk will identify
221 both FOO and BAR as being the longest fixed and floating strings respectively.
222
223 The strings can be composites, for instance
224
225 /(f)(o)(o)/
226
227 will result in a composite fixed substring 'foo'.
228
229 For each string some basic information is maintained:
230
231 - offset or min_offset
232 This is the position the string must appear at, or not before.
233 It also implicitly (when combined with minlenp) tells us how many
234 character must match before the string we are searching.
235 Likewise when combined with minlenp and the length of the string
236 tells us how many characters must appear after the string we have
237 found.
238
239 - max_offset
240 Only used for floating strings. This is the rightmost point that
241 the string can appear at. Ifset to I32 max it indicates that the
242 string can occur infinitely far to the right.
243
244 - minlenp
245 A pointer to the minimum length of the pattern that the string
246 was found inside. This is important as in the case of positive
247 lookahead or positive lookbehind we can have multiple patterns
248 involved. Consider
249
250 /(?=FOO).*F/
251
252 The minimum length of the pattern overall is 3, the minimum length
253 of the lookahead part is 3, but the minimum length of the part that
254 will actually match is 1. So 'FOO's minimum length is 3, but the
255 minimum length for the F is 1. This is important as the minimum length
256 is used to determine offsets in front of and behind the string being
257 looked for. Since strings can be composites this is the length of the
258 pattern at the time it was commited with a scan_commit. Note that
259 the length is calculated by study_chunk, so that the minimum lengths
260 are not known until the full pattern has been compiled, thus the
261 pointer to the value.
262
263 - lookbehind
264
265 In the case of lookbehind the string being searched for can be
266 offset past the start point of the final matching string.
267 If this value was just blithely removed from the min_offset it would
268 invalidate some of the calculations for how many chars must match
269 before or after (as they are derived from min_offset and minlen and
270 the length of the string being searched for).
271 When the final pattern is compiled and the data is moved from the
272 scan_data_t structure into the regexp structure the information
273 about lookbehind is factored in, with the information that would
274 have been lost precalculated in the end_shift field for the
275 associated string.
276
277 The fields pos_min and pos_delta are used to store the minimum offset
278 and the delta to the maximum offset at the current point in the pattern.
279
280*/
2c2d71f5
JH
281
282typedef struct scan_data_t {
1de06328
YO
283 /*I32 len_min; unused */
284 /*I32 len_delta; unused */
2c2d71f5
JH
285 I32 pos_min;
286 I32 pos_delta;
287 SV *last_found;
1de06328 288 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
289 I32 last_start_min;
290 I32 last_start_max;
1de06328
YO
291 SV **longest; /* Either &l_fixed, or &l_float. */
292 SV *longest_fixed; /* longest fixed string found in pattern */
293 I32 offset_fixed; /* offset where it starts */
294 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
295 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
296 SV *longest_float; /* longest floating string found in pattern */
297 I32 offset_float_min; /* earliest point in string it can appear */
298 I32 offset_float_max; /* latest point in string it can appear */
299 I32 *minlen_float; /* pointer to the minlen relevent to the string */
300 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
301 I32 flags;
302 I32 whilem_c;
cb434fcc 303 I32 *last_closep;
653099ff 304 struct regnode_charclass_class *start_class;
2c2d71f5
JH
305} scan_data_t;
306
a687059c 307/*
e50aee73 308 * Forward declarations for pregcomp()'s friends.
a687059c 309 */
a0d0e21e 310
27da23d5 311static const scan_data_t zero_scan_data =
1de06328 312 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
313
314#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
315#define SF_BEFORE_SEOL 0x0001
316#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
317#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
318#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
319
09b7f37c
CB
320#ifdef NO_UNARY_PLUS
321# define SF_FIX_SHIFT_EOL (0+2)
322# define SF_FL_SHIFT_EOL (0+4)
323#else
324# define SF_FIX_SHIFT_EOL (+2)
325# define SF_FL_SHIFT_EOL (+4)
326#endif
c277df42
IZ
327
328#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
329#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
330
331#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
332#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
333#define SF_IS_INF 0x0040
334#define SF_HAS_PAR 0x0080
335#define SF_IN_PAR 0x0100
336#define SF_HAS_EVAL 0x0200
337#define SCF_DO_SUBSTR 0x0400
653099ff
GS
338#define SCF_DO_STCLASS_AND 0x0800
339#define SCF_DO_STCLASS_OR 0x1000
340#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 341#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 342
786e8c11 343#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 344#define SCF_SEEN_ACCEPT 0x8000
07be1b83 345
eb160463 346#define UTF (RExC_utf8 != 0)
bbe252da
YO
347#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
348#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 349
ffc61ed2 350#define OOB_UNICODE 12345678
93733859 351#define OOB_NAMEDCLASS -1
b8c5462f 352
a0ed51b3
LW
353#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
354#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
355
8615cb43 356
b45f050a
JF
357/* length of regex to show in messages that don't mark a position within */
358#define RegexLengthToShowInErrorMessages 127
359
360/*
361 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
362 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
363 * op/pragma/warn/regcomp.
364 */
7253e4e3
RK
365#define MARKER1 "<-- HERE" /* marker as it appears in the description */
366#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 367
7253e4e3 368#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
369
370/*
371 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
372 * arg. Show regex, up to a maximum length. If it's too long, chop and add
373 * "...".
374 */
58e23c8d 375#define _FAIL(code) STMT_START { \
bfed75c6 376 const char *ellipses = ""; \
ccb2c380
MP
377 IV len = RExC_end - RExC_precomp; \
378 \
379 if (!SIZE_ONLY) \
380 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
381 if (len > RegexLengthToShowInErrorMessages) { \
382 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
383 len = RegexLengthToShowInErrorMessages - 10; \
384 ellipses = "..."; \
385 } \
58e23c8d 386 code; \
ccb2c380 387} STMT_END
8615cb43 388
58e23c8d
YO
389#define FAIL(msg) _FAIL( \
390 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
391 msg, (int)len, RExC_precomp, ellipses))
392
393#define FAIL2(msg,arg) _FAIL( \
394 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
395 arg, (int)len, RExC_precomp, ellipses))
396
b45f050a 397/*
b45f050a
JF
398 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
399 */
ccb2c380 400#define Simple_vFAIL(m) STMT_START { \
a28509cc 401 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
402 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
403 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
404} STMT_END
b45f050a
JF
405
406/*
407 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
408 */
ccb2c380
MP
409#define vFAIL(m) STMT_START { \
410 if (!SIZE_ONLY) \
411 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
412 Simple_vFAIL(m); \
413} STMT_END
b45f050a
JF
414
415/*
416 * Like Simple_vFAIL(), but accepts two arguments.
417 */
ccb2c380 418#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 419 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
420 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
421 (int)offset, RExC_precomp, RExC_precomp + offset); \
422} STMT_END
b45f050a
JF
423
424/*
425 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
426 */
ccb2c380
MP
427#define vFAIL2(m,a1) STMT_START { \
428 if (!SIZE_ONLY) \
429 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
430 Simple_vFAIL2(m, a1); \
431} STMT_END
b45f050a
JF
432
433
434/*
435 * Like Simple_vFAIL(), but accepts three arguments.
436 */
ccb2c380 437#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 438 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
439 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
440 (int)offset, RExC_precomp, RExC_precomp + offset); \
441} STMT_END
b45f050a
JF
442
443/*
444 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
445 */
ccb2c380
MP
446#define vFAIL3(m,a1,a2) STMT_START { \
447 if (!SIZE_ONLY) \
448 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
449 Simple_vFAIL3(m, a1, a2); \
450} STMT_END
b45f050a
JF
451
452/*
453 * Like Simple_vFAIL(), but accepts four arguments.
454 */
ccb2c380 455#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 456 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
457 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
458 (int)offset, RExC_precomp, RExC_precomp + offset); \
459} STMT_END
b45f050a 460
ccb2c380 461#define vWARN(loc,m) STMT_START { \
a28509cc 462 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
463 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
464 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
465} STMT_END
466
467#define vWARNdep(loc,m) STMT_START { \
a28509cc 468 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
469 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
470 "%s" REPORT_LOCATION, \
471 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
472} STMT_END
473
474
475#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 476 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
477 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
478 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
479} STMT_END
480
481#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 482 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
483 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
484 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
485} STMT_END
486
487#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 488 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
489 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
491} STMT_END
492
493#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 494 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
495 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
496 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
497} STMT_END
9d1d55b5 498
8615cb43 499
cd439c50 500/* Allow for side effects in s */
ccb2c380
MP
501#define REGC(c,s) STMT_START { \
502 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
503} STMT_END
cd439c50 504
fac92740
MJD
505/* Macros for recording node offsets. 20001227 mjd@plover.com
506 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
507 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
508 * Element 0 holds the number n.
07be1b83 509 * Position is 1 indexed.
fac92740
MJD
510 */
511
ccb2c380
MP
512#define Set_Node_Offset_To_R(node,byte) STMT_START { \
513 if (! SIZE_ONLY) { \
514 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 515 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 516 if((node) < 0) { \
551405c4 517 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
518 } else { \
519 RExC_offsets[2*(node)-1] = (byte); \
520 } \
521 } \
522} STMT_END
523
524#define Set_Node_Offset(node,byte) \
525 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
526#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
527
528#define Set_Node_Length_To_R(node,len) STMT_START { \
529 if (! SIZE_ONLY) { \
530 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 531 __LINE__, (int)(node), (int)(len))); \
ccb2c380 532 if((node) < 0) { \
551405c4 533 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
534 } else { \
535 RExC_offsets[2*(node)] = (len); \
536 } \
537 } \
538} STMT_END
539
540#define Set_Node_Length(node,len) \
541 Set_Node_Length_To_R((node)-RExC_emit_start, len)
542#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
543#define Set_Node_Cur_Length(node) \
544 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
545
546/* Get offsets and lengths */
547#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
548#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
549
07be1b83
YO
550#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
551 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
552 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
553} STMT_END
554
555
556#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
557#define EXPERIMENTAL_INPLACESCAN
558#endif
559
1de06328 560#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 561DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 562 PerlIO_printf(Perl_debug_log, \
b515a41d 563 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
1de06328
YO
564 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
565 (int)(depth)*2, "", \
566 (IV)((data)->pos_min), \
567 (IV)((data)->pos_delta), \
568 (IV)((data)->flags), \
569 (IV)((data)->whilem_c), \
570 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
571 ); \
572 if ((data)->last_found) \
573 PerlIO_printf(Perl_debug_log, \
574 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
575 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
576 SvPVX_const((data)->last_found), \
577 (IV)((data)->last_end), \
578 (IV)((data)->last_start_min), \
579 (IV)((data)->last_start_max), \
580 ((data)->longest && \
581 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
582 SvPVX_const((data)->longest_fixed), \
583 (IV)((data)->offset_fixed), \
584 ((data)->longest && \
585 (data)->longest==&((data)->longest_float)) ? "*" : "", \
586 SvPVX_const((data)->longest_float), \
587 (IV)((data)->offset_float_min), \
588 (IV)((data)->offset_float_max) \
589 ); \
590 PerlIO_printf(Perl_debug_log,"\n"); \
591});
592
acfe0abc 593static void clear_re(pTHX_ void *r);
4327152a 594
653099ff 595/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 596 Update the longest found anchored substring and the longest found
653099ff
GS
597 floating substrings if needed. */
598
4327152a 599STATIC void
1de06328 600S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 601{
e1ec3a88
AL
602 const STRLEN l = CHR_SVLEN(data->last_found);
603 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 604 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 605
c277df42 606 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 607 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
608 if (*data->longest == data->longest_fixed) {
609 data->offset_fixed = l ? data->last_start_min : data->pos_min;
610 if (data->flags & SF_BEFORE_EOL)
b81d288d 611 data->flags
c277df42
IZ
612 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
613 else
614 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
615 data->minlen_fixed=minlenp;
616 data->lookbehind_fixed=0;
a0ed51b3
LW
617 }
618 else {
c277df42 619 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
620 data->offset_float_max = (l
621 ? data->last_start_max
c277df42 622 : data->pos_min + data->pos_delta);
9051bda5
HS
623 if ((U32)data->offset_float_max > (U32)I32_MAX)
624 data->offset_float_max = I32_MAX;
c277df42 625 if (data->flags & SF_BEFORE_EOL)
b81d288d 626 data->flags
c277df42
IZ
627 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
628 else
629 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
630 data->minlen_float=minlenp;
631 data->lookbehind_float=0;
c277df42
IZ
632 }
633 }
634 SvCUR_set(data->last_found, 0);
0eda9292 635 {
a28509cc 636 SV * const sv = data->last_found;
097eb12c
AL
637 if (SvUTF8(sv) && SvMAGICAL(sv)) {
638 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
639 if (mg)
640 mg->mg_len = 0;
641 }
0eda9292 642 }
c277df42
IZ
643 data->last_end = -1;
644 data->flags &= ~SF_BEFORE_EOL;
1de06328 645 DEBUG_STUDYDATA(data,0);
c277df42
IZ
646}
647
653099ff
GS
648/* Can match anything (initialization) */
649STATIC void
097eb12c 650S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 651{
653099ff 652 ANYOF_CLASS_ZERO(cl);
f8bef550 653 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 654 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
655 if (LOC)
656 cl->flags |= ANYOF_LOCALE;
657}
658
659/* Can match anything (initialization) */
660STATIC int
5f66b61c 661S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
662{
663 int value;
664
aaa51d5e 665 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
666 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
667 return 1;
1aa99e6b
IH
668 if (!(cl->flags & ANYOF_UNICODE_ALL))
669 return 0;
10edeb5d 670 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 671 return 0;
653099ff
GS
672 return 1;
673}
674
675/* Can match anything (initialization) */
676STATIC void
097eb12c 677S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 678{
8ecf7187 679 Zero(cl, 1, struct regnode_charclass_class);
653099ff 680 cl->type = ANYOF;
830247a4 681 cl_anything(pRExC_state, cl);
653099ff
GS
682}
683
684STATIC void
097eb12c 685S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 686{
8ecf7187 687 Zero(cl, 1, struct regnode_charclass_class);
653099ff 688 cl->type = ANYOF;
830247a4 689 cl_anything(pRExC_state, cl);
653099ff
GS
690 if (LOC)
691 cl->flags |= ANYOF_LOCALE;
692}
693
694/* 'And' a given class with another one. Can create false positives */
695/* We assume that cl is not inverted */
696STATIC void
5f66b61c 697S_cl_and(struct regnode_charclass_class *cl,
a28509cc 698 const struct regnode_charclass_class *and_with)
653099ff 699{
40d049e4
YO
700
701 assert(and_with->type == ANYOF);
653099ff
GS
702 if (!(and_with->flags & ANYOF_CLASS)
703 && !(cl->flags & ANYOF_CLASS)
704 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
705 && !(and_with->flags & ANYOF_FOLD)
706 && !(cl->flags & ANYOF_FOLD)) {
707 int i;
708
709 if (and_with->flags & ANYOF_INVERT)
710 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
711 cl->bitmap[i] &= ~and_with->bitmap[i];
712 else
713 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
714 cl->bitmap[i] &= and_with->bitmap[i];
715 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
716 if (!(and_with->flags & ANYOF_EOS))
717 cl->flags &= ~ANYOF_EOS;
1aa99e6b 718
14ebb1a2
JH
719 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
720 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
721 cl->flags &= ~ANYOF_UNICODE_ALL;
722 cl->flags |= ANYOF_UNICODE;
723 ARG_SET(cl, ARG(and_with));
724 }
14ebb1a2
JH
725 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
726 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 727 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
728 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
729 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 730 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
731}
732
733/* 'OR' a given class with another one. Can create false positives */
734/* We assume that cl is not inverted */
735STATIC void
097eb12c 736S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 737{
653099ff
GS
738 if (or_with->flags & ANYOF_INVERT) {
739 /* We do not use
740 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
741 * <= (B1 | !B2) | (CL1 | !CL2)
742 * which is wasteful if CL2 is small, but we ignore CL2:
743 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
744 * XXXX Can we handle case-fold? Unclear:
745 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
746 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
747 */
748 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
749 && !(or_with->flags & ANYOF_FOLD)
750 && !(cl->flags & ANYOF_FOLD) ) {
751 int i;
752
753 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
754 cl->bitmap[i] |= ~or_with->bitmap[i];
755 } /* XXXX: logic is complicated otherwise */
756 else {
830247a4 757 cl_anything(pRExC_state, cl);
653099ff
GS
758 }
759 } else {
760 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
761 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 762 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
763 || (cl->flags & ANYOF_FOLD)) ) {
764 int i;
765
766 /* OR char bitmap and class bitmap separately */
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] |= or_with->bitmap[i];
769 if (or_with->flags & ANYOF_CLASS) {
770 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
771 cl->classflags[i] |= or_with->classflags[i];
772 cl->flags |= ANYOF_CLASS;
773 }
774 }
775 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 776 cl_anything(pRExC_state, cl);
653099ff
GS
777 }
778 }
779 if (or_with->flags & ANYOF_EOS)
780 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
781
782 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
783 ARG(cl) != ARG(or_with)) {
784 cl->flags |= ANYOF_UNICODE_ALL;
785 cl->flags &= ~ANYOF_UNICODE;
786 }
787 if (or_with->flags & ANYOF_UNICODE_ALL) {
788 cl->flags |= ANYOF_UNICODE_ALL;
789 cl->flags &= ~ANYOF_UNICODE;
790 }
653099ff
GS
791}
792
a3621e74
YO
793#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
794#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
795#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
796#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
797
3dab1dad
YO
798
799#ifdef DEBUGGING
07be1b83 800/*
3dab1dad
YO
801 dump_trie(trie)
802 dump_trie_interim_list(trie,next_alloc)
803 dump_trie_interim_table(trie,next_alloc)
804
805 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
806 The _interim_ variants are used for debugging the interim
807 tables that are used to generate the final compressed
808 representation which is what dump_trie expects.
809
3dab1dad
YO
810 Part of the reason for their existance is to provide a form
811 of documentation as to how the different representations function.
07be1b83
YO
812
813*/
3dab1dad
YO
814
815/*
816 dump_trie(trie)
817 Dumps the final compressed table form of the trie to Perl_debug_log.
818 Used for debugging make_trie().
819*/
820
821STATIC void
822S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
823{
824 U32 state;
ab3bbdeb
YO
825 SV *sv=sv_newmortal();
826 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
827 GET_RE_DEBUG_FLAGS_DECL;
828
ab3bbdeb 829
3dab1dad
YO
830 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
831 (int)depth * 2 + 2,"",
832 "Match","Base","Ofs" );
833
834 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 835 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 836 if ( tmp ) {
ab3bbdeb
YO
837 PerlIO_printf( Perl_debug_log, "%*s",
838 colwidth,
ddc5bc0f 839 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
840 PL_colors[0], PL_colors[1],
841 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
842 PERL_PV_ESCAPE_FIRSTCHAR
843 )
844 );
3dab1dad
YO
845 }
846 }
847 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
848 (int)depth * 2 + 2,"");
849
850 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 851 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
852 PerlIO_printf( Perl_debug_log, "\n");
853
1e2e3d02 854 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 855 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
856
857 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
858
859 if ( trie->states[ state ].wordnum ) {
860 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
861 } else {
862 PerlIO_printf( Perl_debug_log, "%6s", "" );
863 }
864
865 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
866
867 if ( base ) {
868 U32 ofs = 0;
869
870 while( ( base + ofs < trie->uniquecharcount ) ||
871 ( base + ofs - trie->uniquecharcount < trie->lasttrans
872 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
873 ofs++;
874
875 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
876
877 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
878 if ( ( base + ofs >= trie->uniquecharcount ) &&
879 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
880 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
881 {
ab3bbdeb
YO
882 PerlIO_printf( Perl_debug_log, "%*"UVXf,
883 colwidth,
3dab1dad
YO
884 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
885 } else {
ab3bbdeb 886 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
887 }
888 }
889
890 PerlIO_printf( Perl_debug_log, "]");
891
892 }
893 PerlIO_printf( Perl_debug_log, "\n" );
894 }
895}
896/*
897 dump_trie_interim_list(trie,next_alloc)
898 Dumps a fully constructed but uncompressed trie in list form.
899 List tries normally only are used for construction when the number of
900 possible chars (trie->uniquecharcount) is very high.
901 Used for debugging make_trie().
902*/
903STATIC void
904S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
905{
906 U32 state;
ab3bbdeb
YO
907 SV *sv=sv_newmortal();
908 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
909 GET_RE_DEBUG_FLAGS_DECL;
910 /* print out the table precompression. */
ab3bbdeb
YO
911 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
912 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
913 "------:-----+-----------------\n" );
3dab1dad
YO
914
915 for( state=1 ; state < next_alloc ; state ++ ) {
916 U16 charid;
917
ab3bbdeb 918 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
919 (int)depth * 2 + 2,"", (UV)state );
920 if ( ! trie->states[ state ].wordnum ) {
921 PerlIO_printf( Perl_debug_log, "%5s| ","");
922 } else {
923 PerlIO_printf( Perl_debug_log, "W%4x| ",
924 trie->states[ state ].wordnum
925 );
926 }
927 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 928 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
929 if ( tmp ) {
930 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
931 colwidth,
ddc5bc0f 932 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
933 PL_colors[0], PL_colors[1],
934 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
935 PERL_PV_ESCAPE_FIRSTCHAR
936 ) ,
1e2e3d02
YO
937 TRIE_LIST_ITEM(state,charid).forid,
938 (UV)TRIE_LIST_ITEM(state,charid).newstate
939 );
940 if (!(charid % 10))
664e119d
RGS
941 PerlIO_printf(Perl_debug_log, "\n%*s| ",
942 (int)((depth * 2) + 14), "");
1e2e3d02 943 }
ab3bbdeb
YO
944 }
945 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
946 }
947}
948
949/*
950 dump_trie_interim_table(trie,next_alloc)
951 Dumps a fully constructed but uncompressed trie in table form.
952 This is the normal DFA style state transition table, with a few
953 twists to facilitate compression later.
954 Used for debugging make_trie().
955*/
956STATIC void
957S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
958{
959 U32 state;
960 U16 charid;
ab3bbdeb
YO
961 SV *sv=sv_newmortal();
962 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
963 GET_RE_DEBUG_FLAGS_DECL;
964
965 /*
966 print out the table precompression so that we can do a visual check
967 that they are identical.
968 */
969
970 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
971
972 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 973 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 974 if ( tmp ) {
ab3bbdeb
YO
975 PerlIO_printf( Perl_debug_log, "%*s",
976 colwidth,
ddc5bc0f 977 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
978 PL_colors[0], PL_colors[1],
979 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
980 PERL_PV_ESCAPE_FIRSTCHAR
981 )
982 );
3dab1dad
YO
983 }
984 }
985
986 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
987
988 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 989 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
990 }
991
992 PerlIO_printf( Perl_debug_log, "\n" );
993
994 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
995
996 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
997 (int)depth * 2 + 2,"",
998 (UV)TRIE_NODENUM( state ) );
999
1000 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1001 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1002 if (v)
1003 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1004 else
1005 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1006 }
1007 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1008 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1009 } else {
1010 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1011 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1012 }
1013 }
07be1b83 1014}
3dab1dad
YO
1015
1016#endif
1017
786e8c11
YO
1018/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1019 startbranch: the first branch in the whole branch sequence
1020 first : start branch of sequence of branch-exact nodes.
1021 May be the same as startbranch
1022 last : Thing following the last branch.
1023 May be the same as tail.
1024 tail : item following the branch sequence
1025 count : words in the sequence
1026 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1027 depth : indent depth
3dab1dad 1028
786e8c11 1029Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1030
786e8c11
YO
1031A trie is an N'ary tree where the branches are determined by digital
1032decomposition of the key. IE, at the root node you look up the 1st character and
1033follow that branch repeat until you find the end of the branches. Nodes can be
1034marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1035
786e8c11 1036 /he|she|his|hers/
72f13be8 1037
786e8c11
YO
1038would convert into the following structure. Numbers represent states, letters
1039following numbers represent valid transitions on the letter from that state, if
1040the number is in square brackets it represents an accepting state, otherwise it
1041will be in parenthesis.
07be1b83 1042
786e8c11
YO
1043 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1044 | |
1045 | (2)
1046 | |
1047 (1) +-i->(6)-+-s->[7]
1048 |
1049 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1050
786e8c11
YO
1051 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1052
1053This shows that when matching against the string 'hers' we will begin at state 1
1054read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1055then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1056is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1057single traverse. We store a mapping from accepting to state to which word was
1058matched, and then when we have multiple possibilities we try to complete the
1059rest of the regex in the order in which they occured in the alternation.
1060
1061The only prior NFA like behaviour that would be changed by the TRIE support is
1062the silent ignoring of duplicate alternations which are of the form:
1063
1064 / (DUPE|DUPE) X? (?{ ... }) Y /x
1065
1066Thus EVAL blocks follwing a trie may be called a different number of times with
1067and without the optimisation. With the optimisations dupes will be silently
1068ignored. This inconsistant behaviour of EVAL type nodes is well established as
1069the following demonstrates:
1070
1071 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1072
1073which prints out 'word' three times, but
1074
1075 'words'=~/(word|word|word)(?{ print $1 })S/
1076
1077which doesnt print it out at all. This is due to other optimisations kicking in.
1078
1079Example of what happens on a structural level:
1080
1081The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1082
1083 1: CURLYM[1] {1,32767}(18)
1084 5: BRANCH(8)
1085 6: EXACT <ac>(16)
1086 8: BRANCH(11)
1087 9: EXACT <ad>(16)
1088 11: BRANCH(14)
1089 12: EXACT <ab>(16)
1090 16: SUCCEED(0)
1091 17: NOTHING(18)
1092 18: END(0)
1093
1094This would be optimizable with startbranch=5, first=5, last=16, tail=16
1095and should turn into:
1096
1097 1: CURLYM[1] {1,32767}(18)
1098 5: TRIE(16)
1099 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1100 <ac>
1101 <ad>
1102 <ab>
1103 16: SUCCEED(0)
1104 17: NOTHING(18)
1105 18: END(0)
1106
1107Cases where tail != last would be like /(?foo|bar)baz/:
1108
1109 1: BRANCH(4)
1110 2: EXACT <foo>(8)
1111 4: BRANCH(7)
1112 5: EXACT <bar>(8)
1113 7: TAIL(8)
1114 8: EXACT <baz>(10)
1115 10: END(0)
1116
1117which would be optimizable with startbranch=1, first=1, last=7, tail=8
1118and would end up looking like:
1119
1120 1: TRIE(8)
1121 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1122 <foo>
1123 <bar>
1124 7: TAIL(8)
1125 8: EXACT <baz>(10)
1126 10: END(0)
1127
1128 d = uvuni_to_utf8_flags(d, uv, 0);
1129
1130is the recommended Unicode-aware way of saying
1131
1132 *(d++) = uv;
1133*/
1134
1e2e3d02 1135#define TRIE_STORE_REVCHAR \
786e8c11 1136 STMT_START { \
1e2e3d02 1137 SV *tmp = newSVpvs(""); \
786e8c11 1138 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1139 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1140 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1141 } STMT_END
1142
1143#define TRIE_READ_CHAR STMT_START { \
1144 wordlen++; \
1145 if ( UTF ) { \
1146 if ( folder ) { \
1147 if ( foldlen > 0 ) { \
1148 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1149 foldlen -= len; \
1150 scan += len; \
1151 len = 0; \
1152 } else { \
1153 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1154 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1155 foldlen -= UNISKIP( uvc ); \
1156 scan = foldbuf + UNISKIP( uvc ); \
1157 } \
1158 } else { \
1159 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1160 } \
1161 } else { \
1162 uvc = (U32)*uc; \
1163 len = 1; \
1164 } \
1165} STMT_END
1166
1167
1168
1169#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1170 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1171 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1172 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1173 } \
1174 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1175 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1176 TRIE_LIST_CUR( state )++; \
1177} STMT_END
07be1b83 1178
786e8c11
YO
1179#define TRIE_LIST_NEW(state) STMT_START { \
1180 Newxz( trie->states[ state ].trans.list, \
1181 4, reg_trie_trans_le ); \
1182 TRIE_LIST_CUR( state ) = 1; \
1183 TRIE_LIST_LEN( state ) = 4; \
1184} STMT_END
07be1b83 1185
786e8c11
YO
1186#define TRIE_HANDLE_WORD(state) STMT_START { \
1187 U16 dupe= trie->states[ state ].wordnum; \
1188 regnode * const noper_next = regnext( noper ); \
1189 \
1190 if (trie->wordlen) \
1191 trie->wordlen[ curword ] = wordlen; \
1192 DEBUG_r({ \
1193 /* store the word for dumping */ \
1194 SV* tmp; \
1195 if (OP(noper) != NOTHING) \
1196 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1197 else \
1198 tmp = newSVpvn( "", 0 ); \
1199 if ( UTF ) SvUTF8_on( tmp ); \
1200 av_push( trie->words, tmp ); \
1201 }); \
1202 \
1203 curword++; \
1204 \
1205 if ( noper_next < tail ) { \
1206 if (!trie->jump) \
446bd890 1207 trie->jump = PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1208 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1209 if (!jumper) \
1210 jumper = noper_next; \
1211 if (!nextbranch) \
1212 nextbranch= regnext(cur); \
1213 } \
1214 \
1215 if ( dupe ) { \
1216 /* So it's a dupe. This means we need to maintain a */\
1217 /* linked-list from the first to the next. */\
1218 /* we only allocate the nextword buffer when there */\
1219 /* a dupe, so first time we have to do the allocation */\
1220 if (!trie->nextword) \
446bd890
NC
1221 trie->nextword = \
1222 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1223 while ( trie->nextword[dupe] ) \
1224 dupe= trie->nextword[dupe]; \
1225 trie->nextword[dupe]= curword; \
1226 } else { \
1227 /* we haven't inserted this word yet. */ \
1228 trie->states[ state ].wordnum = curword; \
1229 } \
1230} STMT_END
07be1b83 1231
3dab1dad 1232
786e8c11
YO
1233#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1234 ( ( base + charid >= ucharcount \
1235 && base + charid < ubound \
1236 && state == trie->trans[ base - ucharcount + charid ].check \
1237 && trie->trans[ base - ucharcount + charid ].next ) \
1238 ? trie->trans[ base - ucharcount + charid ].next \
1239 : ( state==1 ? special : 0 ) \
1240 )
3dab1dad 1241
786e8c11
YO
1242#define MADE_TRIE 1
1243#define MADE_JUMP_TRIE 2
1244#define MADE_EXACT_TRIE 4
3dab1dad 1245
a3621e74 1246STATIC I32
786e8c11 1247S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1248{
27da23d5 1249 dVAR;
a3621e74
YO
1250 /* first pass, loop through and scan words */
1251 reg_trie_data *trie;
1252 regnode *cur;
9f7f3913 1253 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1254 STRLEN len = 0;
1255 UV uvc = 0;
1256 U16 curword = 0;
1257 U32 next_alloc = 0;
786e8c11
YO
1258 regnode *jumper = NULL;
1259 regnode *nextbranch = NULL;
7f69552c 1260 regnode *convert = NULL;
a3621e74 1261 /* we just use folder as a flag in utf8 */
e1ec3a88 1262 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1263 ? PL_fold
1264 : ( flags == EXACTFL
1265 ? PL_fold_locale
1266 : NULL
1267 )
1268 );
1269
e1ec3a88 1270 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1271 SV *re_trie_maxbuff;
3dab1dad
YO
1272#ifndef DEBUGGING
1273 /* these are only used during construction but are useful during
8e11feef 1274 * debugging so we store them in the struct when debugging.
8e11feef 1275 */
3dab1dad 1276 STRLEN trie_charcount=0;
3dab1dad
YO
1277 AV *trie_revcharmap;
1278#endif
a3621e74 1279 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1280#ifndef DEBUGGING
1281 PERL_UNUSED_ARG(depth);
1282#endif
a3621e74 1283
446bd890 1284 trie = PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1285 trie->refcount = 1;
3dab1dad 1286 trie->startstate = 1;
786e8c11 1287 trie->wordcount = word_count;
f8fc2ecf 1288 RExC_rxi->data->data[ data_slot ] = (void*)trie;
446bd890 1289 trie->charmap = PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1290 if (!(UTF && folder))
446bd890 1291 trie->bitmap = PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74
YO
1292 DEBUG_r({
1293 trie->words = newAV();
a3621e74 1294 });
3dab1dad 1295 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1296
0111c4fd 1297 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1298 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1299 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1300 }
3dab1dad
YO
1301 DEBUG_OPTIMISE_r({
1302 PerlIO_printf( Perl_debug_log,
786e8c11 1303 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1304 (int)depth * 2 + 2, "",
1305 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1306 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1307 (int)depth);
3dab1dad 1308 });
7f69552c
YO
1309
1310 /* Find the node we are going to overwrite */
1311 if ( first == startbranch && OP( last ) != BRANCH ) {
1312 /* whole branch chain */
1313 convert = first;
1314 } else {
1315 /* branch sub-chain */
1316 convert = NEXTOPER( first );
1317 }
1318
a3621e74
YO
1319 /* -- First loop and Setup --
1320
1321 We first traverse the branches and scan each word to determine if it
1322 contains widechars, and how many unique chars there are, this is
1323 important as we have to build a table with at least as many columns as we
1324 have unique chars.
1325
1326 We use an array of integers to represent the character codes 0..255
1327 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1328 native representation of the character value as the key and IV's for the
1329 coded index.
1330
1331 *TODO* If we keep track of how many times each character is used we can
1332 remap the columns so that the table compression later on is more
1333 efficient in terms of memory by ensuring most common value is in the
1334 middle and the least common are on the outside. IMO this would be better
1335 than a most to least common mapping as theres a decent chance the most
1336 common letter will share a node with the least common, meaning the node
1337 will not be compressable. With a middle is most common approach the worst
1338 case is when we have the least common nodes twice.
1339
1340 */
1341
a3621e74 1342 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1343 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1344 const U8 *uc = (U8*)STRING( noper );
a28509cc 1345 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1346 STRLEN foldlen = 0;
1347 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1348 const U8 *scan = (U8*)NULL;
07be1b83 1349 U32 wordlen = 0; /* required init */
3dab1dad 1350 STRLEN chars=0;
a3621e74 1351
3dab1dad
YO
1352 if (OP(noper) == NOTHING) {
1353 trie->minlen= 0;
1354 continue;
1355 }
1356 if (trie->bitmap) {
1357 TRIE_BITMAP_SET(trie,*uc);
1358 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1359 }
a3621e74 1360 for ( ; uc < e ; uc += len ) {
3dab1dad 1361 TRIE_CHARCOUNT(trie)++;
a3621e74 1362 TRIE_READ_CHAR;
3dab1dad 1363 chars++;
a3621e74
YO
1364 if ( uvc < 256 ) {
1365 if ( !trie->charmap[ uvc ] ) {
1366 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1367 if ( folder )
1368 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1369 TRIE_STORE_REVCHAR;
a3621e74
YO
1370 }
1371 } else {
1372 SV** svpp;
1373 if ( !trie->widecharmap )
1374 trie->widecharmap = newHV();
1375
1376 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1377
1378 if ( !svpp )
e4584336 1379 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1380
1381 if ( !SvTRUE( *svpp ) ) {
1382 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1383 TRIE_STORE_REVCHAR;
a3621e74
YO
1384 }
1385 }
1386 }
3dab1dad
YO
1387 if( cur == first ) {
1388 trie->minlen=chars;
1389 trie->maxlen=chars;
1390 } else if (chars < trie->minlen) {
1391 trie->minlen=chars;
1392 } else if (chars > trie->maxlen) {
1393 trie->maxlen=chars;
1394 }
1395
a3621e74
YO
1396 } /* end first pass */
1397 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1398 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1399 (int)depth * 2 + 2,"",
85c3142d 1400 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1401 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1402 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1403 );
446bd890 1404 trie->wordlen = PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1405
1406 /*
1407 We now know what we are dealing with in terms of unique chars and
1408 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1409 representation using a flat table will take. If it's over a reasonable
1410 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1411 conservative but potentially much slower representation using an array
1412 of lists.
1413
1414 At the end we convert both representations into the same compressed
1415 form that will be used in regexec.c for matching with. The latter
1416 is a form that cannot be used to construct with but has memory
1417 properties similar to the list form and access properties similar
1418 to the table form making it both suitable for fast searches and
1419 small enough that its feasable to store for the duration of a program.
1420
1421 See the comment in the code where the compressed table is produced
1422 inplace from the flat tabe representation for an explanation of how
1423 the compression works.
1424
1425 */
1426
1427
3dab1dad 1428 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1429 /*
1430 Second Pass -- Array Of Lists Representation
1431
1432 Each state will be represented by a list of charid:state records
1433 (reg_trie_trans_le) the first such element holds the CUR and LEN
1434 points of the allocated array. (See defines above).
1435
1436 We build the initial structure using the lists, and then convert
1437 it into the compressed table form which allows faster lookups
1438 (but cant be modified once converted).
a3621e74
YO
1439 */
1440
a3621e74
YO
1441 STRLEN transcount = 1;
1442
1e2e3d02
YO
1443 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1444 "%*sCompiling trie using list compiler\n",
1445 (int)depth * 2 + 2, ""));
446bd890
NC
1446
1447 trie->states = PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1448 sizeof(reg_trie_state) );
a3621e74
YO
1449 TRIE_LIST_NEW(1);
1450 next_alloc = 2;
1451
1452 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1453
c445ea15
AL
1454 regnode * const noper = NEXTOPER( cur );
1455 U8 *uc = (U8*)STRING( noper );
1456 const U8 * const e = uc + STR_LEN( noper );
1457 U32 state = 1; /* required init */
1458 U16 charid = 0; /* sanity init */
1459 U8 *scan = (U8*)NULL; /* sanity init */
1460 STRLEN foldlen = 0; /* required init */
07be1b83 1461 U32 wordlen = 0; /* required init */
c445ea15
AL
1462 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1463
3dab1dad 1464 if (OP(noper) != NOTHING) {
786e8c11 1465 for ( ; uc < e ; uc += len ) {
c445ea15 1466
786e8c11 1467 TRIE_READ_CHAR;
c445ea15 1468
786e8c11
YO
1469 if ( uvc < 256 ) {
1470 charid = trie->charmap[ uvc ];
c445ea15 1471 } else {
786e8c11
YO
1472 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1473 if ( !svpp ) {
1474 charid = 0;
1475 } else {
1476 charid=(U16)SvIV( *svpp );
1477 }
c445ea15 1478 }
786e8c11
YO
1479 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1480 if ( charid ) {
a3621e74 1481
786e8c11
YO
1482 U16 check;
1483 U32 newstate = 0;
a3621e74 1484
786e8c11
YO
1485 charid--;
1486 if ( !trie->states[ state ].trans.list ) {
1487 TRIE_LIST_NEW( state );
c445ea15 1488 }
786e8c11
YO
1489 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1490 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1491 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1492 break;
1493 }
1494 }
1495 if ( ! newstate ) {
1496 newstate = next_alloc++;
1497 TRIE_LIST_PUSH( state, charid, newstate );
1498 transcount++;
1499 }
1500 state = newstate;
1501 } else {
1502 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1503 }
a28509cc 1504 }
c445ea15 1505 }
3dab1dad 1506 TRIE_HANDLE_WORD(state);
a3621e74
YO
1507
1508 } /* end second pass */
1509
1e2e3d02
YO
1510 /* next alloc is the NEXT state to be allocated */
1511 trie->statecount = next_alloc;
446bd890
NC
1512 trie->states = PerlMemShared_realloc( trie->states, next_alloc
1513 * sizeof(reg_trie_state) );
a3621e74 1514
3dab1dad
YO
1515 /* and now dump it out before we compress it */
1516 DEBUG_TRIE_COMPILE_MORE_r(
1517 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1518 );
a3621e74 1519
446bd890
NC
1520 trie->trans
1521 = PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1522 {
1523 U32 state;
a3621e74
YO
1524 U32 tp = 0;
1525 U32 zp = 0;
1526
1527
1528 for( state=1 ; state < next_alloc ; state ++ ) {
1529 U32 base=0;
1530
1531 /*
1532 DEBUG_TRIE_COMPILE_MORE_r(
1533 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1534 );
1535 */
1536
1537 if (trie->states[state].trans.list) {
1538 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1539 U16 maxid=minid;
a28509cc 1540 U16 idx;
a3621e74
YO
1541
1542 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1543 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1544 if ( forid < minid ) {
1545 minid=forid;
1546 } else if ( forid > maxid ) {
1547 maxid=forid;
1548 }
a3621e74
YO
1549 }
1550 if ( transcount < tp + maxid - minid + 1) {
1551 transcount *= 2;
446bd890
NC
1552 trie->trans
1553 = PerlMemShared_realloc( trie->trans,
1554 transcount
1555 * sizeof(reg_trie_trans) );
a3621e74
YO
1556 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1557 }
1558 base = trie->uniquecharcount + tp - minid;
1559 if ( maxid == minid ) {
1560 U32 set = 0;
1561 for ( ; zp < tp ; zp++ ) {
1562 if ( ! trie->trans[ zp ].next ) {
1563 base = trie->uniquecharcount + zp - minid;
1564 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1565 trie->trans[ zp ].check = state;
1566 set = 1;
1567 break;
1568 }
1569 }
1570 if ( !set ) {
1571 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1572 trie->trans[ tp ].check = state;
1573 tp++;
1574 zp = tp;
1575 }
1576 } else {
1577 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1578 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1579 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1580 trie->trans[ tid ].check = state;
1581 }
1582 tp += ( maxid - minid + 1 );
1583 }
1584 Safefree(trie->states[ state ].trans.list);
1585 }
1586 /*
1587 DEBUG_TRIE_COMPILE_MORE_r(
1588 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1589 );
1590 */
1591 trie->states[ state ].trans.base=base;
1592 }
cc601c31 1593 trie->lasttrans = tp + 1;
a3621e74
YO
1594 }
1595 } else {
1596 /*
1597 Second Pass -- Flat Table Representation.
1598
1599 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1600 We know that we will need Charcount+1 trans at most to store the data
1601 (one row per char at worst case) So we preallocate both structures
1602 assuming worst case.
1603
1604 We then construct the trie using only the .next slots of the entry
1605 structs.
1606
1607 We use the .check field of the first entry of the node temporarily to
1608 make compression both faster and easier by keeping track of how many non
1609 zero fields are in the node.
1610
1611 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1612 transition.
1613
1614 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1615 number representing the first entry of the node, and state as a
1616 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1617 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1618 are 2 entrys per node. eg:
1619
1620 A B A B
1621 1. 2 4 1. 3 7
1622 2. 0 3 3. 0 5
1623 3. 0 0 5. 0 0
1624 4. 0 0 7. 0 0
1625
1626 The table is internally in the right hand, idx form. However as we also
1627 have to deal with the states array which is indexed by nodenum we have to
1628 use TRIE_NODENUM() to convert.
1629
1630 */
1e2e3d02
YO
1631 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1632 "%*sCompiling trie using table compiler\n",
1633 (int)depth * 2 + 2, ""));
3dab1dad 1634
446bd890
NC
1635 trie->trans = PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1636 * trie->uniquecharcount + 1,
1637 sizeof(reg_trie_trans) );
1638 trie->states = PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1639 sizeof(reg_trie_state) );
a3621e74
YO
1640 next_alloc = trie->uniquecharcount + 1;
1641
3dab1dad 1642
a3621e74
YO
1643 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1644
c445ea15 1645 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1646 const U8 *uc = (U8*)STRING( noper );
1647 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1648
1649 U32 state = 1; /* required init */
1650
1651 U16 charid = 0; /* sanity init */
1652 U32 accept_state = 0; /* sanity init */
1653 U8 *scan = (U8*)NULL; /* sanity init */
1654
1655 STRLEN foldlen = 0; /* required init */
07be1b83 1656 U32 wordlen = 0; /* required init */
a3621e74
YO
1657 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1658
3dab1dad 1659 if ( OP(noper) != NOTHING ) {
786e8c11 1660 for ( ; uc < e ; uc += len ) {
a3621e74 1661
786e8c11 1662 TRIE_READ_CHAR;
a3621e74 1663
786e8c11
YO
1664 if ( uvc < 256 ) {
1665 charid = trie->charmap[ uvc ];
1666 } else {
1667 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1668 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1669 }
786e8c11
YO
1670 if ( charid ) {
1671 charid--;
1672 if ( !trie->trans[ state + charid ].next ) {
1673 trie->trans[ state + charid ].next = next_alloc;
1674 trie->trans[ state ].check++;
1675 next_alloc += trie->uniquecharcount;
1676 }
1677 state = trie->trans[ state + charid ].next;
1678 } else {
1679 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1680 }
1681 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1682 }
a3621e74 1683 }
3dab1dad
YO
1684 accept_state = TRIE_NODENUM( state );
1685 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1686
1687 } /* end second pass */
1688
3dab1dad
YO
1689 /* and now dump it out before we compress it */
1690 DEBUG_TRIE_COMPILE_MORE_r(
1691 dump_trie_interim_table(trie,next_alloc,depth+1)
1692 );
a3621e74 1693
a3621e74
YO
1694 {
1695 /*
1696 * Inplace compress the table.*
1697
1698 For sparse data sets the table constructed by the trie algorithm will
1699 be mostly 0/FAIL transitions or to put it another way mostly empty.
1700 (Note that leaf nodes will not contain any transitions.)
1701
1702 This algorithm compresses the tables by eliminating most such
1703 transitions, at the cost of a modest bit of extra work during lookup:
1704
1705 - Each states[] entry contains a .base field which indicates the
1706 index in the state[] array wheres its transition data is stored.
1707
1708 - If .base is 0 there are no valid transitions from that node.
1709
1710 - If .base is nonzero then charid is added to it to find an entry in
1711 the trans array.
1712
1713 -If trans[states[state].base+charid].check!=state then the
1714 transition is taken to be a 0/Fail transition. Thus if there are fail
1715 transitions at the front of the node then the .base offset will point
1716 somewhere inside the previous nodes data (or maybe even into a node
1717 even earlier), but the .check field determines if the transition is
1718 valid.
1719
786e8c11 1720 XXX - wrong maybe?
a3621e74
YO
1721 The following process inplace converts the table to the compressed
1722 table: We first do not compress the root node 1,and mark its all its
1723 .check pointers as 1 and set its .base pointer as 1 as well. This
1724 allows to do a DFA construction from the compressed table later, and
1725 ensures that any .base pointers we calculate later are greater than
1726 0.
1727
1728 - We set 'pos' to indicate the first entry of the second node.
1729
1730 - We then iterate over the columns of the node, finding the first and
1731 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1732 and set the .check pointers accordingly, and advance pos
1733 appropriately and repreat for the next node. Note that when we copy
1734 the next pointers we have to convert them from the original
1735 NODEIDX form to NODENUM form as the former is not valid post
1736 compression.
1737
1738 - If a node has no transitions used we mark its base as 0 and do not
1739 advance the pos pointer.
1740
1741 - If a node only has one transition we use a second pointer into the
1742 structure to fill in allocated fail transitions from other states.
1743 This pointer is independent of the main pointer and scans forward
1744 looking for null transitions that are allocated to a state. When it
1745 finds one it writes the single transition into the "hole". If the
786e8c11 1746 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1747
1748 - Once compressed we can Renew/realloc the structures to release the
1749 excess space.
1750
1751 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1752 specifically Fig 3.47 and the associated pseudocode.
1753
1754 demq
1755 */
a3b680e6 1756 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1757 U32 state, charid;
a3621e74 1758 U32 pos = 0, zp=0;
1e2e3d02 1759 trie->statecount = laststate;
a3621e74
YO
1760
1761 for ( state = 1 ; state < laststate ; state++ ) {
1762 U8 flag = 0;
a28509cc
AL
1763 const U32 stateidx = TRIE_NODEIDX( state );
1764 const U32 o_used = trie->trans[ stateidx ].check;
1765 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1766 trie->trans[ stateidx ].check = 0;
1767
1768 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1769 if ( flag || trie->trans[ stateidx + charid ].next ) {
1770 if ( trie->trans[ stateidx + charid ].next ) {
1771 if (o_used == 1) {
1772 for ( ; zp < pos ; zp++ ) {
1773 if ( ! trie->trans[ zp ].next ) {
1774 break;
1775 }
1776 }
1777 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1778 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1779 trie->trans[ zp ].check = state;
1780 if ( ++zp > pos ) pos = zp;
1781 break;
1782 }
1783 used--;
1784 }
1785 if ( !flag ) {
1786 flag = 1;
1787 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1788 }
1789 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1790 trie->trans[ pos ].check = state;
1791 pos++;
1792 }
1793 }
1794 }
cc601c31 1795 trie->lasttrans = pos + 1;
446bd890
NC
1796 trie->states = PerlMemShared_realloc( trie->states, laststate
1797 * sizeof(reg_trie_state) );
a3621e74 1798 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1799 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1800 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1801 (int)depth * 2 + 2,"",
1802 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1803 (IV)next_alloc,
1804 (IV)pos,
a3621e74
YO
1805 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1806 );
1807
1808 } /* end table compress */
1809 }
1e2e3d02
YO
1810 DEBUG_TRIE_COMPILE_MORE_r(
1811 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1812 (int)depth * 2 + 2, "",
1813 (UV)trie->statecount,
1814 (UV)trie->lasttrans)
1815 );
cc601c31 1816 /* resize the trans array to remove unused space */
446bd890
NC
1817 trie->trans = PerlMemShared_realloc( trie->trans, trie->lasttrans
1818 * sizeof(reg_trie_trans) );
a3621e74 1819
3dab1dad
YO
1820 /* and now dump out the compressed format */
1821 DEBUG_TRIE_COMPILE_r(
1822 dump_trie(trie,depth+1)
1823 );
07be1b83 1824
3dab1dad 1825 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1826 U8 nodetype =(U8)(flags & 0xFF);
1827 char *str=NULL;
786e8c11 1828
07be1b83 1829#ifdef DEBUGGING
e62cc96a 1830 regnode *optimize = NULL;
b57a0404
JH
1831 U32 mjd_offset = 0;
1832 U32 mjd_nodelen = 0;
07be1b83 1833#endif
a3621e74 1834 /*
3dab1dad
YO
1835 This means we convert either the first branch or the first Exact,
1836 depending on whether the thing following (in 'last') is a branch
1837 or not and whther first is the startbranch (ie is it a sub part of
1838 the alternation or is it the whole thing.)
1839 Assuming its a sub part we conver the EXACT otherwise we convert
1840 the whole branch sequence, including the first.
a3621e74 1841 */
3dab1dad 1842 /* Find the node we are going to overwrite */
7f69552c 1843 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1844 /* branch sub-chain */
3dab1dad 1845 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1846 DEBUG_r({
1847 mjd_offset= Node_Offset((convert));
1848 mjd_nodelen= Node_Length((convert));
1849 });
7f69552c
YO
1850 /* whole branch chain */
1851 } else {
1852 DEBUG_r({
1853 const regnode *nop = NEXTOPER( convert );
1854 mjd_offset= Node_Offset((nop));
1855 mjd_nodelen= Node_Length((nop));
1856 });
07be1b83 1857 }
7f69552c 1858
07be1b83
YO
1859 DEBUG_OPTIMISE_r(
1860 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1861 (int)depth * 2 + 2, "",
786e8c11 1862 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1863 );
a3621e74 1864
3dab1dad
YO
1865 /* But first we check to see if there is a common prefix we can
1866 split out as an EXACT and put in front of the TRIE node. */
1867 trie->startstate= 1;
786e8c11 1868 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1869 U32 state;
1e2e3d02 1870 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1871 U32 ofs = 0;
8e11feef
RGS
1872 I32 idx = -1;
1873 U32 count = 0;
1874 const U32 base = trie->states[ state ].trans.base;
a3621e74 1875
3dab1dad 1876 if ( trie->states[state].wordnum )
8e11feef 1877 count = 1;
a3621e74 1878
8e11feef 1879 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1880 if ( ( base + ofs >= trie->uniquecharcount ) &&
1881 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1882 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1883 {
3dab1dad 1884 if ( ++count > 1 ) {
8e11feef 1885 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1886 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1887 if ( state == 1 ) break;
3dab1dad
YO
1888 if ( count == 2 ) {
1889 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1890 DEBUG_OPTIMISE_r(
8e11feef
RGS
1891 PerlIO_printf(Perl_debug_log,
1892 "%*sNew Start State=%"UVuf" Class: [",
1893 (int)depth * 2 + 2, "",
786e8c11 1894 (UV)state));
be8e71aa
YO
1895 if (idx >= 0) {
1896 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1897 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1898
3dab1dad 1899 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1900 if ( folder )
1901 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1902 DEBUG_OPTIMISE_r(
07be1b83 1903 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1904 );
8e11feef
RGS
1905 }
1906 }
1907 TRIE_BITMAP_SET(trie,*ch);
1908 if ( folder )
1909 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1910 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1911 }
1912 idx = ofs;
1913 }
3dab1dad
YO
1914 }
1915 if ( count == 1 ) {
1916 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
de734bd5
A
1917 char *ch = SvPV_nolen( *tmp );
1918 DEBUG_OPTIMISE_r({
1919 SV *sv=sv_newmortal();
8e11feef
RGS
1920 PerlIO_printf( Perl_debug_log,
1921 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1922 (int)depth * 2 + 2, "",
de734bd5
A
1923 (UV)state, (UV)idx,
1924 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1925 PL_colors[0], PL_colors[1],
1926 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1927 PERL_PV_ESCAPE_FIRSTCHAR
1928 )
1929 );
1930 });
3dab1dad
YO
1931 if ( state==1 ) {
1932 OP( convert ) = nodetype;
1933 str=STRING(convert);
1934 STR_LEN(convert)=0;
1935 }
de734bd5
A
1936 while (*ch) {
1937 *str++ = *ch++;
1938 STR_LEN(convert)++;
1939 }
1940
8e11feef 1941 } else {
f9049ba1 1942#ifdef DEBUGGING
8e11feef
RGS
1943 if (state>1)
1944 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1945#endif
8e11feef
RGS
1946 break;
1947 }
1948 }
3dab1dad 1949 if (str) {
8e11feef 1950 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1951 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1952 trie->startstate = state;
07be1b83
YO
1953 trie->minlen -= (state - 1);
1954 trie->maxlen -= (state - 1);
1955 DEBUG_r({
1956 regnode *fix = convert;
de734bd5 1957 U32 word = trie->wordcount;
07be1b83
YO
1958 mjd_nodelen++;
1959 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1960 while( ++fix < n ) {
1961 Set_Node_Offset_Length(fix, 0, 0);
1962 }
de734bd5
A
1963 while (word--) {
1964 SV ** const tmp = av_fetch( trie->words, word, 0 );
1965 if (tmp) {
1966 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1967 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1968 else
1969 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1970 }
1971 }
07be1b83 1972 });
8e11feef
RGS
1973 if (trie->maxlen) {
1974 convert = n;
1975 } else {
3dab1dad 1976 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1977 DEBUG_r(optimize= n);
3dab1dad
YO
1978 }
1979 }
1980 }
a5ca303d
YO
1981 if (!jumper)
1982 jumper = last;
3dab1dad 1983 if ( trie->maxlen ) {
8e11feef
RGS
1984 NEXT_OFF( convert ) = (U16)(tail - convert);
1985 ARG_SET( convert, data_slot );
786e8c11
YO
1986 /* Store the offset to the first unabsorbed branch in
1987 jump[0], which is otherwise unused by the jump logic.
1988 We use this when dumping a trie and during optimisation. */
1989 if (trie->jump)
7f69552c 1990 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1991
786e8c11
YO
1992 /* XXXX */
1993 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1994 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1995 {
1996 OP( convert ) = TRIEC;
1997 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 1998 PerlMemShared_free(trie->bitmap);
786e8c11
YO
1999 trie->bitmap= NULL;
2000 } else
2001 OP( convert ) = TRIE;
a3621e74 2002
3dab1dad
YO
2003 /* store the type in the flags */
2004 convert->flags = nodetype;
a5ca303d
YO
2005 DEBUG_r({
2006 optimize = convert
2007 + NODE_STEP_REGNODE
2008 + regarglen[ OP( convert ) ];
2009 });
2010 /* XXX We really should free up the resource in trie now,
2011 as we won't use them - (which resources?) dmq */
3dab1dad 2012 }
a3621e74 2013 /* needed for dumping*/
e62cc96a 2014 DEBUG_r(if (optimize) {
07be1b83 2015 regnode *opt = convert;
e62cc96a 2016 while ( ++opt < optimize) {
07be1b83
YO
2017 Set_Node_Offset_Length(opt,0,0);
2018 }
786e8c11
YO
2019 /*
2020 Try to clean up some of the debris left after the
2021 optimisation.
a3621e74 2022 */
786e8c11 2023 while( optimize < jumper ) {
07be1b83 2024 mjd_nodelen += Node_Length((optimize));
a3621e74 2025 OP( optimize ) = OPTIMIZED;
07be1b83 2026 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2027 optimize++;
2028 }
07be1b83 2029 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2030 });
2031 } /* end node insert */
07be1b83 2032#ifndef DEBUGGING
6e8b4190 2033 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 2034#endif
786e8c11
YO
2035 return trie->jump
2036 ? MADE_JUMP_TRIE
2037 : trie->startstate>1
2038 ? MADE_EXACT_TRIE
2039 : MADE_TRIE;
2040}
2041
2042STATIC void
2043S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2044{
2045/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2046
2047 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2048 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2049 ISBN 0-201-10088-6
2050
2051 We find the fail state for each state in the trie, this state is the longest proper
2052 suffix of the current states 'word' that is also a proper prefix of another word in our
2053 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2054 the DFA not to have to restart after its tried and failed a word at a given point, it
2055 simply continues as though it had been matching the other word in the first place.
2056 Consider
2057 'abcdgu'=~/abcdefg|cdgu/
2058 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2059 fail, which would bring use to the state representing 'd' in the second word where we would
2060 try 'g' and succeed, prodceding to match 'cdgu'.
2061 */
2062 /* add a fail transition */
3251b653
NC
2063 const U32 trie_offset = ARG(source);
2064 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2065 U32 *q;
2066 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2067 const U32 numstates = trie->statecount;
786e8c11
YO
2068 const U32 ubound = trie->lasttrans + ucharcount;
2069 U32 q_read = 0;
2070 U32 q_write = 0;
2071 U32 charid;
2072 U32 base = trie->states[ 1 ].trans.base;
2073 U32 *fail;
2074 reg_ac_data *aho;
2075 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2076 GET_RE_DEBUG_FLAGS_DECL;
2077#ifndef DEBUGGING
2078 PERL_UNUSED_ARG(depth);
2079#endif
2080
2081
2082 ARG_SET( stclass, data_slot );
446bd890 2083 aho = PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2084 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2085 aho->trie=trie_offset;
446bd890
NC
2086 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2087 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2088 Newxz( q, numstates, U32);
446bd890 2089 aho->fail = PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2090 aho->refcount = 1;
2091 fail = aho->fail;
2092 /* initialize fail[0..1] to be 1 so that we always have
2093 a valid final fail state */
2094 fail[ 0 ] = fail[ 1 ] = 1;
2095
2096 for ( charid = 0; charid < ucharcount ; charid++ ) {
2097 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2098 if ( newstate ) {
2099 q[ q_write ] = newstate;
2100 /* set to point at the root */
2101 fail[ q[ q_write++ ] ]=1;
2102 }
2103 }
2104 while ( q_read < q_write) {
2105 const U32 cur = q[ q_read++ % numstates ];
2106 base = trie->states[ cur ].trans.base;
2107
2108 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2109 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2110 if (ch_state) {
2111 U32 fail_state = cur;
2112 U32 fail_base;
2113 do {
2114 fail_state = fail[ fail_state ];
2115 fail_base = aho->states[ fail_state ].trans.base;
2116 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2117
2118 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2119 fail[ ch_state ] = fail_state;
2120 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2121 {
2122 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2123 }
2124 q[ q_write++ % numstates] = ch_state;
2125 }
2126 }
2127 }
2128 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2129 when we fail in state 1, this allows us to use the
2130 charclass scan to find a valid start char. This is based on the principle
2131 that theres a good chance the string being searched contains lots of stuff
2132 that cant be a start char.
2133 */
2134 fail[ 0 ] = fail[ 1 ] = 0;
2135 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2136 PerlIO_printf(Perl_debug_log,
2137 "%*sStclass Failtable (%"UVuf" states): 0",
2138 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2139 );
786e8c11
YO
2140 for( q_read=1; q_read<numstates; q_read++ ) {
2141 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2142 }
2143 PerlIO_printf(Perl_debug_log, "\n");
2144 });
2145 Safefree(q);
2146 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2147}
2148
786e8c11 2149
a3621e74 2150/*
5d1c421c
JH
2151 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2152 * These need to be revisited when a newer toolchain becomes available.
2153 */
2154#if defined(__sparc64__) && defined(__GNUC__)
2155# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2156# undef SPARC64_GCC_WORKAROUND
2157# define SPARC64_GCC_WORKAROUND 1
2158# endif
2159#endif
2160
07be1b83 2161#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2162 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2163 SV * const mysv=sv_newmortal(); \
2164 regnode *Next = regnext(scan); \
2165 regprop(RExC_rx, mysv, scan); \
7f69552c 2166 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2167 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2168 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2169 }});
07be1b83 2170
1de06328
YO
2171
2172
2173
2174
07be1b83
YO
2175#define JOIN_EXACT(scan,min,flags) \
2176 if (PL_regkind[OP(scan)] == EXACT) \
2177 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2178
be8e71aa 2179STATIC U32
07be1b83
YO
2180S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2181 /* Merge several consecutive EXACTish nodes into one. */
2182 regnode *n = regnext(scan);
2183 U32 stringok = 1;
2184 regnode *next = scan + NODE_SZ_STR(scan);
2185 U32 merged = 0;
2186 U32 stopnow = 0;
2187#ifdef DEBUGGING
2188 regnode *stop = scan;
72f13be8 2189 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2190#else
d47053eb
RGS
2191 PERL_UNUSED_ARG(depth);
2192#endif
2193#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2194 PERL_UNUSED_ARG(flags);
2195 PERL_UNUSED_ARG(val);
07be1b83 2196#endif
07be1b83
YO
2197 DEBUG_PEEP("join",scan,depth);
2198
2199 /* Skip NOTHING, merge EXACT*. */
2200 while (n &&
2201 ( PL_regkind[OP(n)] == NOTHING ||
2202 (stringok && (OP(n) == OP(scan))))
2203 && NEXT_OFF(n)
2204 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2205
2206 if (OP(n) == TAIL || n > next)
2207 stringok = 0;
2208 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2209 DEBUG_PEEP("skip:",n,depth);
2210 NEXT_OFF(scan) += NEXT_OFF(n);
2211 next = n + NODE_STEP_REGNODE;
2212#ifdef DEBUGGING
2213 if (stringok)
2214 stop = n;
2215#endif
2216 n = regnext(n);
2217 }
2218 else if (stringok) {
786e8c11 2219 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2220 regnode * const nnext = regnext(n);
2221
2222 DEBUG_PEEP("merg",n,depth);
2223
2224 merged++;
2225 if (oldl + STR_LEN(n) > U8_MAX)
2226 break;
2227 NEXT_OFF(scan) += NEXT_OFF(n);
2228 STR_LEN(scan) += STR_LEN(n);
2229 next = n + NODE_SZ_STR(n);
2230 /* Now we can overwrite *n : */
2231 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2232#ifdef DEBUGGING
2233 stop = next - 1;
2234#endif
2235 n = nnext;
2236 if (stopnow) break;
2237 }
2238
d47053eb
RGS
2239#ifdef EXPERIMENTAL_INPLACESCAN
2240 if (flags && !NEXT_OFF(n)) {
2241 DEBUG_PEEP("atch", val, depth);
2242 if (reg_off_by_arg[OP(n)]) {
2243 ARG_SET(n, val - n);
2244 }
2245 else {
2246 NEXT_OFF(n) = val - n;
2247 }
2248 stopnow = 1;
2249 }
07be1b83
YO
2250#endif
2251 }
2252
2253 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2254 /*
2255 Two problematic code points in Unicode casefolding of EXACT nodes:
2256
2257 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2258 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2259
2260 which casefold to
2261
2262 Unicode UTF-8
2263
2264 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2265 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2266
2267 This means that in case-insensitive matching (or "loose matching",
2268 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2269 length of the above casefolded versions) can match a target string
2270 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2271 This would rather mess up the minimum length computation.
2272
2273 What we'll do is to look for the tail four bytes, and then peek
2274 at the preceding two bytes to see whether we need to decrease
2275 the minimum length by four (six minus two).
2276
2277 Thanks to the design of UTF-8, there cannot be false matches:
2278 A sequence of valid UTF-8 bytes cannot be a subsequence of
2279 another valid sequence of UTF-8 bytes.
2280
2281 */
2282 char * const s0 = STRING(scan), *s, *t;
2283 char * const s1 = s0 + STR_LEN(scan) - 1;
2284 char * const s2 = s1 - 4;
e294cc5d
JH
2285#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2286 const char t0[] = "\xaf\x49\xaf\x42";
2287#else
07be1b83 2288 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2289#endif
07be1b83
YO
2290 const char * const t1 = t0 + 3;
2291
2292 for (s = s0 + 2;
2293 s < s2 && (t = ninstr(s, s1, t0, t1));
2294 s = t + 4) {
e294cc5d
JH
2295#ifdef EBCDIC
2296 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2297 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2298#else
07be1b83
YO
2299 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2300 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2301#endif
07be1b83
YO
2302 *min -= 4;
2303 }
2304 }
2305
2306#ifdef DEBUGGING
2307 /* Allow dumping */
2308 n = scan + NODE_SZ_STR(scan);
2309 while (n <= stop) {
2310 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2311 OP(n) = OPTIMIZED;
2312 NEXT_OFF(n) = 0;
2313 }
2314 n++;
2315 }
2316#endif
2317 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2318 return stopnow;
2319}
2320
653099ff
GS
2321/* REx optimizer. Converts nodes into quickier variants "in place".
2322 Finds fixed substrings. */
2323
a0288114 2324/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2325 to the position after last scanned or to NULL. */
2326
40d049e4
YO
2327#define INIT_AND_WITHP \
2328 assert(!and_withp); \
2329 Newx(and_withp,1,struct regnode_charclass_class); \
2330 SAVEFREEPV(and_withp)
07be1b83 2331
b515a41d
YO
2332/* this is a chain of data about sub patterns we are processing that
2333 need to be handled seperately/specially in study_chunk. Its so
2334 we can simulate recursion without losing state. */
2335struct scan_frame;
2336typedef struct scan_frame {
2337 regnode *last; /* last node to process in this frame */
2338 regnode *next; /* next node to process when last is reached */
2339 struct scan_frame *prev; /*previous frame*/
2340 I32 stop; /* what stopparen do we use */
2341} scan_frame;
2342
76e3520e 2343STATIC I32
40d049e4 2344S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2345 I32 *minlenp, I32 *deltap,
40d049e4
YO
2346 regnode *last,
2347 scan_data_t *data,
2348 I32 stopparen,
2349 U8* recursed,
2350 struct regnode_charclass_class *and_withp,
2351 U32 flags, U32 depth)
c277df42
IZ
2352 /* scanp: Start here (read-write). */
2353 /* deltap: Write maxlen-minlen here. */
2354 /* last: Stop before this one. */
40d049e4
YO
2355 /* data: string data about the pattern */
2356 /* stopparen: treat close N as END */
2357 /* recursed: which subroutines have we recursed into */
2358 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2359{
97aff369 2360 dVAR;
c277df42
IZ
2361 I32 min = 0, pars = 0, code;
2362 regnode *scan = *scanp, *next;
2363 I32 delta = 0;
2364 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2365 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2366 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2367 scan_data_t data_fake;
a3621e74 2368 SV *re_trie_maxbuff = NULL;
786e8c11 2369 regnode *first_non_open = scan;
e2e6a0f1 2370 I32 stopmin = I32_MAX;
8aa23a47
YO
2371 scan_frame *frame = NULL;
2372
a3621e74 2373 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2374
13a24bad 2375#ifdef DEBUGGING
40d049e4 2376 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2377#endif
40d049e4 2378
786e8c11 2379 if ( depth == 0 ) {
40d049e4 2380 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2381 first_non_open=regnext(first_non_open);
2382 }
2383
b81d288d 2384
8aa23a47
YO
2385 fake_study_recurse:
2386 while ( scan && OP(scan) != END && scan < last ){
2387 /* Peephole optimizer: */
2388 DEBUG_STUDYDATA(data,depth);
2389 DEBUG_PEEP("Peep",scan,depth);
2390 JOIN_EXACT(scan,&min,0);
2391
2392 /* Follow the next-chain of the current node and optimize
2393 away all the NOTHINGs from it. */
2394 if (OP(scan) != CURLYX) {
2395 const int max = (reg_off_by_arg[OP(scan)]
2396 ? I32_MAX
2397 /* I32 may be smaller than U16 on CRAYs! */
2398 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2399 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2400 int noff;
2401 regnode *n = scan;
2402
2403 /* Skip NOTHING and LONGJMP. */
2404 while ((n = regnext(n))
2405 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2406 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2407 && off + noff < max)
2408 off += noff;
2409 if (reg_off_by_arg[OP(scan)])
2410 ARG(scan) = off;
2411 else
2412 NEXT_OFF(scan) = off;
2413 }
a3621e74 2414
c277df42 2415
8aa23a47
YO
2416
2417 /* The principal pseudo-switch. Cannot be a switch, since we
2418 look into several different things. */
2419 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2420 || OP(scan) == IFTHEN) {
2421 next = regnext(scan);
2422 code = OP(scan);
2423 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2424
2425 if (OP(next) == code || code == IFTHEN) {
2426 /* NOTE - There is similar code to this block below for handling
2427 TRIE nodes on a re-study. If you change stuff here check there
2428 too. */
2429 I32 max1 = 0, min1 = I32_MAX, num = 0;
2430 struct regnode_charclass_class accum;
2431 regnode * const startbranch=scan;
2432
2433 if (flags & SCF_DO_SUBSTR)
2434 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2435 if (flags & SCF_DO_STCLASS)
2436 cl_init_zero(pRExC_state, &accum);
2437
2438 while (OP(scan) == code) {
2439 I32 deltanext, minnext, f = 0, fake;
2440 struct regnode_charclass_class this_class;
2441
2442 num++;
2443 data_fake.flags = 0;
2444 if (data) {
2445 data_fake.whilem_c = data->whilem_c;
2446 data_fake.last_closep = data->last_closep;
2447 }
2448 else
2449 data_fake.last_closep = &fake;
58e23c8d
YO
2450
2451 data_fake.pos_delta = delta;
8aa23a47
YO
2452 next = regnext(scan);
2453 scan = NEXTOPER(scan);
2454 if (code != BRANCH)
c277df42 2455 scan = NEXTOPER(scan);
8aa23a47
YO
2456 if (flags & SCF_DO_STCLASS) {
2457 cl_init(pRExC_state, &this_class);
2458 data_fake.start_class = &this_class;
2459 f = SCF_DO_STCLASS_AND;
58e23c8d 2460 }
8aa23a47
YO
2461 if (flags & SCF_WHILEM_VISITED_POS)
2462 f |= SCF_WHILEM_VISITED_POS;
2463
2464 /* we suppose the run is continuous, last=next...*/
2465 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2466 next, &data_fake,
2467 stopparen, recursed, NULL, f,depth+1);
2468 if (min1 > minnext)
2469 min1 = minnext;
2470 if (max1 < minnext + deltanext)
2471 max1 = minnext + deltanext;
2472 if (deltanext == I32_MAX)
2473 is_inf = is_inf_internal = 1;
2474 scan = next;
2475 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2476 pars++;
2477 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2478 if ( stopmin > minnext)
2479 stopmin = min + min1;
2480 flags &= ~SCF_DO_SUBSTR;
2481 if (data)
2482 data->flags |= SCF_SEEN_ACCEPT;
2483 }
2484 if (data) {
2485 if (data_fake.flags & SF_HAS_EVAL)
2486 data->flags |= SF_HAS_EVAL;
2487 data->whilem_c = data_fake.whilem_c;
3dab1dad 2488 }
8aa23a47
YO
2489 if (flags & SCF_DO_STCLASS)
2490 cl_or(pRExC_state, &accum, &this_class);
2491 }
2492 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2493 min1 = 0;
2494 if (flags & SCF_DO_SUBSTR) {
2495 data->pos_min += min1;
2496 data->pos_delta += max1 - min1;
2497 if (max1 != min1 || is_inf)
2498 data->longest = &(data->longest_float);
2499 }
2500 min += min1;
2501 delta += max1 - min1;
2502 if (flags & SCF_DO_STCLASS_OR) {
2503 cl_or(pRExC_state, data->start_class, &accum);
2504 if (min1) {
2505 cl_and(data->start_class, and_withp);
2506 flags &= ~SCF_DO_STCLASS;
653099ff 2507 }
8aa23a47
YO
2508 }
2509 else if (flags & SCF_DO_STCLASS_AND) {
2510 if (min1) {
2511 cl_and(data->start_class, &accum);
2512 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2513 }
8aa23a47
YO
2514 else {
2515 /* Switch to OR mode: cache the old value of
2516 * data->start_class */
2517 INIT_AND_WITHP;
2518 StructCopy(data->start_class, and_withp,
2519 struct regnode_charclass_class);
2520 flags &= ~SCF_DO_STCLASS_AND;
2521 StructCopy(&accum, data->start_class,
2522 struct regnode_charclass_class);
2523 flags |= SCF_DO_STCLASS_OR;
2524 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2525 }
8aa23a47 2526 }
a3621e74 2527
8aa23a47
YO
2528 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2529 /* demq.
a3621e74 2530
8aa23a47
YO
2531 Assuming this was/is a branch we are dealing with: 'scan' now
2532 points at the item that follows the branch sequence, whatever
2533 it is. We now start at the beginning of the sequence and look
2534 for subsequences of
a3621e74 2535
8aa23a47
YO
2536 BRANCH->EXACT=>x1
2537 BRANCH->EXACT=>x2
2538 tail
a3621e74 2539
8aa23a47 2540 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2541
8aa23a47
YO
2542 If we can find such a subseqence we need to turn the first
2543 element into a trie and then add the subsequent branch exact
2544 strings to the trie.
a3621e74 2545
8aa23a47 2546 We have two cases
a3621e74 2547
8aa23a47 2548 1. patterns where the whole set of branch can be converted.
a3621e74 2549
8aa23a47 2550 2. patterns where only a subset can be converted.
a3621e74 2551
8aa23a47
YO
2552 In case 1 we can replace the whole set with a single regop
2553 for the trie. In case 2 we need to keep the start and end
2554 branchs so
a3621e74 2555
8aa23a47
YO
2556 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2557 becomes BRANCH TRIE; BRANCH X;
786e8c11 2558
8aa23a47
YO
2559 There is an additional case, that being where there is a
2560 common prefix, which gets split out into an EXACT like node
2561 preceding the TRIE node.
a3621e74 2562
8aa23a47
YO
2563 If x(1..n)==tail then we can do a simple trie, if not we make
2564 a "jump" trie, such that when we match the appropriate word
2565 we "jump" to the appopriate tail node. Essentailly we turn
2566 a nested if into a case structure of sorts.
b515a41d 2567
8aa23a47
YO
2568 */
2569
2570 int made=0;
2571 if (!re_trie_maxbuff) {
2572 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2573 if (!SvIOK(re_trie_maxbuff))
2574 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2575 }
2576 if ( SvIV(re_trie_maxbuff)>=0 ) {
2577 regnode *cur;
2578 regnode *first = (regnode *)NULL;
2579 regnode *last = (regnode *)NULL;
2580 regnode *tail = scan;
2581 U8 optype = 0;
2582 U32 count=0;
a3621e74
YO
2583
2584#ifdef DEBUGGING
8aa23a47 2585 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2586#endif
8aa23a47
YO
2587 /* var tail is used because there may be a TAIL
2588 regop in the way. Ie, the exacts will point to the
2589 thing following the TAIL, but the last branch will
2590 point at the TAIL. So we advance tail. If we
2591 have nested (?:) we may have to move through several
2592 tails.
2593 */
2594
2595 while ( OP( tail ) == TAIL ) {
2596 /* this is the TAIL generated by (?:) */
2597 tail = regnext( tail );
2598 }
a3621e74 2599
8aa23a47
YO
2600
2601 DEBUG_OPTIMISE_r({
2602 regprop(RExC_rx, mysv, tail );
2603 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2604 (int)depth * 2 + 2, "",
2605 "Looking for TRIE'able sequences. Tail node is: ",
2606 SvPV_nolen_const( mysv )
2607 );
2608 });
2609
2610 /*
2611
2612 step through the branches, cur represents each
2613 branch, noper is the first thing to be matched
2614 as part of that branch and noper_next is the
2615 regnext() of that node. if noper is an EXACT
2616 and noper_next is the same as scan (our current
2617 position in the regex) then the EXACT branch is
2618 a possible optimization target. Once we have
2619 two or more consequetive such branches we can
2620 create a trie of the EXACT's contents and stich
2621 it in place. If the sequence represents all of
2622 the branches we eliminate the whole thing and
2623 replace it with a single TRIE. If it is a
2624 subsequence then we need to stitch it in. This
2625 means the first branch has to remain, and needs
2626 to be repointed at the item on the branch chain
2627 following the last branch optimized. This could
2628 be either a BRANCH, in which case the
2629 subsequence is internal, or it could be the
2630 item following the branch sequence in which
2631 case the subsequence is at the end.
2632
2633 */
2634
2635 /* dont use tail as the end marker for this traverse */
2636 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2637 regnode * const noper = NEXTOPER( cur );
b515a41d 2638#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2639 regnode * const noper_next = regnext( noper );
b515a41d
YO
2640#endif
2641
8aa23a47
YO
2642 DEBUG_OPTIMISE_r({
2643 regprop(RExC_rx, mysv, cur);
2644 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2645 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2646
2647 regprop(RExC_rx, mysv, noper);
2648 PerlIO_printf( Perl_debug_log, " -> %s",
2649 SvPV_nolen_const(mysv));
2650
2651 if ( noper_next ) {
2652 regprop(RExC_rx, mysv, noper_next );
2653 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2654 SvPV_nolen_const(mysv));
2655 }
2656 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2657 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2658 });
2659 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2660 : PL_regkind[ OP( noper ) ] == EXACT )
2661 || OP(noper) == NOTHING )
786e8c11 2662#ifdef NOJUMPTRIE
8aa23a47 2663 && noper_next == tail
786e8c11 2664#endif
8aa23a47
YO
2665 && count < U16_MAX)
2666 {
2667 count++;
2668 if ( !first || optype == NOTHING ) {
2669 if (!first) first = cur;
2670 optype = OP( noper );
2671 } else {
2672 last = cur;
2673 }
2674 } else {
2675 if ( last ) {
2676 make_trie( pRExC_state,
2677 startbranch, first, cur, tail, count,
2678 optype, depth+1 );
2679 }
2680 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2681#ifdef NOJUMPTRIE
8aa23a47 2682 && noper_next == tail
786e8c11 2683#endif
8aa23a47
YO
2684 ){
2685 count = 1;
2686 first = cur;
2687 optype = OP( noper );
2688 } else {
2689 count = 0;
2690 first = NULL;
2691 optype = 0;
2692 }
2693 last = NULL;
2694 }
2695 }
2696 DEBUG_OPTIMISE_r({
2697 regprop(RExC_rx, mysv, cur);
2698 PerlIO_printf( Perl_debug_log,
2699 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2700 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2701
2702 });
2703 if ( last ) {
2704 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2705#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2706 if ( ((made == MADE_EXACT_TRIE &&
2707 startbranch == first)
2708 || ( first_non_open == first )) &&
2709 depth==0 ) {
2710 flags |= SCF_TRIE_RESTUDY;
2711 if ( startbranch == first
2712 && scan == tail )
2713 {
2714 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2715 }
2716 }
3dab1dad 2717#endif
8aa23a47
YO
2718 }
2719 }
2720
2721 } /* do trie */
2722
653099ff 2723 }
8aa23a47
YO
2724 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2725 scan = NEXTOPER(NEXTOPER(scan));
2726 } else /* single branch is optimized. */
2727 scan = NEXTOPER(scan);
2728 continue;
2729 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2730 scan_frame *newframe = NULL;
2731 I32 paren;
2732 regnode *start;
2733 regnode *end;
2734
2735 if (OP(scan) != SUSPEND) {
2736 /* set the pointer */
2737 if (OP(scan) == GOSUB) {
2738 paren = ARG(scan);
2739 RExC_recurse[ARG2L(scan)] = scan;
2740 start = RExC_open_parens[paren-1];
2741 end = RExC_close_parens[paren-1];
2742 } else {
2743 paren = 0;
f8fc2ecf 2744 start = RExC_rxi->program + 1;
8aa23a47
YO
2745 end = RExC_opend;
2746 }
2747 if (!recursed) {
2748 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2749 SAVEFREEPV(recursed);
2750 }
2751 if (!PAREN_TEST(recursed,paren+1)) {
2752 PAREN_SET(recursed,paren+1);
2753 Newx(newframe,1,scan_frame);
2754 } else {
2755 if (flags & SCF_DO_SUBSTR) {
2756 scan_commit(pRExC_state,data,minlenp);
2757 data->longest = &(data->longest_float);
2758 }
2759 is_inf = is_inf_internal = 1;
2760 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2761 cl_anything(pRExC_state, data->start_class);
2762 flags &= ~SCF_DO_STCLASS;
2763 }
2764 } else {
2765 Newx(newframe,1,scan_frame);
2766 paren = stopparen;
2767 start = scan+2;
2768 end = regnext(scan);
2769 }
2770 if (newframe) {
2771 assert(start);
2772 assert(end);
2773 SAVEFREEPV(newframe);
2774 newframe->next = regnext(scan);
2775 newframe->last = last;
2776 newframe->stop = stopparen;
2777 newframe->prev = frame;
2778
2779 frame = newframe;
2780 scan = start;
2781 stopparen = paren;
2782 last = end;
2783
2784 continue;
2785 }
2786 }
2787 else if (OP(scan) == EXACT) {
2788 I32 l = STR_LEN(scan);
2789 UV uc;
2790 if (UTF) {
2791 const U8 * const s = (U8*)STRING(scan);
2792 l = utf8_length(s, s + l);
2793 uc = utf8_to_uvchr(s, NULL);
2794 } else {
2795 uc = *((U8*)STRING(scan));
2796 }
2797 min += l;
2798 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2799 /* The code below prefers earlier match for fixed
2800 offset, later match for variable offset. */
2801 if (data->last_end == -1) { /* Update the start info. */
2802 data->last_start_min = data->pos_min;
2803 data->last_start_max = is_inf
2804 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2805 }
8aa23a47
YO
2806 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2807 if (UTF)
2808 SvUTF8_on(data->last_found);
2809 {
2810 SV * const sv = data->last_found;
2811 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2812 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2813 if (mg && mg->mg_len >= 0)
2814 mg->mg_len += utf8_length((U8*)STRING(scan),
2815 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2816 }
8aa23a47
YO
2817 data->last_end = data->pos_min + l;
2818 data->pos_min += l; /* As in the first entry. */
2819 data->flags &= ~SF_BEFORE_EOL;
2820 }
2821 if (flags & SCF_DO_STCLASS_AND) {
2822 /* Check whether it is compatible with what we know already! */
2823 int compat = 1;
2824
2825 if (uc >= 0x100 ||
2826 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2827 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2828 && (!(data->start_class->flags & ANYOF_FOLD)
2829 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2830 )
2831 compat = 0;
2832 ANYOF_CLASS_ZERO(data->start_class);
2833 ANYOF_BITMAP_ZERO(data->start_class);
2834 if (compat)
2835 ANYOF_BITMAP_SET(data->start_class, uc);
2836 data->start_class->flags &= ~ANYOF_EOS;
2837 if (uc < 0x100)
2838 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2839 }
2840 else if (flags & SCF_DO_STCLASS_OR) {
2841 /* false positive possible if the class is case-folded */
2842 if (uc < 0x100)
2843 ANYOF_BITMAP_SET(data->start_class, uc);
2844 else
2845 data->start_class->flags |= ANYOF_UNICODE_ALL;
2846 data->start_class->flags &= ~ANYOF_EOS;
2847 cl_and(data->start_class, and_withp);
2848 }
2849 flags &= ~SCF_DO_STCLASS;
2850 }
2851 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2852 I32 l = STR_LEN(scan);
2853 UV uc = *((U8*)STRING(scan));
2854
2855 /* Search for fixed substrings supports EXACT only. */
2856 if (flags & SCF_DO_SUBSTR) {
2857 assert(data);
2858 scan_commit(pRExC_state, data, minlenp);
2859 }
2860 if (UTF) {
2861 const U8 * const s = (U8 *)STRING(scan);
2862 l = utf8_length(s, s + l);
2863 uc = utf8_to_uvchr(s, NULL);
2864 }
2865 min += l;
2866 if (flags & SCF_DO_SUBSTR)
2867 data->pos_min += l;
2868 if (flags & SCF_DO_STCLASS_AND) {
2869 /* Check whether it is compatible with what we know already! */
2870 int compat = 1;
2871
2872 if (uc >= 0x100 ||
2873 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2874 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2875 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2876 compat = 0;
2877 ANYOF_CLASS_ZERO(data->start_class);
2878 ANYOF_BITMAP_ZERO(data->start_class);
2879 if (compat) {
2880 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2881 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2882 data->start_class->flags |= ANYOF_FOLD;
2883 if (OP(scan) == EXACTFL)
2884 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2885 }
8aa23a47
YO
2886 }
2887 else if (flags & SCF_DO_STCLASS_OR) {
2888 if (data->start_class->flags & ANYOF_FOLD) {
2889 /* false positive possible if the class is case-folded.
2890 Assume that the locale settings are the same... */
1aa99e6b
IH
2891 if (uc < 0x100)
2892 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2893 data->start_class->flags &= ~ANYOF_EOS;
2894 }
8aa23a47 2895 cl_and(data->start_class, and_withp);
653099ff 2896 }
8aa23a47
YO
2897 flags &= ~SCF_DO_STCLASS;
2898 }
2899 else if (strchr((const char*)PL_varies,OP(scan))) {
2900 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2901 I32 f = flags, pos_before = 0;
2902 regnode * const oscan = scan;
2903 struct regnode_charclass_class this_class;
2904 struct regnode_charclass_class *oclass = NULL;
2905 I32 next_is_eval = 0;
2906
2907 switch (PL_regkind[OP(scan)]) {
2908 case WHILEM: /* End of (?:...)* . */
2909 scan = NEXTOPER(scan);
2910 goto finish;
2911 case PLUS:
2912 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2913 next = NEXTOPER(scan);
2914 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2915 mincount = 1;
2916 maxcount = REG_INFTY;
2917 next = regnext(scan);
2918 scan = NEXTOPER(scan);
2919 goto do_curly;
2920 }
2921 }
2922 if (flags & SCF_DO_SUBSTR)
2923 data->pos_min++;
2924 min++;
2925 /* Fall through. */
2926 case STAR:
2927 if (flags & SCF_DO_STCLASS) {
2928 mincount = 0;
2929 maxcount = REG_INFTY;
2930 next = regnext(scan);
2931 scan = NEXTOPER(scan);
2932 goto do_curly;
2933 }
2934 is_inf = is_inf_internal = 1;
2935 scan = regnext(scan);
c277df42 2936 if (flags & SCF_DO_SUBSTR) {
8aa23a47
YO
2937 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2938 data->longest = &(data->longest_float);
c277df42 2939 }
8aa23a47
YO
2940 goto optimize_curly_tail;
2941 case CURLY:
2942 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2943 && (scan->flags == stopparen))
2944 {
2945 mincount = 1;
2946 maxcount = 1;
2947 } else {
2948 mincount = ARG1(scan);
2949 maxcount = ARG2(scan);
653099ff 2950 }
8aa23a47
YO
2951 next = regnext(scan);
2952 if (OP(scan) == CURLYX) {
2953 I32 lp = (data ? *(data->last_closep) : 0);
2954 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2955 }
8aa23a47
YO
2956 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2957 next_is_eval = (OP(scan) == EVAL);
2958 do_curly:
2959 if (flags & SCF_DO_SUBSTR) {
2960 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2961 pos_before = data->pos_min;
b45f050a 2962 }
8aa23a47
YO
2963 if (data) {
2964 fl = data->flags;
2965 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2966 if (is_inf)
2967 data->flags |= SF_IS_INF;
2968 }
2969 if (flags & SCF_DO_STCLASS) {
2970 cl_init(pRExC_state, &this_class);
2971 oclass = data->start_class;
2972 data->start_class = &this_class;
2973 f |= SCF_DO_STCLASS_AND;
2974 f &= ~SCF_DO_STCLASS_OR;
2975 }
2976 /* These are the cases when once a subexpression
2977 fails at a particular position, it cannot succeed
2978 even after backtracking at the enclosing scope.
2979
2980 XXXX what if minimal match and we are at the
2981 initial run of {n,m}? */
2982 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2983 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 2984
8aa23a47
YO
2985 /* This will finish on WHILEM, setting scan, or on NULL: */
2986 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2987 last, data, stopparen, recursed, NULL,
2988 (mincount == 0
2989 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 2990
8aa23a47
YO
2991 if (flags & SCF_DO_STCLASS)
2992 data->start_class = oclass;
2993 if (mincount == 0 || minnext == 0) {
2994 if (flags & SCF_DO_STCLASS_OR) {
2995 cl_or(pRExC_state, data->start_class, &this_class);
2996 }
2997 else if (flags & SCF_DO_STCLASS_AND) {
2998 /* Switch to OR mode: cache the old value of
2999 * data->start_class */
3000 INIT_AND_WITHP;
3001 StructCopy(data->start_class, and_withp,
3002 struct regnode_charclass_class);
3003 flags &= ~SCF_DO_STCLASS_AND;
3004 StructCopy(&this_class, data->start_class,
3005 struct regnode_charclass_class);
3006 flags |= SCF_DO_STCLASS_OR;
3007 data->start_class->flags |= ANYOF_EOS;
3008 }
3009 } else { /* Non-zero len */
3010 if (flags & SCF_DO_STCLASS_OR) {
3011 cl_or(pRExC_state, data->start_class, &this_class);
3012 cl_and(data->start_class, and_withp);
3013 }
3014 else if (flags & SCF_DO_STCLASS_AND)
3015 cl_and(data->start_class, &this_class);
3016 flags &= ~SCF_DO_STCLASS;
3017 }
3018 if (!scan) /* It was not CURLYX, but CURLY. */
3019 scan = next;
3020 if ( /* ? quantifier ok, except for (?{ ... }) */
3021 (next_is_eval || !(mincount == 0 && maxcount == 1))
3022 && (minnext == 0) && (deltanext == 0)
3023 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3024 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3025 && ckWARN(WARN_REGEXP))
3026 {
3027 vWARN(RExC_parse,
3028 "Quantifier unexpected on zero-length expression");
3029 }
3030
3031 min += minnext * mincount;
3032 is_inf_internal |= ((maxcount == REG_INFTY
3033 && (minnext + deltanext) > 0)
3034 || deltanext == I32_MAX);
3035 is_inf |= is_inf_internal;
3036 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3037
3038 /* Try powerful optimization CURLYX => CURLYN. */
3039 if ( OP(oscan) == CURLYX && data
3040 && data->flags & SF_IN_PAR
3041 && !(data->flags & SF_HAS_EVAL)
3042 && !deltanext && minnext == 1 ) {
3043 /* Try to optimize to CURLYN. */
3044 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3045 regnode * const nxt1 = nxt;
497b47a8 3046#ifdef DEBUGGING
8aa23a47 3047 regnode *nxt2;
497b47a8 3048#endif
c277df42 3049
8aa23a47
YO
3050 /* Skip open. */
3051 nxt = regnext(nxt);
3052 if (!strchr((const char*)PL_simple,OP(nxt))
3053 && !(PL_regkind[OP(nxt)] == EXACT
3054 && STR_LEN(nxt) == 1))
3055 goto nogo;
497b47a8 3056#ifdef DEBUGGING
8aa23a47 3057 nxt2 = nxt;
497b47a8 3058#endif
8aa23a47
YO
3059 nxt = regnext(nxt);
3060 if (OP(nxt) != CLOSE)
3061 goto nogo;
3062 if (RExC_open_parens) {
3063 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3064 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3065 }
3066 /* Now we know that nxt2 is the only contents: */
3067 oscan->flags = (U8)ARG(nxt);
3068 OP(oscan) = CURLYN;
3069 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3070
c277df42 3071#ifdef DEBUGGING
8aa23a47
YO
3072 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3073 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3074 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3075 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3076 OP(nxt + 1) = OPTIMIZED; /* was count. */
3077 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3078#endif
8aa23a47
YO
3079 }
3080 nogo:
3081
3082 /* Try optimization CURLYX => CURLYM. */
3083 if ( OP(oscan) == CURLYX && data
3084 && !(data->flags & SF_HAS_PAR)
3085 && !(data->flags & SF_HAS_EVAL)
3086 && !deltanext /* atom is fixed width */
3087 && minnext != 0 /* CURLYM can't handle zero width */
3088 ) {
3089 /* XXXX How to optimize if data == 0? */
3090 /* Optimize to a simpler form. */
3091 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3092 regnode *nxt2;
3093
3094 OP(oscan) = CURLYM;
3095 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3096 && (OP(nxt2) != WHILEM))
3097 nxt = nxt2;
3098 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3099 /* Need to optimize away parenths. */
3100 if (data->flags & SF_IN_PAR) {
3101 /* Set the parenth number. */
3102 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3103
3104 if (OP(nxt) != CLOSE)
3105 FAIL("Panic opt close");
3106 oscan->flags = (U8)ARG(nxt);
3107 if (RExC_open_parens) {
3108 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3109 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3110 }
8aa23a47
YO
3111 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3112 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3113
c277df42 3114#ifdef DEBUGGING
8aa23a47
YO
3115 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3116 OP(nxt + 1) = OPTIMIZED; /* was count. */
3117 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3118 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3119#endif
c277df42 3120#if 0
8aa23a47
YO
3121 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3122 regnode *nnxt = regnext(nxt1);
3123
3124 if (nnxt == nxt) {
3125 if (reg_off_by_arg[OP(nxt1)])
3126 ARG_SET(nxt1, nxt2 - nxt1);
3127 else if (nxt2 - nxt1 < U16_MAX)
3128 NEXT_OFF(nxt1) = nxt2 - nxt1;
3129 else
3130 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3131 }
8aa23a47 3132 nxt1 = nnxt;
c277df42 3133 }
5d1c421c 3134#endif
8aa23a47
YO
3135 /* Optimize again: */
3136 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3137 NULL, stopparen, recursed, NULL, 0,depth+1);
3138 }
3139 else
3140 oscan->flags = 0;
3141 }
3142 else if ((OP(oscan) == CURLYX)
3143 && (flags & SCF_WHILEM_VISITED_POS)
3144 /* See the comment on a similar expression above.
3145 However, this time it not a subexpression
3146 we care about, but the expression itself. */
3147 && (maxcount == REG_INFTY)
3148 && data && ++data->whilem_c < 16) {
3149 /* This stays as CURLYX, we can put the count/of pair. */
3150 /* Find WHILEM (as in regexec.c) */
3151 regnode *nxt = oscan + NEXT_OFF(oscan);
3152
3153 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3154 nxt += ARG(nxt);
3155 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3156 | (RExC_whilem_seen << 4)); /* On WHILEM */
3157 }
3158 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3159 pars++;
3160 if (flags & SCF_DO_SUBSTR) {
3161 SV *last_str = NULL;
3162 int counted = mincount != 0;
a0ed51b3 3163
8aa23a47
YO
3164 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3165#if defined(SPARC64_GCC_WORKAROUND)
3166 I32 b = 0;
3167 STRLEN l = 0;
3168 const char *s = NULL;
3169 I32 old = 0;
b515a41d 3170
8aa23a47
YO
3171 if (pos_before >= data->last_start_min)
3172 b = pos_before;
3173 else
3174 b = data->last_start_min;
b515a41d 3175
8aa23a47
YO
3176 l = 0;
3177 s = SvPV_const(data->last_found, l);
3178 old = b - data->last_start_min;
3179
3180#else
3181 I32 b = pos_before >= data->last_start_min
3182 ? pos_before : data->last_start_min;
3183 STRLEN l;
3184 const char * const s = SvPV_const(data->last_found, l);
3185 I32 old = b - data->last_start_min;
3186#endif
3187
3188 if (UTF)
3189 old = utf8_hop((U8*)s, old) - (U8*)s;
3190
3191 l -= old;
3192 /* Get the added string: */
3193 last_str = newSVpvn(s + old, l);
3194 if (UTF)
3195 SvUTF8_on(last_str);
3196 if (deltanext == 0 && pos_before == b) {
3197 /* What was added is a constant string */
3198 if (mincount > 1) {
3199 SvGROW(last_str, (mincount * l) + 1);
3200 repeatcpy(SvPVX(last_str) + l,
3201 SvPVX_const(last_str), l, mincount - 1);
3202 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3203 /* Add additional parts. */
3204 SvCUR_set(data->last_found,
3205 SvCUR(data->last_found) - l);
3206 sv_catsv(data->last_found, last_str);
3207 {
3208 SV * sv = data->last_found;
3209 MAGIC *mg =
3210 SvUTF8(sv) && SvMAGICAL(sv) ?
3211 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3212 if (mg && mg->mg_len >= 0)
3213 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3214 }
8aa23a47 3215 data->last_end += l * (mincount - 1);
b515a41d 3216 }
8aa23a47
YO
3217 } else {
3218 /* start offset must point into the last copy */
3219 data->last_start_min += minnext * (mincount - 1);
3220 data->last_start_max += is_inf ? I32_MAX
3221 : (maxcount - 1) * (minnext + data->pos_delta);
3222 }
c277df42 3223 }
8aa23a47
YO
3224 /* It is counted once already... */
3225 data->pos_min += minnext * (mincount - counted);
3226 data->pos_delta += - counted * deltanext +
3227 (minnext + deltanext) * maxcount - minnext * mincount;
3228 if (mincount != maxcount) {
3229 /* Cannot extend fixed substrings found inside
3230 the group. */
3231 scan_commit(pRExC_state,data,minlenp);
3232 if (mincount && last_str) {
3233 SV * const sv = data->last_found;
3234 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3235 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3236
3237 if (mg)
3238 mg->mg_len = -1;
3239 sv_setsv(sv, last_str);
3240 data->last_end = data->pos_min;
3241 data->last_start_min =
3242 data->pos_min - CHR_SVLEN(last_str);
3243 data->last_start_max = is_inf
3244 ? I32_MAX
3245 : data->pos_min + data->pos_delta
3246 - CHR_SVLEN(last_str);
3247 }
3248 data->longest = &(data->longest_float);
3249 }
3250 SvREFCNT_dec(last_str);
c277df42 3251 }
8aa23a47
YO
3252 if (data && (fl & SF_HAS_EVAL))
3253 data->flags |= SF_HAS_EVAL;
3254 optimize_curly_tail:
3255 if (OP(oscan) != CURLYX) {
3256 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3257 && NEXT_OFF(next))
3258 NEXT_OFF(oscan) += NEXT_OFF(next);
3259 }
3260 continue;
3261 default: /* REF and CLUMP only? */
3262 if (flags & SCF_DO_SUBSTR) {
3263 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3264 data->longest = &(data->longest_float);
3265 }
3266 is_inf = is_inf_internal = 1;
3267 if (flags & SCF_DO_STCLASS_OR)
3268 cl_anything(pRExC_state, data->start_class);
3269 flags &= ~SCF_DO_STCLASS;
3270 break;
c277df42 3271 }
8aa23a47
YO
3272 }
3273 else if (strchr((const char*)PL_simple,OP(scan))) {
3274 int value = 0;
653099ff 3275
8aa23a47
YO
3276 if (flags & SCF_DO_SUBSTR) {
3277 scan_commit(pRExC_state,data,minlenp);
3278 data->pos_min++;
3279 }
3280 min++;
3281 if (flags & SCF_DO_STCLASS) {
3282 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3283
8aa23a47
YO
3284 /* Some of the logic below assumes that switching
3285 locale on will only add false positives. */
3286 switch (PL_regkind[OP(scan)]) {
3287 case SANY:
3288 default:
3289 do_default:
3290 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3291 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3292 cl_anything(pRExC_state, data->start_class);
3293 break;
3294 case REG_ANY:
3295 if (OP(scan) == SANY)
3296 goto do_default;
3297 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3298 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3299 || (data->start_class->flags & ANYOF_CLASS));
3300 cl_anything(pRExC_state, data->start_class);
653099ff 3301 }
8aa23a47
YO
3302 if (flags & SCF_DO_STCLASS_AND || !value)
3303 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3304 break;
3305 case ANYOF:
3306 if (flags & SCF_DO_STCLASS_AND)
3307 cl_and(data->start_class,
3308 (struct regnode_charclass_class*)scan);
653099ff 3309 else
8aa23a47
YO
3310 cl_or(pRExC_state, data->start_class,
3311 (struct regnode_charclass_class*)scan);
3312 break;
3313 case ALNUM:
3314 if (flags & SCF_DO_STCLASS_AND) {
3315 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3316 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3317 for (value = 0; value < 256; value++)
3318 if (!isALNUM(value))
3319 ANYOF_BITMAP_CLEAR(data->start_class, value);
3320 }
653099ff 3321 }
8aa23a47
YO
3322 else {
3323 if (data->start_class->flags & ANYOF_LOCALE)
3324 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3325 else {
3326 for (value = 0; value < 256; value++)
3327 if (isALNUM(value))
3328 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3329 }
8aa23a47
YO
3330 }
3331 break;
3332 case ALNUML:
3333 if (flags & SCF_DO_STCLASS_AND) {
3334 if (data->start_class->flags & ANYOF_LOCALE)
3335 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3336 }
3337 else {
3338 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3339 data->start_class->flags |= ANYOF_LOCALE;
3340 }
3341 break;
3342 case NALNUM:
3343 if (flags & SCF_DO_STCLASS_AND) {
3344 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3345 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3346 for (value = 0; value < 256; value++)
3347 if (isALNUM(value))
3348 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3349 }
3350 }
8aa23a47
YO
3351 else {
3352 if (data->start_class->flags & ANYOF_LOCALE)
3353 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3354 else {
3355 for (value = 0; value < 256; value++)
3356 if (!isALNUM(value))
3357 ANYOF_BITMAP_SET(data->start_class, value);
3358 }
653099ff 3359 }
8aa23a47
YO
3360 break;
3361 case NALNUML:
3362 if (flags & SCF_DO_STCLASS_AND) {
3363 if (data->start_class->flags & ANYOF_LOCALE)
3364 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3365 }
8aa23a47
YO
3366 else {
3367 data->start_class->flags |= ANYOF_LOCALE;
3368 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3369 }
3370 break;
3371 case SPACE:
3372 if (flags & SCF_DO_STCLASS_AND) {
3373 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3374 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3375 for (value = 0; value < 256; value++)
3376 if (!isSPACE(value))
3377 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3378 }
3379 }
8aa23a47
YO
3380 else {
3381 if (data->start_class->flags & ANYOF_LOCALE)
3382 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3383 else {
3384 for (value = 0; value < 256; value++)
3385 if (isSPACE(value))
3386 ANYOF_BITMAP_SET(data->start_class, value);
3387 }
653099ff 3388 }
8aa23a47
YO
3389 break;
3390 case SPACEL:
3391 if (flags & SCF_DO_STCLASS_AND) {
3392 if (data->start_class->flags & ANYOF_LOCALE)
3393 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3394 }
3395 else {
3396 data->start_class->flags |= ANYOF_LOCALE;
3397 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3398 }
3399 break;
3400 case NSPACE:
3401 if (flags & SCF_DO_STCLASS_AND) {
3402 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3403 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3404 for (value = 0; value < 256; value++)
3405 if (isSPACE(value))
3406 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3407 }
8aa23a47
YO
3408 }
3409 else {
3410 if (data->start_class->flags & ANYOF_LOCALE)
3411 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3412 else {
3413 for (value = 0; value < 256; value++)
3414 if (!isSPACE(value))
3415 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3416 }
3417 }
8aa23a47
YO
3418 break;
3419 case NSPACEL:
3420 if (flags & SCF_DO_STCLASS_AND) {
3421 if (data->start_class->flags & ANYOF_LOCALE) {
3422 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3423 for (value = 0; value < 256; value++)
3424 if (!isSPACE(value))
3425 ANYOF_BITMAP_CLEAR(data->start_class, value);
3426 }
653099ff 3427 }
8aa23a47
YO
3428 else {
3429 data->start_class->flags |= ANYOF_LOCALE;
3430 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3431 }
3432 break;
3433 case DIGIT:
3434 if (flags & SCF_DO_STCLASS_AND) {
3435 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3436 for (value = 0; value < 256; value++)
3437 if (!isDIGIT(value))
3438 ANYOF_BITMAP_CLEAR(data->start_class, value);
3439 }
3440 else {
3441 if (data->start_class->flags & ANYOF_LOCALE)
3442 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3443 else {
3444 for (value = 0; value < 256; value++)
3445 if (isDIGIT(value))
3446 ANYOF_BITMAP_SET(data->start_class, value);
3447 }
3448 }
3449 break;
3450 case NDIGIT:
3451 if (flags & SCF_DO_STCLASS_AND) {
3452 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3453 for (value = 0; value < 256; value++)
3454 if (isDIGIT(value))
3455 ANYOF_BITMAP_CLEAR(data->start_class, value);
3456 }
3457 else {
3458 if (data->start_class->flags & ANYOF_LOCALE)
3459 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3460 else {
3461 for (value = 0; value < 256; value++)
3462 if (!isDIGIT(value))
3463 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3464 }
3465 }
8aa23a47
YO
3466 break;
3467 }
3468 if (flags & SCF_DO_STCLASS_OR)
3469 cl_and(data->start_class, and_withp);
3470 flags &= ~SCF_DO_STCLASS;
3471 }
3472 }
3473 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3474 data->flags |= (OP(scan) == MEOL
3475 ? SF_BEFORE_MEOL
3476 : SF_BEFORE_SEOL);
3477 }
3478 else if ( PL_regkind[OP(scan)] == BRANCHJ
3479 /* Lookbehind, or need to calculate parens/evals/stclass: */
3480 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3481 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3482 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3483 || OP(scan) == UNLESSM )
3484 {
3485 /* Negative Lookahead/lookbehind
3486 In this case we can't do fixed string optimisation.
3487 */
1de06328 3488
8aa23a47
YO
3489 I32 deltanext, minnext, fake = 0;
3490 regnode *nscan;
3491 struct regnode_charclass_class intrnl;
3492 int f = 0;
1de06328 3493
8aa23a47
YO
3494 data_fake.flags = 0;
3495 if (data) {
3496 data_fake.whilem_c = data->whilem_c;
3497 data_fake.last_closep = data->last_closep;
c277df42 3498 }
8aa23a47
YO
3499 else
3500 data_fake.last_closep = &fake;
58e23c8d 3501 data_fake.pos_delta = delta;
8aa23a47
YO
3502 if ( flags & SCF_DO_STCLASS && !scan->flags
3503 && OP(scan) == IFMATCH ) { /* Lookahead */
3504 cl_init(pRExC_state, &intrnl);
3505 data_fake.start_class = &intrnl;
3506 f |= SCF_DO_STCLASS_AND;
3507 }
3508 if (flags & SCF_WHILEM_VISITED_POS)
3509 f |= SCF_WHILEM_VISITED_POS;
3510 next = regnext(scan);
3511 nscan = NEXTOPER(NEXTOPER(scan));
3512 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3513 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3514 if (scan->flags) {
3515 if (deltanext) {
58e23c8d 3516 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3517 }
3518 else if (minnext > (I32)U8_MAX) {
58e23c8d 3519 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3520 }
3521 scan->flags = (U8)minnext;
3522 }
3523 if (data) {
3524 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3525 pars++;
3526 if (data_fake.flags & SF_HAS_EVAL)
3527 data->flags |= SF_HAS_EVAL;
3528 data->whilem_c = data_fake.whilem_c;
3529 }
3530 if (f & SCF_DO_STCLASS_AND) {
3531 const int was = (data->start_class->flags & ANYOF_EOS);
3532
3533 cl_and(data->start_class, &intrnl);
3534 if (was)
3535 data->start_class->flags |= ANYOF_EOS;
3536 }
cb434fcc 3537 }
8aa23a47
YO
3538#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3539 else {
3540 /* Positive Lookahead/lookbehind
3541 In this case we can do fixed string optimisation,
3542 but we must be careful about it. Note in the case of
3543 lookbehind the positions will be offset by the minimum
3544 length of the pattern, something we won't know about
3545 until after the recurse.
3546 */
3547 I32 deltanext, fake = 0;
3548 regnode *nscan;
3549 struct regnode_charclass_class intrnl;
3550 int f = 0;
3551 /* We use SAVEFREEPV so that when the full compile
3552 is finished perl will clean up the allocated
3553 minlens when its all done. This was we don't
3554 have to worry about freeing them when we know
3555 they wont be used, which would be a pain.
3556 */
3557 I32 *minnextp;
3558 Newx( minnextp, 1, I32 );
3559 SAVEFREEPV(minnextp);
3560
3561 if (data) {
3562 StructCopy(data, &data_fake, scan_data_t);
3563 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3564 f |= SCF_DO_SUBSTR;
3565 if (scan->flags)
3566 scan_commit(pRExC_state, &data_fake,minlenp);
3567 data_fake.last_found=newSVsv(data->last_found);
3568 }
3569 }
3570 else
3571 data_fake.last_closep = &fake;
3572 data_fake.flags = 0;
58e23c8d 3573 data_fake.pos_delta = delta;
8aa23a47
YO
3574 if (is_inf)
3575 data_fake.flags |= SF_IS_INF;
3576 if ( flags & SCF_DO_STCLASS && !scan->flags
3577 && OP(scan) == IFMATCH ) { /* Lookahead */
3578 cl_init(pRExC_state, &intrnl);
3579 data_fake.start_class = &intrnl;
3580 f |= SCF_DO_STCLASS_AND;
3581 }
3582 if (flags & SCF_WHILEM_VISITED_POS)
3583 f |= SCF_WHILEM_VISITED_POS;
3584 next = regnext(scan);
3585 nscan = NEXTOPER(NEXTOPER(scan));
3586
3587 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3588 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3589 if (scan->flags) {
3590 if (deltanext) {
58e23c8d 3591 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3592 }
3593 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3594 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3595 }
3596 scan->flags = (U8)*minnextp;
3597 }
3598
3599 *minnextp += min;
3600
3601 if (f & SCF_DO_STCLASS_AND) {
3602 const int was = (data->start_class->flags & ANYOF_EOS);
3603
3604 cl_and(data->start_class, &intrnl);
3605 if (was)
3606 data->start_class->flags |= ANYOF_EOS;
3607 }
3608 if (data) {
3609 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3610 pars++;
3611 if (data_fake.flags & SF_HAS_EVAL)
3612 data->flags |= SF_HAS_EVAL;
3613 data->whilem_c = data_fake.whilem_c;
3614 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3615 if (RExC_rx->minlen<*minnextp)
3616 RExC_rx->minlen=*minnextp;
3617 scan_commit(pRExC_state, &data_fake, minnextp);
3618 SvREFCNT_dec(data_fake.last_found);
3619
3620 if ( data_fake.minlen_fixed != minlenp )
3621 {
3622 data->offset_fixed= data_fake.offset_fixed;
3623 data->minlen_fixed= data_fake.minlen_fixed;
3624 data->lookbehind_fixed+= scan->flags;
3625 }
3626 if ( data_fake.minlen_float != minlenp )
3627 {
3628 data->minlen_float= data_fake.minlen_float;
3629 data->offset_float_min=data_fake.offset_float_min;
3630 data->offset_float_max=data_fake.offset_float_max;
3631 data->lookbehind_float+= scan->flags;
3632 }
3633 }
3634 }
3635
3636
40d049e4 3637 }
8aa23a47
YO
3638#endif
3639 }
3640 else if (OP(scan) == OPEN) {
3641 if (stopparen != (I32)ARG(scan))
3642 pars++;
3643 }
3644 else if (OP(scan) == CLOSE) {
3645 if (stopparen == (I32)ARG(scan)) {
3646 break;
3647 }
3648 if ((I32)ARG(scan) == is_par) {
3649 next = regnext(scan);
b515a41d 3650
8aa23a47
YO
3651 if ( next && (OP(next) != WHILEM) && next < last)
3652 is_par = 0; /* Disable optimization */
40d049e4 3653 }
8aa23a47
YO
3654 if (data)
3655 *(data->last_closep) = ARG(scan);
3656 }
3657 else if (OP(scan) == EVAL) {
c277df42
IZ
3658 if (data)
3659 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3660 }
3661 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3662 if (flags & SCF_DO_SUBSTR) {
3663 scan_commit(pRExC_state,data,minlenp);
3664 flags &= ~SCF_DO_SUBSTR;
40d049e4 3665 }
8aa23a47
YO
3666 if (data && OP(scan)==ACCEPT) {
3667 data->flags |= SCF_SEEN_ACCEPT;
3668 if (stopmin > min)
3669 stopmin = min;
e2e6a0f1 3670 }
8aa23a47
YO
3671 }
3672 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3673 {
0f5d15d6 3674 if (flags & SCF_DO_SUBSTR) {
1de06328 3675 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3676 data->longest = &(data->longest_float);
3677 }
3678 is_inf = is_inf_internal = 1;
653099ff 3679 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3680 cl_anything(pRExC_state, data->start_class);
96776eda 3681 flags &= ~SCF_DO_STCLASS;
8aa23a47 3682 }
58e23c8d 3683 else if (OP(scan) == GPOS) {
bbe252da 3684 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3685 !(delta || is_inf || (data && data->pos_delta)))
3686 {
bbe252da
YO
3687 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3688 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3689 if (RExC_rx->gofs < (U32)min)
3690 RExC_rx->gofs = min;
3691 } else {
bbe252da 3692 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3693 RExC_rx->gofs = 0;
3694 }
3695 }
786e8c11 3696#ifdef TRIE_STUDY_OPT
40d049e4 3697#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3698 else if (PL_regkind[OP(scan)] == TRIE) {
3699 /* NOTE - There is similar code to this block above for handling
3700 BRANCH nodes on the initial study. If you change stuff here
3701 check there too. */
3702 regnode *trie_node= scan;
3703 regnode *tail= regnext(scan);
f8fc2ecf 3704 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3705 I32 max1 = 0, min1 = I32_MAX;
3706 struct regnode_charclass_class accum;
3707
3708 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3709 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3710 if (flags & SCF_DO_STCLASS)
3711 cl_init_zero(pRExC_state, &accum);
3712
3713 if (!trie->jump) {
3714 min1= trie->minlen;
3715 max1= trie->maxlen;
3716 } else {
3717 const regnode *nextbranch= NULL;
3718 U32 word;
3719
3720 for ( word=1 ; word <= trie->wordcount ; word++)
3721 {
3722 I32 deltanext=0, minnext=0, f = 0, fake;
3723 struct regnode_charclass_class this_class;
3724
3725 data_fake.flags = 0;
3726 if (data) {
3727 data_fake.whilem_c = data->whilem_c;
3728 data_fake.last_closep = data->last_closep;
3729 }
3730 else
3731 data_fake.last_closep = &fake;
58e23c8d 3732 data_fake.pos_delta = delta;
8aa23a47
YO
3733 if (flags & SCF_DO_STCLASS) {
3734 cl_init(pRExC_state, &this_class);
3735 data_fake.start_class = &this_class;
3736 f = SCF_DO_STCLASS_AND;
3737 }
3738 if (flags & SCF_WHILEM_VISITED_POS)
3739 f |= SCF_WHILEM_VISITED_POS;
3740
3741 if (trie->jump[word]) {
3742 if (!nextbranch)
3743 nextbranch = trie_node + trie->jump[0];
3744 scan= trie_node + trie->jump[word];
3745 /* We go from the jump point to the branch that follows
3746 it. Note this means we need the vestigal unused branches
3747 even though they arent otherwise used.
3748 */
3749 minnext = study_chunk(pRExC_state, &scan, minlenp,
3750 &deltanext, (regnode *)nextbranch, &data_fake,
3751 stopparen, recursed, NULL, f,depth+1);
3752 }
3753 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3754 nextbranch= regnext((regnode*)nextbranch);
3755
3756 if (min1 > (I32)(minnext + trie->minlen))
3757 min1 = minnext + trie->minlen;
3758 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3759 max1 = minnext + deltanext + trie->maxlen;
3760 if (deltanext == I32_MAX)
3761 is_inf = is_inf_internal = 1;
3762
3763 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3764 pars++;
3765 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3766 if ( stopmin > min + min1)
3767 stopmin = min + min1;
3768 flags &= ~SCF_DO_SUBSTR;
3769 if (data)
3770 data->flags |= SCF_SEEN_ACCEPT;
3771 }
3772 if (data) {
3773 if (data_fake.flags & SF_HAS_EVAL)
3774 data->flags |= SF_HAS_EVAL;
3775 data->whilem_c = data_fake.whilem_c;
3776 }
3777 if (flags & SCF_DO_STCLASS)
3778 cl_or(pRExC_state, &accum, &this_class);
3779 }
3780 }
3781 if (flags & SCF_DO_SUBSTR) {
3782 data->pos_min += min1;
3783 data->pos_delta += max1 - min1;
3784 if (max1 != min1 || is_inf)
3785 data->longest = &(data->longest_float);
3786 }
3787 min += min1;
3788 delta += max1 - min1;
3789 if (flags & SCF_DO_STCLASS_OR) {
3790 cl_or(pRExC_state, data->start_class, &accum);
3791 if (min1) {
3792 cl_and(data->start_class, and_withp);
3793 flags &= ~SCF_DO_STCLASS;
3794 }
3795 }
3796 else if (flags & SCF_DO_STCLASS_AND) {
3797 if (min1) {
3798 cl_and(data->start_class, &accum);
3799 flags &= ~SCF_DO_STCLASS;
3800 }
3801 else {
3802 /* Switch to OR mode: cache the old value of
3803 * data->start_class */
3804 INIT_AND_WITHP;
3805 StructCopy(data->start_class, and_withp,
3806 struct regnode_charclass_class);
3807 flags &= ~SCF_DO_STCLASS_AND;
3808 StructCopy(&accum, data->start_class,
3809 struct regnode_charclass_class);
3810 flags |= SCF_DO_STCLASS_OR;
3811 data->start_class->flags |= ANYOF_EOS;
3812 }
3813 }
3814 scan= tail;
3815 continue;
3816 }
786e8c11 3817#else
8aa23a47 3818 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 3819 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3820 U8*bang=NULL;
3821
3822 min += trie->minlen;
3823 delta += (trie->maxlen - trie->minlen);
3824 flags &= ~SCF_DO_STCLASS; /* xxx */
3825 if (flags & SCF_DO_SUBSTR) {
3826 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3827 data->pos_min += trie->minlen;
3828 data->pos_delta += (trie->maxlen - trie->minlen);
3829 if (trie->maxlen != trie->minlen)
3830 data->longest = &(data->longest_float);
3831 }
3832 if (trie->jump) /* no more substrings -- for now /grr*/
3833 flags &= ~SCF_DO_SUBSTR;
b515a41d 3834 }
8aa23a47
YO
3835#endif /* old or new */
3836#endif /* TRIE_STUDY_OPT */
3837 /* Else: zero-length, ignore. */
3838 scan = regnext(scan);
3839 }
3840 if (frame) {
3841 last = frame->last;
3842 scan = frame->next;
3843 stopparen = frame->stop;
3844 frame = frame->prev;
3845 goto fake_study_recurse;
c277df42
IZ
3846 }
3847
3848 finish:
8aa23a47
YO
3849 assert(!frame);
3850
c277df42 3851 *scanp = scan;
aca2d497 3852 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3853 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3854 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3855 if (is_par > (I32)U8_MAX)
c277df42
IZ
3856 is_par = 0;
3857 if (is_par && pars==1 && data) {
3858 data->flags |= SF_IN_PAR;
3859 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3860 }
3861 else if (pars && data) {
c277df42
IZ
3862 data->flags |= SF_HAS_PAR;
3863 data->flags &= ~SF_IN_PAR;
3864 }
653099ff 3865 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3866 cl_and(data->start_class, and_withp);
786e8c11
YO
3867 if (flags & SCF_TRIE_RESTUDY)
3868 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3869
3870 DEBUG_STUDYDATA(data,depth);
3871
e2e6a0f1 3872 return min < stopmin ? min : stopmin;
c277df42
IZ
3873}
3874
2eccd3b2
NC
3875STATIC U32
3876S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 3877{
4a4e7719
NC
3878 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
3879
3880 Renewc(RExC_rxi->data,
3881 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
3882 char, struct reg_data);
3883 if(count)
f8fc2ecf 3884 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 3885 else
f8fc2ecf 3886 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
3887 RExC_rxi->data->count = count + n;
3888 Copy(s, RExC_rxi->data->what + count, n, U8);
3889 return count;
c277df42
IZ
3890}
3891
76234dfb 3892#ifndef PERL_IN_XSUB_RE
d88dccdf 3893void
864dbfa3 3894Perl_reginitcolors(pTHX)
d88dccdf 3895{
97aff369 3896 dVAR;
1df70142 3897 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3898 if (s) {
1df70142
AL
3899 char *t = savepv(s);
3900 int i = 0;
3901 PL_colors[0] = t;
d88dccdf 3902 while (++i < 6) {
1df70142
AL
3903 t = strchr(t, '\t');
3904 if (t) {
3905 *t = '\0';
3906 PL_colors[i] = ++t;
d88dccdf
IZ
3907 }
3908 else
1df70142 3909 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3910 }
3911 } else {
1df70142 3912 int i = 0;
b81d288d 3913 while (i < 6)
06b5626a 3914 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3915 }
3916 PL_colorset = 1;
3917}
76234dfb 3918#endif
8615cb43 3919
07be1b83 3920
786e8c11
YO
3921#ifdef TRIE_STUDY_OPT
3922#define CHECK_RESTUDY_GOTO \
3923 if ( \
3924 (data.flags & SCF_TRIE_RESTUDY) \
3925 && ! restudied++ \
3926 ) goto reStudy
3927#else
3928#define CHECK_RESTUDY_GOTO
3929#endif
f9f4320a 3930
a687059c 3931/*
e50aee73 3932 - pregcomp - compile a regular expression into internal code
a687059c
LW
3933 *
3934 * We can't allocate space until we know how big the compiled form will be,
3935 * but we can't compile it (and thus know how big it is) until we've got a
3936 * place to put the code. So we cheat: we compile it twice, once with code
3937 * generation turned off and size counting turned on, and once "for real".
3938 * This also means that we don't allocate space until we are sure that the
3939 * thing really will compile successfully, and we never have to move the
3940 * code and thus invalidate pointers into it. (Note that it has to be in
3941 * one piece because free() must be able to free it all.) [NB: not true in perl]
3942 *
3943 * Beware that the optimization-preparation code in here knows about some
3944 * of the structure of the compiled regexp. [I'll say.]
3945 */
b9b4dddf
YO
3946
3947
3948
f9f4320a 3949#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3950#define RE_ENGINE_PTR &PL_core_reg_engine
3951#else
f9f4320a
YO
3952extern const struct regexp_engine my_reg_engine;
3953#define RE_ENGINE_PTR &my_reg_engine
3954#endif
b9b4dddf
YO
3955/* these make a few things look better, to avoid indentation */
3956#define BEGIN_BLOCK {
f9f4320a
YO
3957#define END_BLOCK }
3958
a687059c 3959regexp *
864dbfa3 3960Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3961{
97aff369 3962 dVAR;
f9f4320a
YO
3963 GET_RE_DEBUG_FLAGS_DECL;
3964 DEBUG_r(if (!PL_colorset) reginitcolors());
b9b4dddf
YO
3965#ifndef PERL_IN_XSUB_RE
3966 BEGIN_BLOCK
f9f4320a
YO
3967 /* Dispatch a request to compile a regexp to correct
3968 regexp engine. */
3969 HV * const table = GvHV(PL_hintgv);
3970 if (table) {
3971 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
1e2e3d02 3972 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3973 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3974 DEBUG_COMPILE_r({
8d8756e7 3975 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3976 SvIV(*ptr));
3977 });
f2f78491 3978 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3979 }
b9b4dddf
YO
3980 }
3981 END_BLOCK
3982#endif
3983 BEGIN_BLOCK
a0d0e21e 3984 register regexp *r;
f8fc2ecf 3985 register regexp_internal *ri;
c277df42 3986 regnode *scan;
c277df42 3987 regnode *first;
a0d0e21e 3988 I32 flags;
a0d0e21e
LW
3989 I32 minlen = 0;
3990 I32 sawplus = 0;
3991 I32 sawopen = 0;
2c2d71f5 3992 scan_data_t data;
830247a4 3993 RExC_state_t RExC_state;
be8e71aa 3994 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3995#ifdef TRIE_STUDY_OPT
3996 int restudied= 0;
3997 RExC_state_t copyRExC_state;
3998#endif
a0d0e21e 3999 if (exp == NULL)
c277df42 4000 FAIL("NULL regexp argument");
a0d0e21e 4001
a5961de5 4002 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 4003
5cfc7842 4004 RExC_precomp = exp;
a3621e74 4005 DEBUG_COMPILE_r({
ab3bbdeb
YO
4006 SV *dsv= sv_newmortal();
4007 RE_PV_QUOTED_DECL(s, RExC_utf8,
4008 dsv, RExC_precomp, (xend - exp), 60);
4009 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4010 PL_colors[4],PL_colors[5],s);
a5961de5 4011 });
e2509266 4012 RExC_flags = pm->op_pmflags;
830247a4 4013 RExC_sawback = 0;
bbce6d69 4014
830247a4
IZ
4015 RExC_seen = 0;
4016 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4017 RExC_seen_evals = 0;
4018 RExC_extralen = 0;
c277df42 4019
bbce6d69 4020 /* First pass: determine size, legality. */
830247a4 4021 RExC_parse = exp;
fac92740 4022 RExC_start = exp;
830247a4
IZ
4023 RExC_end = xend;
4024 RExC_naughty = 0;
4025 RExC_npar = 1;
c74340f9 4026 RExC_cpar = 1;
e2e6a0f1 4027 RExC_nestroot = 0;
830247a4
IZ
4028 RExC_size = 0L;
4029 RExC_emit = &PL_regdummy;
4030 RExC_whilem_seen = 0;
fc8cd66c 4031 RExC_charnames = NULL;
40d049e4
YO
4032 RExC_open_parens = NULL;
4033 RExC_close_parens = NULL;
4034 RExC_opend = NULL;
81714fb9 4035 RExC_paren_names = NULL;
40d049e4
YO
4036 RExC_recurse = NULL;
4037 RExC_recurse_count = 0;
81714fb9 4038
85ddcde9
JH
4039#if 0 /* REGC() is (currently) a NOP at the first pass.
4040 * Clever compilers notice this and complain. --jhi */
830247a4 4041 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4042#endif
3dab1dad
YO
4043 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4044 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4045 RExC_precomp = NULL;
a0d0e21e
LW
4046 return(NULL);
4047 }
07be1b83 4048 DEBUG_PARSE_r({
81714fb9
YO
4049 PerlIO_printf(Perl_debug_log,
4050 "Required size %"IVdf" nodes\n"
4051 "Starting second pass (creation)\n",
4052 (IV)RExC_size);
07be1b83
YO
4053 RExC_lastnum=0;
4054 RExC_lastparse=NULL;
4055 });
c277df42
IZ
4056 /* Small enough for pointer-storage convention?
4057 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4058 if (RExC_size >= 0x10000L && RExC_extralen)
4059 RExC_size += RExC_extralen;
c277df42 4060 else
830247a4
IZ
4061 RExC_extralen = 0;
4062 if (RExC_whilem_seen > 15)
4063 RExC_whilem_seen = 15;
a0d0e21e 4064
e2e6a0f1
YO
4065#ifdef DEBUGGING
4066 /* Make room for a sentinel value at the end of the program */
4067 RExC_size++;
4068#endif
4069
f9f4320a
YO
4070 /* Allocate space and zero-initialize. Note, the two step process
4071 of zeroing when in debug mode, thus anything assigned has to
4072 happen after that */
f8fc2ecf
YO
4073 Newxz(r, 1, regexp);
4074 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4075 char, regexp_internal);
4076 if ( r == NULL || ri == NULL )
b45f050a 4077 FAIL("Regexp out of space");
0f79a09d
GS
4078#ifdef DEBUGGING
4079 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4080 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4081#else
f8fc2ecf
YO
4082 /* bulk initialize base fields with 0. */
4083 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4084#endif
58e23c8d
YO
4085
4086 /* non-zero initialization begins here */
f8fc2ecf 4087 RXi_SET( r, ri );
f9f4320a 4088 r->engine= RE_ENGINE_PTR;
c277df42 4089 r->refcnt = 1;
bbce6d69 4090 r->prelen = xend - exp;
5cfc7842 4091 r->precomp = savepvn(RExC_precomp, r->prelen);
bbe252da
YO
4092 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
4093 r->intflags = 0;
830247a4 4094 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4095
6bda09f9 4096 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4097 Newxz(RExC_open_parens, RExC_npar,regnode *);
4098 SAVEFREEPV(RExC_open_parens);
4099 Newxz(RExC_close_parens,RExC_npar,regnode *);
4100 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4101 }
4102
4103 /* Useful during FAIL. */
f8fc2ecf
YO
4104 Newxz(ri->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4105 if (ri->offsets) {
4106 ri->offsets[0] = RExC_size;
fac92740 4107 }
a3621e74 4108 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4109 "%s %"UVuf" bytes for offset annotations.\n",
f8fc2ecf 4110 ri->offsets ? "Got" : "Couldn't get",
392fbf5d 4111 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4112
830247a4 4113 RExC_rx = r;
f8fc2ecf 4114 RExC_rxi = ri;
bbce6d69 4115
4116 /* Second pass: emit code. */
e2509266 4117 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4118 RExC_parse = exp;
4119 RExC_end = xend;
4120 RExC_naughty = 0;
4121 RExC_npar = 1;
c74340f9 4122 RExC_cpar = 1;
f8fc2ecf
YO
4123 RExC_emit_start = ri->program;
4124 RExC_emit = ri->program;
e2e6a0f1
YO
4125#ifdef DEBUGGING
4126 /* put a sentinal on the end of the program so we can check for
4127 overwrites */
f8fc2ecf 4128 ri->program[RExC_size].type = 255;
e2e6a0f1 4129#endif
2cd61cdb 4130 /* Store the count of eval-groups for security checks: */
786e8c11 4131 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 4132 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3dab1dad 4133 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4134 return(NULL);
6bda09f9 4135
07be1b83
YO
4136 /* XXXX To minimize changes to RE engine we always allocate
4137 3-units-long substrs field. */
4138 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4139 if (RExC_recurse_count) {
4140 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4141 SAVEFREEPV(RExC_recurse);
4142 }
a0d0e21e 4143
07be1b83 4144reStudy:
1de06328 4145 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4146 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4147
07be1b83
YO
4148#ifdef TRIE_STUDY_OPT
4149 if ( restudied ) {
5d458dd8 4150 U32 seen=RExC_seen;
07be1b83 4151 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4152
4153 RExC_state = copyRExC_state;
4154 if (seen & REG_TOP_LEVEL_BRANCHES)
4155 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4156 else
4157 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4158 if (data.last_found) {
07be1b83 4159 SvREFCNT_dec(data.longest_fixed);
07be1b83 4160 SvREFCNT_dec(data.longest_float);
07be1b83 4161 SvREFCNT_dec(data.last_found);
1de06328 4162 }
40d049e4 4163 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4164 } else {
40d049e4 4165 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4166 copyRExC_state = RExC_state;
07be1b83 4167 }
40d049e4
YO
4168#else
4169 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4170#endif
fc8cd66c 4171
a0d0e21e 4172 /* Dig out information for optimizations. */
bbe252da 4173 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */
e2509266 4174 pm->op_pmflags = RExC_flags;
a0ed51b3 4175 if (UTF)
bbe252da 4176 r->extflags |= RXf_UTF8; /* Unicode in it? */
f8fc2ecf 4177 ri->regstclass = NULL;
830247a4 4178 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4179 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4180 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4181
1de06328
YO
4182 /* testing for BRANCH here tells us whether there is "must appear"
4183 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4184 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4185 I32 fake;
c5254dd6 4186 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4187 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4188 int stclass_flag;
07be1b83 4189 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4190
4191 first = scan;
c277df42 4192 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4193 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4194 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4195 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4196 /* for now we can't handle lookbehind IFMATCH*/
4197 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4198 (OP(first) == PLUS) ||
4199 (OP(first) == MINMOD) ||
653099ff 4200 /* An {n,m} with n>0 */
07be1b83
YO
4201 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4202 {
786e8c11 4203
a0d0e21e
LW
4204 if (OP(first) == PLUS)
4205 sawplus = 1;
4206 else
3dab1dad 4207 first += regarglen[OP(first)];
07be1b83
YO
4208 if (OP(first) == IFMATCH) {
4209 first = NEXTOPER(first);
4210 first += EXTRA_STEP_2ARGS;
7c167cea 4211 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4212 first = NEXTOPER(first);
a687059c
LW
4213 }
4214
a0d0e21e
LW
4215 /* Starting-point info. */
4216 again:
786e8c11 4217 DEBUG_PEEP("first:",first,0);
07be1b83 4218 /* Ignore EXACT as we deal with it later. */
3dab1dad 4219 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4220 if (OP(first) == EXACT)
6f207bd3 4221 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4222 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4223 ri->regstclass = first;
b3c9acc1 4224 }
07be1b83 4225#ifdef TRIE_STCLASS
786e8c11 4226 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4227 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4228 {
786e8c11 4229 regnode *trie_op;
07be1b83 4230 /* this can happen only on restudy */
786e8c11 4231 if ( OP(first) == TRIE ) {
446bd890
NC
4232 struct regnode_1 *trieop =
4233 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4234 StructCopy(first,trieop,struct regnode_1);
4235 trie_op=(regnode *)trieop;
4236 } else {
446bd890
NC
4237 struct regnode_charclass *trieop =
4238 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4239 StructCopy(first,trieop,struct regnode_charclass);
4240 trie_op=(regnode *)trieop;
4241 }
1de06328 4242 OP(trie_op)+=2;
786e8c11 4243 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4244 ri->regstclass = trie_op;
07be1b83
YO
4245 }
4246#endif
bfed75c6 4247 else if (strchr((const char*)PL_simple,OP(first)))
f8fc2ecf 4248 ri->regstclass = first;
3dab1dad
YO
4249 else if (PL_regkind[OP(first)] == BOUND ||
4250 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4251 ri->regstclass = first;
3dab1dad 4252 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4253 r->extflags |= (OP(first) == MBOL
4254 ? RXf_ANCH_MBOL
cad2e5aa 4255 : (OP(first) == SBOL
bbe252da
YO
4256 ? RXf_ANCH_SBOL
4257 : RXf_ANCH_BOL));
a0d0e21e 4258 first = NEXTOPER(first);
774d564b 4259 goto again;
4260 }
4261 else if (OP(first) == GPOS) {
bbe252da 4262 r->extflags |= RXf_ANCH_GPOS;
774d564b 4263 first = NEXTOPER(first);
4264 goto again;
a0d0e21e 4265 }
cf2a2b69
YO
4266 else if ((!sawopen || !RExC_sawback) &&
4267 (OP(first) == STAR &&
3dab1dad 4268 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4269 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4270 {
4271 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4272 const int type =
4273 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4274 ? RXf_ANCH_MBOL
4275 : RXf_ANCH_SBOL;
4276 r->extflags |= type;
4277 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4278 first = NEXTOPER(first);
774d564b 4279 goto again;
a0d0e21e 4280 }
b81d288d 4281 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4282 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4283 /* x+ must match at the 1st pos of run of x's */
bbe252da 4284 r->intflags |= PREGf_SKIP;
a0d0e21e 4285
c277df42 4286 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4287#ifdef TRIE_STUDY_OPT
81714fb9 4288 DEBUG_PARSE_r(
be8e71aa
YO
4289 if (!restudied)
4290 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4291 (IV)(first - scan + 1))
4292 );
4293#else
81714fb9 4294 DEBUG_PARSE_r(
be8e71aa
YO
4295 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4296 (IV)(first - scan + 1))
4297 );
4298#endif
4299
4300
a0d0e21e
LW
4301 /*
4302 * If there's something expensive in the r.e., find the
4303 * longest literal string that must appear and make it the
4304 * regmust. Resolve ties in favor of later strings, since
4305 * the regstart check works with the beginning of the r.e.
4306 * and avoiding duplication strengthens checking. Not a
4307 * strong reason, but sufficient in the absence of others.
4308 * [Now we resolve ties in favor of the earlier string if
c277df42 4309 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4310 * earlier string may buy us something the later one won't.]
4311 */
de8c5301 4312
396482e1
GA
4313 data.longest_fixed = newSVpvs("");
4314 data.longest_float = newSVpvs("");
4315 data.last_found = newSVpvs("");
c277df42
IZ
4316 data.longest = &(data.longest_fixed);
4317 first = scan;
f8fc2ecf 4318 if (!ri->regstclass) {
830247a4 4319 cl_init(pRExC_state, &ch_class);
653099ff
GS
4320 data.start_class = &ch_class;
4321 stclass_flag = SCF_DO_STCLASS_AND;
4322 } else /* XXXX Check for BOUND? */
4323 stclass_flag = 0;
cb434fcc 4324 data.last_closep = &last_close;
de8c5301 4325
1de06328 4326 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4327 &data, -1, NULL, NULL,
4328 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4329
07be1b83 4330
786e8c11
YO
4331 CHECK_RESTUDY_GOTO;
4332
4333
830247a4 4334 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4335 && data.last_start_min == 0 && data.last_end > 0
830247a4 4336 && !RExC_seen_zerolen
bbe252da
YO
4337 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4338 r->extflags |= RXf_CHECK_ALL;
1de06328 4339 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4340 SvREFCNT_dec(data.last_found);
4341
1de06328
YO
4342 /* Note that code very similar to this but for anchored string
4343 follows immediately below, changes may need to be made to both.
4344 Be careful.
4345 */
a0ed51b3 4346 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4347 if (longest_float_length
c277df42
IZ
4348 || (data.flags & SF_FL_BEFORE_EOL
4349 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4350 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4351 {
1182767e 4352 I32 t,ml;
cf93c79d 4353
1de06328 4354 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4355 && data.offset_fixed == data.offset_float_min
4356 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4357 goto remove_float; /* As in (a)+. */
4358
1de06328
YO
4359 /* copy the information about the longest float from the reg_scan_data
4360 over to the program. */
33b8afdf
JH
4361 if (SvUTF8(data.longest_float)) {
4362 r->float_utf8 = data.longest_float;
c445ea15 4363 r->float_substr = NULL;
33b8afdf
JH
4364 } else {
4365 r->float_substr = data.longest_float;
c445ea15 4366 r->float_utf8 = NULL;
33b8afdf 4367 }
1de06328
YO
4368 /* float_end_shift is how many chars that must be matched that
4369 follow this item. We calculate it ahead of time as once the
4370 lookbehind offset is added in we lose the ability to correctly
4371 calculate it.*/
4372 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4373 : (I32)longest_float_length;
1de06328
YO
4374 r->float_end_shift = ml - data.offset_float_min
4375 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4376 + data.lookbehind_float;
4377 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4378 r->float_max_offset = data.offset_float_max;
1182767e 4379 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4380 r->float_max_offset -= data.lookbehind_float;
4381
cf93c79d
IZ
4382 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4383 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4384 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4385 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4386 }
4387 else {
aca2d497 4388 remove_float:
c445ea15 4389 r->float_substr = r->float_utf8 = NULL;
c277df42 4390 SvREFCNT_dec(data.longest_float);
c5254dd6 4391 longest_float_length = 0;
a0d0e21e 4392 }
c277df42 4393
1de06328
YO
4394 /* Note that code very similar to this but for floating string
4395 is immediately above, changes may need to be made to both.
4396 Be careful.
4397 */
a0ed51b3 4398 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4399 if (longest_fixed_length
c277df42
IZ
4400 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4401 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4402 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4403 {
1182767e 4404 I32 t,ml;
cf93c79d 4405
1de06328
YO
4406 /* copy the information about the longest fixed
4407 from the reg_scan_data over to the program. */
33b8afdf
JH
4408 if (SvUTF8(data.longest_fixed)) {
4409 r->anchored_utf8 = data.longest_fixed;
c445ea15 4410 r->anchored_substr = NULL;
33b8afdf
JH
4411 } else {
4412 r->anchored_substr = data.longest_fixed;
c445ea15 4413 r->anchored_utf8 = NULL;
33b8afdf 4414 }
1de06328
YO
4415 /* fixed_end_shift is how many chars that must be matched that
4416 follow this item. We calculate it ahead of time as once the
4417 lookbehind offset is added in we lose the ability to correctly
4418 calculate it.*/
4419 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4420 : (I32)longest_fixed_length;
1de06328
YO
4421 r->anchored_end_shift = ml - data.offset_fixed
4422 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4423 + data.lookbehind_fixed;
4424 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4425
cf93c79d
IZ
4426 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4427 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4428 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4429 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4430 }
4431 else {
c445ea15 4432 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4433 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4434 longest_fixed_length = 0;
a0d0e21e 4435 }
f8fc2ecf
YO
4436 if (ri->regstclass
4437 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4438 ri->regstclass = NULL;
33b8afdf
JH
4439 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4440 && stclass_flag
653099ff 4441 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4442 && !cl_is_anything(data.start_class))
4443 {
2eccd3b2 4444 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4445
f8fc2ecf 4446 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4447 struct regnode_charclass_class);
4448 StructCopy(data.start_class,
f8fc2ecf 4449 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4450 struct regnode_charclass_class);
f8fc2ecf 4451 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4452 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4453 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4454 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4455 PerlIO_printf(Perl_debug_log,
a0288114 4456 "synthetic stclass \"%s\".\n",
3f7c398e 4457 SvPVX_const(sv));});
653099ff 4458 }
c277df42
IZ
4459
4460 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4461 if (longest_fixed_length > longest_float_length) {
1de06328 4462 r->check_end_shift = r->anchored_end_shift;
c277df42 4463 r->check_substr = r->anchored_substr;
33b8afdf 4464 r->check_utf8 = r->anchored_utf8;
c277df42 4465 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4466 if (r->extflags & RXf_ANCH_SINGLE)
4467 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4468 }
4469 else {
1de06328 4470 r->check_end_shift = r->float_end_shift;
c277df42 4471 r->check_substr = r->float_substr;
33b8afdf 4472 r->check_utf8 = r->float_utf8;
1de06328
YO
4473 r->check_offset_min = r->float_min_offset;
4474 r->check_offset_max = r->float_max_offset;
a0d0e21e 4475 }
30382c73
IZ
4476 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4477 This should be changed ASAP! */
bbe252da
YO
4478 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4479 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4480 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4481 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4482 }
1de06328
YO
4483 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4484 if ( (STRLEN)minlen < longest_float_length )
4485 minlen= longest_float_length;
4486 if ( (STRLEN)minlen < longest_fixed_length )
4487 minlen= longest_fixed_length;
4488 */
a0ed51b3
LW
4489 }
4490 else {
c277df42
IZ
4491 /* Several toplevels. Best we can is to set minlen. */
4492 I32 fake;
653099ff 4493 struct regnode_charclass_class ch_class;
cb434fcc 4494 I32 last_close = 0;
c277df42 4495
5d458dd8 4496 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4497
f8fc2ecf 4498 scan = ri->program + 1;
830247a4 4499 cl_init(pRExC_state, &ch_class);
653099ff 4500 data.start_class = &ch_class;
cb434fcc 4501 data.last_closep = &last_close;
07be1b83 4502
de8c5301 4503
1de06328 4504 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4505 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4506
786e8c11 4507 CHECK_RESTUDY_GOTO;
07be1b83 4508
33b8afdf 4509 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4510 = r->float_substr = r->float_utf8 = NULL;
653099ff 4511 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4512 && !cl_is_anything(data.start_class))
4513 {
2eccd3b2 4514 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4515
f8fc2ecf 4516 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4517 struct regnode_charclass_class);
4518 StructCopy(data.start_class,
f8fc2ecf 4519 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4520 struct regnode_charclass_class);
f8fc2ecf 4521 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4522 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4523 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4524 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4525 PerlIO_printf(Perl_debug_log,
a0288114 4526 "synthetic stclass \"%s\".\n",
3f7c398e 4527 SvPVX_const(sv));});
653099ff 4528 }
a0d0e21e
LW
4529 }
4530
1de06328
YO
4531 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4532 the "real" pattern. */
cf9788e3
RGS
4533 DEBUG_OPTIMISE_r({
4534 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4535 (IV)minlen, (IV)r->minlen);
cf9788e3 4536 });
de8c5301 4537 r->minlenret = minlen;
1de06328
YO
4538 if (r->minlen < minlen)
4539 r->minlen = minlen;
4540
b81d288d 4541 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4542 r->extflags |= RXf_GPOS_SEEN;
830247a4 4543 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4544 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4545 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4546 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4547 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4548 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4549 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4550 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4551 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4552 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9
YO
4553 if (RExC_paren_names)
4554 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4555 else
4556 r->paren_names = NULL;
4557
40d049e4
YO
4558 if (RExC_recurse_count) {
4559 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4560 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4561 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4562 }
4563 }
a02a5408
JC
4564 Newxz(r->startp, RExC_npar, I32);
4565 Newxz(r->endp, RExC_npar, I32);
c74340f9
YO
4566 /* assume we don't need to swap parens around before we match */
4567
be8e71aa
YO
4568 DEBUG_DUMP_r({
4569 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4570 regdump(r);
4571 });
f8fc2ecf
YO
4572 DEBUG_OFFSETS_r(if (ri->offsets) {
4573 const U32 len = ri->offsets[0];
8e9a8a48
YO
4574 U32 i;
4575 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 4576 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->offsets[0]);
8e9a8a48 4577 for (i = 1; i <= len; i++) {
f8fc2ecf 4578 if (ri->offsets[i*2-1] || ri->offsets[i*2])
8e9a8a48 4579 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
f8fc2ecf 4580 (UV)i, (UV)ri->offsets[i*2-1], (UV)ri->offsets[i*2]);
8e9a8a48
YO
4581 }
4582 PerlIO_printf(Perl_debug_log, "\n");
4583 });
a0d0e21e 4584 return(r);
f9f4320a 4585 END_BLOCK
a687059c
LW
4586}
4587
f9f4320a
YO
4588#undef CORE_ONLY_BLOCK
4589#undef END_BLOCK
4590#undef RE_ENGINE_PTR
3dab1dad 4591
9af228c6 4592#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4593SV*
4594Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4595{
4596 I32 parno = 0; /* no match */
4597 if (PL_curpm) {
4598 const REGEXP * const rx = PM_GETRE(PL_curpm);
4599 if (rx && rx->paren_names) {
4600 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4601 if (he_str) {
4602 IV i;
4603 SV* sv_dat=HeVAL(he_str);
4604 I32 *nums=(I32*)SvPVX(sv_dat);
4605 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4606 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4607 rx->endp[nums[i]] != -1)
4608 {
4609 parno = nums[i];
4610 break;
4611 }
4612 }
4613 }
4614 }
4615 }
4616 if ( !parno ) {
4617 return 0;
4618 } else {
4619 GV *gv_paren;
4620 SV *sv= sv_newmortal();
4621 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4622 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4623 return GvSVn(gv_paren);
4624 }
4625}
9af228c6 4626#endif
0a4db386 4627
894be9b7 4628/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4629 * If flags is REG_RSN_RETURN_NULL returns null.
4630 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4631 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4632 * to the parsed name as looked up in the RExC_paren_names hash.
4633 * If there is an error throws a vFAIL().. type exception.
894be9b7 4634 */
0a4db386
YO
4635
4636#define REG_RSN_RETURN_NULL 0
4637#define REG_RSN_RETURN_NAME 1
4638#define REG_RSN_RETURN_DATA 2
4639
894be9b7
YO
4640STATIC SV*
4641S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4642 char *name_start = RExC_parse;
0a4db386 4643 if ( UTF ) {
894be9b7 4644 STRLEN numlen;
0a4db386
YO
4645 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4646 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4647 {
4648 RExC_parse += numlen;
4649 }
4650 } else {
4651 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4652 RExC_parse++;
4653 }
0a4db386
YO
4654 if ( flags ) {
4655 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4656 (int)(RExC_parse - name_start)));
894be9b7 4657 if (UTF)
0a4db386
YO
4658 SvUTF8_on(sv_name);
4659 if ( flags == REG_RSN_RETURN_NAME)
4660 return sv_name;
4661 else if (flags==REG_RSN_RETURN_DATA) {
4662 HE *he_str = NULL;
4663 SV *sv_dat = NULL;
4664 if ( ! sv_name ) /* should not happen*/
4665 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4666 if (RExC_paren_names)
4667 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4668 if ( he_str )
4669 sv_dat = HeVAL(he_str);
4670 if ( ! sv_dat )
4671 vFAIL("Reference to nonexistent named group");
4672 return sv_dat;
4673 }
4674 else {
4675 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4676 }
4677 /* NOT REACHED */
894be9b7 4678 }
0a4db386 4679 return NULL;
894be9b7
YO
4680}
4681
3dab1dad
YO
4682#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4683 int rem=(int)(RExC_end - RExC_parse); \
4684 int cut; \
4685 int num; \
4686 int iscut=0; \
4687 if (rem>10) { \
4688 rem=10; \
4689 iscut=1; \
4690 } \
4691 cut=10-rem; \
4692 if (RExC_lastparse!=RExC_parse) \
4693 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4694 rem, RExC_parse, \
4695 cut + 4, \
4696 iscut ? "..." : "<" \
4697 ); \
4698 else \
4699 PerlIO_printf(Perl_debug_log,"%16s",""); \
4700 \
4701 if (SIZE_ONLY) \
4702 num=RExC_size; \
4703 else \
4704 num=REG_NODE_NUM(RExC_emit); \
4705 if (RExC_lastnum!=num) \
0a4db386 4706 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4707 else \
0a4db386 4708 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4709 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4710 (int)((depth*2)), "", \
3dab1dad
YO
4711 (funcname) \
4712 ); \
4713 RExC_lastnum=num; \
4714 RExC_lastparse=RExC_parse; \
4715})
4716
07be1b83
YO
4717
4718
3dab1dad
YO
4719#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4720 DEBUG_PARSE_MSG((funcname)); \
4721 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4722})
6bda09f9
YO
4723#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4724 DEBUG_PARSE_MSG((funcname)); \
4725 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4726})
a687059c
LW
4727/*
4728 - reg - regular expression, i.e. main body or parenthesized thing
4729 *
4730 * Caller must absorb opening parenthesis.
4731 *
4732 * Combining parenthesis handling with the base level of regular expression
4733 * is a trifle forced, but the need to tie the tails of the branches to what
4734 * follows makes it hard to avoid.
4735 */
07be1b83
YO
4736#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4737#ifdef DEBUGGING
4738#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4739#else
4740#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4741#endif
3dab1dad 4742
e2e6a0f1
YO
4743/* this idea is borrowed from STR_WITH_LEN in handy.h */
4744#define CHECK_WORD(s,v,l) \
4745 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4746
76e3520e 4747STATIC regnode *
3dab1dad 4748S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4749 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4750{
27da23d5 4751 dVAR;
c277df42
IZ
4752 register regnode *ret; /* Will be the head of the group. */
4753 register regnode *br;
4754 register regnode *lastbr;
cbbf8932 4755 register regnode *ender = NULL;
a0d0e21e 4756 register I32 parno = 0;
cbbf8932
AL
4757 I32 flags;
4758 const I32 oregflags = RExC_flags;
6136c704
AL
4759 bool have_branch = 0;
4760 bool is_open = 0;
9d1d55b5
JP
4761
4762 /* for (?g), (?gc), and (?o) warnings; warning
4763 about (?c) will warn about (?g) -- japhy */
4764
6136c704
AL
4765#define WASTED_O 0x01
4766#define WASTED_G 0x02
4767#define WASTED_C 0x04
4768#define WASTED_GC (0x02|0x04)
cbbf8932 4769 I32 wastedflags = 0x00;
9d1d55b5 4770
fac92740 4771 char * parse_start = RExC_parse; /* MJD */
a28509cc 4772 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4773
3dab1dad
YO
4774 GET_RE_DEBUG_FLAGS_DECL;
4775 DEBUG_PARSE("reg ");
4776
4777
821b33a5 4778 *flagp = 0; /* Tentatively. */
a0d0e21e 4779
9d1d55b5 4780
a0d0e21e
LW
4781 /* Make an OPEN node, if parenthesized. */
4782 if (paren) {
e2e6a0f1
YO
4783 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4784 char *start_verb = RExC_parse;
4785 STRLEN verb_len = 0;
4786 char *start_arg = NULL;
4787 unsigned char op = 0;
4788 int argok = 1;
4789 int internal_argval = 0; /* internal_argval is only useful if !argok */
4790 while ( *RExC_parse && *RExC_parse != ')' ) {
4791 if ( *RExC_parse == ':' ) {
4792 start_arg = RExC_parse + 1;
4793 break;
4794 }
4795 RExC_parse++;
4796 }
4797 ++start_verb;
4798 verb_len = RExC_parse - start_verb;
4799 if ( start_arg ) {
4800 RExC_parse++;
4801 while ( *RExC_parse && *RExC_parse != ')' )
4802 RExC_parse++;
4803 if ( *RExC_parse != ')' )
4804 vFAIL("Unterminated verb pattern argument");
4805 if ( RExC_parse == start_arg )
4806 start_arg = NULL;
4807 } else {
4808 if ( *RExC_parse != ')' )
4809 vFAIL("Unterminated verb pattern");
4810 }
5d458dd8 4811
e2e6a0f1
YO
4812 switch ( *start_verb ) {
4813 case 'A': /* (*ACCEPT) */
4814 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4815 op = ACCEPT;
4816 internal_argval = RExC_nestroot;
4817 }
4818 break;
4819 case 'C': /* (*COMMIT) */
4820 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4821 op = COMMIT;
e2e6a0f1
YO
4822 break;
4823 case 'F': /* (*FAIL) */
4824 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4825 op = OPFAIL;
4826 argok = 0;
4827 }
4828 break;
5d458dd8
YO
4829 case ':': /* (*:NAME) */
4830 case 'M': /* (*MARK:NAME) */
4831 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 4832 op = MARKPOINT;
5d458dd8
YO
4833 argok = -1;
4834 }
4835 break;
4836 case 'P': /* (*PRUNE) */
4837 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4838 op = PRUNE;
e2e6a0f1 4839 break;
5d458dd8
YO
4840 case 'S': /* (*SKIP) */
4841 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4842 op = SKIP;
4843 break;
4844 case 'T': /* (*THEN) */
4845 /* [19:06] <TimToady> :: is then */
4846 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4847 op = CUTGROUP;
4848 RExC_seen |= REG_SEEN_CUTGROUP;
4849 }
e2e6a0f1
YO
4850 break;
4851 }
4852 if ( ! op ) {
4853 RExC_parse++;
4854 vFAIL3("Unknown verb pattern '%.*s'",
4855 verb_len, start_verb);
4856 }
4857 if ( argok ) {
4858 if ( start_arg && internal_argval ) {
4859 vFAIL3("Verb pattern '%.*s' may not have an argument",
4860 verb_len, start_verb);
4861 } else if ( argok < 0 && !start_arg ) {
4862 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4863 verb_len, start_verb);
4864 } else {
4865 ret = reganode(pRExC_state, op, internal_argval);
4866 if ( ! internal_argval && ! SIZE_ONLY ) {
4867 if (start_arg) {
4868 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4869 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 4870 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
4871 ret->flags = 0;
4872 } else {
4873 ret->flags = 1;
4874 }
4875 }
4876 }
4877 if (!internal_argval)
4878 RExC_seen |= REG_SEEN_VERBARG;
4879 } else if ( start_arg ) {
4880 vFAIL3("Verb pattern '%.*s' may not have an argument",
4881 verb_len, start_verb);
4882 } else {
4883 ret = reg_node(pRExC_state, op);
4884 }
4885 nextchar(pRExC_state);
4886 return ret;
4887 } else
fac92740 4888 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4889 U32 posflags = 0, negflags = 0;
4890 U32 *flagsp = &posflags;
6136c704 4891 bool is_logical = 0;
a28509cc 4892 const char * const seqstart = RExC_parse;
ca9dfc88 4893
830247a4
IZ
4894 RExC_parse++;
4895 paren = *RExC_parse++;
c277df42 4896 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4897 switch (paren) {
894be9b7 4898
fac92740 4899 case '<': /* (?<...) */
b81d288d 4900 if (*RExC_parse == '!')
c277df42 4901 paren = ',';
0a4db386
YO
4902 else if (*RExC_parse != '=')
4903 { /* (?<...>) */
81714fb9 4904 char *name_start;
894be9b7 4905 SV *svname;
81714fb9
YO
4906 paren= '>';
4907 case '\'': /* (?'...') */
4908 name_start= RExC_parse;
0a4db386
YO
4909 svname = reg_scan_name(pRExC_state,
4910 SIZE_ONLY ? /* reverse test from the others */
4911 REG_RSN_RETURN_NAME :
4912 REG_RSN_RETURN_NULL);
81714fb9
YO
4913 if (RExC_parse == name_start)
4914 goto unknown;
4915 if (*RExC_parse != paren)
4916 vFAIL2("Sequence (?%c... not terminated",
4917 paren=='>' ? '<' : paren);
4918 if (SIZE_ONLY) {
e62cc96a
YO
4919 HE *he_str;
4920 SV *sv_dat = NULL;
894be9b7
YO
4921 if (!svname) /* shouldnt happen */
4922 Perl_croak(aTHX_
4923 "panic: reg_scan_name returned NULL");
81714fb9
YO
4924 if (!RExC_paren_names) {
4925 RExC_paren_names= newHV();
4926 sv_2mortal((SV*)RExC_paren_names);
4927 }
4928 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4929 if ( he_str )
81714fb9 4930 sv_dat = HeVAL(he_str);
e62cc96a 4931 if ( ! sv_dat ) {
81714fb9 4932 /* croak baby croak */
e62cc96a
YO
4933 Perl_croak(aTHX_
4934 "panic: paren_name hash element allocation failed");
4935 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4936 IV count=SvIV(sv_dat);
4937 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4938 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4939 pv[count]=RExC_npar;
4940 SvIVX(sv_dat)++;
4941 } else {
4942 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4943 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4944 SvIOK_on(sv_dat);
4945 SvIVX(sv_dat)= 1;
e62cc96a
YO
4946 }
4947
81714fb9
YO
4948 /*sv_dump(sv_dat);*/
4949 }
4950 nextchar(pRExC_state);
4951 paren = 1;
4952 goto capturing_parens;
4953 }
4954 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4955 RExC_parse++;
fac92740
MJD
4956 case '=': /* (?=...) */
4957 case '!': /* (?!...) */
830247a4 4958 RExC_seen_zerolen++;
e2e6a0f1
YO
4959 if (*RExC_parse == ')') {
4960 ret=reg_node(pRExC_state, OPFAIL);
4961 nextchar(pRExC_state);
4962 return ret;
4963 }
fac92740
MJD
4964 case ':': /* (?:...) */
4965 case '>': /* (?>...) */
a0d0e21e 4966 break;
fac92740
MJD
4967 case '$': /* (?$...) */
4968 case '@': /* (?@...) */
8615cb43 4969 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4970 break;
fac92740 4971 case '#': /* (?#...) */
830247a4
IZ
4972 while (*RExC_parse && *RExC_parse != ')')
4973 RExC_parse++;
4974 if (*RExC_parse != ')')
c277df42 4975 FAIL("Sequence (?#... not terminated");
830247a4 4976 nextchar(pRExC_state);
a0d0e21e
LW
4977 *flagp = TRYAGAIN;
4978 return NULL;
894be9b7
YO
4979 case '0' : /* (?0) */
4980 case 'R' : /* (?R) */
4981 if (*RExC_parse != ')')
6bda09f9 4982 FAIL("Sequence (?R) not terminated");
1a147d38 4983 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
4984 nextchar(pRExC_state);
4985 return ret;
4986 /*notreached*/
894be9b7
YO
4987 { /* named and numeric backreferences */
4988 I32 num;
4989 char * parse_start;
4990 case '&': /* (?&NAME) */
4991 parse_start = RExC_parse - 1;
4992 {
0a4db386
YO
4993 SV *sv_dat = reg_scan_name(pRExC_state,
4994 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4995 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
4996 }
4997 goto gen_recurse_regop;
4998 /* NOT REACHED */
542fa716
YO
4999 case '+':
5000 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5001 RExC_parse++;
5002 vFAIL("Illegal pattern");
5003 }
5004 goto parse_recursion;
5005 /* NOT REACHED*/
5006 case '-': /* (?-1) */
5007 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5008 RExC_parse--; /* rewind to let it be handled later */
5009 goto parse_flags;
5010 }
5011 /*FALLTHROUGH */
6bda09f9
YO
5012 case '1': case '2': case '3': case '4': /* (?1) */
5013 case '5': case '6': case '7': case '8': case '9':
5014 RExC_parse--;
542fa716 5015 parse_recursion:
894be9b7
YO
5016 num = atoi(RExC_parse);
5017 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5018 if (*RExC_parse == '-')
5019 RExC_parse++;
6bda09f9
YO
5020 while (isDIGIT(*RExC_parse))
5021 RExC_parse++;
5022 if (*RExC_parse!=')')
5023 vFAIL("Expecting close bracket");
894be9b7
YO
5024
5025 gen_recurse_regop:
542fa716
YO
5026 if ( paren == '-' ) {
5027 /*
5028 Diagram of capture buffer numbering.
5029 Top line is the normal capture buffer numbers
5030 Botton line is the negative indexing as from
5031 the X (the (?-2))
5032
5033 + 1 2 3 4 5 X 6 7
5034 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5035 - 5 4 3 2 1 X x x
5036
5037 */
5038 num = RExC_npar + num;
5039 if (num < 1) {
5040 RExC_parse++;
5041 vFAIL("Reference to nonexistent group");
5042 }
5043 } else if ( paren == '+' ) {
5044 num = RExC_npar + num - 1;
5045 }
5046
1a147d38 5047 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5048 if (!SIZE_ONLY) {
5049 if (num > (I32)RExC_rx->nparens) {
5050 RExC_parse++;
5051 vFAIL("Reference to nonexistent group");
5052 }
40d049e4 5053 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5054 RExC_emit++;
226de585 5055 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5056 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5057 } else {
6bda09f9 5058 RExC_size++;
6bda09f9 5059 }
0a4db386 5060 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5061 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5062 Set_Node_Offset(ret, parse_start); /* MJD */
5063
6bda09f9
YO
5064 nextchar(pRExC_state);
5065 return ret;
894be9b7
YO
5066 } /* named and numeric backreferences */
5067 /* NOT REACHED */
5068
fac92740 5069 case 'p': /* (?p...) */
9014280d 5070 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 5071 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 5072 /* FALL THROUGH*/
fac92740 5073 case '?': /* (??...) */
6136c704 5074 is_logical = 1;
438a3801
YST
5075 if (*RExC_parse != '{')
5076 goto unknown;
830247a4 5077 paren = *RExC_parse++;
0f5d15d6 5078 /* FALL THROUGH */
fac92740 5079 case '{': /* (?{...}) */
c277df42 5080 {
2eccd3b2
NC
5081 I32 count = 1;
5082 U32 n = 0;
c277df42 5083 char c;
830247a4 5084 char *s = RExC_parse;
c277df42 5085
830247a4
IZ
5086 RExC_seen_zerolen++;
5087 RExC_seen |= REG_SEEN_EVAL;
5088 while (count && (c = *RExC_parse)) {
6136c704
AL
5089 if (c == '\\') {
5090 if (RExC_parse[1])
5091 RExC_parse++;
5092 }
b81d288d 5093 else if (c == '{')
c277df42 5094 count++;
b81d288d 5095 else if (c == '}')
c277df42 5096 count--;
830247a4 5097 RExC_parse++;
c277df42 5098 }
6136c704 5099 if (*RExC_parse != ')') {
b81d288d 5100 RExC_parse = s;
b45f050a
JF
5101 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5102 }
c277df42 5103 if (!SIZE_ONLY) {
f3548bdc 5104 PAD *pad;
6136c704
AL
5105 OP_4tree *sop, *rop;
5106 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5107
569233ed
SB
5108 ENTER;
5109 Perl_save_re_context(aTHX);
f3548bdc 5110 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5111 sop->op_private |= OPpREFCOUNTED;
5112 /* re_dup will OpREFCNT_inc */
5113 OpREFCNT_set(sop, 1);
569233ed 5114 LEAVE;
c277df42 5115
830247a4 5116 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5117 RExC_rxi->data->data[n] = (void*)rop;
5118 RExC_rxi->data->data[n+1] = (void*)sop;
5119 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 5120 SvREFCNT_dec(sv);
a0ed51b3 5121 }
e24b16f9 5122 else { /* First pass */
830247a4 5123 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5124 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5125 /* No compiled RE interpolated, has runtime
5126 components ===> unsafe. */
5127 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5128 if (PL_tainting && PL_tainted)
cc6b7395 5129 FAIL("Eval-group in insecure regular expression");
54df2634 5130#if PERL_VERSION > 8
923e4eb5 5131 if (IN_PERL_COMPILETIME)
b5c19bd7 5132 PL_cv_has_eval = 1;
54df2634 5133#endif
c277df42 5134 }
b5c19bd7 5135
830247a4 5136 nextchar(pRExC_state);
6136c704 5137 if (is_logical) {
830247a4 5138 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5139 if (!SIZE_ONLY)
5140 ret->flags = 2;
3dab1dad 5141 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5142 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5143 return ret;
5144 }
ccb2c380
MP
5145 ret = reganode(pRExC_state, EVAL, n);
5146 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5147 Set_Node_Offset(ret, parse_start);
5148 return ret;
c277df42 5149 }
fac92740 5150 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5151 {
0a4db386 5152 int is_define= 0;
fac92740 5153 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5154 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5155 || RExC_parse[1] == '<'
830247a4 5156 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5157 I32 flag;
5158
830247a4 5159 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5160 if (!SIZE_ONLY)
5161 ret->flags = 1;
3dab1dad 5162 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5163 goto insert_if;
b81d288d 5164 }
a0ed51b3 5165 }
0a4db386
YO
5166 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5167 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5168 {
5169 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5170 char *name_start= RExC_parse++;
2eccd3b2 5171 U32 num = 0;
0a4db386
YO
5172 SV *sv_dat=reg_scan_name(pRExC_state,
5173 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5174 if (RExC_parse == name_start || *RExC_parse != ch)
5175 vFAIL2("Sequence (?(%c... not terminated",
5176 (ch == '>' ? '<' : ch));
5177 RExC_parse++;
5178 if (!SIZE_ONLY) {
5179 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5180 RExC_rxi->data->data[num]=(void*)sv_dat;
0a4db386
YO
5181 SvREFCNT_inc(sv_dat);
5182 }
5183 ret = reganode(pRExC_state,NGROUPP,num);
5184 goto insert_if_check_paren;
5185 }
5186 else if (RExC_parse[0] == 'D' &&
5187 RExC_parse[1] == 'E' &&
5188 RExC_parse[2] == 'F' &&
5189 RExC_parse[3] == 'I' &&
5190 RExC_parse[4] == 'N' &&
5191 RExC_parse[5] == 'E')
5192 {
5193 ret = reganode(pRExC_state,DEFINEP,0);
5194 RExC_parse +=6 ;
5195 is_define = 1;
5196 goto insert_if_check_paren;
5197 }
5198 else if (RExC_parse[0] == 'R') {
5199 RExC_parse++;
5200 parno = 0;
5201 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5202 parno = atoi(RExC_parse++);
5203 while (isDIGIT(*RExC_parse))
5204 RExC_parse++;
5205 } else if (RExC_parse[0] == '&') {
5206 SV *sv_dat;
5207 RExC_parse++;
5208 sv_dat = reg_scan_name(pRExC_state,
5209 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5210 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5211 }
1a147d38 5212 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5213 goto insert_if_check_paren;
5214 }
830247a4 5215 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5216 /* (?(1)...) */
6136c704 5217 char c;
830247a4 5218 parno = atoi(RExC_parse++);
c277df42 5219
830247a4
IZ
5220 while (isDIGIT(*RExC_parse))
5221 RExC_parse++;
fac92740 5222 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5223
0a4db386 5224 insert_if_check_paren:
830247a4 5225 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5226 vFAIL("Switch condition not recognized");
c277df42 5227 insert_if:
3dab1dad
YO
5228 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5229 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5230 if (br == NULL)
830247a4 5231 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5232 else
3dab1dad 5233 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5234 c = *nextchar(pRExC_state);
d1b80229
IZ
5235 if (flags&HASWIDTH)
5236 *flagp |= HASWIDTH;
c277df42 5237 if (c == '|') {
0a4db386
YO
5238 if (is_define)
5239 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5240 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5241 regbranch(pRExC_state, &flags, 1,depth+1);
5242 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5243 if (flags&HASWIDTH)
5244 *flagp |= HASWIDTH;
830247a4 5245 c = *nextchar(pRExC_state);
a0ed51b3
LW
5246 }
5247 else
c277df42
IZ
5248 lastbr = NULL;
5249 if (c != ')')
8615cb43 5250 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5251 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5252 REGTAIL(pRExC_state, br, ender);
c277df42 5253 if (lastbr) {
3dab1dad
YO
5254 REGTAIL(pRExC_state, lastbr, ender);
5255 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5256 }
5257 else
3dab1dad 5258 REGTAIL(pRExC_state, ret, ender);
c277df42 5259 return ret;
a0ed51b3
LW
5260 }
5261 else {
830247a4 5262 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5263 }
5264 }
1b1626e4 5265 case 0:
830247a4 5266 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5267 vFAIL("Sequence (? incomplete");
1b1626e4 5268 break;
a0d0e21e 5269 default:
830247a4 5270 --RExC_parse;
fac92740 5271 parse_flags: /* (?i) */
830247a4 5272 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5273 /* (?g), (?gc) and (?o) are useless here
5274 and must be globally applied -- japhy */
5275
5276 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5277 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5278 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5279 if (! (wastedflags & wflagbit) ) {
5280 wastedflags |= wflagbit;
5281 vWARN5(
5282 RExC_parse + 1,
5283 "Useless (%s%c) - %suse /%c modifier",
5284 flagsp == &negflags ? "?-" : "?",
5285 *RExC_parse,
5286 flagsp == &negflags ? "don't " : "",
5287 *RExC_parse
5288 );
5289 }
5290 }
5291 }
5292 else if (*RExC_parse == 'c') {
5293 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5294 if (! (wastedflags & WASTED_C) ) {
5295 wastedflags |= WASTED_GC;
9d1d55b5
JP
5296 vWARN3(
5297 RExC_parse + 1,
5298 "Useless (%sc) - %suse /gc modifier",
5299 flagsp == &negflags ? "?-" : "?",
5300 flagsp == &negflags ? "don't " : ""
5301 );
5302 }
5303 }
5304 }
5305 else { pmflag(flagsp, *RExC_parse); }
5306
830247a4 5307 ++RExC_parse;
ca9dfc88 5308 }
830247a4 5309 if (*RExC_parse == '-') {
ca9dfc88 5310 flagsp = &negflags;
9d1d55b5 5311 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5312 ++RExC_parse;
ca9dfc88 5313 goto parse_flags;
48c036b1 5314 }
e2509266
JH
5315 RExC_flags |= posflags;
5316 RExC_flags &= ~negflags;
830247a4
IZ
5317 if (*RExC_parse == ':') {
5318 RExC_parse++;
ca9dfc88
IZ
5319 paren = ':';
5320 break;
5321 }
c277df42 5322 unknown:
830247a4
IZ
5323 if (*RExC_parse != ')') {
5324 RExC_parse++;
5325 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5326 }
830247a4 5327 nextchar(pRExC_state);
a0d0e21e
LW
5328 *flagp = TRYAGAIN;
5329 return NULL;
5330 }
5331 }
fac92740 5332 else { /* (...) */
81714fb9 5333 capturing_parens:
830247a4
IZ
5334 parno = RExC_npar;
5335 RExC_npar++;
e2e6a0f1 5336
830247a4 5337 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5338 if (!SIZE_ONLY ){
5339 if (!RExC_nestroot)
5340 RExC_nestroot = parno;
5341 if (RExC_seen & REG_SEEN_RECURSE) {
5342 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5343 "Setting open paren #%"IVdf" to %d\n",
5344 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5345 RExC_open_parens[parno-1]= ret;
5346 }
6bda09f9 5347 }
fac92740
MJD
5348 Set_Node_Length(ret, 1); /* MJD */
5349 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5350 is_open = 1;
a0d0e21e 5351 }
a0ed51b3 5352 }
fac92740 5353 else /* ! paren */
a0d0e21e
LW
5354 ret = NULL;
5355
5356 /* Pick up the branches, linking them together. */
fac92740 5357 parse_start = RExC_parse; /* MJD */
3dab1dad 5358 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5359 /* branch_len = (paren != 0); */
2af232bd 5360
a0d0e21e
LW
5361 if (br == NULL)
5362 return(NULL);
830247a4
IZ
5363 if (*RExC_parse == '|') {
5364 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5365 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5366 }
fac92740 5367 else { /* MJD */
6bda09f9 5368 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5369 Set_Node_Length(br, paren != 0);
5370 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5371 }
c277df42
IZ
5372 have_branch = 1;
5373 if (SIZE_ONLY)
830247a4 5374 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5375 }
5376 else if (paren == ':') {
c277df42
IZ
5377 *flagp |= flags&SIMPLE;
5378 }
6136c704 5379 if (is_open) { /* Starts with OPEN. */
3dab1dad 5380 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5381 }
5382 else if (paren != '?') /* Not Conditional */
a0d0e21e 5383 ret = br;
32a0ca98 5384 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5385 lastbr = br;
830247a4
IZ
5386 while (*RExC_parse == '|') {
5387 if (!SIZE_ONLY && RExC_extralen) {
5388 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5389 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5390 }
5391 if (SIZE_ONLY)
830247a4
IZ
5392 RExC_extralen += 2; /* Account for LONGJMP. */
5393 nextchar(pRExC_state);
3dab1dad 5394 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5395
a687059c 5396 if (br == NULL)
a0d0e21e 5397 return(NULL);
3dab1dad 5398 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5399 lastbr = br;
821b33a5
IZ
5400 if (flags&HASWIDTH)
5401 *flagp |= HASWIDTH;
a687059c 5402 *flagp |= flags&SPSTART;
a0d0e21e
LW
5403 }
5404
c277df42
IZ
5405 if (have_branch || paren != ':') {
5406 /* Make a closing node, and hook it on the end. */
5407 switch (paren) {
5408 case ':':
830247a4 5409 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5410 break;
5411 case 1:
c74340f9 5412 RExC_cpar++;
830247a4 5413 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5414 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5415 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5416 "Setting close paren #%"IVdf" to %d\n",
5417 (IV)parno, REG_NODE_NUM(ender)));
5418 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5419 if (RExC_nestroot == parno)
5420 RExC_nestroot = 0;
40d049e4 5421 }
fac92740
MJD
5422 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5423 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5424 break;
5425 case '<':
c277df42
IZ
5426 case ',':
5427 case '=':
5428 case '!':
c277df42 5429 *flagp &= ~HASWIDTH;
821b33a5
IZ
5430 /* FALL THROUGH */
5431 case '>':
830247a4 5432 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5433 break;
5434 case 0:
830247a4 5435 ender = reg_node(pRExC_state, END);
40d049e4
YO
5436 if (!SIZE_ONLY) {
5437 assert(!RExC_opend); /* there can only be one! */
5438 RExC_opend = ender;
5439 }
c277df42
IZ
5440 break;
5441 }
eaf3ca90 5442 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5443
9674d46a 5444 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5445 if (depth==1)
5446 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5447
c277df42 5448 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5449 for (br = ret; br; br = regnext(br)) {
5450 const U8 op = PL_regkind[OP(br)];
5451 if (op == BRANCH) {
07be1b83 5452 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5453 }
5454 else if (op == BRANCHJ) {
07be1b83 5455 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5456 }
c277df42
IZ
5457 }
5458 }
a0d0e21e 5459 }
c277df42
IZ
5460
5461 {
e1ec3a88
AL
5462 const char *p;
5463 static const char parens[] = "=!<,>";
c277df42
IZ
5464
5465 if (paren && (p = strchr(parens, paren))) {
eb160463 5466 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5467 int flag = (p - parens) > 1;
5468
5469 if (paren == '>')
5470 node = SUSPEND, flag = 0;
6bda09f9 5471 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5472 Set_Node_Cur_Length(ret);
5473 Set_Node_Offset(ret, parse_start + 1);
c277df42 5474 ret->flags = flag;
07be1b83 5475 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5476 }
a0d0e21e
LW
5477 }
5478
5479 /* Check for proper termination. */
ce3e6498 5480 if (paren) {
e2509266 5481 RExC_flags = oregflags;
830247a4
IZ
5482 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5483 RExC_parse = oregcomp_parse;
380a0633 5484 vFAIL("Unmatched (");
ce3e6498 5485 }
a0ed51b3 5486 }
830247a4
IZ
5487 else if (!paren && RExC_parse < RExC_end) {
5488 if (*RExC_parse == ')') {
5489 RExC_parse++;
380a0633 5490 vFAIL("Unmatched )");
a0ed51b3
LW
5491 }
5492 else
b45f050a 5493 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5494 /* NOTREACHED */
5495 }
a687059c 5496
a0d0e21e 5497 return(ret);
a687059c
LW
5498}
5499
5500/*
5501 - regbranch - one alternative of an | operator
5502 *
5503 * Implements the concatenation operator.
5504 */
76e3520e 5505STATIC regnode *
3dab1dad 5506S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5507{
97aff369 5508 dVAR;
c277df42
IZ
5509 register regnode *ret;
5510 register regnode *chain = NULL;
5511 register regnode *latest;
5512 I32 flags = 0, c = 0;
3dab1dad
YO
5513 GET_RE_DEBUG_FLAGS_DECL;
5514 DEBUG_PARSE("brnc");
b81d288d 5515 if (first)
c277df42
IZ
5516 ret = NULL;
5517 else {
b81d288d 5518 if (!SIZE_ONLY && RExC_extralen)
830247a4 5519 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5520 else {
830247a4 5521 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5522 Set_Node_Length(ret, 1);
5523 }
c277df42
IZ
5524 }
5525
b81d288d 5526 if (!first && SIZE_ONLY)
830247a4 5527 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5528
c277df42 5529 *flagp = WORST; /* Tentatively. */
a0d0e21e 5530
830247a4
IZ
5531 RExC_parse--;
5532 nextchar(pRExC_state);
5533 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5534 flags &= ~TRYAGAIN;
3dab1dad 5535 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5536 if (latest == NULL) {
5537 if (flags & TRYAGAIN)
5538 continue;
5539 return(NULL);
a0ed51b3
LW
5540 }
5541 else if (ret == NULL)
c277df42 5542 ret = latest;
a0d0e21e 5543 *flagp |= flags&HASWIDTH;
c277df42 5544 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5545 *flagp |= flags&SPSTART;
5546 else {
830247a4 5547 RExC_naughty++;
3dab1dad 5548 REGTAIL(pRExC_state, chain, latest);
a687059c 5549 }
a0d0e21e 5550 chain = latest;
c277df42
IZ
5551 c++;
5552 }
5553 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5554 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5555 if (ret == NULL)
5556 ret = chain;
5557 }
5558 if (c == 1) {
5559 *flagp |= flags&SIMPLE;
a0d0e21e 5560 }
a687059c 5561
d4c19fe8 5562 return ret;
a687059c
LW
5563}
5564
5565/*
5566 - regpiece - something followed by possible [*+?]
5567 *
5568 * Note that the branching code sequences used for ? and the general cases
5569 * of * and + are somewhat optimized: they use the same NOTHING node as
5570 * both the endmarker for their branch list and the body of the last branch.
5571 * It might seem that this node could be dispensed with entirely, but the
5572 * endmarker role is not redundant.
5573 */
76e3520e 5574STATIC regnode *
3dab1dad 5575S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5576{
97aff369 5577 dVAR;
c277df42 5578 register regnode *ret;
a0d0e21e
LW
5579 register char op;
5580 register char *next;
5581 I32 flags;
1df70142 5582 const char * const origparse = RExC_parse;
a0d0e21e 5583 I32 min;
c277df42 5584 I32 max = REG_INFTY;
fac92740 5585 char *parse_start;
10edeb5d 5586 const char *maxpos = NULL;
3dab1dad
YO
5587 GET_RE_DEBUG_FLAGS_DECL;
5588 DEBUG_PARSE("piec");
a0d0e21e 5589
3dab1dad 5590 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5591 if (ret == NULL) {
5592 if (flags & TRYAGAIN)
5593 *flagp |= TRYAGAIN;
5594 return(NULL);
5595 }
5596
830247a4 5597 op = *RExC_parse;
a0d0e21e 5598
830247a4 5599 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5600 maxpos = NULL;
fac92740 5601 parse_start = RExC_parse; /* MJD */
830247a4 5602 next = RExC_parse + 1;
a0d0e21e
LW
5603 while (isDIGIT(*next) || *next == ',') {
5604 if (*next == ',') {
5605 if (maxpos)
5606 break;
5607 else
5608 maxpos = next;
a687059c 5609 }
a0d0e21e
LW
5610 next++;
5611 }
5612 if (*next == '}') { /* got one */
5613 if (!maxpos)
5614 maxpos = next;
830247a4
IZ
5615 RExC_parse++;
5616 min = atoi(RExC_parse);
a0d0e21e
LW
5617 if (*maxpos == ',')
5618 maxpos++;
5619 else
830247a4 5620 maxpos = RExC_parse;
a0d0e21e
LW
5621 max = atoi(maxpos);
5622 if (!max && *maxpos != '0')
c277df42
IZ
5623 max = REG_INFTY; /* meaning "infinity" */
5624 else if (max >= REG_INFTY)
8615cb43 5625 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5626 RExC_parse = next;
5627 nextchar(pRExC_state);
a0d0e21e
LW
5628
5629 do_curly:
5630 if ((flags&SIMPLE)) {
830247a4 5631 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5632 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5633 Set_Node_Offset(ret, parse_start+1); /* MJD */
5634 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5635 }
5636 else {
3dab1dad 5637 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5638
5639 w->flags = 0;
3dab1dad 5640 REGTAIL(pRExC_state, ret, w);
830247a4 5641 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5642 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5643 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5644 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5645 }
6bda09f9 5646 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5647 /* MJD hk */
5648 Set_Node_Offset(ret, parse_start+1);
2af232bd 5649 Set_Node_Length(ret,
fac92740 5650 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5651
830247a4 5652 if (!SIZE_ONLY && RExC_extralen)
c277df42 5653 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5654 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5655 if (SIZE_ONLY)
830247a4
IZ
5656 RExC_whilem_seen++, RExC_extralen += 3;
5657 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5658 }
c277df42 5659 ret->flags = 0;
a0d0e21e
LW
5660
5661 if (min > 0)
821b33a5
IZ
5662 *flagp = WORST;
5663 if (max > 0)
5664 *flagp |= HASWIDTH;
a0d0e21e 5665 if (max && max < min)
8615cb43 5666 vFAIL("Can't do {n,m} with n > m");
c277df42 5667 if (!SIZE_ONLY) {
eb160463
GS
5668 ARG1_SET(ret, (U16)min);
5669 ARG2_SET(ret, (U16)max);
a687059c 5670 }
a687059c 5671
a0d0e21e 5672 goto nest_check;
a687059c 5673 }
a0d0e21e 5674 }
a687059c 5675
a0d0e21e
LW
5676 if (!ISMULT1(op)) {
5677 *flagp = flags;
a687059c 5678 return(ret);
a0d0e21e 5679 }
bb20fd44 5680
c277df42 5681#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5682
5683 /* if this is reinstated, don't forget to put this back into perldiag:
5684
5685 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5686
5687 (F) The part of the regexp subject to either the * or + quantifier
5688 could match an empty string. The {#} shows in the regular
5689 expression about where the problem was discovered.
5690
5691 */
5692
bb20fd44 5693 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5694 vFAIL("Regexp *+ operand could be empty");
b81d288d 5695#endif
bb20fd44 5696
fac92740 5697 parse_start = RExC_parse;
830247a4 5698 nextchar(pRExC_state);
a0d0e21e 5699
821b33a5 5700 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5701
5702 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5703 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5704 ret->flags = 0;
830247a4 5705 RExC_naughty += 4;
a0d0e21e
LW
5706 }
5707 else if (op == '*') {
5708 min = 0;
5709 goto do_curly;
a0ed51b3
LW
5710 }
5711 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5712 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5713 ret->flags = 0;
830247a4 5714 RExC_naughty += 3;
a0d0e21e
LW
5715 }
5716 else if (op == '+') {
5717 min = 1;
5718 goto do_curly;
a0ed51b3
LW
5719 }
5720 else if (op == '?') {
a0d0e21e
LW
5721 min = 0; max = 1;
5722 goto do_curly;
5723 }
5724 nest_check:
041457d9 5725 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5726 vWARN3(RExC_parse,
b45f050a 5727 "%.*s matches null string many times",
afd78fd5 5728 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5729 origparse);
a0d0e21e
LW
5730 }
5731
b9b4dddf 5732 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5733 nextchar(pRExC_state);
6bda09f9 5734 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5735 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5736 }
b9b4dddf
YO
5737#ifndef REG_ALLOW_MINMOD_SUSPEND
5738 else
5739#endif
5740 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5741 regnode *ender;
5742 nextchar(pRExC_state);
5743 ender = reg_node(pRExC_state, SUCCEED);
5744 REGTAIL(pRExC_state, ret, ender);
5745 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5746 ret->flags = 0;
5747 ender = reg_node(pRExC_state, TAIL);
5748 REGTAIL(pRExC_state, ret, ender);
5749 /*ret= ender;*/
5750 }
5751
5752 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5753 RExC_parse++;
b45f050a
JF
5754 vFAIL("Nested quantifiers");
5755 }
a0d0e21e
LW
5756
5757 return(ret);
a687059c
LW
5758}
5759
fc8cd66c
YO
5760
5761/* reg_namedseq(pRExC_state,UVp)
5762
5763 This is expected to be called by a parser routine that has
5764 recognized'\N' and needs to handle the rest. RExC_parse is
5765 expected to point at the first char following the N at the time
5766 of the call.
5767
5768 If valuep is non-null then it is assumed that we are parsing inside
5769 of a charclass definition and the first codepoint in the resolved
5770 string is returned via *valuep and the routine will return NULL.
5771 In this mode if a multichar string is returned from the charnames
5772 handler a warning will be issued, and only the first char in the
5773 sequence will be examined. If the string returned is zero length
5774 then the value of *valuep is undefined and NON-NULL will
5775 be returned to indicate failure. (This will NOT be a valid pointer
5776 to a regnode.)
5777
5778 If value is null then it is assumed that we are parsing normal text
5779 and inserts a new EXACT node into the program containing the resolved
5780 string and returns a pointer to the new node. If the string is
5781 zerolength a NOTHING node is emitted.
5782
5783 On success RExC_parse is set to the char following the endbrace.
5784 Parsing failures will generate a fatal errorvia vFAIL(...)
5785
5786 NOTE: We cache all results from the charnames handler locally in
5787 the RExC_charnames hash (created on first use) to prevent a charnames
5788 handler from playing silly-buggers and returning a short string and
5789 then a long string for a given pattern. Since the regexp program
5790 size is calculated during an initial parse this would result
5791 in a buffer overrun so we cache to prevent the charname result from
5792 changing during the course of the parse.
5793
5794 */
5795STATIC regnode *
5796S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5797{
5798 char * name; /* start of the content of the name */
5799 char * endbrace; /* endbrace following the name */
5800 SV *sv_str = NULL;
5801 SV *sv_name = NULL;
5802 STRLEN len; /* this has various purposes throughout the code */
5803 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5804 regnode *ret = NULL;
5805
5806 if (*RExC_parse != '{') {
5807 vFAIL("Missing braces on \\N{}");
5808 }
5809 name = RExC_parse+1;
5810 endbrace = strchr(RExC_parse, '}');
5811 if ( ! endbrace ) {
5812 RExC_parse++;
5813 vFAIL("Missing right brace on \\N{}");
5814 }
5815 RExC_parse = endbrace + 1;
5816
5817
5818 /* RExC_parse points at the beginning brace,
5819 endbrace points at the last */
5820 if ( name[0]=='U' && name[1]=='+' ) {
5821 /* its a "unicode hex" notation {U+89AB} */
5822 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5823 | PERL_SCAN_DISALLOW_PREFIX
5824 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5825 UV cp;
196f1508 5826 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5827 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5828 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5829 cp = 0xFFFD;
5830 }
5831 if (cp > 0xff)
5832 RExC_utf8 = 1;
5833 if ( valuep ) {
5834 *valuep = cp;
5835 return NULL;
5836 }
5837 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5838 } else {
5839 /* fetch the charnames handler for this scope */
5840 HV * const table = GvHV(PL_hintgv);
5841 SV **cvp= table ?
5842 hv_fetchs(table, "charnames", FALSE) :
5843 NULL;
5844 SV *cv= cvp ? *cvp : NULL;
5845 HE *he_str;
5846 int count;
5847 /* create an SV with the name as argument */
5848 sv_name = newSVpvn(name, endbrace - name);
5849
5850 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5851 vFAIL2("Constant(\\N{%s}) unknown: "
5852 "(possibly a missing \"use charnames ...\")",
5853 SvPVX(sv_name));
5854 }
5855 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5856 vFAIL2("Constant(\\N{%s}): "
5857 "$^H{charnames} is not defined",SvPVX(sv_name));
5858 }
5859
5860
5861
5862 if (!RExC_charnames) {
5863 /* make sure our cache is allocated */
5864 RExC_charnames = newHV();
6bda09f9 5865 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5866 }
5867 /* see if we have looked this one up before */
5868 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5869 if ( he_str ) {
5870 sv_str = HeVAL(he_str);
5871 cached = 1;
5872 } else {
5873 dSP ;
5874
5875 ENTER ;
5876 SAVETMPS ;
5877 PUSHMARK(SP) ;
5878
5879 XPUSHs(sv_name);
5880
5881 PUTBACK ;
5882
5883 count= call_sv(cv, G_SCALAR);
5884
5885 if (count == 1) { /* XXXX is this right? dmq */
5886 sv_str = POPs;
5887 SvREFCNT_inc_simple_void(sv_str);
5888 }
5889
5890 SPAGAIN ;
5891 PUTBACK ;
5892 FREETMPS ;
5893 LEAVE ;
5894
5895 if ( !sv_str || !SvOK(sv_str) ) {
5896 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5897 "did not return a defined value",SvPVX(sv_name));
5898 }
5899 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5900 cached = 1;
5901 }
5902 }
5903 if (valuep) {
5904 char *p = SvPV(sv_str, len);
5905 if (len) {
5906 STRLEN numlen = 1;
5907 if ( SvUTF8(sv_str) ) {
196f1508 5908 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5909 if (*valuep > 0x7F)
5910 RExC_utf8 = 1;
5911 /* XXXX
5912 We have to turn on utf8 for high bit chars otherwise
5913 we get failures with
5914
5915 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5916 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5917
5918 This is different from what \x{} would do with the same
5919 codepoint, where the condition is > 0xFF.
5920 - dmq
5921 */
5922
5923
5924 } else {
5925 *valuep = (UV)*p;
5926 /* warn if we havent used the whole string? */
5927 }
5928 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5929 vWARN2(RExC_parse,
5930 "Ignoring excess chars from \\N{%s} in character class",
5931 SvPVX(sv_name)
5932 );
5933 }
5934 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5935 vWARN2(RExC_parse,
5936 "Ignoring zero length \\N{%s} in character class",
5937 SvPVX(sv_name)
5938 );
5939 }
5940 if (sv_name)
5941 SvREFCNT_dec(sv_name);
5942 if (!cached)
5943 SvREFCNT_dec(sv_str);
5944 return len ? NULL : (regnode *)&len;
5945 } else if(SvCUR(sv_str)) {
5946
5947 char *s;
5948 char *p, *pend;
5949 STRLEN charlen = 1;
5950 char * parse_start = name-3; /* needed for the offsets */
5951 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5952
5953 ret = reg_node(pRExC_state,
5954 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5955 s= STRING(ret);
5956
5957 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5958 sv_utf8_upgrade(sv_str);
5959 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5960 RExC_utf8= 1;
5961 }
5962
5963 p = SvPV(sv_str, len);
5964 pend = p + len;
5965 /* len is the length written, charlen is the size the char read */
5966 for ( len = 0; p < pend; p += charlen ) {
5967 if (UTF) {
196f1508 5968 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5969 if (FOLD) {
5970 STRLEN foldlen,numlen;
5971 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5972 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5973 /* Emit all the Unicode characters. */
5974
5975 for (foldbuf = tmpbuf;
5976 foldlen;
5977 foldlen -= numlen)
5978 {
5979 uvc = utf8_to_uvchr(foldbuf, &numlen);
5980 if (numlen > 0) {
5981 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5982 s += unilen;
5983 len += unilen;
5984 /* In EBCDIC the numlen
5985 * and unilen can differ. */
5986 foldbuf += numlen;
5987 if (numlen >= foldlen)
5988 break;
5989 }
5990 else
5991 break; /* "Can't happen." */
5992 }
5993 } else {
5994 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5995 if (unilen > 0) {
5996 s += unilen;
5997 len += unilen;
5998 }
5999 }
6000 } else {
6001 len++;
6002 REGC(*p, s++);
6003 }
6004 }
6005 if (SIZE_ONLY) {
6006 RExC_size += STR_SZ(len);
6007 } else {
6008 STR_LEN(ret) = len;
6009 RExC_emit += STR_SZ(len);
6010 }
6011 Set_Node_Cur_Length(ret); /* MJD */
6012 RExC_parse--;
6013 nextchar(pRExC_state);
6014 } else {
6015 ret = reg_node(pRExC_state,NOTHING);
6016 }
6017 if (!cached) {
6018 SvREFCNT_dec(sv_str);
6019 }
6020 if (sv_name) {
6021 SvREFCNT_dec(sv_name);
6022 }
6023 return ret;
6024
6025}
6026
6027
9e08bc66
TS
6028/*
6029 * reg_recode
6030 *
6031 * It returns the code point in utf8 for the value in *encp.
6032 * value: a code value in the source encoding
6033 * encp: a pointer to an Encode object
6034 *
6035 * If the result from Encode is not a single character,
6036 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6037 */
6038STATIC UV
6039S_reg_recode(pTHX_ const char value, SV **encp)
6040{
6041 STRLEN numlen = 1;
6042 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6043 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6044 : SvPVX(sv);
6045 const STRLEN newlen = SvCUR(sv);
6046 UV uv = UNICODE_REPLACEMENT;
6047
6048 if (newlen)
6049 uv = SvUTF8(sv)
6050 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6051 : *(U8*)s;
6052
6053 if (!newlen || numlen != newlen) {
6054 uv = UNICODE_REPLACEMENT;
6055 if (encp)
6056 *encp = NULL;
6057 }
6058 return uv;
6059}
6060
fc8cd66c 6061
a687059c
LW
6062/*
6063 - regatom - the lowest level
6064 *
6065 * Optimization: gobbles an entire sequence of ordinary characters so that
6066 * it can turn them into a single node, which is smaller to store and
6067 * faster to run. Backslashed characters are exceptions, each becoming a
6068 * separate node; the code is simpler that way and it's not worth fixing.
6069 *
7f6f358c
YO
6070 * [Yes, it is worth fixing, some scripts can run twice the speed.]
6071 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6072 */
76e3520e 6073STATIC regnode *
3dab1dad 6074S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6075{
97aff369 6076 dVAR;
cbbf8932 6077 register regnode *ret = NULL;
a0d0e21e 6078 I32 flags;
45948336 6079 char *parse_start = RExC_parse;
3dab1dad
YO
6080 GET_RE_DEBUG_FLAGS_DECL;
6081 DEBUG_PARSE("atom");
a0d0e21e
LW
6082 *flagp = WORST; /* Tentatively. */
6083
6084tryagain:
830247a4 6085 switch (*RExC_parse) {
a0d0e21e 6086 case '^':
830247a4
IZ
6087 RExC_seen_zerolen++;
6088 nextchar(pRExC_state);
bbe252da 6089 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6090 ret = reg_node(pRExC_state, MBOL);
bbe252da 6091 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6092 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6093 else
830247a4 6094 ret = reg_node(pRExC_state, BOL);
fac92740 6095 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6096 break;
6097 case '$':
830247a4 6098 nextchar(pRExC_state);
b81d288d 6099 if (*RExC_parse)
830247a4 6100 RExC_seen_zerolen++;
bbe252da 6101 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6102 ret = reg_node(pRExC_state, MEOL);
bbe252da 6103 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6104 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6105 else
830247a4 6106 ret = reg_node(pRExC_state, EOL);
fac92740 6107 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6108 break;
6109 case '.':
830247a4 6110 nextchar(pRExC_state);
bbe252da 6111 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
6112 ret = reg_node(pRExC_state, SANY);
6113 else
6114 ret = reg_node(pRExC_state, REG_ANY);
6115 *flagp |= HASWIDTH|SIMPLE;
830247a4 6116 RExC_naughty++;
fac92740 6117 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6118 break;
6119 case '[':
b45f050a 6120 {
3dab1dad
YO
6121 char * const oregcomp_parse = ++RExC_parse;
6122 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6123 if (*RExC_parse != ']') {
6124 RExC_parse = oregcomp_parse;
b45f050a
JF
6125 vFAIL("Unmatched [");
6126 }
830247a4 6127 nextchar(pRExC_state);
a0d0e21e 6128 *flagp |= HASWIDTH|SIMPLE;
fac92740 6129 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6130 break;
b45f050a 6131 }
a0d0e21e 6132 case '(':
830247a4 6133 nextchar(pRExC_state);
3dab1dad 6134 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6135 if (ret == NULL) {
bf93d4cc 6136 if (flags & TRYAGAIN) {
830247a4 6137 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6138 /* Make parent create an empty node if needed. */
6139 *flagp |= TRYAGAIN;
6140 return(NULL);
6141 }
a0d0e21e 6142 goto tryagain;
bf93d4cc 6143 }
a0d0e21e
LW
6144 return(NULL);
6145 }
c277df42 6146 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
6147 break;
6148 case '|':
6149 case ')':
6150 if (flags & TRYAGAIN) {
6151 *flagp |= TRYAGAIN;
6152 return NULL;
6153 }
b45f050a 6154 vFAIL("Internal urp");
a0d0e21e
LW
6155 /* Supposed to be caught earlier. */
6156 break;
85afd4ae 6157 case '{':
830247a4
IZ
6158 if (!regcurly(RExC_parse)) {
6159 RExC_parse++;
85afd4ae
CS
6160 goto defchar;
6161 }
6162 /* FALL THROUGH */
a0d0e21e
LW
6163 case '?':
6164 case '+':
6165 case '*':
830247a4 6166 RExC_parse++;
b45f050a 6167 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6168 break;
6169 case '\\':
830247a4 6170 switch (*++RExC_parse) {
a0d0e21e 6171 case 'A':
830247a4
IZ
6172 RExC_seen_zerolen++;
6173 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6174 *flagp |= SIMPLE;
830247a4 6175 nextchar(pRExC_state);
fac92740 6176 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6177 break;
6178 case 'G':
830247a4
IZ
6179 ret = reg_node(pRExC_state, GPOS);
6180 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6181 *flagp |= SIMPLE;
830247a4 6182 nextchar(pRExC_state);
fac92740 6183 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6184 break;
6185 case 'Z':
830247a4 6186 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6187 *flagp |= SIMPLE;
a1917ab9 6188 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 6189 nextchar(pRExC_state);
a0d0e21e 6190 break;
b85d18e9 6191 case 'z':
830247a4 6192 ret = reg_node(pRExC_state, EOS);
b85d18e9 6193 *flagp |= SIMPLE;
830247a4
IZ
6194 RExC_seen_zerolen++; /* Do not optimize RE away */
6195 nextchar(pRExC_state);
fac92740 6196 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 6197 break;
4a2d328f 6198 case 'C':
f33976b4
DB
6199 ret = reg_node(pRExC_state, CANY);
6200 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6201 *flagp |= HASWIDTH|SIMPLE;
830247a4 6202 nextchar(pRExC_state);
fac92740 6203 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
6204 break;
6205 case 'X':
830247a4 6206 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6207 *flagp |= HASWIDTH;
830247a4 6208 nextchar(pRExC_state);
fac92740 6209 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 6210 break;
a0d0e21e 6211 case 'w':
eb160463 6212 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6213 *flagp |= HASWIDTH|SIMPLE;
830247a4 6214 nextchar(pRExC_state);
fac92740 6215 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6216 break;
6217 case 'W':
eb160463 6218 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6219 *flagp |= HASWIDTH|SIMPLE;
830247a4 6220 nextchar(pRExC_state);
fac92740 6221 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6222 break;
6223 case 'b':
830247a4
IZ
6224 RExC_seen_zerolen++;
6225 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6226 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6227 *flagp |= SIMPLE;
830247a4 6228 nextchar(pRExC_state);
fac92740 6229 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6230 break;
6231 case 'B':
830247a4
IZ
6232 RExC_seen_zerolen++;
6233 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6234 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6235 *flagp |= SIMPLE;
830247a4 6236 nextchar(pRExC_state);
fac92740 6237 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6238 break;
6239 case 's':
eb160463 6240 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6241 *flagp |= HASWIDTH|SIMPLE;
830247a4 6242 nextchar(pRExC_state);
fac92740 6243 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6244 break;
6245 case 'S':
eb160463 6246 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6247 *flagp |= HASWIDTH|SIMPLE;
830247a4 6248 nextchar(pRExC_state);
fac92740 6249 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6250 break;
6251 case 'd':
ffc61ed2 6252 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6253 *flagp |= HASWIDTH|SIMPLE;
830247a4 6254 nextchar(pRExC_state);
fac92740 6255 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6256 break;
6257 case 'D':
ffc61ed2 6258 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6259 *flagp |= HASWIDTH|SIMPLE;
830247a4 6260 nextchar(pRExC_state);
fac92740 6261 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 6262 break;
a14b48bc
LW
6263 case 'p':
6264 case 'P':
3568d838 6265 {
3dab1dad 6266 char* const oldregxend = RExC_end;
ccb2c380 6267 char* parse_start = RExC_parse - 2;
a14b48bc 6268
830247a4 6269 if (RExC_parse[1] == '{') {
3568d838 6270 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6271 RExC_end = strchr(RExC_parse, '}');
6272 if (!RExC_end) {
3dab1dad 6273 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6274 RExC_parse += 2;
6275 RExC_end = oldregxend;
0da60cf5 6276 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6277 }
830247a4 6278 RExC_end++;
a14b48bc 6279 }
af6f566e 6280 else {
830247a4 6281 RExC_end = RExC_parse + 2;
af6f566e
HS
6282 if (RExC_end > oldregxend)
6283 RExC_end = oldregxend;
6284 }
830247a4 6285 RExC_parse--;
a14b48bc 6286
3dab1dad 6287 ret = regclass(pRExC_state,depth+1);
a14b48bc 6288
830247a4
IZ
6289 RExC_end = oldregxend;
6290 RExC_parse--;
ccb2c380
MP
6291
6292 Set_Node_Offset(ret, parse_start + 2);
6293 Set_Node_Cur_Length(ret);
830247a4 6294 nextchar(pRExC_state);
a14b48bc
LW
6295 *flagp |= HASWIDTH|SIMPLE;
6296 }
6297 break;
fc8cd66c
YO
6298 case 'N':
6299 /* Handle \N{NAME} here and not below because it can be
6300 multicharacter. join_exact() will join them up later on.
6301 Also this makes sure that things like /\N{BLAH}+/ and
6302 \N{BLAH} being multi char Just Happen. dmq*/
6303 ++RExC_parse;
6304 ret= reg_namedseq(pRExC_state, NULL);
6305 break;
0a4db386 6306 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
6307 {
6308 char ch= RExC_parse[1];
6309 if (ch != '<' && ch != '\'') {
6310 if (SIZE_ONLY)
6311 vWARN( RExC_parse + 1,
6312 "Possible broken named back reference treated as literal k");
6313 parse_start--;
6314 goto defchar;
6315 } else {
6316 char* name_start = (RExC_parse += 2);
2eccd3b2 6317 U32 num = 0;
0a4db386
YO
6318 SV *sv_dat = reg_scan_name(pRExC_state,
6319 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
6320 ch= (ch == '<') ? '>' : '\'';
6321
81714fb9
YO
6322 if (RExC_parse == name_start || *RExC_parse != ch)
6323 vFAIL2("Sequence \\k%c... not terminated",
6324 (ch == '>' ? '<' : ch));
6325
6326 RExC_sawback = 1;
6327 ret = reganode(pRExC_state,
6328 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6329 num);
6330 *flagp |= HASWIDTH;
6331
6332
6333 if (!SIZE_ONLY) {
81714fb9
YO
6334 num = add_data( pRExC_state, 1, "S" );
6335 ARG_SET(ret,num);
f8fc2ecf 6336 RExC_rxi->data->data[num]=(void*)sv_dat;
81714fb9
YO
6337 SvREFCNT_inc(sv_dat);
6338 }
6339 /* override incorrect value set in reganode MJD */
6340 Set_Node_Offset(ret, parse_start+1);
6341 Set_Node_Cur_Length(ret); /* MJD */
6342 nextchar(pRExC_state);
6343
6344 }
6345 break;
6346 }
a0d0e21e
LW
6347 case 'n':
6348 case 'r':
6349 case 't':
6350 case 'f':
6351 case 'e':
6352 case 'a':
6353 case 'x':
6354 case 'c':
6355 case '0':
6356 goto defchar;
c74340f9 6357 case 'R':
a0d0e21e
LW
6358 case '1': case '2': case '3': case '4':
6359 case '5': case '6': case '7': case '8': case '9':
6360 {
c74340f9
YO
6361 I32 num;
6362 bool isrel=(*RExC_parse=='R');
6363 if (isrel)
6364 RExC_parse++;
6365 num = atoi(RExC_parse);
6366 if (isrel) {
6367 num = RExC_cpar - num;
6368 if (num < 1)
6369 vFAIL("Reference to nonexistent or unclosed group");
6370 }
830247a4 6371 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
6372 goto defchar;
6373 else {
3dab1dad 6374 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6375 while (isDIGIT(*RExC_parse))
6376 RExC_parse++;
b45f050a 6377
c74340f9
YO
6378 if (!SIZE_ONLY) {
6379 if (num > (I32)RExC_rx->nparens)
6380 vFAIL("Reference to nonexistent group");
6381 /* People make this error all the time apparently.
6382 So we cant fail on it, even though we should
6383
6384 else if (num >= RExC_cpar)
6385 vFAIL("Reference to unclosed group will always match");
6386 */
6387 }
830247a4 6388 RExC_sawback = 1;
eb160463
GS
6389 ret = reganode(pRExC_state,
6390 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6391 num);
a0d0e21e 6392 *flagp |= HASWIDTH;
2af232bd 6393
fac92740 6394 /* override incorrect value set in reganode MJD */
2af232bd 6395 Set_Node_Offset(ret, parse_start+1);
fac92740 6396 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6397 RExC_parse--;
6398 nextchar(pRExC_state);
a0d0e21e
LW
6399 }
6400 }
6401 break;
6402 case '\0':
830247a4 6403 if (RExC_parse >= RExC_end)
b45f050a 6404 FAIL("Trailing \\");
a0d0e21e
LW
6405 /* FALL THROUGH */
6406 default:
a0288114 6407 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6408 back into the quick-grab loop below */
45948336 6409 parse_start--;
a0d0e21e
LW
6410 goto defchar;
6411 }
6412 break;
4633a7c4
LW
6413
6414 case '#':
bbe252da 6415 if (RExC_flags & RXf_PMf_EXTENDED) {
3dab1dad
YO
6416 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6417 RExC_parse++;
830247a4 6418 if (RExC_parse < RExC_end)
4633a7c4
LW
6419 goto tryagain;
6420 }
6421 /* FALL THROUGH */
6422
a0d0e21e 6423 default: {
ba210ebe 6424 register STRLEN len;
58ae7d3f 6425 register UV ender;
a0d0e21e 6426 register char *p;
3dab1dad 6427 char *s;
80aecb99 6428 STRLEN foldlen;
89ebb4a3 6429 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6430
6431 parse_start = RExC_parse - 1;
a0d0e21e 6432
830247a4 6433 RExC_parse++;
a0d0e21e
LW
6434
6435 defchar:
58ae7d3f 6436 ender = 0;
eb160463
GS
6437 ret = reg_node(pRExC_state,
6438 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6439 s = STRING(ret);
830247a4
IZ
6440 for (len = 0, p = RExC_parse - 1;
6441 len < 127 && p < RExC_end;
a0d0e21e
LW
6442 len++)
6443 {
3dab1dad 6444 char * const oldp = p;
5b5a24f7 6445
bbe252da 6446 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6447 p = regwhite(p, RExC_end);
a0d0e21e
LW
6448 switch (*p) {
6449 case '^':
6450 case '$':
6451 case '.':
6452 case '[':
6453 case '(':
6454 case ')':
6455 case '|':
6456 goto loopdone;
6457 case '\\':
6458 switch (*++p) {
6459 case 'A':
1ed8eac0
JF
6460 case 'C':
6461 case 'X':
a0d0e21e
LW
6462 case 'G':
6463 case 'Z':
b85d18e9 6464 case 'z':
a0d0e21e
LW
6465 case 'w':
6466 case 'W':
6467 case 'b':
6468 case 'B':
6469 case 's':
6470 case 'S':
6471 case 'd':
6472 case 'D':
a14b48bc
LW
6473 case 'p':
6474 case 'P':
fc8cd66c 6475 case 'N':
c74340f9 6476 case 'R':
a0d0e21e
LW
6477 --p;
6478 goto loopdone;
6479 case 'n':
6480 ender = '\n';
6481 p++;
a687059c 6482 break;
a0d0e21e
LW
6483 case 'r':
6484 ender = '\r';
6485 p++;
a687059c 6486 break;
a0d0e21e
LW
6487 case 't':
6488 ender = '\t';
6489 p++;
a687059c 6490 break;
a0d0e21e
LW
6491 case 'f':
6492 ender = '\f';
6493 p++;
a687059c 6494 break;
a0d0e21e 6495 case 'e':
c7f1f016 6496 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6497 p++;
a687059c 6498 break;
a0d0e21e 6499 case 'a':
c7f1f016 6500 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6501 p++;
a687059c 6502 break;
a0d0e21e 6503 case 'x':
a0ed51b3 6504 if (*++p == '{') {
1df70142 6505 char* const e = strchr(p, '}');
b81d288d 6506
b45f050a 6507 if (!e) {
830247a4 6508 RExC_parse = p + 1;
b45f050a
JF
6509 vFAIL("Missing right brace on \\x{}");
6510 }
de5f0749 6511 else {
a4c04bdc
NC
6512 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6513 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6514 STRLEN numlen = e - p - 1;
53305cf1 6515 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6516 if (ender > 0xff)
6517 RExC_utf8 = 1;
a0ed51b3
LW
6518 p = e + 1;
6519 }
a0ed51b3
LW
6520 }
6521 else {
a4c04bdc 6522 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6523 STRLEN numlen = 2;
53305cf1 6524 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6525 p += numlen;
6526 }
9e08bc66
TS
6527 if (PL_encoding && ender < 0x100)
6528 goto recode_encoding;
a687059c 6529 break;
a0d0e21e
LW
6530 case 'c':
6531 p++;
bbce6d69 6532 ender = UCHARAT(p++);
6533 ender = toCTRL(ender);
a687059c 6534 break;
a0d0e21e
LW
6535 case '0': case '1': case '2': case '3':case '4':
6536 case '5': case '6': case '7': case '8':case '9':
6537 if (*p == '0' ||
830247a4 6538 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6539 I32 flags = 0;
1df70142 6540 STRLEN numlen = 3;
53305cf1 6541 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6542 p += numlen;
6543 }
6544 else {
6545 --p;
6546 goto loopdone;
a687059c 6547 }
9e08bc66
TS
6548 if (PL_encoding && ender < 0x100)
6549 goto recode_encoding;
6550 break;
6551 recode_encoding:
6552 {
6553 SV* enc = PL_encoding;
6554 ender = reg_recode((const char)(U8)ender, &enc);
6555 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6556 vWARN(p, "Invalid escape in the specified encoding");
6557 RExC_utf8 = 1;
6558 }
a687059c 6559 break;
a0d0e21e 6560 case '\0':
830247a4 6561 if (p >= RExC_end)
b45f050a 6562 FAIL("Trailing \\");
a687059c 6563 /* FALL THROUGH */
a0d0e21e 6564 default:
041457d9 6565 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6566 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6567 goto normal_default;
a0d0e21e
LW
6568 }
6569 break;
a687059c 6570 default:
a0ed51b3 6571 normal_default:
fd400ab9 6572 if (UTF8_IS_START(*p) && UTF) {
1df70142 6573 STRLEN numlen;
5e12f4fb 6574 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6575 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6576 p += numlen;
6577 }
6578 else
6579 ender = *p++;
a0d0e21e 6580 break;
a687059c 6581 }
bbe252da 6582 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6583 p = regwhite(p, RExC_end);
60a8b682
JH
6584 if (UTF && FOLD) {
6585 /* Prime the casefolded buffer. */
ac7e0132 6586 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6587 }
a0d0e21e
LW
6588 if (ISMULT2(p)) { /* Back off on ?+*. */
6589 if (len)
6590 p = oldp;
16ea2a2e 6591 else if (UTF) {
80aecb99 6592 if (FOLD) {
60a8b682 6593 /* Emit all the Unicode characters. */
1df70142 6594 STRLEN numlen;
80aecb99
JH
6595 for (foldbuf = tmpbuf;
6596 foldlen;
6597 foldlen -= numlen) {
6598 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6599 if (numlen > 0) {
71207a34 6600 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6601 s += unilen;
6602 len += unilen;
6603 /* In EBCDIC the numlen
6604 * and unilen can differ. */
9dc45d57 6605 foldbuf += numlen;
47654450
JH
6606 if (numlen >= foldlen)
6607 break;
9dc45d57
JH
6608 }
6609 else
6610 break; /* "Can't happen." */
80aecb99
JH
6611 }
6612 }
6613 else {
71207a34 6614 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6615 if (unilen > 0) {
0ebc6274
JH
6616 s += unilen;
6617 len += unilen;
9dc45d57 6618 }
80aecb99 6619 }
a0ed51b3 6620 }
a0d0e21e
LW
6621 else {
6622 len++;
eb160463 6623 REGC((char)ender, s++);
a0d0e21e
LW
6624 }
6625 break;
a687059c 6626 }
16ea2a2e 6627 if (UTF) {
80aecb99 6628 if (FOLD) {
60a8b682 6629 /* Emit all the Unicode characters. */
1df70142 6630 STRLEN numlen;
80aecb99
JH
6631 for (foldbuf = tmpbuf;
6632 foldlen;
6633 foldlen -= numlen) {
6634 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6635 if (numlen > 0) {
71207a34 6636 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6637 len += unilen;
6638 s += unilen;
6639 /* In EBCDIC the numlen
6640 * and unilen can differ. */
9dc45d57 6641 foldbuf += numlen;
47654450
JH
6642 if (numlen >= foldlen)
6643 break;
9dc45d57
JH
6644 }
6645 else
6646 break;
80aecb99
JH
6647 }
6648 }
6649 else {
71207a34 6650 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6651 if (unilen > 0) {
0ebc6274
JH
6652 s += unilen;
6653 len += unilen;
9dc45d57 6654 }
80aecb99
JH
6655 }
6656 len--;
a0ed51b3
LW
6657 }
6658 else
eb160463 6659 REGC((char)ender, s++);
a0d0e21e
LW
6660 }
6661 loopdone:
830247a4 6662 RExC_parse = p - 1;
fac92740 6663 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6664 nextchar(pRExC_state);
793db0cb
JH
6665 {
6666 /* len is STRLEN which is unsigned, need to copy to signed */
6667 IV iv = len;
6668 if (iv < 0)
6669 vFAIL("Internal disaster");
6670 }
a0d0e21e
LW
6671 if (len > 0)
6672 *flagp |= HASWIDTH;
090f7165 6673 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6674 *flagp |= SIMPLE;
3dab1dad 6675
cd439c50 6676 if (SIZE_ONLY)
830247a4 6677 RExC_size += STR_SZ(len);
3dab1dad
YO
6678 else {
6679 STR_LEN(ret) = len;
830247a4 6680 RExC_emit += STR_SZ(len);
07be1b83 6681 }
3dab1dad 6682 }
a0d0e21e
LW
6683 break;
6684 }
a687059c 6685
a0d0e21e 6686 return(ret);
a687059c
LW
6687}
6688
873ef191 6689STATIC char *
5f66b61c 6690S_regwhite(char *p, const char *e)
5b5a24f7
CS
6691{
6692 while (p < e) {
6693 if (isSPACE(*p))
6694 ++p;
6695 else if (*p == '#') {
6696 do {
6697 p++;
6698 } while (p < e && *p != '\n');
6699 }
6700 else
6701 break;
6702 }
6703 return p;
6704}
6705
b8c5462f
JH
6706/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6707 Character classes ([:foo:]) can also be negated ([:^foo:]).
6708 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6709 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6710 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6711
6712#define POSIXCC_DONE(c) ((c) == ':')
6713#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6714#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6715
b8c5462f 6716STATIC I32
830247a4 6717S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6718{
97aff369 6719 dVAR;
936ed897 6720 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6721
830247a4 6722 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6723 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6724 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6725 const char c = UCHARAT(RExC_parse);
097eb12c 6726 char* const s = RExC_parse++;
b81d288d 6727
9a86a77b 6728 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6729 RExC_parse++;
6730 if (RExC_parse == RExC_end)
620e46c5 6731 /* Grandfather lone [:, [=, [. */
830247a4 6732 RExC_parse = s;
620e46c5 6733 else {
3dab1dad 6734 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6735 assert(*t == c);
6736
9a86a77b 6737 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6738 const char *posixcc = s + 1;
830247a4 6739 RExC_parse++; /* skip over the ending ] */
3dab1dad 6740
b8c5462f 6741 if (*s == ':') {
1df70142
AL
6742 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6743 const I32 skip = t - posixcc;
80916619
NC
6744
6745 /* Initially switch on the length of the name. */
6746 switch (skip) {
6747 case 4:
3dab1dad
YO
6748 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6749 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6750 break;
80916619
NC
6751 case 5:
6752 /* Names all of length 5. */
6753 /* alnum alpha ascii blank cntrl digit graph lower
6754 print punct space upper */
6755 /* Offset 4 gives the best switch position. */
6756 switch (posixcc[4]) {
6757 case 'a':
3dab1dad
YO
6758 if (memEQ(posixcc, "alph", 4)) /* alpha */
6759 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6760 break;
6761 case 'e':
3dab1dad
YO
6762 if (memEQ(posixcc, "spac", 4)) /* space */
6763 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6764 break;
6765 case 'h':
3dab1dad
YO
6766 if (memEQ(posixcc, "grap", 4)) /* graph */
6767 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6768 break;
6769 case 'i':
3dab1dad
YO
6770 if (memEQ(posixcc, "asci", 4)) /* ascii */
6771 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6772 break;
6773 case 'k':
3dab1dad
YO
6774 if (memEQ(posixcc, "blan", 4)) /* blank */
6775 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6776 break;
6777 case 'l':
3dab1dad
YO
6778 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6779 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6780 break;
6781 case 'm':
3dab1dad
YO
6782 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6783 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6784 break;
6785 case 'r':
3dab1dad
YO
6786 if (memEQ(posixcc, "lowe", 4)) /* lower */
6787 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6788 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6789 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6790 break;
6791 case 't':
3dab1dad
YO
6792 if (memEQ(posixcc, "digi", 4)) /* digit */
6793 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6794 else if (memEQ(posixcc, "prin", 4)) /* print */
6795 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6796 else if (memEQ(posixcc, "punc", 4)) /* punct */
6797 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6798 break;
b8c5462f
JH
6799 }
6800 break;
80916619 6801 case 6:
3dab1dad
YO
6802 if (memEQ(posixcc, "xdigit", 6))
6803 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6804 break;
6805 }
80916619
NC
6806
6807 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6808 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6809 t - s - 1, s + 1);
80916619
NC
6810 assert (posixcc[skip] == ':');
6811 assert (posixcc[skip+1] == ']');
b45f050a 6812 } else if (!SIZE_ONLY) {
b8c5462f 6813 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6814
830247a4 6815 /* adjust RExC_parse so the warning shows after
b45f050a 6816 the class closes */
9a86a77b 6817 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6818 RExC_parse++;
b45f050a
JF
6819 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6820 }
b8c5462f
JH
6821 } else {
6822 /* Maternal grandfather:
6823 * "[:" ending in ":" but not in ":]" */
830247a4 6824 RExC_parse = s;
767d463e 6825 }
620e46c5
JH
6826 }
6827 }
6828
b8c5462f
JH
6829 return namedclass;
6830}
6831
6832STATIC void
830247a4 6833S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6834{
97aff369 6835 dVAR;
3dab1dad 6836 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6837 const char *s = RExC_parse;
6838 const char c = *s++;
b8c5462f 6839
3dab1dad 6840 while (isALNUM(*s))
b8c5462f
JH
6841 s++;
6842 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6843 if (ckWARN(WARN_REGEXP))
6844 vWARN3(s+2,
6845 "POSIX syntax [%c %c] belongs inside character classes",
6846 c, c);
b45f050a
JF
6847
6848 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6849 if (POSIXCC_NOTYET(c)) {
830247a4 6850 /* adjust RExC_parse so the error shows after
b45f050a 6851 the class closes */
9a86a77b 6852 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6853 NOOP;
b45f050a
JF
6854 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6855 }
b8c5462f
JH
6856 }
6857 }
620e46c5
JH
6858}
6859
7f6f358c
YO
6860
6861/*
6862 parse a class specification and produce either an ANYOF node that
6863 matches the pattern. If the pattern matches a single char only and
6864 that char is < 256 then we produce an EXACT node instead.
6865*/
76e3520e 6866STATIC regnode *
3dab1dad 6867S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6868{
97aff369 6869 dVAR;
9ef43ace 6870 register UV value = 0;
9a86a77b 6871 register UV nextvalue;
3568d838 6872 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6873 register IV range = 0;
c277df42 6874 register regnode *ret;
ba210ebe 6875 STRLEN numlen;
ffc61ed2 6876 IV namedclass;
cbbf8932 6877 char *rangebegin = NULL;
936ed897 6878 bool need_class = 0;
c445ea15 6879 SV *listsv = NULL;
ffc61ed2 6880 UV n;
9e55ce06 6881 bool optimize_invert = TRUE;
cbbf8932 6882 AV* unicode_alternate = NULL;
1b2d223b
JH
6883#ifdef EBCDIC
6884 UV literal_endpoint = 0;
6885#endif
7f6f358c 6886 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6887
3dab1dad 6888 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6889 case we need to change the emitted regop to an EXACT. */
07be1b83 6890 const char * orig_parse = RExC_parse;
72f13be8 6891 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6892#ifndef DEBUGGING
6893 PERL_UNUSED_ARG(depth);
6894#endif
72f13be8 6895
3dab1dad 6896 DEBUG_PARSE("clas");
7f6f358c
YO
6897
6898 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6899 ret = reganode(pRExC_state, ANYOF, 0);
6900
6901 if (!SIZE_ONLY)
6902 ANYOF_FLAGS(ret) = 0;
6903
9a86a77b 6904 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6905 RExC_naughty++;
6906 RExC_parse++;
6907 if (!SIZE_ONLY)
6908 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6909 }
a0d0e21e 6910
73060fc4 6911 if (SIZE_ONLY) {
830247a4 6912 RExC_size += ANYOF_SKIP;
73060fc4
JH
6913 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6914 }
936ed897 6915 else {
830247a4 6916 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6917 if (FOLD)
6918 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6919 if (LOC)
6920 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6921 ANYOF_BITMAP_ZERO(ret);
396482e1 6922 listsv = newSVpvs("# comment\n");
a0d0e21e 6923 }
b8c5462f 6924
9a86a77b
JH
6925 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6926
b938889d 6927 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6928 checkposixcc(pRExC_state);
b8c5462f 6929
f064b6ad
HS
6930 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6931 if (UCHARAT(RExC_parse) == ']')
6932 goto charclassloop;
ffc61ed2 6933
fc8cd66c 6934parseit:
9a86a77b 6935 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6936
6937 charclassloop:
6938
6939 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6940
73b437c8 6941 if (!range)
830247a4 6942 rangebegin = RExC_parse;
ffc61ed2 6943 if (UTF) {
5e12f4fb 6944 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6945 RExC_end - RExC_parse,
9f7f3913 6946 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6947 RExC_parse += numlen;
6948 }
6949 else
6950 value = UCHARAT(RExC_parse++);
7f6f358c 6951
9a86a77b
JH
6952 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6953 if (value == '[' && POSIXCC(nextvalue))
830247a4 6954 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6955 else if (value == '\\') {
ffc61ed2 6956 if (UTF) {
5e12f4fb 6957 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6958 RExC_end - RExC_parse,
9f7f3913 6959 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6960 RExC_parse += numlen;
6961 }
6962 else
6963 value = UCHARAT(RExC_parse++);
470c3474 6964 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6965 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6966 * be a problem later if we want switch on Unicode.
6967 * A similar issue a little bit later when switching on
6968 * namedclass. --jhi */
ffc61ed2 6969 switch ((I32)value) {
b8c5462f
JH
6970 case 'w': namedclass = ANYOF_ALNUM; break;
6971 case 'W': namedclass = ANYOF_NALNUM; break;
6972 case 's': namedclass = ANYOF_SPACE; break;
6973 case 'S': namedclass = ANYOF_NSPACE; break;
6974 case 'd': namedclass = ANYOF_DIGIT; break;
6975 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6976 case 'N': /* Handle \N{NAME} in class */
6977 {
6978 /* We only pay attention to the first char of
6979 multichar strings being returned. I kinda wonder
6980 if this makes sense as it does change the behaviour
6981 from earlier versions, OTOH that behaviour was broken
6982 as well. */
6983 UV v; /* value is register so we cant & it /grrr */
6984 if (reg_namedseq(pRExC_state, &v)) {
6985 goto parseit;
6986 }
6987 value= v;
6988 }
6989 break;
ffc61ed2
JH
6990 case 'p':
6991 case 'P':
3dab1dad
YO
6992 {
6993 char *e;
af6f566e 6994 if (RExC_parse >= RExC_end)
2a4859cd 6995 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6996 if (*RExC_parse == '{') {
1df70142 6997 const U8 c = (U8)value;
ffc61ed2
JH
6998 e = strchr(RExC_parse++, '}');
6999 if (!e)
0da60cf5 7000 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
7001 while (isSPACE(UCHARAT(RExC_parse)))
7002 RExC_parse++;
7003 if (e == RExC_parse)
0da60cf5 7004 vFAIL2("Empty \\%c{}", c);
ffc61ed2 7005 n = e - RExC_parse;
ab13f0c7
JH
7006 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7007 n--;
ffc61ed2
JH
7008 }
7009 else {
7010 e = RExC_parse;
7011 n = 1;
7012 }
7013 if (!SIZE_ONLY) {
ab13f0c7
JH
7014 if (UCHARAT(RExC_parse) == '^') {
7015 RExC_parse++;
7016 n--;
7017 value = value == 'p' ? 'P' : 'p'; /* toggle */
7018 while (isSPACE(UCHARAT(RExC_parse))) {
7019 RExC_parse++;
7020 n--;
7021 }
7022 }
097eb12c
AL
7023 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7024 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
7025 }
7026 RExC_parse = e + 1;
7027 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 7028 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 7029 }
f81125e2 7030 break;
b8c5462f
JH
7031 case 'n': value = '\n'; break;
7032 case 'r': value = '\r'; break;
7033 case 't': value = '\t'; break;
7034 case 'f': value = '\f'; break;
7035 case 'b': value = '\b'; break;
c7f1f016
NIS
7036 case 'e': value = ASCII_TO_NATIVE('\033');break;
7037 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 7038 case 'x':
ffc61ed2 7039 if (*RExC_parse == '{') {
a4c04bdc
NC
7040 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7041 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 7042 char * const e = strchr(RExC_parse++, '}');
b81d288d 7043 if (!e)
ffc61ed2 7044 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
7045
7046 numlen = e - RExC_parse;
7047 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7048 RExC_parse = e + 1;
7049 }
7050 else {
a4c04bdc 7051 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
7052 numlen = 2;
7053 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7054 RExC_parse += numlen;
7055 }
9e08bc66
TS
7056 if (PL_encoding && value < 0x100)
7057 goto recode_encoding;
b8c5462f
JH
7058 break;
7059 case 'c':
830247a4 7060 value = UCHARAT(RExC_parse++);
b8c5462f
JH
7061 value = toCTRL(value);
7062 break;
7063 case '0': case '1': case '2': case '3': case '4':
7064 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
7065 {
7066 I32 flags = 0;
7067 numlen = 3;
7068 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7069 RExC_parse += numlen;
7070 if (PL_encoding && value < 0x100)
7071 goto recode_encoding;
7072 break;
7073 }
7074 recode_encoding:
7075 {
7076 SV* enc = PL_encoding;
7077 value = reg_recode((const char)(U8)value, &enc);
7078 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7079 vWARN(RExC_parse,
7080 "Invalid escape in the specified encoding");
7081 break;
7082 }
1028017a 7083 default:
041457d9 7084 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7085 vWARN2(RExC_parse,
7086 "Unrecognized escape \\%c in character class passed through",
7087 (int)value);
1028017a 7088 break;
b8c5462f 7089 }
ffc61ed2 7090 } /* end of \blah */
1b2d223b
JH
7091#ifdef EBCDIC
7092 else
7093 literal_endpoint++;
7094#endif
ffc61ed2
JH
7095
7096 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7097
7098 if (!SIZE_ONLY && !need_class)
936ed897 7099 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7100
936ed897 7101 need_class = 1;
ffc61ed2
JH
7102
7103 /* a bad range like a-\d, a-[:digit:] ? */
7104 if (range) {
73b437c8 7105 if (!SIZE_ONLY) {
afd78fd5 7106 if (ckWARN(WARN_REGEXP)) {
097eb12c 7107 const int w =
afd78fd5
JH
7108 RExC_parse >= rangebegin ?
7109 RExC_parse - rangebegin : 0;
830247a4 7110 vWARN4(RExC_parse,
b45f050a 7111 "False [] range \"%*.*s\"",
097eb12c 7112 w, w, rangebegin);
afd78fd5 7113 }
3568d838
JH
7114 if (prevvalue < 256) {
7115 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7116 ANYOF_BITMAP_SET(ret, '-');
7117 }
7118 else {
7119 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7120 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7121 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7122 }
b8c5462f 7123 }
ffc61ed2
JH
7124
7125 range = 0; /* this was not a true range */
73b437c8 7126 }
ffc61ed2 7127
73b437c8 7128 if (!SIZE_ONLY) {
c49a72a9
NC
7129 const char *what = NULL;
7130 char yesno = 0;
7131
3568d838
JH
7132 if (namedclass > OOB_NAMEDCLASS)
7133 optimize_invert = FALSE;
e2962f66
JH
7134 /* Possible truncation here but in some 64-bit environments
7135 * the compiler gets heartburn about switch on 64-bit values.
7136 * A similar issue a little earlier when switching on value.
98f323fa 7137 * --jhi */
e2962f66 7138 switch ((I32)namedclass) {
73b437c8
JH
7139 case ANYOF_ALNUM:
7140 if (LOC)
936ed897 7141 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
7142 else {
7143 for (value = 0; value < 256; value++)
7144 if (isALNUM(value))
936ed897 7145 ANYOF_BITMAP_SET(ret, value);
73b437c8 7146 }
c49a72a9
NC
7147 yesno = '+';
7148 what = "Word";
73b437c8
JH
7149 break;
7150 case ANYOF_NALNUM:
7151 if (LOC)
936ed897 7152 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
7153 else {
7154 for (value = 0; value < 256; value++)
7155 if (!isALNUM(value))
936ed897 7156 ANYOF_BITMAP_SET(ret, value);
73b437c8 7157 }
c49a72a9
NC
7158 yesno = '!';
7159 what = "Word";
73b437c8 7160 break;
ffc61ed2 7161 case ANYOF_ALNUMC:
73b437c8 7162 if (LOC)
ffc61ed2 7163 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
7164 else {
7165 for (value = 0; value < 256; value++)
ffc61ed2 7166 if (isALNUMC(value))
936ed897 7167 ANYOF_BITMAP_SET(ret, value);
73b437c8 7168 }
c49a72a9
NC
7169 yesno = '+';
7170 what = "Alnum";
73b437c8
JH
7171 break;
7172 case ANYOF_NALNUMC:
7173 if (LOC)
936ed897 7174 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
7175 else {
7176 for (value = 0; value < 256; value++)
7177 if (!isALNUMC(value))
936ed897 7178 ANYOF_BITMAP_SET(ret, value);
73b437c8 7179 }
c49a72a9
NC
7180 yesno = '!';
7181 what = "Alnum";
73b437c8
JH
7182 break;
7183 case ANYOF_ALPHA:
7184 if (LOC)
936ed897 7185 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
7186 else {
7187 for (value = 0; value < 256; value++)
7188 if (isALPHA(value))
936ed897 7189 ANYOF_BITMAP_SET(ret, value);
73b437c8 7190 }
c49a72a9
NC
7191 yesno = '+';
7192 what = "Alpha";
73b437c8
JH
7193 break;
7194 case ANYOF_NALPHA:
7195 if (LOC)
936ed897 7196 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
7197 else {
7198 for (value = 0; value < 256; value++)
7199 if (!isALPHA(value))
936ed897 7200 ANYOF_BITMAP_SET(ret, value);
73b437c8 7201 }
c49a72a9
NC
7202 yesno = '!';
7203 what = "Alpha";
73b437c8
JH
7204 break;
7205 case ANYOF_ASCII:
7206 if (LOC)
936ed897 7207 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7208 else {
c7f1f016 7209#ifndef EBCDIC
1ba5c669
JH
7210 for (value = 0; value < 128; value++)
7211 ANYOF_BITMAP_SET(ret, value);
7212#else /* EBCDIC */
ffbc6a93 7213 for (value = 0; value < 256; value++) {
3a3c4447
JH
7214 if (isASCII(value))
7215 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7216 }
1ba5c669 7217#endif /* EBCDIC */
73b437c8 7218 }
c49a72a9
NC
7219 yesno = '+';
7220 what = "ASCII";
73b437c8
JH
7221 break;
7222 case ANYOF_NASCII:
7223 if (LOC)
936ed897 7224 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7225 else {
c7f1f016 7226#ifndef EBCDIC
1ba5c669
JH
7227 for (value = 128; value < 256; value++)
7228 ANYOF_BITMAP_SET(ret, value);
7229#else /* EBCDIC */
ffbc6a93 7230 for (value = 0; value < 256; value++) {
3a3c4447
JH
7231 if (!isASCII(value))
7232 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7233 }
1ba5c669 7234#endif /* EBCDIC */
73b437c8 7235 }
c49a72a9
NC
7236 yesno = '!';
7237 what = "ASCII";
73b437c8 7238 break;
aaa51d5e
JF
7239 case ANYOF_BLANK:
7240 if (LOC)
7241 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7242 else {
7243 for (value = 0; value < 256; value++)
7244 if (isBLANK(value))
7245 ANYOF_BITMAP_SET(ret, value);
7246 }
c49a72a9
NC
7247 yesno = '+';
7248 what = "Blank";
aaa51d5e
JF
7249 break;
7250 case ANYOF_NBLANK:
7251 if (LOC)
7252 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7253 else {
7254 for (value = 0; value < 256; value++)
7255 if (!isBLANK(value))
7256 ANYOF_BITMAP_SET(ret, value);
7257 }
c49a72a9
NC
7258 yesno = '!';
7259 what = "Blank";
aaa51d5e 7260 break;
73b437c8
JH
7261 case ANYOF_CNTRL:
7262 if (LOC)
936ed897 7263 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
7264 else {
7265 for (value = 0; value < 256; value++)
7266 if (isCNTRL(value))
936ed897 7267 ANYOF_BITMAP_SET(ret, value);
73b437c8 7268 }
c49a72a9
NC
7269 yesno = '+';
7270 what = "Cntrl";
73b437c8
JH
7271 break;
7272 case ANYOF_NCNTRL:
7273 if (LOC)
936ed897 7274 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
7275 else {
7276 for (value = 0; value < 256; value++)
7277 if (!isCNTRL(value))
936ed897 7278 ANYOF_BITMAP_SET(ret, value);
73b437c8 7279 }
c49a72a9
NC
7280 yesno = '!';
7281 what = "Cntrl";
ffc61ed2
JH
7282 break;
7283 case ANYOF_DIGIT:
7284 if (LOC)
7285 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7286 else {
7287 /* consecutive digits assumed */
7288 for (value = '0'; value <= '9'; value++)
7289 ANYOF_BITMAP_SET(ret, value);
7290 }
c49a72a9
NC
7291 yesno = '+';
7292 what = "Digit";
ffc61ed2
JH
7293 break;
7294 case ANYOF_NDIGIT:
7295 if (LOC)
7296 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7297 else {
7298 /* consecutive digits assumed */
7299 for (value = 0; value < '0'; value++)
7300 ANYOF_BITMAP_SET(ret, value);
7301 for (value = '9' + 1; value < 256; value++)
7302 ANYOF_BITMAP_SET(ret, value);
7303 }
c49a72a9
NC
7304 yesno = '!';
7305 what = "Digit";
73b437c8
JH
7306 break;
7307 case ANYOF_GRAPH:
7308 if (LOC)
936ed897 7309 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
7310 else {
7311 for (value = 0; value < 256; value++)
7312 if (isGRAPH(value))
936ed897 7313 ANYOF_BITMAP_SET(ret, value);
73b437c8 7314 }
c49a72a9
NC
7315 yesno = '+';
7316 what = "Graph";
73b437c8
JH
7317 break;
7318 case ANYOF_NGRAPH:
7319 if (LOC)
936ed897 7320 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
7321 else {
7322 for (value = 0; value < 256; value++)
7323 if (!isGRAPH(value))
936ed897 7324 ANYOF_BITMAP_SET(ret, value);
73b437c8 7325 }
c49a72a9
NC
7326 yesno = '!';
7327 what = "Graph";
73b437c8
JH
7328 break;
7329 case ANYOF_LOWER:
7330 if (LOC)
936ed897 7331 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
7332 else {
7333 for (value = 0; value < 256; value++)
7334 if (isLOWER(value))
936ed897 7335 ANYOF_BITMAP_SET(ret, value);
73b437c8 7336 }
c49a72a9
NC
7337 yesno = '+';
7338 what = "Lower";
73b437c8
JH
7339 break;
7340 case ANYOF_NLOWER:
7341 if (LOC)
936ed897 7342 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
7343 else {
7344 for (value = 0; value < 256; value++)
7345 if (!isLOWER(value))
936ed897 7346 ANYOF_BITMAP_SET(ret, value);
73b437c8 7347 }
c49a72a9
NC
7348 yesno = '!';
7349 what = "Lower";
73b437c8
JH
7350 break;
7351 case ANYOF_PRINT:
7352 if (LOC)
936ed897 7353 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
7354 else {
7355 for (value = 0; value < 256; value++)
7356 if (isPRINT(value))
936ed897 7357 ANYOF_BITMAP_SET(ret, value);
73b437c8 7358 }
c49a72a9
NC
7359 yesno = '+';
7360 what = "Print";
73b437c8
JH
7361 break;
7362 case ANYOF_NPRINT:
7363 if (LOC)
936ed897 7364 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
7365 else {
7366 for (value = 0; value < 256; value++)
7367 if (!isPRINT(value))
936ed897 7368 ANYOF_BITMAP_SET(ret, value);
73b437c8 7369 }
c49a72a9
NC
7370 yesno = '!';
7371 what = "Print";
73b437c8 7372 break;
aaa51d5e
JF
7373 case ANYOF_PSXSPC:
7374 if (LOC)
7375 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7376 else {
7377 for (value = 0; value < 256; value++)
7378 if (isPSXSPC(value))
7379 ANYOF_BITMAP_SET(ret, value);
7380 }
c49a72a9
NC
7381 yesno = '+';
7382 what = "Space";
aaa51d5e
JF
7383 break;
7384 case ANYOF_NPSXSPC:
7385 if (LOC)
7386 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7387 else {
7388 for (value = 0; value < 256; value++)
7389 if (!isPSXSPC(value))
7390 ANYOF_BITMAP_SET(ret, value);
7391 }
c49a72a9
NC
7392 yesno = '!';
7393 what = "Space";
aaa51d5e 7394 break;
73b437c8
JH
7395 case ANYOF_PUNCT:
7396 if (LOC)
936ed897 7397 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
7398 else {
7399 for (value = 0; value < 256; value++)
7400 if (isPUNCT(value))
936ed897 7401 ANYOF_BITMAP_SET(ret, value);
73b437c8 7402 }
c49a72a9
NC
7403 yesno = '+';
7404 what = "Punct";
73b437c8
JH
7405 break;
7406 case ANYOF_NPUNCT:
7407 if (LOC)
936ed897 7408 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
7409 else {
7410 for (value = 0; value < 256; value++)
7411 if (!isPUNCT(value))
936ed897 7412 ANYOF_BITMAP_SET(ret, value);
73b437c8 7413 }
c49a72a9
NC
7414 yesno = '!';
7415 what = "Punct";
ffc61ed2
JH
7416 break;
7417 case ANYOF_SPACE:
7418 if (LOC)
7419 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7420 else {
7421 for (value = 0; value < 256; value++)
7422 if (isSPACE(value))
7423 ANYOF_BITMAP_SET(ret, value);
7424 }
c49a72a9
NC
7425 yesno = '+';
7426 what = "SpacePerl";
ffc61ed2
JH
7427 break;
7428 case ANYOF_NSPACE:
7429 if (LOC)
7430 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7431 else {
7432 for (value = 0; value < 256; value++)
7433 if (!isSPACE(value))
7434 ANYOF_BITMAP_SET(ret, value);
7435 }
c49a72a9
NC
7436 yesno = '!';
7437 what = "SpacePerl";
73b437c8
JH
7438 break;
7439 case ANYOF_UPPER:
7440 if (LOC)
936ed897 7441 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
7442 else {
7443 for (value = 0; value < 256; value++)
7444 if (isUPPER(value))
936ed897 7445 ANYOF_BITMAP_SET(ret, value);
73b437c8 7446 }
c49a72a9
NC
7447 yesno = '+';
7448 what = "Upper";
73b437c8
JH
7449 break;
7450 case ANYOF_NUPPER:
7451 if (LOC)
936ed897 7452 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
7453 else {
7454 for (value = 0; value < 256; value++)
7455 if (!isUPPER(value))
936ed897 7456 ANYOF_BITMAP_SET(ret, value);
73b437c8 7457 }
c49a72a9
NC
7458 yesno = '!';
7459 what = "Upper";
73b437c8
JH
7460 break;
7461 case ANYOF_XDIGIT:
7462 if (LOC)
936ed897 7463 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
7464 else {
7465 for (value = 0; value < 256; value++)
7466 if (isXDIGIT(value))
936ed897 7467 ANYOF_BITMAP_SET(ret, value);
73b437c8 7468 }
c49a72a9
NC
7469 yesno = '+';
7470 what = "XDigit";
73b437c8
JH
7471 break;
7472 case ANYOF_NXDIGIT:
7473 if (LOC)
936ed897 7474 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
7475 else {
7476 for (value = 0; value < 256; value++)
7477 if (!isXDIGIT(value))
936ed897 7478 ANYOF_BITMAP_SET(ret, value);
73b437c8 7479 }
c49a72a9
NC
7480 yesno = '!';
7481 what = "XDigit";
73b437c8 7482 break;
f81125e2
JP
7483 case ANYOF_MAX:
7484 /* this is to handle \p and \P */
7485 break;
73b437c8 7486 default:
b45f050a 7487 vFAIL("Invalid [::] class");
73b437c8 7488 break;
b8c5462f 7489 }
c49a72a9
NC
7490 if (what) {
7491 /* Strings such as "+utf8::isWord\n" */
7492 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7493 }
b8c5462f 7494 if (LOC)
936ed897 7495 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7496 continue;
a0d0e21e 7497 }
ffc61ed2
JH
7498 } /* end of namedclass \blah */
7499
a0d0e21e 7500 if (range) {
eb160463 7501 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7502 const int w = RExC_parse - rangebegin;
7503 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7504 range = 0; /* not a valid range */
73b437c8 7505 }
a0d0e21e
LW
7506 }
7507 else {
3568d838 7508 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7509 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7510 RExC_parse[1] != ']') {
7511 RExC_parse++;
ffc61ed2
JH
7512
7513 /* a bad range like \w-, [:word:]- ? */
7514 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7515 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7516 const int w =
afd78fd5
JH
7517 RExC_parse >= rangebegin ?
7518 RExC_parse - rangebegin : 0;
830247a4 7519 vWARN4(RExC_parse,
b45f050a 7520 "False [] range \"%*.*s\"",
097eb12c 7521 w, w, rangebegin);
afd78fd5 7522 }
73b437c8 7523 if (!SIZE_ONLY)
936ed897 7524 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7525 } else
ffc61ed2
JH
7526 range = 1; /* yeah, it's a range! */
7527 continue; /* but do it the next time */
a0d0e21e 7528 }
a687059c 7529 }
ffc61ed2 7530
93733859 7531 /* now is the next time */
07be1b83 7532 /*stored += (value - prevvalue + 1);*/
ae5c130c 7533 if (!SIZE_ONLY) {
3568d838 7534 if (prevvalue < 256) {
1df70142 7535 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7536 IV i;
3568d838 7537#ifdef EBCDIC
1b2d223b
JH
7538 /* In EBCDIC [\x89-\x91] should include
7539 * the \x8e but [i-j] should not. */
7540 if (literal_endpoint == 2 &&
7541 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7542 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7543 {
3568d838
JH
7544 if (isLOWER(prevvalue)) {
7545 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7546 if (isLOWER(i))
7547 ANYOF_BITMAP_SET(ret, i);
7548 } else {
3568d838 7549 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7550 if (isUPPER(i))
7551 ANYOF_BITMAP_SET(ret, i);
7552 }
8ada0baa 7553 }
ffc61ed2 7554 else
8ada0baa 7555#endif
07be1b83
YO
7556 for (i = prevvalue; i <= ceilvalue; i++) {
7557 if (!ANYOF_BITMAP_TEST(ret,i)) {
7558 stored++;
7559 ANYOF_BITMAP_SET(ret, i);
7560 }
7561 }
3568d838 7562 }
a5961de5 7563 if (value > 255 || UTF) {
1df70142
AL
7564 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7565 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7566 stored+=2; /* can't optimize this class */
ffc61ed2 7567 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7568 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7569 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7570 prevnatvalue, natvalue);
7571 }
7572 else if (prevnatvalue == natvalue) {
7573 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7574 if (FOLD) {
89ebb4a3 7575 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7576 STRLEN foldlen;
1df70142 7577 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7578
e294cc5d
JH
7579#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7580 if (RExC_precomp[0] == ':' &&
7581 RExC_precomp[1] == '[' &&
7582 (f == 0xDF || f == 0x92)) {
7583 f = NATIVE_TO_UNI(f);
7584 }
7585#endif
c840d2a2
JH
7586 /* If folding and foldable and a single
7587 * character, insert also the folded version
7588 * to the charclass. */
9e55ce06 7589 if (f != value) {
e294cc5d
JH
7590#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7591 if ((RExC_precomp[0] == ':' &&
7592 RExC_precomp[1] == '[' &&
7593 (f == 0xA2 &&
7594 (value == 0xFB05 || value == 0xFB06))) ?
7595 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7596 foldlen == (STRLEN)UNISKIP(f) )
7597#else
eb160463 7598 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7599#endif
9e55ce06
JH
7600 Perl_sv_catpvf(aTHX_ listsv,
7601 "%04"UVxf"\n", f);
7602 else {
7603 /* Any multicharacter foldings
7604 * require the following transform:
7605 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7606 * where E folds into "pq" and F folds
7607 * into "rst", all other characters
7608 * fold to single characters. We save
7609 * away these multicharacter foldings,
7610 * to be later saved as part of the
7611 * additional "s" data. */
7612 SV *sv;
7613
7614 if (!unicode_alternate)
7615 unicode_alternate = newAV();
7616 sv = newSVpvn((char*)foldbuf, foldlen);
7617 SvUTF8_on(sv);
7618 av_push(unicode_alternate, sv);
7619 }
7620 }
254ba52a 7621
60a8b682
JH
7622 /* If folding and the value is one of the Greek
7623 * sigmas insert a few more sigmas to make the
7624 * folding rules of the sigmas to work right.
7625 * Note that not all the possible combinations
7626 * are handled here: some of them are handled
9e55ce06
JH
7627 * by the standard folding rules, and some of
7628 * them (literal or EXACTF cases) are handled
7629 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7630 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7631 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7632 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7633 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7634 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7635 }
7636 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7637 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7638 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7639 }
7640 }
ffc61ed2 7641 }
1b2d223b
JH
7642#ifdef EBCDIC
7643 literal_endpoint = 0;
7644#endif
8ada0baa 7645 }
ffc61ed2
JH
7646
7647 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7648 }
ffc61ed2 7649
936ed897 7650 if (need_class) {
4f66b38d 7651 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7652 if (SIZE_ONLY)
830247a4 7653 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7654 else
830247a4 7655 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7656 }
ffc61ed2 7657
7f6f358c
YO
7658
7659 if (SIZE_ONLY)
7660 return ret;
7661 /****** !SIZE_ONLY AFTER HERE *********/
7662
7663 if( stored == 1 && value < 256
7664 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7665 ) {
7666 /* optimize single char class to an EXACT node
7667 but *only* when its not a UTF/high char */
07be1b83
YO
7668 const char * cur_parse= RExC_parse;
7669 RExC_emit = (regnode *)orig_emit;
7670 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7671 ret = reg_node(pRExC_state,
7672 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7673 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7674 *STRING(ret)= (char)value;
7675 STR_LEN(ret)= 1;
7676 RExC_emit += STR_SZ(1);
7677 return ret;
7678 }
ae5c130c 7679 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7680 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7681 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7682 ) {
a0ed51b3 7683 for (value = 0; value < 256; ++value) {
936ed897 7684 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7685 UV fold = PL_fold[value];
ffc61ed2
JH
7686
7687 if (fold != value)
7688 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7689 }
7690 }
936ed897 7691 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7692 }
ffc61ed2 7693
ae5c130c 7694 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7695 if (optimize_invert &&
ffc61ed2
JH
7696 /* If the only flag is inversion. */
7697 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7698 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7699 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7700 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7701 }
7f6f358c 7702 {
097eb12c 7703 AV * const av = newAV();
ffc61ed2 7704 SV *rv;
9e55ce06 7705 /* The 0th element stores the character class description
6a0407ee 7706 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7707 * to initialize the appropriate swash (which gets stored in
7708 * the 1st element), and also useful for dumping the regnode.
7709 * The 2nd element stores the multicharacter foldings,
6a0407ee 7710 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7711 av_store(av, 0, listsv);
7712 av_store(av, 1, NULL);
9e55ce06 7713 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7714 rv = newRV_noinc((SV*)av);
19860706 7715 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 7716 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 7717 ARG_SET(ret, n);
a0ed51b3 7718 }
a0ed51b3
LW
7719 return ret;
7720}
7721
76e3520e 7722STATIC char*
830247a4 7723S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7724{
097eb12c 7725 char* const retval = RExC_parse++;
a0d0e21e 7726
4633a7c4 7727 for (;;) {
830247a4
IZ
7728 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7729 RExC_parse[2] == '#') {
e994fd66
AE
7730 while (*RExC_parse != ')') {
7731 if (RExC_parse == RExC_end)
7732 FAIL("Sequence (?#... not terminated");
830247a4 7733 RExC_parse++;
e994fd66 7734 }
830247a4 7735 RExC_parse++;
4633a7c4
LW
7736 continue;
7737 }
bbe252da 7738 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
7739 if (isSPACE(*RExC_parse)) {
7740 RExC_parse++;
748a9306
LW
7741 continue;
7742 }
830247a4 7743 else if (*RExC_parse == '#') {
e994fd66
AE
7744 while (RExC_parse < RExC_end)
7745 if (*RExC_parse++ == '\n') break;
748a9306
LW
7746 continue;
7747 }
748a9306 7748 }
4633a7c4 7749 return retval;
a0d0e21e 7750 }
a687059c
LW
7751}
7752
7753/*
c277df42 7754- reg_node - emit a node
a0d0e21e 7755*/
76e3520e 7756STATIC regnode * /* Location. */
830247a4 7757S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7758{
97aff369 7759 dVAR;
c277df42 7760 register regnode *ptr;
504618e9 7761 regnode * const ret = RExC_emit;
07be1b83 7762 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7763
c277df42 7764 if (SIZE_ONLY) {
830247a4
IZ
7765 SIZE_ALIGN(RExC_size);
7766 RExC_size += 1;
a0d0e21e
LW
7767 return(ret);
7768 }
e2e6a0f1
YO
7769#ifdef DEBUGGING
7770 if (OP(RExC_emit) == 255)
7771 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7772 reg_name[op], OP(RExC_emit));
7773#endif
c277df42 7774 NODE_ALIGN_FILL(ret);
a0d0e21e 7775 ptr = ret;
c277df42 7776 FILL_ADVANCE_NODE(ptr, op);
fac92740 7777 if (RExC_offsets) { /* MJD */
07be1b83 7778 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7779 "reg_node", __LINE__,
7780 reg_name[op],
07be1b83
YO
7781 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7782 ? "Overwriting end of array!\n" : "OK",
7783 (UV)(RExC_emit - RExC_emit_start),
7784 (UV)(RExC_parse - RExC_start),
7785 (UV)RExC_offsets[0]));
ccb2c380 7786 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7787 }
07be1b83 7788
830247a4 7789 RExC_emit = ptr;
a0d0e21e 7790 return(ret);
a687059c
LW
7791}
7792
7793/*
a0d0e21e
LW
7794- reganode - emit a node with an argument
7795*/
76e3520e 7796STATIC regnode * /* Location. */
830247a4 7797S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7798{
97aff369 7799 dVAR;
c277df42 7800 register regnode *ptr;
504618e9 7801 regnode * const ret = RExC_emit;
07be1b83 7802 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7803
c277df42 7804 if (SIZE_ONLY) {
830247a4
IZ
7805 SIZE_ALIGN(RExC_size);
7806 RExC_size += 2;
6bda09f9
YO
7807 /*
7808 We can't do this:
7809
7810 assert(2==regarglen[op]+1);
7811
7812 Anything larger than this has to allocate the extra amount.
7813 If we changed this to be:
7814
7815 RExC_size += (1 + regarglen[op]);
7816
7817 then it wouldn't matter. Its not clear what side effect
7818 might come from that so its not done so far.
7819 -- dmq
7820 */
a0d0e21e
LW
7821 return(ret);
7822 }
e2e6a0f1
YO
7823#ifdef DEBUGGING
7824 if (OP(RExC_emit) == 255)
7825 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7826#endif
c277df42 7827 NODE_ALIGN_FILL(ret);
a0d0e21e 7828 ptr = ret;
c277df42 7829 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7830 if (RExC_offsets) { /* MJD */
07be1b83 7831 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7832 "reganode",
ccb2c380
MP
7833 __LINE__,
7834 reg_name[op],
07be1b83 7835 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7836 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7837 (UV)(RExC_emit - RExC_emit_start),
7838 (UV)(RExC_parse - RExC_start),
7839 (UV)RExC_offsets[0]));
ccb2c380 7840 Set_Cur_Node_Offset;
fac92740
MJD
7841 }
7842
830247a4 7843 RExC_emit = ptr;
a0d0e21e 7844 return(ret);
fe14fcc3
LW
7845}
7846
7847/*
cd439c50 7848- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7849*/
71207a34
AL
7850STATIC STRLEN
7851S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7852{
97aff369 7853 dVAR;
71207a34 7854 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7855}
7856
7857/*
a0d0e21e
LW
7858- reginsert - insert an operator in front of already-emitted operand
7859*
7860* Means relocating the operand.
7861*/
76e3520e 7862STATIC void
6bda09f9 7863S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7864{
97aff369 7865 dVAR;
c277df42
IZ
7866 register regnode *src;
7867 register regnode *dst;
7868 register regnode *place;
504618e9 7869 const int offset = regarglen[(U8)op];
6bda09f9 7870 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7871 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7872/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7873 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7874 if (SIZE_ONLY) {
6bda09f9 7875 RExC_size += size;
a0d0e21e
LW
7876 return;
7877 }
a687059c 7878
830247a4 7879 src = RExC_emit;
6bda09f9 7880 RExC_emit += size;
830247a4 7881 dst = RExC_emit;
40d049e4 7882 if (RExC_open_parens) {
6bda09f9 7883 int paren;
6d99fb9b 7884 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 7885 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7886 if ( RExC_open_parens[paren] >= opnd ) {
7887 DEBUG_PARSE_FMT("open"," - %d",size);
7888 RExC_open_parens[paren] += size;
7889 } else {
7890 DEBUG_PARSE_FMT("open"," - %s","ok");
7891 }
7892 if ( RExC_close_parens[paren] >= opnd ) {
7893 DEBUG_PARSE_FMT("close"," - %d",size);
7894 RExC_close_parens[paren] += size;
7895 } else {
7896 DEBUG_PARSE_FMT("close"," - %s","ok");
7897 }
7898 }
6bda09f9 7899 }
40d049e4 7900
fac92740 7901 while (src > opnd) {
c277df42 7902 StructCopy(--src, --dst, regnode);
fac92740 7903 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7904 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7905 "reg_insert",
ccb2c380
MP
7906 __LINE__,
7907 reg_name[op],
07be1b83
YO
7908 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7909 ? "Overwriting end of array!\n" : "OK",
7910 (UV)(src - RExC_emit_start),
7911 (UV)(dst - RExC_emit_start),
7912 (UV)RExC_offsets[0]));
ccb2c380
MP
7913 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7914 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7915 }
7916 }
7917
a0d0e21e
LW
7918
7919 place = opnd; /* Op node, where operand used to be. */
fac92740 7920 if (RExC_offsets) { /* MJD */
07be1b83 7921 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7922 "reginsert",
ccb2c380
MP
7923 __LINE__,
7924 reg_name[op],
07be1b83 7925 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7926 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7927 (UV)(place - RExC_emit_start),
7928 (UV)(RExC_parse - RExC_start),
786e8c11 7929 (UV)RExC_offsets[0]));
ccb2c380 7930 Set_Node_Offset(place, RExC_parse);
45948336 7931 Set_Node_Length(place, 1);
fac92740 7932 }
c277df42
IZ
7933 src = NEXTOPER(place);
7934 FILL_ADVANCE_NODE(place, op);
7935 Zero(src, offset, regnode);
a687059c
LW
7936}
7937
7938/*
c277df42 7939- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7940- SEE ALSO: regtail_study
a0d0e21e 7941*/
097eb12c 7942/* TODO: All three parms should be const */
76e3520e 7943STATIC void
3dab1dad 7944S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7945{
97aff369 7946 dVAR;
c277df42 7947 register regnode *scan;
72f13be8 7948 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7949#ifndef DEBUGGING
7950 PERL_UNUSED_ARG(depth);
7951#endif
a0d0e21e 7952
c277df42 7953 if (SIZE_ONLY)
a0d0e21e
LW
7954 return;
7955
7956 /* Find last node. */
7957 scan = p;
7958 for (;;) {
504618e9 7959 regnode * const temp = regnext(scan);
3dab1dad
YO
7960 DEBUG_PARSE_r({
7961 SV * const mysv=sv_newmortal();
7962 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7963 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7964 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7965 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7966 (temp == NULL ? "->" : ""),
7967 (temp == NULL ? reg_name[OP(val)] : "")
7968 );
3dab1dad
YO
7969 });
7970 if (temp == NULL)
7971 break;
7972 scan = temp;
7973 }
7974
7975 if (reg_off_by_arg[OP(scan)]) {
7976 ARG_SET(scan, val - scan);
7977 }
7978 else {
7979 NEXT_OFF(scan) = val - scan;
7980 }
7981}
7982
07be1b83 7983#ifdef DEBUGGING
3dab1dad
YO
7984/*
7985- regtail_study - set the next-pointer at the end of a node chain of p to val.
7986- Look for optimizable sequences at the same time.
7987- currently only looks for EXACT chains.
07be1b83
YO
7988
7989This is expermental code. The idea is to use this routine to perform
7990in place optimizations on branches and groups as they are constructed,
7991with the long term intention of removing optimization from study_chunk so
7992that it is purely analytical.
7993
7994Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7995to control which is which.
7996
3dab1dad
YO
7997*/
7998/* TODO: All four parms should be const */
07be1b83 7999
3dab1dad
YO
8000STATIC U8
8001S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8002{
8003 dVAR;
8004 register regnode *scan;
07be1b83
YO
8005 U8 exact = PSEUDO;
8006#ifdef EXPERIMENTAL_INPLACESCAN
8007 I32 min = 0;
8008#endif
8009
3dab1dad
YO
8010 GET_RE_DEBUG_FLAGS_DECL;
8011
07be1b83 8012
3dab1dad
YO
8013 if (SIZE_ONLY)
8014 return exact;
8015
8016 /* Find last node. */
8017
8018 scan = p;
8019 for (;;) {
8020 regnode * const temp = regnext(scan);
07be1b83
YO
8021#ifdef EXPERIMENTAL_INPLACESCAN
8022 if (PL_regkind[OP(scan)] == EXACT)
8023 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8024 return EXACT;
8025#endif
3dab1dad
YO
8026 if ( exact ) {
8027 switch (OP(scan)) {
8028 case EXACT:
8029 case EXACTF:
8030 case EXACTFL:
8031 if( exact == PSEUDO )
8032 exact= OP(scan);
07be1b83
YO
8033 else if ( exact != OP(scan) )
8034 exact= 0;
3dab1dad
YO
8035 case NOTHING:
8036 break;
8037 default:
8038 exact= 0;
8039 }
8040 }
8041 DEBUG_PARSE_r({
8042 SV * const mysv=sv_newmortal();
8043 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8044 regprop(RExC_rx, mysv, scan);
eaf3ca90 8045 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8046 SvPV_nolen_const(mysv),
eaf3ca90
YO
8047 REG_NODE_NUM(scan),
8048 reg_name[exact]);
3dab1dad 8049 });
a0d0e21e
LW
8050 if (temp == NULL)
8051 break;
8052 scan = temp;
8053 }
07be1b83
YO
8054 DEBUG_PARSE_r({
8055 SV * const mysv_val=sv_newmortal();
8056 DEBUG_PARSE_MSG("");
8057 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
8058 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8059 SvPV_nolen_const(mysv_val),
8060 (IV)REG_NODE_NUM(val),
8061 (IV)(val - scan)
07be1b83
YO
8062 );
8063 });
c277df42
IZ
8064 if (reg_off_by_arg[OP(scan)]) {
8065 ARG_SET(scan, val - scan);
a0ed51b3
LW
8066 }
8067 else {
c277df42
IZ
8068 NEXT_OFF(scan) = val - scan;
8069 }
3dab1dad
YO
8070
8071 return exact;
a687059c 8072}
07be1b83 8073#endif
a687059c
LW
8074
8075/*
a687059c
LW
8076 - regcurly - a little FSA that accepts {\d+,?\d*}
8077 */
79072805 8078STATIC I32
5f66b61c 8079S_regcurly(register const char *s)
a687059c
LW
8080{
8081 if (*s++ != '{')
8082 return FALSE;
f0fcb552 8083 if (!isDIGIT(*s))
a687059c 8084 return FALSE;
f0fcb552 8085 while (isDIGIT(*s))
a687059c
LW
8086 s++;
8087 if (*s == ',')
8088 s++;
f0fcb552 8089 while (isDIGIT(*s))
a687059c
LW
8090 s++;
8091 if (*s != '}')
8092 return FALSE;
8093 return TRUE;
8094}
8095
a687059c
LW
8096
8097/*
fd181c75 8098 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
8099 */
8100void
097eb12c 8101Perl_regdump(pTHX_ const regexp *r)
a687059c 8102{
35ff7856 8103#ifdef DEBUGGING
97aff369 8104 dVAR;
c445ea15 8105 SV * const sv = sv_newmortal();
ab3bbdeb 8106 SV *dsv= sv_newmortal();
f8fc2ecf 8107 RXi_GET_DECL(r,ri);
a687059c 8108
f8fc2ecf 8109 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8110
8111 /* Header fields of interest. */
ab3bbdeb
YO
8112 if (r->anchored_substr) {
8113 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8114 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8115 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8116 "anchored %s%s at %"IVdf" ",
8117 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8118 (IV)r->anchored_offset);
ab3bbdeb
YO
8119 } else if (r->anchored_utf8) {
8120 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8121 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8122 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8123 "anchored utf8 %s%s at %"IVdf" ",
8124 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8125 (IV)r->anchored_offset);
ab3bbdeb
YO
8126 }
8127 if (r->float_substr) {
8128 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8129 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8130 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8131 "floating %s%s at %"IVdf"..%"UVuf" ",
8132 s, RE_SV_TAIL(r->float_substr),
7b0972df 8133 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8134 } else if (r->float_utf8) {
8135 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8136 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8137 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8138 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8139 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8140 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8141 }
33b8afdf 8142 if (r->check_substr || r->check_utf8)
b81d288d 8143 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8144 (const char *)
8145 (r->check_substr == r->float_substr
8146 && r->check_utf8 == r->float_utf8
8147 ? "(checking floating" : "(checking anchored"));
bbe252da 8148 if (r->extflags & RXf_NOSCAN)
c277df42 8149 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 8150 if (r->extflags & RXf_CHECK_ALL)
c277df42 8151 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8152 if (r->check_substr || r->check_utf8)
c277df42
IZ
8153 PerlIO_printf(Perl_debug_log, ") ");
8154
f8fc2ecf
YO
8155 if (ri->regstclass) {
8156 regprop(r, sv, ri->regstclass);
1de06328 8157 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8158 }
bbe252da 8159 if (r->extflags & RXf_ANCH) {
774d564b 8160 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 8161 if (r->extflags & RXf_ANCH_BOL)
774d564b 8162 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 8163 if (r->extflags & RXf_ANCH_MBOL)
c277df42 8164 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 8165 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 8166 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 8167 if (r->extflags & RXf_ANCH_GPOS)
774d564b 8168 PerlIO_printf(Perl_debug_log, "(GPOS)");
8169 PerlIO_putc(Perl_debug_log, ' ');
8170 }
bbe252da 8171 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 8172 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 8173 if (r->intflags & PREGf_SKIP)
760ac839 8174 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 8175 if (r->intflags & PREGf_IMPLICIT)
760ac839 8176 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 8177 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 8178 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 8179 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8180 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8181#else
96a5add6 8182 PERL_UNUSED_CONTEXT;
65e66c80 8183 PERL_UNUSED_ARG(r);
17c3b450 8184#endif /* DEBUGGING */
a687059c
LW
8185}
8186
8187/*
a0d0e21e
LW
8188- regprop - printable representation of opcode
8189*/
46fc3d4c 8190void
32fc9b6a 8191Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8192{
35ff7856 8193#ifdef DEBUGGING
97aff369 8194 dVAR;
9b155405 8195 register int k;
f8fc2ecf 8196 RXi_GET_DECL(prog,progi);
1de06328 8197 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 8198
a0d0e21e 8199
54dc92de 8200 sv_setpvn(sv, "", 0);
8aa23a47 8201
03363afd 8202 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8203 /* It would be nice to FAIL() here, but this may be called from
8204 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8205 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8206 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8207
3dab1dad 8208 k = PL_regkind[OP(o)];
9b155405 8209
2a782b5b 8210 if (k == EXACT) {
396482e1 8211 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8212 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8213 * is a crude hack but it may be the best for now since
8214 * we have no flag "this EXACTish node was UTF-8"
8215 * --jhi */
8216 const char * const s =
ddc5bc0f 8217 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8218 PL_colors[0], PL_colors[1],
8219 PERL_PV_ESCAPE_UNI_DETECT |
8220 PERL_PV_PRETTY_ELIPSES |
8221 PERL_PV_PRETTY_LTGT
8222 );
8223 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8224 } else if (k == TRIE) {
3dab1dad 8225 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 8226 * progi->data isn't available here */
1de06328
YO
8227 const char op = OP(o);
8228 const I32 n = ARG(o);
8229 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 8230 (reg_ac_data *)progi->data->data[n] :
1de06328 8231 NULL;
3251b653
NC
8232 const reg_trie_data * const trie
8233 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328
YO
8234
8235 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8236 DEBUG_TRIE_COMPILE_r(
8237 Perl_sv_catpvf(aTHX_ sv,
8238 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8239 (UV)trie->startstate,
1e2e3d02 8240 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8241 (UV)trie->wordcount,
8242 (UV)trie->minlen,
8243 (UV)trie->maxlen,
8244 (UV)TRIE_CHARCOUNT(trie),
8245 (UV)trie->uniquecharcount
8246 )
8247 );
8248 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8249 int i;
8250 int rangestart = -1;
f46cb337 8251 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8252 Perl_sv_catpvf(aTHX_ sv, "[");
8253 for (i = 0; i <= 256; i++) {
8254 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8255 if (rangestart == -1)
8256 rangestart = i;
8257 } else if (rangestart != -1) {
8258 if (i <= rangestart + 3)
8259 for (; rangestart < i; rangestart++)
8260 put_byte(sv, rangestart);
8261 else {
8262 put_byte(sv, rangestart);
8263 sv_catpvs(sv, "-");
8264 put_byte(sv, i - 1);
8265 }
8266 rangestart = -1;
8267 }
8268 }
8269 Perl_sv_catpvf(aTHX_ sv, "]");
8270 }
8271
a3621e74 8272 } else if (k == CURLY) {
cb434fcc 8273 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8274 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8275 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8276 }
2c2d71f5
JH
8277 else if (k == WHILEM && o->flags) /* Ordinal/of */
8278 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
e2e6a0f1 8279 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
894356b3 8280 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1a147d38 8281 else if (k == GOSUB)
6bda09f9 8282 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8283 else if (k == VERB) {
8284 if (!o->flags)
8285 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
f8fc2ecf 8286 (SV*)progi->data->data[ ARG( o ) ]);
e2e6a0f1 8287 } else if (k == LOGICAL)
04ebc1ab 8288 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8289 else if (k == ANYOF) {
8290 int i, rangestart = -1;
2d03de9c 8291 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8292
8293 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8294 static const char * const anyofs[] = {
653099ff
GS
8295 "\\w",
8296 "\\W",
8297 "\\s",
8298 "\\S",
8299 "\\d",
8300 "\\D",
8301 "[:alnum:]",
8302 "[:^alnum:]",
8303 "[:alpha:]",
8304 "[:^alpha:]",
8305 "[:ascii:]",
8306 "[:^ascii:]",
8307 "[:ctrl:]",
8308 "[:^ctrl:]",
8309 "[:graph:]",
8310 "[:^graph:]",
8311 "[:lower:]",
8312 "[:^lower:]",
8313 "[:print:]",
8314 "[:^print:]",
8315 "[:punct:]",
8316 "[:^punct:]",
8317 "[:upper:]",
aaa51d5e 8318 "[:^upper:]",
653099ff 8319 "[:xdigit:]",
aaa51d5e
JF
8320 "[:^xdigit:]",
8321 "[:space:]",
8322 "[:^space:]",
8323 "[:blank:]",
8324 "[:^blank:]"
653099ff
GS
8325 };
8326
19860706 8327 if (flags & ANYOF_LOCALE)
396482e1 8328 sv_catpvs(sv, "{loc}");
19860706 8329 if (flags & ANYOF_FOLD)
396482e1 8330 sv_catpvs(sv, "{i}");
653099ff 8331 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8332 if (flags & ANYOF_INVERT)
396482e1 8333 sv_catpvs(sv, "^");
ffc61ed2
JH
8334 for (i = 0; i <= 256; i++) {
8335 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8336 if (rangestart == -1)
8337 rangestart = i;
8338 } else if (rangestart != -1) {
8339 if (i <= rangestart + 3)
8340 for (; rangestart < i; rangestart++)
653099ff 8341 put_byte(sv, rangestart);
ffc61ed2
JH
8342 else {
8343 put_byte(sv, rangestart);
396482e1 8344 sv_catpvs(sv, "-");
ffc61ed2 8345 put_byte(sv, i - 1);
653099ff 8346 }
ffc61ed2 8347 rangestart = -1;
653099ff 8348 }
847a199f 8349 }
ffc61ed2
JH
8350
8351 if (o->flags & ANYOF_CLASS)
bb7a0f54 8352 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8353 if (ANYOF_CLASS_TEST(o,i))
8354 sv_catpv(sv, anyofs[i]);
8355
8356 if (flags & ANYOF_UNICODE)
396482e1 8357 sv_catpvs(sv, "{unicode}");
1aa99e6b 8358 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8359 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8360
8361 {
8362 SV *lv;
32fc9b6a 8363 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8364
ffc61ed2
JH
8365 if (lv) {
8366 if (sw) {
89ebb4a3 8367 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8368
ffc61ed2 8369 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8370 uvchr_to_utf8(s, i);
ffc61ed2 8371
3568d838 8372 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8373 if (rangestart == -1)
8374 rangestart = i;
8375 } else if (rangestart != -1) {
ffc61ed2
JH
8376 if (i <= rangestart + 3)
8377 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8378 const U8 * const e = uvchr_to_utf8(s,rangestart);
8379 U8 *p;
8380 for(p = s; p < e; p++)
ffc61ed2
JH
8381 put_byte(sv, *p);
8382 }
8383 else {
2d03de9c
AL
8384 const U8 *e = uvchr_to_utf8(s,rangestart);
8385 U8 *p;
8386 for (p = s; p < e; p++)
ffc61ed2 8387 put_byte(sv, *p);
396482e1 8388 sv_catpvs(sv, "-");
2d03de9c
AL
8389 e = uvchr_to_utf8(s, i-1);
8390 for (p = s; p < e; p++)
1df70142 8391 put_byte(sv, *p);
ffc61ed2
JH
8392 }
8393 rangestart = -1;
8394 }
19860706 8395 }
ffc61ed2 8396
396482e1 8397 sv_catpvs(sv, "..."); /* et cetera */
19860706 8398 }
fde631ed 8399
ffc61ed2 8400 {
2e0de35c 8401 char *s = savesvpv(lv);
c445ea15 8402 char * const origs = s;
b81d288d 8403
3dab1dad
YO
8404 while (*s && *s != '\n')
8405 s++;
b81d288d 8406
ffc61ed2 8407 if (*s == '\n') {
2d03de9c 8408 const char * const t = ++s;
ffc61ed2
JH
8409
8410 while (*s) {
8411 if (*s == '\n')
8412 *s = ' ';
8413 s++;
8414 }
8415 if (s[-1] == ' ')
8416 s[-1] = 0;
8417
8418 sv_catpv(sv, t);
fde631ed 8419 }
b81d288d 8420
ffc61ed2 8421 Safefree(origs);
fde631ed
JH
8422 }
8423 }
653099ff 8424 }
ffc61ed2 8425
653099ff
GS
8426 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8427 }
9b155405 8428 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8429 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8430#else
96a5add6 8431 PERL_UNUSED_CONTEXT;
65e66c80
SP
8432 PERL_UNUSED_ARG(sv);
8433 PERL_UNUSED_ARG(o);
f9049ba1 8434 PERL_UNUSED_ARG(prog);
17c3b450 8435#endif /* DEBUGGING */
35ff7856 8436}
a687059c 8437
cad2e5aa
JH
8438SV *
8439Perl_re_intuit_string(pTHX_ regexp *prog)
8440{ /* Assume that RE_INTUIT is set */
97aff369 8441 dVAR;
a3621e74 8442 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8443 PERL_UNUSED_CONTEXT;
8444
a3621e74 8445 DEBUG_COMPILE_r(
cfd0369c 8446 {
2d03de9c 8447 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8448 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8449
8450 if (!PL_colorset) reginitcolors();
8451 PerlIO_printf(Perl_debug_log,
a0288114 8452 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8453 PL_colors[4],
8454 prog->check_substr ? "" : "utf8 ",
8455 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8456 s,
8457 PL_colors[1],
8458 (strlen(s) > 60 ? "..." : ""));
8459 } );
8460
33b8afdf 8461 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8462}
8463
84da74a7
YO
8464/*
8465 pregfree - free a regexp
8466
8467 See regdupe below if you change anything here.
8468*/
8469
2b69d0c2 8470void
864dbfa3 8471Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8472{
27da23d5 8473 dVAR;
f8fc2ecf 8474 RXi_GET_DECL(r,ri);
fc32ee4a 8475 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8476
7821416a
IZ
8477 if (!r || (--r->refcnt > 0))
8478 return;
ab3bbdeb 8479 DEBUG_COMPILE_r({
0df25f3d
YO
8480 if (!PL_colorset)
8481 reginitcolors();
5c9f2f80
YO
8482 {
8483 SV *dsv= sv_newmortal();
bbe252da 8484 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
ab3bbdeb
YO
8485 dsv, r->precomp, r->prelen, 60);
8486 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8487 PL_colors[4],PL_colors[5],s);
8488 }
9e55ce06 8489 });
cad2e5aa 8490
43c5f42d
NC
8491 /* gcov results gave these as non-null 100% of the time, so there's no
8492 optimisation in checking them before calling Safefree */
8493 Safefree(r->precomp);
f8fc2ecf 8494 Safefree(ri->offsets); /* 20010421 MJD */
ed252734 8495 RX_MATCH_COPY_FREE(r);
f8c7b90f 8496#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8497 if (r->saved_copy)
8498 SvREFCNT_dec(r->saved_copy);
8499#endif
a193d654
GS
8500 if (r->substrs) {
8501 if (r->anchored_substr)
8502 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8503 if (r->anchored_utf8)
8504 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8505 if (r->float_substr)
8506 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8507 if (r->float_utf8)
8508 SvREFCNT_dec(r->float_utf8);
2779dcf1 8509 Safefree(r->substrs);
a193d654 8510 }
81714fb9
YO
8511 if (r->paren_names)
8512 SvREFCNT_dec(r->paren_names);
f8fc2ecf
YO
8513 if (ri->data) {
8514 int n = ri->data->count;
f3548bdc
DM
8515 PAD* new_comppad = NULL;
8516 PAD* old_comppad;
4026c95a 8517 PADOFFSET refcnt;
dfad63ad 8518
c277df42 8519 while (--n >= 0) {
261faec3 8520 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 8521 switch (ri->data->what[n]) {
c277df42 8522 case 's':
81714fb9 8523 case 'S':
f8fc2ecf 8524 SvREFCNT_dec((SV*)ri->data->data[n]);
c277df42 8525 break;
653099ff 8526 case 'f':
f8fc2ecf 8527 Safefree(ri->data->data[n]);
653099ff 8528 break;
dfad63ad 8529 case 'p':
f8fc2ecf 8530 new_comppad = (AV*)ri->data->data[n];
dfad63ad 8531 break;
c277df42 8532 case 'o':
dfad63ad 8533 if (new_comppad == NULL)
cea2e8a9 8534 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8535 PAD_SAVE_LOCAL(old_comppad,
8536 /* Watch out for global destruction's random ordering. */
c445ea15 8537 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8538 );
b34c0dd4 8539 OP_REFCNT_LOCK;
f8fc2ecf 8540 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
8541 OP_REFCNT_UNLOCK;
8542 if (!refcnt)
f8fc2ecf 8543 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 8544
f3548bdc 8545 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8546 SvREFCNT_dec((SV*)new_comppad);
8547 new_comppad = NULL;
c277df42
IZ
8548 break;
8549 case 'n':
9e55ce06 8550 break;
07be1b83 8551 case 'T':
be8e71aa
YO
8552 { /* Aho Corasick add-on structure for a trie node.
8553 Used in stclass optimization only */
07be1b83 8554 U32 refcount;
f8fc2ecf 8555 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
8556 OP_REFCNT_LOCK;
8557 refcount = --aho->refcount;
8558 OP_REFCNT_UNLOCK;
8559 if ( !refcount ) {
446bd890
NC
8560 PerlMemShared_free(aho->states);
8561 PerlMemShared_free(aho->fail);
446bd890
NC
8562 /* do this last!!!! */
8563 PerlMemShared_free(ri->data->data[n]);
8564 PerlMemShared_free(ri->regstclass);
07be1b83
YO
8565 }
8566 }
8567 break;
a3621e74 8568 case 't':
07be1b83 8569 {
be8e71aa 8570 /* trie structure. */
07be1b83 8571 U32 refcount;
f8fc2ecf 8572 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
8573 OP_REFCNT_LOCK;
8574 refcount = --trie->refcount;
8575 OP_REFCNT_UNLOCK;
8576 if ( !refcount ) {
446bd890 8577 PerlMemShared_free(trie->charmap);
07be1b83
YO
8578 if (trie->widecharmap)
8579 SvREFCNT_dec((SV*)trie->widecharmap);
446bd890
NC
8580 PerlMemShared_free(trie->states);
8581 PerlMemShared_free(trie->trans);
07be1b83 8582 if (trie->bitmap)
446bd890 8583 PerlMemShared_free(trie->bitmap);
07be1b83 8584 if (trie->wordlen)
446bd890 8585 PerlMemShared_free(trie->wordlen);
786e8c11 8586 if (trie->jump)
446bd890 8587 PerlMemShared_free(trie->jump);
786e8c11 8588 if (trie->nextword)
446bd890 8589 PerlMemShared_free(trie->nextword);
a3621e74 8590#ifdef DEBUGGING
5c9f2f80
YO
8591 if (trie->words)
8592 SvREFCNT_dec((SV*)trie->words);
8593 if (trie->revcharmap)
8594 SvREFCNT_dec((SV*)trie->revcharmap);
a3621e74 8595#endif
446bd890
NC
8596 /* do this last!!!! */
8597 PerlMemShared_free(ri->data->data[n]);
a3621e74 8598 }
07be1b83
YO
8599 }
8600 break;
c277df42 8601 default:
f8fc2ecf 8602 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
8603 }
8604 }
f8fc2ecf
YO
8605 Safefree(ri->data->what);
8606 Safefree(ri->data);
a0d0e21e
LW
8607 }
8608 Safefree(r->startp);
8609 Safefree(r->endp);
f8fc2ecf
YO
8610 if (ri->swap) {
8611 Safefree(ri->swap->startp);
8612 Safefree(ri->swap->endp);
8613 Safefree(ri->swap);
c74340f9 8614 }
f8fc2ecf 8615 Safefree(ri);
a0d0e21e 8616 Safefree(r);
a687059c 8617}
c277df42 8618
84da74a7
YO
8619#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8620#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8621#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8622#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8623
8624/*
8625 regdupe - duplicate a regexp.
8626
8627 This routine is called by sv.c's re_dup and is expected to clone a
8628 given regexp structure. It is a no-op when not under USE_ITHREADS.
8629 (Originally this *was* re_dup() for change history see sv.c)
8630
8631 See pregfree() above if you change anything here.
8632*/
a3c0e9ca 8633#if defined(USE_ITHREADS)
84da74a7
YO
8634regexp *
8635Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8636{
84da74a7 8637 dVAR;
f8fc2ecf
YO
8638 regexp *ret;
8639 regexp_internal *reti;
84da74a7
YO
8640 int i, len, npar;
8641 struct reg_substr_datum *s;
f8fc2ecf
YO
8642 RXi_GET_DECL(r,ri);
8643
84da74a7
YO
8644 if (!r)
8645 return (REGEXP *)NULL;
8646
8647 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8648 return ret;
8649
f8fc2ecf 8650 len = ri->offsets[0];
84da74a7
YO
8651 npar = r->nparens+1;
8652
f8fc2ecf
YO
8653 Newxz(ret, 1, regexp);
8654 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
8655 RXi_SET(ret,reti);
8656 Copy(ri->program, reti->program, len+1, regnode);
84da74a7
YO
8657
8658 Newx(ret->startp, npar, I32);
8659 Copy(r->startp, ret->startp, npar, I32);
8660 Newx(ret->endp, npar, I32);
8661 Copy(r->startp, ret->startp, npar, I32);
f8fc2ecf
YO
8662 if(ri->swap) {
8663 Newx(reti->swap, 1, regexp_paren_ofs);
c74340f9 8664 /* no need to copy these */
f8fc2ecf
YO
8665 Newx(reti->swap->startp, npar, I32);
8666 Newx(reti->swap->endp, npar, I32);
c74340f9 8667 } else {
f8fc2ecf 8668 reti->swap = NULL;
c74340f9 8669 }
84da74a7
YO
8670
8671 Newx(ret->substrs, 1, struct reg_substr_data);
8672 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8673 s->min_offset = r->substrs->data[i].min_offset;
8674 s->max_offset = r->substrs->data[i].max_offset;
8675 s->end_shift = r->substrs->data[i].end_shift;
8676 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8677 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8678 }
8679
f8fc2ecf
YO
8680 reti->regstclass = NULL;
8681 if (ri->data) {
84da74a7 8682 struct reg_data *d;
f8fc2ecf 8683 const int count = ri->data->count;
84da74a7
YO
8684 int i;
8685
8686 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8687 char, struct reg_data);
8688 Newx(d->what, count, U8);
8689
8690 d->count = count;
8691 for (i = 0; i < count; i++) {
f8fc2ecf 8692 d->what[i] = ri->data->what[i];
84da74a7 8693 switch (d->what[i]) {
bbe252da 8694 /* legal options are one of: sSfpontT
84da74a7
YO
8695 see also regcomp.h and pregfree() */
8696 case 's':
81714fb9 8697 case 'S':
0536c0a7 8698 case 'p': /* actually an AV, but the dup function is identical. */
f8fc2ecf 8699 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
84da74a7 8700 break;
84da74a7
YO
8701 case 'f':
8702 /* This is cheating. */
8703 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 8704 StructCopy(ri->data->data[i], d->data[i],
84da74a7 8705 struct regnode_charclass_class);
f8fc2ecf 8706 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
8707 break;
8708 case 'o':
bbe252da
YO
8709 /* Compiled op trees are readonly and in shared memory,
8710 and can thus be shared without duplication. */
84da74a7 8711 OP_REFCNT_LOCK;
f8fc2ecf 8712 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
8713 OP_REFCNT_UNLOCK;
8714 break;
23eab42c
NC
8715 case 'T':
8716 /* Trie stclasses are readonly and can thus be shared
8717 * without duplication. We free the stclass in pregfree
8718 * when the corresponding reg_ac_data struct is freed.
8719 */
8720 reti->regstclass= ri->regstclass;
8721 /* Fall through */
84da74a7 8722 case 't':
84da74a7 8723 OP_REFCNT_LOCK;
0536c0a7 8724 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 8725 OP_REFCNT_UNLOCK;
0536c0a7
NC
8726 /* Fall through */
8727 case 'n':
8728 d->data[i] = ri->data->data[i];
84da74a7 8729 break;
84da74a7 8730 default:
f8fc2ecf 8731 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
8732 }
8733 }
8734
f8fc2ecf 8735 reti->data = d;
84da74a7
YO
8736 }
8737 else
f8fc2ecf 8738 reti->data = NULL;
84da74a7 8739
f8fc2ecf
YO
8740 Newx(reti->offsets, 2*len+1, U32);
8741 Copy(ri->offsets, reti->offsets, 2*len+1, U32);
84da74a7
YO
8742
8743 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8744 ret->refcnt = r->refcnt;
8745 ret->minlen = r->minlen;
de8c5301 8746 ret->minlenret = r->minlenret;
84da74a7
YO
8747 ret->prelen = r->prelen;
8748 ret->nparens = r->nparens;
8749 ret->lastparen = r->lastparen;
8750 ret->lastcloseparen = r->lastcloseparen;
bbe252da
YO
8751 ret->intflags = r->intflags;
8752 ret->extflags = r->extflags;
84da74a7
YO
8753
8754 ret->sublen = r->sublen;
8755
f9f4320a 8756 ret->engine = r->engine;
81714fb9
YO
8757
8758 ret->paren_names = hv_dup_inc(r->paren_names, param);
f9f4320a 8759
84da74a7
YO
8760 if (RX_MATCH_COPIED(ret))
8761 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8762 else
8763 ret->subbeg = NULL;
8764#ifdef PERL_OLD_COPY_ON_WRITE
8765 ret->saved_copy = NULL;
8766#endif
8767
8768 ptr_table_store(PL_ptr_table, r, ret);
8769 return ret;
84da74a7 8770}
a3c0e9ca 8771#endif
84da74a7 8772
de8c5301
YO
8773/*
8774 reg_stringify()
8775
8776 converts a regexp embedded in a MAGIC struct to its stringified form,
8777 caching the converted form in the struct and returns the cached
8778 string.
8779
8780 If lp is nonnull then it is used to return the length of the
8781 resulting string
8782
8783 If flags is nonnull and the returned string contains UTF8 then
8784 (flags & 1) will be true.
8785
8786 If haseval is nonnull then it is used to return whether the pattern
8787 contains evals.
8788
8789 Normally called via macro:
8790
8791 CALLREG_STRINGIFY(mg,0,0);
8792
8793 And internally with
8794
8795 CALLREG_AS_STR(mg,lp,flags,haseval)
8796
8797 See sv_2pv_flags() in sv.c for an example of internal usage.
8798
8799 */
8800
8801char *
8802Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8803 dVAR;
8804 const regexp * const re = (regexp *)mg->mg_obj;
f8fc2ecf
YO
8805 RXi_GET_DECL(re,ri);
8806
de8c5301
YO
8807 if (!mg->mg_ptr) {
8808 const char *fptr = "msix";
8809 char reflags[6];
8810 char ch;
8811 int left = 0;
8812 int right = 4;
8813 bool need_newline = 0;
bbe252da 8814 U16 reganch = (U16)((re->extflags & RXf_PMf_COMPILETIME) >> 12);
de8c5301
YO
8815
8816 while((ch = *fptr++)) {
8817 if(reganch & 1) {
8818 reflags[left++] = ch;
8819 }
8820 else {
8821 reflags[right--] = ch;
8822 }
8823 reganch >>= 1;
8824 }
8825 if(left != 4) {
8826 reflags[left] = '-';
8827 left = 5;
8828 }
8829
8830 mg->mg_len = re->prelen + 4 + left;
8831 /*
8832 * If /x was used, we have to worry about a regex ending with a
8833 * comment later being embedded within another regex. If so, we don't
8834 * want this regex's "commentization" to leak out to the right part of
8835 * the enclosing regex, we must cap it with a newline.
8836 *
8837 * So, if /x was used, we scan backwards from the end of the regex. If
8838 * we find a '#' before we find a newline, we need to add a newline
8839 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8840 * we don't need to add anything. -jfriedl
8841 */
bbe252da 8842 if (PMf_EXTENDED & re->extflags) {
de8c5301
YO
8843 const char *endptr = re->precomp + re->prelen;
8844 while (endptr >= re->precomp) {
8845 const char c = *(endptr--);
8846 if (c == '\n')
8847 break; /* don't need another */
8848 if (c == '#') {
8849 /* we end while in a comment, so we need a newline */
8850 mg->mg_len++; /* save space for it */
8851 need_newline = 1; /* note to add it */
8852 break;
8853 }
8854 }
8855 }
8856
8857 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8858 mg->mg_ptr[0] = '(';
8859 mg->mg_ptr[1] = '?';
8860 Copy(reflags, mg->mg_ptr+2, left, char);
8861 *(mg->mg_ptr+left+2) = ':';
8862 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8863 if (need_newline)
8864 mg->mg_ptr[mg->mg_len - 2] = '\n';
8865 mg->mg_ptr[mg->mg_len - 1] = ')';
8866 mg->mg_ptr[mg->mg_len] = 0;
8867 }
8868 if (haseval)
f8fc2ecf 8869 *haseval = ri->program[0].next_off;
de8c5301 8870 if (flags)
bbe252da 8871 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
de8c5301
YO
8872
8873 if (lp)
8874 *lp = mg->mg_len;
8875 return mg->mg_ptr;
8876}
8877
8878
76234dfb 8879#ifndef PERL_IN_XSUB_RE
c277df42
IZ
8880/*
8881 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8882 */
8883regnode *
864dbfa3 8884Perl_regnext(pTHX_ register regnode *p)
c277df42 8885{
97aff369 8886 dVAR;
c277df42
IZ
8887 register I32 offset;
8888
f8fc2ecf 8889 if (!p)
c277df42
IZ
8890 return(NULL);
8891
8892 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8893 if (offset == 0)
8894 return(NULL);
8895
c277df42 8896 return(p+offset);
c277df42 8897}
76234dfb 8898#endif
c277df42 8899
01f988be 8900STATIC void
cea2e8a9 8901S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8902{
8903 va_list args;
8904 STRLEN l1 = strlen(pat1);
8905 STRLEN l2 = strlen(pat2);
8906 char buf[512];
06bf62c7 8907 SV *msv;
73d840c0 8908 const char *message;
c277df42
IZ
8909
8910 if (l1 > 510)
8911 l1 = 510;
8912 if (l1 + l2 > 510)
8913 l2 = 510 - l1;
8914 Copy(pat1, buf, l1 , char);
8915 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8916 buf[l1 + l2] = '\n';
8917 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8918#ifdef I_STDARG
8919 /* ANSI variant takes additional second argument */
c277df42 8920 va_start(args, pat2);
8736538c
AS
8921#else
8922 va_start(args);
8923#endif
5a844595 8924 msv = vmess(buf, &args);
c277df42 8925 va_end(args);
cfd0369c 8926 message = SvPV_const(msv,l1);
c277df42
IZ
8927 if (l1 > 512)
8928 l1 = 512;
8929 Copy(message, buf, l1 , char);
197cf9b9 8930 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8931 Perl_croak(aTHX_ "%s", buf);
c277df42 8932}
a0ed51b3
LW
8933
8934/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8935
76234dfb 8936#ifndef PERL_IN_XSUB_RE
a0ed51b3 8937void
864dbfa3 8938Perl_save_re_context(pTHX)
b81d288d 8939{
97aff369 8940 dVAR;
1ade1aa1
NC
8941
8942 struct re_save_state *state;
8943
8944 SAVEVPTR(PL_curcop);
8945 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8946
8947 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8948 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8949 SSPUSHINT(SAVEt_RE_STATE);
8950
46ab3289 8951 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8952
a0ed51b3 8953 PL_reg_start_tmp = 0;
a0ed51b3 8954 PL_reg_start_tmpl = 0;
c445ea15 8955 PL_reg_oldsaved = NULL;
a5db57d6 8956 PL_reg_oldsavedlen = 0;
a5db57d6 8957 PL_reg_maxiter = 0;
a5db57d6 8958 PL_reg_leftiter = 0;
c445ea15 8959 PL_reg_poscache = NULL;
a5db57d6 8960 PL_reg_poscache_size = 0;
1ade1aa1
NC
8961#ifdef PERL_OLD_COPY_ON_WRITE
8962 PL_nrs = NULL;
8963#endif
ada6e8a9 8964
c445ea15
AL
8965 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8966 if (PL_curpm) {
8967 const REGEXP * const rx = PM_GETRE(PL_curpm);
8968 if (rx) {
1df70142 8969 U32 i;
ada6e8a9 8970 for (i = 1; i <= rx->nparens; i++) {
1df70142 8971 char digits[TYPE_CHARS(long)];
d9fad198 8972 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8973 GV *const *const gvp
8974 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8975
b37c2d43
AL
8976 if (gvp) {
8977 GV * const gv = *gvp;
8978 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8979 save_scalar(gv);
49f27e4b 8980 }
ada6e8a9
AMS
8981 }
8982 }
8983 }
a0ed51b3 8984}
76234dfb 8985#endif
51371543 8986
51371543 8987static void
acfe0abc 8988clear_re(pTHX_ void *r)
51371543 8989{
97aff369 8990 dVAR;
51371543
GS
8991 ReREFCNT_dec((regexp *)r);
8992}
ffbc6a93 8993
a28509cc
AL
8994#ifdef DEBUGGING
8995
8996STATIC void
8997S_put_byte(pTHX_ SV *sv, int c)
8998{
8999 if (isCNTRL(c) || c == 255 || !isPRINT(c))
9000 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9001 else if (c == '-' || c == ']' || c == '\\' || c == '^')
9002 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
9003 else
9004 Perl_sv_catpvf(aTHX_ sv, "%c", c);
9005}
9006
786e8c11 9007
3dab1dad
YO
9008#define CLEAR_OPTSTART \
9009 if (optstart) STMT_START { \
70685ca0 9010 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
9011 optstart=NULL; \
9012 } STMT_END
9013
786e8c11 9014#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 9015
b5a2f8d8
NC
9016STATIC const regnode *
9017S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9018 const regnode *last, const regnode *plast,
9019 SV* sv, I32 indent, U32 depth)
a28509cc 9020{
97aff369 9021 dVAR;
786e8c11 9022 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9023 register const regnode *next;
3dab1dad 9024 const regnode *optstart= NULL;
f8fc2ecf 9025 RXi_GET_DECL(r,ri);
3dab1dad 9026 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 9027
786e8c11
YO
9028#ifdef DEBUG_DUMPUNTIL
9029 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9030 last ? last-start : 0,plast ? plast-start : 0);
9031#endif
9032
9033 if (plast && plast < last)
9034 last= plast;
9035
9036 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
9037 /* While that wasn't END last time... */
9038
9039 NODE_ALIGN(node);
9040 op = OP(node);
de734bd5 9041 if (op == CLOSE || op == WHILEM)
786e8c11 9042 indent--;
b5a2f8d8 9043 next = regnext((regnode *)node);
07be1b83 9044
a28509cc 9045 /* Where, what. */
8e11feef 9046 if (OP(node) == OPTIMIZED) {
e68ec53f 9047 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 9048 optstart = node;
3dab1dad 9049 else
8e11feef 9050 goto after_print;
3dab1dad
YO
9051 } else
9052 CLEAR_OPTSTART;
07be1b83 9053
32fc9b6a 9054 regprop(r, sv, node);
a28509cc 9055 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 9056 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
9057
9058 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
9059 if (next == NULL) /* Next ptr. */
9060 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
9061 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9062 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
9063 else
9064 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 9065
1de06328 9066 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 9067 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
9068 }
9069
a28509cc
AL
9070 after_print:
9071 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
9072 assert(next);
9073 {
9074 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
9075 ? regnext((regnode *)next)
9076 : next);
be8e71aa
YO
9077 if (last && nnode > last)
9078 nnode = last;
786e8c11 9079 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 9080 }
a28509cc
AL
9081 }
9082 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9083 assert(next);
786e8c11 9084 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9085 }
9086 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9087 const regnode *this_trie = node;
1de06328 9088 const char op = OP(node);
a28509cc 9089 const I32 n = ARG(node);
1de06328 9090 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 9091 (reg_ac_data *)ri->data->data[n] :
1de06328 9092 NULL;
3251b653
NC
9093 const reg_trie_data * const trie =
9094 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
786e8c11 9095 const regnode *nextbranch= NULL;
a28509cc 9096 I32 word_idx;
1de06328 9097 sv_setpvn(sv, "", 0);
786e8c11 9098 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 9099 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
9100
9101 PerlIO_printf(Perl_debug_log, "%*s%s ",
9102 (int)(2*(indent+3)), "",
9103 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9104 PL_colors[0], PL_colors[1],
9105 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9106 PERL_PV_PRETTY_ELIPSES |
7f69552c 9107 PERL_PV_PRETTY_LTGT
786e8c11
YO
9108 )
9109 : "???"
9110 );
9111 if (trie->jump) {
40d049e4 9112 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
9113 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9114 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
9115 if (dist) {
9116 if (!nextbranch)
24b23f37 9117 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9118 DUMPUNTIL(this_trie + dist, nextbranch);
9119 }
786e8c11
YO
9120 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9121 nextbranch= regnext((regnode *)nextbranch);
9122 } else {
9123 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9124 }
786e8c11
YO
9125 }
9126 if (last && next > last)
9127 node= last;
9128 else
9129 node= next;
a28509cc 9130 }
786e8c11
YO
9131 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9132 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9133 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9134 }
9135 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9136 assert(next);
786e8c11 9137 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9138 }
9139 else if ( op == PLUS || op == STAR) {
786e8c11 9140 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9141 }
9142 else if (op == ANYOF) {
9143 /* arglen 1 + class block */
9144 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9145 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9146 node = NEXTOPER(node);
9147 }
9148 else if (PL_regkind[(U8)op] == EXACT) {
9149 /* Literal string, where present. */
9150 node += NODE_SZ_STR(node) - 1;
9151 node = NEXTOPER(node);
9152 }
9153 else {
9154 node = NEXTOPER(node);
9155 node += regarglen[(U8)op];
9156 }
9157 if (op == CURLYX || op == OPEN)
786e8c11 9158 indent++;
a28509cc 9159 }
3dab1dad 9160 CLEAR_OPTSTART;
786e8c11 9161#ifdef DEBUG_DUMPUNTIL
70685ca0 9162 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 9163#endif
1de06328 9164 return node;
a28509cc
AL
9165}
9166
9167#endif /* DEBUGGING */
9168
241d1a3b
NC
9169/*
9170 * Local variables:
9171 * c-indentation-style: bsd
9172 * c-basic-offset: 4
9173 * indent-tabs-mode: t
9174 * End:
9175 *
37442d52
RGS
9176 * ex: set ts=8 sts=4 sw=4 noet:
9177 */