This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Doc nits to re.pm
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
c74340f9
YO
116 I32 npar; /* Capture buffer count, (OPEN). */
117 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 118 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
119 I32 extralen;
120 I32 seen_zerolen;
121 I32 seen_evals;
40d049e4
YO
122 regnode **open_parens; /* pointers to open parens */
123 regnode **close_parens; /* pointers to close parens */
124 regnode *opend; /* END node in program */
1aa99e6b 125 I32 utf8;
6bda09f9 126 HV *charnames; /* cache of named sequences */
81714fb9 127 HV *paren_names; /* Paren names */
40d049e4
YO
128 regnode **recurse; /* Recurse regops */
129 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
130#if ADD_TO_REGEXEC
131 char *starttry; /* -Dr: where regtry was called. */
132#define RExC_starttry (pRExC_state->starttry)
133#endif
3dab1dad 134#ifdef DEBUGGING
be8e71aa 135 const char *lastparse;
3dab1dad
YO
136 I32 lastnum;
137#define RExC_lastparse (pRExC_state->lastparse)
138#define RExC_lastnum (pRExC_state->lastnum)
139#endif
830247a4
IZ
140} RExC_state_t;
141
e2509266 142#define RExC_flags (pRExC_state->flags)
830247a4
IZ
143#define RExC_precomp (pRExC_state->precomp)
144#define RExC_rx (pRExC_state->rx)
fac92740 145#define RExC_start (pRExC_state->start)
830247a4
IZ
146#define RExC_end (pRExC_state->end)
147#define RExC_parse (pRExC_state->parse)
148#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 149#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 150#define RExC_emit (pRExC_state->emit)
fac92740 151#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
152#define RExC_naughty (pRExC_state->naughty)
153#define RExC_sawback (pRExC_state->sawback)
154#define RExC_seen (pRExC_state->seen)
155#define RExC_size (pRExC_state->size)
156#define RExC_npar (pRExC_state->npar)
c74340f9 157#define RExC_cpar (pRExC_state->cpar)
e2e6a0f1 158#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
159#define RExC_extralen (pRExC_state->extralen)
160#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
161#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 162#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 163#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
164#define RExC_open_parens (pRExC_state->open_parens)
165#define RExC_close_parens (pRExC_state->close_parens)
166#define RExC_opend (pRExC_state->opend)
81714fb9 167#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
168#define RExC_recurse (pRExC_state->recurse)
169#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 170
a687059c
LW
171#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
172#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
173 ((*s) == '{' && regcurly(s)))
a687059c 174
35c8bce7
LW
175#ifdef SPSTART
176#undef SPSTART /* dratted cpp namespace... */
177#endif
a687059c
LW
178/*
179 * Flags to be passed up and down.
180 */
a687059c 181#define WORST 0 /* Worst case. */
821b33a5 182#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
183#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
184#define SPSTART 0x4 /* Starts with * or +. */
185#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 186
3dab1dad
YO
187#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
188
07be1b83
YO
189/* whether trie related optimizations are enabled */
190#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
191#define TRIE_STUDY_OPT
786e8c11 192#define FULL_TRIE_STUDY
07be1b83
YO
193#define TRIE_STCLASS
194#endif
1de06328
YO
195
196
40d049e4
YO
197
198#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
199#define PBITVAL(paren) (1 << ((paren) & 7))
200#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
201#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
202#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
203
204
1de06328
YO
205/* About scan_data_t.
206
207 During optimisation we recurse through the regexp program performing
208 various inplace (keyhole style) optimisations. In addition study_chunk
209 and scan_commit populate this data structure with information about
210 what strings MUST appear in the pattern. We look for the longest
211 string that must appear for at a fixed location, and we look for the
212 longest string that may appear at a floating location. So for instance
213 in the pattern:
214
215 /FOO[xX]A.*B[xX]BAR/
216
217 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
218 strings (because they follow a .* construct). study_chunk will identify
219 both FOO and BAR as being the longest fixed and floating strings respectively.
220
221 The strings can be composites, for instance
222
223 /(f)(o)(o)/
224
225 will result in a composite fixed substring 'foo'.
226
227 For each string some basic information is maintained:
228
229 - offset or min_offset
230 This is the position the string must appear at, or not before.
231 It also implicitly (when combined with minlenp) tells us how many
232 character must match before the string we are searching.
233 Likewise when combined with minlenp and the length of the string
234 tells us how many characters must appear after the string we have
235 found.
236
237 - max_offset
238 Only used for floating strings. This is the rightmost point that
239 the string can appear at. Ifset to I32 max it indicates that the
240 string can occur infinitely far to the right.
241
242 - minlenp
243 A pointer to the minimum length of the pattern that the string
244 was found inside. This is important as in the case of positive
245 lookahead or positive lookbehind we can have multiple patterns
246 involved. Consider
247
248 /(?=FOO).*F/
249
250 The minimum length of the pattern overall is 3, the minimum length
251 of the lookahead part is 3, but the minimum length of the part that
252 will actually match is 1. So 'FOO's minimum length is 3, but the
253 minimum length for the F is 1. This is important as the minimum length
254 is used to determine offsets in front of and behind the string being
255 looked for. Since strings can be composites this is the length of the
256 pattern at the time it was commited with a scan_commit. Note that
257 the length is calculated by study_chunk, so that the minimum lengths
258 are not known until the full pattern has been compiled, thus the
259 pointer to the value.
260
261 - lookbehind
262
263 In the case of lookbehind the string being searched for can be
264 offset past the start point of the final matching string.
265 If this value was just blithely removed from the min_offset it would
266 invalidate some of the calculations for how many chars must match
267 before or after (as they are derived from min_offset and minlen and
268 the length of the string being searched for).
269 When the final pattern is compiled and the data is moved from the
270 scan_data_t structure into the regexp structure the information
271 about lookbehind is factored in, with the information that would
272 have been lost precalculated in the end_shift field for the
273 associated string.
274
275 The fields pos_min and pos_delta are used to store the minimum offset
276 and the delta to the maximum offset at the current point in the pattern.
277
278*/
2c2d71f5
JH
279
280typedef struct scan_data_t {
1de06328
YO
281 /*I32 len_min; unused */
282 /*I32 len_delta; unused */
2c2d71f5
JH
283 I32 pos_min;
284 I32 pos_delta;
285 SV *last_found;
1de06328 286 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
287 I32 last_start_min;
288 I32 last_start_max;
1de06328
YO
289 SV **longest; /* Either &l_fixed, or &l_float. */
290 SV *longest_fixed; /* longest fixed string found in pattern */
291 I32 offset_fixed; /* offset where it starts */
292 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
293 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
294 SV *longest_float; /* longest floating string found in pattern */
295 I32 offset_float_min; /* earliest point in string it can appear */
296 I32 offset_float_max; /* latest point in string it can appear */
297 I32 *minlen_float; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
299 I32 flags;
300 I32 whilem_c;
cb434fcc 301 I32 *last_closep;
653099ff 302 struct regnode_charclass_class *start_class;
2c2d71f5
JH
303} scan_data_t;
304
a687059c 305/*
e50aee73 306 * Forward declarations for pregcomp()'s friends.
a687059c 307 */
a0d0e21e 308
27da23d5 309static const scan_data_t zero_scan_data =
1de06328 310 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
311
312#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
313#define SF_BEFORE_SEOL 0x0001
314#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
315#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
316#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
317
09b7f37c
CB
318#ifdef NO_UNARY_PLUS
319# define SF_FIX_SHIFT_EOL (0+2)
320# define SF_FL_SHIFT_EOL (0+4)
321#else
322# define SF_FIX_SHIFT_EOL (+2)
323# define SF_FL_SHIFT_EOL (+4)
324#endif
c277df42
IZ
325
326#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
327#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
328
329#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
330#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
331#define SF_IS_INF 0x0040
332#define SF_HAS_PAR 0x0080
333#define SF_IN_PAR 0x0100
334#define SF_HAS_EVAL 0x0200
335#define SCF_DO_SUBSTR 0x0400
653099ff
GS
336#define SCF_DO_STCLASS_AND 0x0800
337#define SCF_DO_STCLASS_OR 0x1000
338#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 339#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 340
786e8c11 341#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 342#define SCF_SEEN_ACCEPT 0x8000
07be1b83 343
eb160463 344#define UTF (RExC_utf8 != 0)
e2509266
JH
345#define LOC ((RExC_flags & PMf_LOCALE) != 0)
346#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 347
ffc61ed2 348#define OOB_UNICODE 12345678
93733859 349#define OOB_NAMEDCLASS -1
b8c5462f 350
a0ed51b3
LW
351#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
352#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
353
8615cb43 354
b45f050a
JF
355/* length of regex to show in messages that don't mark a position within */
356#define RegexLengthToShowInErrorMessages 127
357
358/*
359 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
360 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
361 * op/pragma/warn/regcomp.
362 */
7253e4e3
RK
363#define MARKER1 "<-- HERE" /* marker as it appears in the description */
364#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 365
7253e4e3 366#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
367
368/*
369 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
370 * arg. Show regex, up to a maximum length. If it's too long, chop and add
371 * "...".
372 */
ccb2c380 373#define FAIL(msg) STMT_START { \
bfed75c6 374 const char *ellipses = ""; \
ccb2c380
MP
375 IV len = RExC_end - RExC_precomp; \
376 \
377 if (!SIZE_ONLY) \
378 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
379 if (len > RegexLengthToShowInErrorMessages) { \
380 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
381 len = RegexLengthToShowInErrorMessages - 10; \
382 ellipses = "..."; \
383 } \
384 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
385 msg, (int)len, RExC_precomp, ellipses); \
386} STMT_END
8615cb43 387
b45f050a 388/*
b45f050a
JF
389 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
390 */
ccb2c380 391#define Simple_vFAIL(m) STMT_START { \
a28509cc 392 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
393 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
394 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
395} STMT_END
b45f050a
JF
396
397/*
398 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
399 */
ccb2c380
MP
400#define vFAIL(m) STMT_START { \
401 if (!SIZE_ONLY) \
402 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
403 Simple_vFAIL(m); \
404} STMT_END
b45f050a
JF
405
406/*
407 * Like Simple_vFAIL(), but accepts two arguments.
408 */
ccb2c380 409#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 410 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
411 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
412 (int)offset, RExC_precomp, RExC_precomp + offset); \
413} STMT_END
b45f050a
JF
414
415/*
416 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
417 */
ccb2c380
MP
418#define vFAIL2(m,a1) STMT_START { \
419 if (!SIZE_ONLY) \
420 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
421 Simple_vFAIL2(m, a1); \
422} STMT_END
b45f050a
JF
423
424
425/*
426 * Like Simple_vFAIL(), but accepts three arguments.
427 */
ccb2c380 428#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 429 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
430 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432} STMT_END
b45f050a
JF
433
434/*
435 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
436 */
ccb2c380
MP
437#define vFAIL3(m,a1,a2) STMT_START { \
438 if (!SIZE_ONLY) \
439 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
440 Simple_vFAIL3(m, a1, a2); \
441} STMT_END
b45f050a
JF
442
443/*
444 * Like Simple_vFAIL(), but accepts four arguments.
445 */
ccb2c380 446#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 447 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
448 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
449 (int)offset, RExC_precomp, RExC_precomp + offset); \
450} STMT_END
b45f050a 451
ccb2c380 452#define vWARN(loc,m) STMT_START { \
a28509cc 453 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
454 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
455 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
456} STMT_END
457
458#define vWARNdep(loc,m) STMT_START { \
a28509cc 459 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
460 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
461 "%s" REPORT_LOCATION, \
462 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
463} STMT_END
464
465
466#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 467 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
468 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
469 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
470} STMT_END
471
472#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 473 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
474 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
475 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
476} STMT_END
477
478#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 479 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
480 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
482} STMT_END
483
484#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 485 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
486 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
487 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
488} STMT_END
9d1d55b5 489
8615cb43 490
cd439c50 491/* Allow for side effects in s */
ccb2c380
MP
492#define REGC(c,s) STMT_START { \
493 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
494} STMT_END
cd439c50 495
fac92740
MJD
496/* Macros for recording node offsets. 20001227 mjd@plover.com
497 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
498 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
499 * Element 0 holds the number n.
07be1b83 500 * Position is 1 indexed.
fac92740
MJD
501 */
502
ccb2c380
MP
503#define Set_Node_Offset_To_R(node,byte) STMT_START { \
504 if (! SIZE_ONLY) { \
505 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 506 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 507 if((node) < 0) { \
551405c4 508 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
509 } else { \
510 RExC_offsets[2*(node)-1] = (byte); \
511 } \
512 } \
513} STMT_END
514
515#define Set_Node_Offset(node,byte) \
516 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
517#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
518
519#define Set_Node_Length_To_R(node,len) STMT_START { \
520 if (! SIZE_ONLY) { \
521 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 522 __LINE__, (int)(node), (int)(len))); \
ccb2c380 523 if((node) < 0) { \
551405c4 524 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
525 } else { \
526 RExC_offsets[2*(node)] = (len); \
527 } \
528 } \
529} STMT_END
530
531#define Set_Node_Length(node,len) \
532 Set_Node_Length_To_R((node)-RExC_emit_start, len)
533#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
534#define Set_Node_Cur_Length(node) \
535 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
536
537/* Get offsets and lengths */
538#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
539#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
540
07be1b83
YO
541#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
542 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
543 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
544} STMT_END
545
546
547#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
548#define EXPERIMENTAL_INPLACESCAN
549#endif
550
1de06328 551#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 552DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328
YO
553 PerlIO_printf(Perl_debug_log, \
554 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
555 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
556 (int)(depth)*2, "", \
557 (IV)((data)->pos_min), \
558 (IV)((data)->pos_delta), \
559 (IV)((data)->flags), \
560 (IV)((data)->whilem_c), \
561 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
562 ); \
563 if ((data)->last_found) \
564 PerlIO_printf(Perl_debug_log, \
565 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
566 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
567 SvPVX_const((data)->last_found), \
568 (IV)((data)->last_end), \
569 (IV)((data)->last_start_min), \
570 (IV)((data)->last_start_max), \
571 ((data)->longest && \
572 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
573 SvPVX_const((data)->longest_fixed), \
574 (IV)((data)->offset_fixed), \
575 ((data)->longest && \
576 (data)->longest==&((data)->longest_float)) ? "*" : "", \
577 SvPVX_const((data)->longest_float), \
578 (IV)((data)->offset_float_min), \
579 (IV)((data)->offset_float_max) \
580 ); \
581 PerlIO_printf(Perl_debug_log,"\n"); \
582});
583
acfe0abc 584static void clear_re(pTHX_ void *r);
4327152a 585
653099ff 586/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 587 Update the longest found anchored substring and the longest found
653099ff
GS
588 floating substrings if needed. */
589
4327152a 590STATIC void
1de06328 591S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 592{
e1ec3a88
AL
593 const STRLEN l = CHR_SVLEN(data->last_found);
594 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 595 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 596
c277df42 597 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 598 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
599 if (*data->longest == data->longest_fixed) {
600 data->offset_fixed = l ? data->last_start_min : data->pos_min;
601 if (data->flags & SF_BEFORE_EOL)
b81d288d 602 data->flags
c277df42
IZ
603 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
604 else
605 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
606 data->minlen_fixed=minlenp;
607 data->lookbehind_fixed=0;
a0ed51b3
LW
608 }
609 else {
c277df42 610 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
611 data->offset_float_max = (l
612 ? data->last_start_max
c277df42 613 : data->pos_min + data->pos_delta);
9051bda5
HS
614 if ((U32)data->offset_float_max > (U32)I32_MAX)
615 data->offset_float_max = I32_MAX;
c277df42 616 if (data->flags & SF_BEFORE_EOL)
b81d288d 617 data->flags
c277df42
IZ
618 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
619 else
620 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
621 data->minlen_float=minlenp;
622 data->lookbehind_float=0;
c277df42
IZ
623 }
624 }
625 SvCUR_set(data->last_found, 0);
0eda9292 626 {
a28509cc 627 SV * const sv = data->last_found;
097eb12c
AL
628 if (SvUTF8(sv) && SvMAGICAL(sv)) {
629 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
630 if (mg)
631 mg->mg_len = 0;
632 }
0eda9292 633 }
c277df42
IZ
634 data->last_end = -1;
635 data->flags &= ~SF_BEFORE_EOL;
1de06328 636 DEBUG_STUDYDATA(data,0);
c277df42
IZ
637}
638
653099ff
GS
639/* Can match anything (initialization) */
640STATIC void
097eb12c 641S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 642{
653099ff 643 ANYOF_CLASS_ZERO(cl);
f8bef550 644 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 645 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
646 if (LOC)
647 cl->flags |= ANYOF_LOCALE;
648}
649
650/* Can match anything (initialization) */
651STATIC int
5f66b61c 652S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
653{
654 int value;
655
aaa51d5e 656 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
657 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
658 return 1;
1aa99e6b
IH
659 if (!(cl->flags & ANYOF_UNICODE_ALL))
660 return 0;
10edeb5d 661 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 662 return 0;
653099ff
GS
663 return 1;
664}
665
666/* Can match anything (initialization) */
667STATIC void
097eb12c 668S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 669{
8ecf7187 670 Zero(cl, 1, struct regnode_charclass_class);
653099ff 671 cl->type = ANYOF;
830247a4 672 cl_anything(pRExC_state, cl);
653099ff
GS
673}
674
675STATIC void
097eb12c 676S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 677{
8ecf7187 678 Zero(cl, 1, struct regnode_charclass_class);
653099ff 679 cl->type = ANYOF;
830247a4 680 cl_anything(pRExC_state, cl);
653099ff
GS
681 if (LOC)
682 cl->flags |= ANYOF_LOCALE;
683}
684
685/* 'And' a given class with another one. Can create false positives */
686/* We assume that cl is not inverted */
687STATIC void
5f66b61c 688S_cl_and(struct regnode_charclass_class *cl,
a28509cc 689 const struct regnode_charclass_class *and_with)
653099ff 690{
40d049e4
YO
691
692 assert(and_with->type == ANYOF);
653099ff
GS
693 if (!(and_with->flags & ANYOF_CLASS)
694 && !(cl->flags & ANYOF_CLASS)
695 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
696 && !(and_with->flags & ANYOF_FOLD)
697 && !(cl->flags & ANYOF_FOLD)) {
698 int i;
699
700 if (and_with->flags & ANYOF_INVERT)
701 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
702 cl->bitmap[i] &= ~and_with->bitmap[i];
703 else
704 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
705 cl->bitmap[i] &= and_with->bitmap[i];
706 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
707 if (!(and_with->flags & ANYOF_EOS))
708 cl->flags &= ~ANYOF_EOS;
1aa99e6b 709
14ebb1a2
JH
710 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
711 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
712 cl->flags &= ~ANYOF_UNICODE_ALL;
713 cl->flags |= ANYOF_UNICODE;
714 ARG_SET(cl, ARG(and_with));
715 }
14ebb1a2
JH
716 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
717 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 718 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
719 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
720 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 721 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
722}
723
724/* 'OR' a given class with another one. Can create false positives */
725/* We assume that cl is not inverted */
726STATIC void
097eb12c 727S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 728{
653099ff
GS
729 if (or_with->flags & ANYOF_INVERT) {
730 /* We do not use
731 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
732 * <= (B1 | !B2) | (CL1 | !CL2)
733 * which is wasteful if CL2 is small, but we ignore CL2:
734 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
735 * XXXX Can we handle case-fold? Unclear:
736 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
737 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
738 */
739 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
740 && !(or_with->flags & ANYOF_FOLD)
741 && !(cl->flags & ANYOF_FOLD) ) {
742 int i;
743
744 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
745 cl->bitmap[i] |= ~or_with->bitmap[i];
746 } /* XXXX: logic is complicated otherwise */
747 else {
830247a4 748 cl_anything(pRExC_state, cl);
653099ff
GS
749 }
750 } else {
751 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
752 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 753 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
754 || (cl->flags & ANYOF_FOLD)) ) {
755 int i;
756
757 /* OR char bitmap and class bitmap separately */
758 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
759 cl->bitmap[i] |= or_with->bitmap[i];
760 if (or_with->flags & ANYOF_CLASS) {
761 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
762 cl->classflags[i] |= or_with->classflags[i];
763 cl->flags |= ANYOF_CLASS;
764 }
765 }
766 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 767 cl_anything(pRExC_state, cl);
653099ff
GS
768 }
769 }
770 if (or_with->flags & ANYOF_EOS)
771 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
772
773 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
774 ARG(cl) != ARG(or_with)) {
775 cl->flags |= ANYOF_UNICODE_ALL;
776 cl->flags &= ~ANYOF_UNICODE;
777 }
778 if (or_with->flags & ANYOF_UNICODE_ALL) {
779 cl->flags |= ANYOF_UNICODE_ALL;
780 cl->flags &= ~ANYOF_UNICODE;
781 }
653099ff
GS
782}
783
a3621e74
YO
784#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
785#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
786#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
787#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
788
3dab1dad
YO
789
790#ifdef DEBUGGING
07be1b83 791/*
3dab1dad
YO
792 dump_trie(trie)
793 dump_trie_interim_list(trie,next_alloc)
794 dump_trie_interim_table(trie,next_alloc)
795
796 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
797 The _interim_ variants are used for debugging the interim
798 tables that are used to generate the final compressed
799 representation which is what dump_trie expects.
800
3dab1dad
YO
801 Part of the reason for their existance is to provide a form
802 of documentation as to how the different representations function.
07be1b83
YO
803
804*/
3dab1dad
YO
805
806/*
807 dump_trie(trie)
808 Dumps the final compressed table form of the trie to Perl_debug_log.
809 Used for debugging make_trie().
810*/
811
812STATIC void
813S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
814{
815 U32 state;
ab3bbdeb
YO
816 SV *sv=sv_newmortal();
817 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
818 GET_RE_DEBUG_FLAGS_DECL;
819
ab3bbdeb 820
3dab1dad
YO
821 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
822 (int)depth * 2 + 2,"",
823 "Match","Base","Ofs" );
824
825 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 826 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 827 if ( tmp ) {
ab3bbdeb
YO
828 PerlIO_printf( Perl_debug_log, "%*s",
829 colwidth,
ddc5bc0f 830 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
831 PL_colors[0], PL_colors[1],
832 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
833 PERL_PV_ESCAPE_FIRSTCHAR
834 )
835 );
3dab1dad
YO
836 }
837 }
838 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
839 (int)depth * 2 + 2,"");
840
841 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 842 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
843 PerlIO_printf( Perl_debug_log, "\n");
844
1e2e3d02 845 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 846 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
847
848 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
849
850 if ( trie->states[ state ].wordnum ) {
851 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
852 } else {
853 PerlIO_printf( Perl_debug_log, "%6s", "" );
854 }
855
856 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
857
858 if ( base ) {
859 U32 ofs = 0;
860
861 while( ( base + ofs < trie->uniquecharcount ) ||
862 ( base + ofs - trie->uniquecharcount < trie->lasttrans
863 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
864 ofs++;
865
866 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
867
868 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
869 if ( ( base + ofs >= trie->uniquecharcount ) &&
870 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
871 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
872 {
ab3bbdeb
YO
873 PerlIO_printf( Perl_debug_log, "%*"UVXf,
874 colwidth,
3dab1dad
YO
875 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
876 } else {
ab3bbdeb 877 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
878 }
879 }
880
881 PerlIO_printf( Perl_debug_log, "]");
882
883 }
884 PerlIO_printf( Perl_debug_log, "\n" );
885 }
886}
887/*
888 dump_trie_interim_list(trie,next_alloc)
889 Dumps a fully constructed but uncompressed trie in list form.
890 List tries normally only are used for construction when the number of
891 possible chars (trie->uniquecharcount) is very high.
892 Used for debugging make_trie().
893*/
894STATIC void
895S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
896{
897 U32 state;
ab3bbdeb
YO
898 SV *sv=sv_newmortal();
899 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
900 GET_RE_DEBUG_FLAGS_DECL;
901 /* print out the table precompression. */
ab3bbdeb
YO
902 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
903 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
904 "------:-----+-----------------\n" );
3dab1dad
YO
905
906 for( state=1 ; state < next_alloc ; state ++ ) {
907 U16 charid;
908
ab3bbdeb 909 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
910 (int)depth * 2 + 2,"", (UV)state );
911 if ( ! trie->states[ state ].wordnum ) {
912 PerlIO_printf( Perl_debug_log, "%5s| ","");
913 } else {
914 PerlIO_printf( Perl_debug_log, "W%4x| ",
915 trie->states[ state ].wordnum
916 );
917 }
918 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 919 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
920 if ( tmp ) {
921 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
922 colwidth,
ddc5bc0f 923 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
924 PL_colors[0], PL_colors[1],
925 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
926 PERL_PV_ESCAPE_FIRSTCHAR
927 ) ,
1e2e3d02
YO
928 TRIE_LIST_ITEM(state,charid).forid,
929 (UV)TRIE_LIST_ITEM(state,charid).newstate
930 );
931 if (!(charid % 10))
664e119d
RGS
932 PerlIO_printf(Perl_debug_log, "\n%*s| ",
933 (int)((depth * 2) + 14), "");
1e2e3d02 934 }
ab3bbdeb
YO
935 }
936 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
937 }
938}
939
940/*
941 dump_trie_interim_table(trie,next_alloc)
942 Dumps a fully constructed but uncompressed trie in table form.
943 This is the normal DFA style state transition table, with a few
944 twists to facilitate compression later.
945 Used for debugging make_trie().
946*/
947STATIC void
948S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
949{
950 U32 state;
951 U16 charid;
ab3bbdeb
YO
952 SV *sv=sv_newmortal();
953 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
954 GET_RE_DEBUG_FLAGS_DECL;
955
956 /*
957 print out the table precompression so that we can do a visual check
958 that they are identical.
959 */
960
961 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
962
963 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 964 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 965 if ( tmp ) {
ab3bbdeb
YO
966 PerlIO_printf( Perl_debug_log, "%*s",
967 colwidth,
ddc5bc0f 968 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
969 PL_colors[0], PL_colors[1],
970 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
971 PERL_PV_ESCAPE_FIRSTCHAR
972 )
973 );
3dab1dad
YO
974 }
975 }
976
977 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
978
979 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 980 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
981 }
982
983 PerlIO_printf( Perl_debug_log, "\n" );
984
985 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
986
987 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
988 (int)depth * 2 + 2,"",
989 (UV)TRIE_NODENUM( state ) );
990
991 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
992 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
993 if (v)
994 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
995 else
996 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
997 }
998 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
999 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1000 } else {
1001 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1002 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1003 }
1004 }
07be1b83 1005}
3dab1dad
YO
1006
1007#endif
1008
786e8c11
YO
1009/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1010 startbranch: the first branch in the whole branch sequence
1011 first : start branch of sequence of branch-exact nodes.
1012 May be the same as startbranch
1013 last : Thing following the last branch.
1014 May be the same as tail.
1015 tail : item following the branch sequence
1016 count : words in the sequence
1017 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1018 depth : indent depth
3dab1dad 1019
786e8c11 1020Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1021
786e8c11
YO
1022A trie is an N'ary tree where the branches are determined by digital
1023decomposition of the key. IE, at the root node you look up the 1st character and
1024follow that branch repeat until you find the end of the branches. Nodes can be
1025marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1026
786e8c11 1027 /he|she|his|hers/
72f13be8 1028
786e8c11
YO
1029would convert into the following structure. Numbers represent states, letters
1030following numbers represent valid transitions on the letter from that state, if
1031the number is in square brackets it represents an accepting state, otherwise it
1032will be in parenthesis.
07be1b83 1033
786e8c11
YO
1034 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1035 | |
1036 | (2)
1037 | |
1038 (1) +-i->(6)-+-s->[7]
1039 |
1040 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1041
786e8c11
YO
1042 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1043
1044This shows that when matching against the string 'hers' we will begin at state 1
1045read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1046then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1047is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1048single traverse. We store a mapping from accepting to state to which word was
1049matched, and then when we have multiple possibilities we try to complete the
1050rest of the regex in the order in which they occured in the alternation.
1051
1052The only prior NFA like behaviour that would be changed by the TRIE support is
1053the silent ignoring of duplicate alternations which are of the form:
1054
1055 / (DUPE|DUPE) X? (?{ ... }) Y /x
1056
1057Thus EVAL blocks follwing a trie may be called a different number of times with
1058and without the optimisation. With the optimisations dupes will be silently
1059ignored. This inconsistant behaviour of EVAL type nodes is well established as
1060the following demonstrates:
1061
1062 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1063
1064which prints out 'word' three times, but
1065
1066 'words'=~/(word|word|word)(?{ print $1 })S/
1067
1068which doesnt print it out at all. This is due to other optimisations kicking in.
1069
1070Example of what happens on a structural level:
1071
1072The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1073
1074 1: CURLYM[1] {1,32767}(18)
1075 5: BRANCH(8)
1076 6: EXACT <ac>(16)
1077 8: BRANCH(11)
1078 9: EXACT <ad>(16)
1079 11: BRANCH(14)
1080 12: EXACT <ab>(16)
1081 16: SUCCEED(0)
1082 17: NOTHING(18)
1083 18: END(0)
1084
1085This would be optimizable with startbranch=5, first=5, last=16, tail=16
1086and should turn into:
1087
1088 1: CURLYM[1] {1,32767}(18)
1089 5: TRIE(16)
1090 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1091 <ac>
1092 <ad>
1093 <ab>
1094 16: SUCCEED(0)
1095 17: NOTHING(18)
1096 18: END(0)
1097
1098Cases where tail != last would be like /(?foo|bar)baz/:
1099
1100 1: BRANCH(4)
1101 2: EXACT <foo>(8)
1102 4: BRANCH(7)
1103 5: EXACT <bar>(8)
1104 7: TAIL(8)
1105 8: EXACT <baz>(10)
1106 10: END(0)
1107
1108which would be optimizable with startbranch=1, first=1, last=7, tail=8
1109and would end up looking like:
1110
1111 1: TRIE(8)
1112 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1113 <foo>
1114 <bar>
1115 7: TAIL(8)
1116 8: EXACT <baz>(10)
1117 10: END(0)
1118
1119 d = uvuni_to_utf8_flags(d, uv, 0);
1120
1121is the recommended Unicode-aware way of saying
1122
1123 *(d++) = uv;
1124*/
1125
1e2e3d02 1126#define TRIE_STORE_REVCHAR \
786e8c11 1127 STMT_START { \
1e2e3d02 1128 SV *tmp = newSVpvs(""); \
786e8c11 1129 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1130 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1131 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1132 } STMT_END
1133
1134#define TRIE_READ_CHAR STMT_START { \
1135 wordlen++; \
1136 if ( UTF ) { \
1137 if ( folder ) { \
1138 if ( foldlen > 0 ) { \
1139 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1140 foldlen -= len; \
1141 scan += len; \
1142 len = 0; \
1143 } else { \
1144 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1145 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1146 foldlen -= UNISKIP( uvc ); \
1147 scan = foldbuf + UNISKIP( uvc ); \
1148 } \
1149 } else { \
1150 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1151 } \
1152 } else { \
1153 uvc = (U32)*uc; \
1154 len = 1; \
1155 } \
1156} STMT_END
1157
1158
1159
1160#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1161 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1162 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1163 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1164 } \
1165 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1166 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1167 TRIE_LIST_CUR( state )++; \
1168} STMT_END
07be1b83 1169
786e8c11
YO
1170#define TRIE_LIST_NEW(state) STMT_START { \
1171 Newxz( trie->states[ state ].trans.list, \
1172 4, reg_trie_trans_le ); \
1173 TRIE_LIST_CUR( state ) = 1; \
1174 TRIE_LIST_LEN( state ) = 4; \
1175} STMT_END
07be1b83 1176
786e8c11
YO
1177#define TRIE_HANDLE_WORD(state) STMT_START { \
1178 U16 dupe= trie->states[ state ].wordnum; \
1179 regnode * const noper_next = regnext( noper ); \
1180 \
1181 if (trie->wordlen) \
1182 trie->wordlen[ curword ] = wordlen; \
1183 DEBUG_r({ \
1184 /* store the word for dumping */ \
1185 SV* tmp; \
1186 if (OP(noper) != NOTHING) \
1187 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1188 else \
1189 tmp = newSVpvn( "", 0 ); \
1190 if ( UTF ) SvUTF8_on( tmp ); \
1191 av_push( trie->words, tmp ); \
1192 }); \
1193 \
1194 curword++; \
1195 \
1196 if ( noper_next < tail ) { \
1197 if (!trie->jump) \
1198 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1199 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1200 if (!jumper) \
1201 jumper = noper_next; \
1202 if (!nextbranch) \
1203 nextbranch= regnext(cur); \
1204 } \
1205 \
1206 if ( dupe ) { \
1207 /* So it's a dupe. This means we need to maintain a */\
1208 /* linked-list from the first to the next. */\
1209 /* we only allocate the nextword buffer when there */\
1210 /* a dupe, so first time we have to do the allocation */\
1211 if (!trie->nextword) \
1212 Newxz( trie->nextword, word_count + 1, U16); \
1213 while ( trie->nextword[dupe] ) \
1214 dupe= trie->nextword[dupe]; \
1215 trie->nextword[dupe]= curword; \
1216 } else { \
1217 /* we haven't inserted this word yet. */ \
1218 trie->states[ state ].wordnum = curword; \
1219 } \
1220} STMT_END
07be1b83 1221
3dab1dad 1222
786e8c11
YO
1223#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1224 ( ( base + charid >= ucharcount \
1225 && base + charid < ubound \
1226 && state == trie->trans[ base - ucharcount + charid ].check \
1227 && trie->trans[ base - ucharcount + charid ].next ) \
1228 ? trie->trans[ base - ucharcount + charid ].next \
1229 : ( state==1 ? special : 0 ) \
1230 )
3dab1dad 1231
786e8c11
YO
1232#define MADE_TRIE 1
1233#define MADE_JUMP_TRIE 2
1234#define MADE_EXACT_TRIE 4
3dab1dad 1235
a3621e74 1236STATIC I32
786e8c11 1237S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1238{
27da23d5 1239 dVAR;
a3621e74
YO
1240 /* first pass, loop through and scan words */
1241 reg_trie_data *trie;
1242 regnode *cur;
9f7f3913 1243 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1244 STRLEN len = 0;
1245 UV uvc = 0;
1246 U16 curword = 0;
1247 U32 next_alloc = 0;
786e8c11
YO
1248 regnode *jumper = NULL;
1249 regnode *nextbranch = NULL;
7f69552c 1250 regnode *convert = NULL;
a3621e74 1251 /* we just use folder as a flag in utf8 */
e1ec3a88 1252 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1253 ? PL_fold
1254 : ( flags == EXACTFL
1255 ? PL_fold_locale
1256 : NULL
1257 )
1258 );
1259
e1ec3a88 1260 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1261 SV *re_trie_maxbuff;
3dab1dad
YO
1262#ifndef DEBUGGING
1263 /* these are only used during construction but are useful during
8e11feef 1264 * debugging so we store them in the struct when debugging.
8e11feef 1265 */
3dab1dad 1266 STRLEN trie_charcount=0;
3dab1dad
YO
1267 AV *trie_revcharmap;
1268#endif
a3621e74 1269 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1270#ifndef DEBUGGING
1271 PERL_UNUSED_ARG(depth);
1272#endif
a3621e74 1273
a02a5408 1274 Newxz( trie, 1, reg_trie_data );
a3621e74 1275 trie->refcount = 1;
3dab1dad 1276 trie->startstate = 1;
786e8c11 1277 trie->wordcount = word_count;
a3621e74 1278 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1279 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1280 if (!(UTF && folder))
1281 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1282 DEBUG_r({
1283 trie->words = newAV();
a3621e74 1284 });
3dab1dad 1285 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1286
0111c4fd 1287 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1288 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1289 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1290 }
3dab1dad
YO
1291 DEBUG_OPTIMISE_r({
1292 PerlIO_printf( Perl_debug_log,
786e8c11 1293 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1294 (int)depth * 2 + 2, "",
1295 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1296 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1297 (int)depth);
3dab1dad 1298 });
7f69552c
YO
1299
1300 /* Find the node we are going to overwrite */
1301 if ( first == startbranch && OP( last ) != BRANCH ) {
1302 /* whole branch chain */
1303 convert = first;
1304 } else {
1305 /* branch sub-chain */
1306 convert = NEXTOPER( first );
1307 }
1308
a3621e74
YO
1309 /* -- First loop and Setup --
1310
1311 We first traverse the branches and scan each word to determine if it
1312 contains widechars, and how many unique chars there are, this is
1313 important as we have to build a table with at least as many columns as we
1314 have unique chars.
1315
1316 We use an array of integers to represent the character codes 0..255
1317 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1318 native representation of the character value as the key and IV's for the
1319 coded index.
1320
1321 *TODO* If we keep track of how many times each character is used we can
1322 remap the columns so that the table compression later on is more
1323 efficient in terms of memory by ensuring most common value is in the
1324 middle and the least common are on the outside. IMO this would be better
1325 than a most to least common mapping as theres a decent chance the most
1326 common letter will share a node with the least common, meaning the node
1327 will not be compressable. With a middle is most common approach the worst
1328 case is when we have the least common nodes twice.
1329
1330 */
1331
a3621e74 1332 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1333 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1334 const U8 *uc = (U8*)STRING( noper );
a28509cc 1335 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1336 STRLEN foldlen = 0;
1337 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1338 const U8 *scan = (U8*)NULL;
07be1b83 1339 U32 wordlen = 0; /* required init */
3dab1dad 1340 STRLEN chars=0;
a3621e74 1341
3dab1dad
YO
1342 if (OP(noper) == NOTHING) {
1343 trie->minlen= 0;
1344 continue;
1345 }
1346 if (trie->bitmap) {
1347 TRIE_BITMAP_SET(trie,*uc);
1348 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1349 }
a3621e74 1350 for ( ; uc < e ; uc += len ) {
3dab1dad 1351 TRIE_CHARCOUNT(trie)++;
a3621e74 1352 TRIE_READ_CHAR;
3dab1dad 1353 chars++;
a3621e74
YO
1354 if ( uvc < 256 ) {
1355 if ( !trie->charmap[ uvc ] ) {
1356 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1357 if ( folder )
1358 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1359 TRIE_STORE_REVCHAR;
a3621e74
YO
1360 }
1361 } else {
1362 SV** svpp;
1363 if ( !trie->widecharmap )
1364 trie->widecharmap = newHV();
1365
1366 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1367
1368 if ( !svpp )
e4584336 1369 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1370
1371 if ( !SvTRUE( *svpp ) ) {
1372 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1373 TRIE_STORE_REVCHAR;
a3621e74
YO
1374 }
1375 }
1376 }
3dab1dad
YO
1377 if( cur == first ) {
1378 trie->minlen=chars;
1379 trie->maxlen=chars;
1380 } else if (chars < trie->minlen) {
1381 trie->minlen=chars;
1382 } else if (chars > trie->maxlen) {
1383 trie->maxlen=chars;
1384 }
1385
a3621e74
YO
1386 } /* end first pass */
1387 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1388 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1389 (int)depth * 2 + 2,"",
85c3142d 1390 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1391 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1392 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1393 );
786e8c11 1394 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1395
1396 /*
1397 We now know what we are dealing with in terms of unique chars and
1398 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1399 representation using a flat table will take. If it's over a reasonable
1400 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1401 conservative but potentially much slower representation using an array
1402 of lists.
1403
1404 At the end we convert both representations into the same compressed
1405 form that will be used in regexec.c for matching with. The latter
1406 is a form that cannot be used to construct with but has memory
1407 properties similar to the list form and access properties similar
1408 to the table form making it both suitable for fast searches and
1409 small enough that its feasable to store for the duration of a program.
1410
1411 See the comment in the code where the compressed table is produced
1412 inplace from the flat tabe representation for an explanation of how
1413 the compression works.
1414
1415 */
1416
1417
3dab1dad 1418 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1419 /*
1420 Second Pass -- Array Of Lists Representation
1421
1422 Each state will be represented by a list of charid:state records
1423 (reg_trie_trans_le) the first such element holds the CUR and LEN
1424 points of the allocated array. (See defines above).
1425
1426 We build the initial structure using the lists, and then convert
1427 it into the compressed table form which allows faster lookups
1428 (but cant be modified once converted).
a3621e74
YO
1429 */
1430
a3621e74
YO
1431 STRLEN transcount = 1;
1432
1e2e3d02
YO
1433 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1434 "%*sCompiling trie using list compiler\n",
1435 (int)depth * 2 + 2, ""));
1436
3dab1dad 1437 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1438 TRIE_LIST_NEW(1);
1439 next_alloc = 2;
1440
1441 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1442
c445ea15
AL
1443 regnode * const noper = NEXTOPER( cur );
1444 U8 *uc = (U8*)STRING( noper );
1445 const U8 * const e = uc + STR_LEN( noper );
1446 U32 state = 1; /* required init */
1447 U16 charid = 0; /* sanity init */
1448 U8 *scan = (U8*)NULL; /* sanity init */
1449 STRLEN foldlen = 0; /* required init */
07be1b83 1450 U32 wordlen = 0; /* required init */
c445ea15
AL
1451 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1452
3dab1dad 1453 if (OP(noper) != NOTHING) {
786e8c11 1454 for ( ; uc < e ; uc += len ) {
c445ea15 1455
786e8c11 1456 TRIE_READ_CHAR;
c445ea15 1457
786e8c11
YO
1458 if ( uvc < 256 ) {
1459 charid = trie->charmap[ uvc ];
c445ea15 1460 } else {
786e8c11
YO
1461 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1462 if ( !svpp ) {
1463 charid = 0;
1464 } else {
1465 charid=(U16)SvIV( *svpp );
1466 }
c445ea15 1467 }
786e8c11
YO
1468 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1469 if ( charid ) {
a3621e74 1470
786e8c11
YO
1471 U16 check;
1472 U32 newstate = 0;
a3621e74 1473
786e8c11
YO
1474 charid--;
1475 if ( !trie->states[ state ].trans.list ) {
1476 TRIE_LIST_NEW( state );
c445ea15 1477 }
786e8c11
YO
1478 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1479 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1480 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1481 break;
1482 }
1483 }
1484 if ( ! newstate ) {
1485 newstate = next_alloc++;
1486 TRIE_LIST_PUSH( state, charid, newstate );
1487 transcount++;
1488 }
1489 state = newstate;
1490 } else {
1491 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1492 }
a28509cc 1493 }
c445ea15 1494 }
3dab1dad 1495 TRIE_HANDLE_WORD(state);
a3621e74
YO
1496
1497 } /* end second pass */
1498
1e2e3d02
YO
1499 /* next alloc is the NEXT state to be allocated */
1500 trie->statecount = next_alloc;
a3621e74
YO
1501 Renew( trie->states, next_alloc, reg_trie_state );
1502
3dab1dad
YO
1503 /* and now dump it out before we compress it */
1504 DEBUG_TRIE_COMPILE_MORE_r(
1505 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1506 );
a3621e74 1507
a02a5408 1508 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1509 {
1510 U32 state;
a3621e74
YO
1511 U32 tp = 0;
1512 U32 zp = 0;
1513
1514
1515 for( state=1 ; state < next_alloc ; state ++ ) {
1516 U32 base=0;
1517
1518 /*
1519 DEBUG_TRIE_COMPILE_MORE_r(
1520 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1521 );
1522 */
1523
1524 if (trie->states[state].trans.list) {
1525 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1526 U16 maxid=minid;
a28509cc 1527 U16 idx;
a3621e74
YO
1528
1529 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1530 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1531 if ( forid < minid ) {
1532 minid=forid;
1533 } else if ( forid > maxid ) {
1534 maxid=forid;
1535 }
a3621e74
YO
1536 }
1537 if ( transcount < tp + maxid - minid + 1) {
1538 transcount *= 2;
1539 Renew( trie->trans, transcount, reg_trie_trans );
1540 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1541 }
1542 base = trie->uniquecharcount + tp - minid;
1543 if ( maxid == minid ) {
1544 U32 set = 0;
1545 for ( ; zp < tp ; zp++ ) {
1546 if ( ! trie->trans[ zp ].next ) {
1547 base = trie->uniquecharcount + zp - minid;
1548 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1549 trie->trans[ zp ].check = state;
1550 set = 1;
1551 break;
1552 }
1553 }
1554 if ( !set ) {
1555 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1556 trie->trans[ tp ].check = state;
1557 tp++;
1558 zp = tp;
1559 }
1560 } else {
1561 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1562 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1563 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1564 trie->trans[ tid ].check = state;
1565 }
1566 tp += ( maxid - minid + 1 );
1567 }
1568 Safefree(trie->states[ state ].trans.list);
1569 }
1570 /*
1571 DEBUG_TRIE_COMPILE_MORE_r(
1572 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1573 );
1574 */
1575 trie->states[ state ].trans.base=base;
1576 }
cc601c31 1577 trie->lasttrans = tp + 1;
a3621e74
YO
1578 }
1579 } else {
1580 /*
1581 Second Pass -- Flat Table Representation.
1582
1583 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1584 We know that we will need Charcount+1 trans at most to store the data
1585 (one row per char at worst case) So we preallocate both structures
1586 assuming worst case.
1587
1588 We then construct the trie using only the .next slots of the entry
1589 structs.
1590
1591 We use the .check field of the first entry of the node temporarily to
1592 make compression both faster and easier by keeping track of how many non
1593 zero fields are in the node.
1594
1595 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1596 transition.
1597
1598 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1599 number representing the first entry of the node, and state as a
1600 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1601 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1602 are 2 entrys per node. eg:
1603
1604 A B A B
1605 1. 2 4 1. 3 7
1606 2. 0 3 3. 0 5
1607 3. 0 0 5. 0 0
1608 4. 0 0 7. 0 0
1609
1610 The table is internally in the right hand, idx form. However as we also
1611 have to deal with the states array which is indexed by nodenum we have to
1612 use TRIE_NODENUM() to convert.
1613
1614 */
1e2e3d02
YO
1615 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1616 "%*sCompiling trie using table compiler\n",
1617 (int)depth * 2 + 2, ""));
3dab1dad
YO
1618
1619 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1620 reg_trie_trans );
3dab1dad 1621 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1622 next_alloc = trie->uniquecharcount + 1;
1623
3dab1dad 1624
a3621e74
YO
1625 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1626
c445ea15 1627 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1628 const U8 *uc = (U8*)STRING( noper );
1629 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1630
1631 U32 state = 1; /* required init */
1632
1633 U16 charid = 0; /* sanity init */
1634 U32 accept_state = 0; /* sanity init */
1635 U8 *scan = (U8*)NULL; /* sanity init */
1636
1637 STRLEN foldlen = 0; /* required init */
07be1b83 1638 U32 wordlen = 0; /* required init */
a3621e74
YO
1639 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1640
3dab1dad 1641 if ( OP(noper) != NOTHING ) {
786e8c11 1642 for ( ; uc < e ; uc += len ) {
a3621e74 1643
786e8c11 1644 TRIE_READ_CHAR;
a3621e74 1645
786e8c11
YO
1646 if ( uvc < 256 ) {
1647 charid = trie->charmap[ uvc ];
1648 } else {
1649 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1650 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1651 }
786e8c11
YO
1652 if ( charid ) {
1653 charid--;
1654 if ( !trie->trans[ state + charid ].next ) {
1655 trie->trans[ state + charid ].next = next_alloc;
1656 trie->trans[ state ].check++;
1657 next_alloc += trie->uniquecharcount;
1658 }
1659 state = trie->trans[ state + charid ].next;
1660 } else {
1661 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1662 }
1663 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1664 }
a3621e74 1665 }
3dab1dad
YO
1666 accept_state = TRIE_NODENUM( state );
1667 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1668
1669 } /* end second pass */
1670
3dab1dad
YO
1671 /* and now dump it out before we compress it */
1672 DEBUG_TRIE_COMPILE_MORE_r(
1673 dump_trie_interim_table(trie,next_alloc,depth+1)
1674 );
a3621e74 1675
a3621e74
YO
1676 {
1677 /*
1678 * Inplace compress the table.*
1679
1680 For sparse data sets the table constructed by the trie algorithm will
1681 be mostly 0/FAIL transitions or to put it another way mostly empty.
1682 (Note that leaf nodes will not contain any transitions.)
1683
1684 This algorithm compresses the tables by eliminating most such
1685 transitions, at the cost of a modest bit of extra work during lookup:
1686
1687 - Each states[] entry contains a .base field which indicates the
1688 index in the state[] array wheres its transition data is stored.
1689
1690 - If .base is 0 there are no valid transitions from that node.
1691
1692 - If .base is nonzero then charid is added to it to find an entry in
1693 the trans array.
1694
1695 -If trans[states[state].base+charid].check!=state then the
1696 transition is taken to be a 0/Fail transition. Thus if there are fail
1697 transitions at the front of the node then the .base offset will point
1698 somewhere inside the previous nodes data (or maybe even into a node
1699 even earlier), but the .check field determines if the transition is
1700 valid.
1701
786e8c11 1702 XXX - wrong maybe?
a3621e74
YO
1703 The following process inplace converts the table to the compressed
1704 table: We first do not compress the root node 1,and mark its all its
1705 .check pointers as 1 and set its .base pointer as 1 as well. This
1706 allows to do a DFA construction from the compressed table later, and
1707 ensures that any .base pointers we calculate later are greater than
1708 0.
1709
1710 - We set 'pos' to indicate the first entry of the second node.
1711
1712 - We then iterate over the columns of the node, finding the first and
1713 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1714 and set the .check pointers accordingly, and advance pos
1715 appropriately and repreat for the next node. Note that when we copy
1716 the next pointers we have to convert them from the original
1717 NODEIDX form to NODENUM form as the former is not valid post
1718 compression.
1719
1720 - If a node has no transitions used we mark its base as 0 and do not
1721 advance the pos pointer.
1722
1723 - If a node only has one transition we use a second pointer into the
1724 structure to fill in allocated fail transitions from other states.
1725 This pointer is independent of the main pointer and scans forward
1726 looking for null transitions that are allocated to a state. When it
1727 finds one it writes the single transition into the "hole". If the
786e8c11 1728 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1729
1730 - Once compressed we can Renew/realloc the structures to release the
1731 excess space.
1732
1733 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1734 specifically Fig 3.47 and the associated pseudocode.
1735
1736 demq
1737 */
a3b680e6 1738 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1739 U32 state, charid;
a3621e74 1740 U32 pos = 0, zp=0;
1e2e3d02 1741 trie->statecount = laststate;
a3621e74
YO
1742
1743 for ( state = 1 ; state < laststate ; state++ ) {
1744 U8 flag = 0;
a28509cc
AL
1745 const U32 stateidx = TRIE_NODEIDX( state );
1746 const U32 o_used = trie->trans[ stateidx ].check;
1747 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1748 trie->trans[ stateidx ].check = 0;
1749
1750 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1751 if ( flag || trie->trans[ stateidx + charid ].next ) {
1752 if ( trie->trans[ stateidx + charid ].next ) {
1753 if (o_used == 1) {
1754 for ( ; zp < pos ; zp++ ) {
1755 if ( ! trie->trans[ zp ].next ) {
1756 break;
1757 }
1758 }
1759 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1760 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1761 trie->trans[ zp ].check = state;
1762 if ( ++zp > pos ) pos = zp;
1763 break;
1764 }
1765 used--;
1766 }
1767 if ( !flag ) {
1768 flag = 1;
1769 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1770 }
1771 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1772 trie->trans[ pos ].check = state;
1773 pos++;
1774 }
1775 }
1776 }
cc601c31 1777 trie->lasttrans = pos + 1;
1e2e3d02 1778 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1779 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1780 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1781 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1782 (int)depth * 2 + 2,"",
1783 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1784 (IV)next_alloc,
1785 (IV)pos,
a3621e74
YO
1786 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1787 );
1788
1789 } /* end table compress */
1790 }
1e2e3d02
YO
1791 DEBUG_TRIE_COMPILE_MORE_r(
1792 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1793 (int)depth * 2 + 2, "",
1794 (UV)trie->statecount,
1795 (UV)trie->lasttrans)
1796 );
cc601c31
YO
1797 /* resize the trans array to remove unused space */
1798 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1799
3dab1dad
YO
1800 /* and now dump out the compressed format */
1801 DEBUG_TRIE_COMPILE_r(
1802 dump_trie(trie,depth+1)
1803 );
07be1b83 1804
3dab1dad 1805 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1806 U8 nodetype =(U8)(flags & 0xFF);
1807 char *str=NULL;
786e8c11 1808
07be1b83 1809#ifdef DEBUGGING
e62cc96a 1810 regnode *optimize = NULL;
b57a0404
JH
1811 U32 mjd_offset = 0;
1812 U32 mjd_nodelen = 0;
07be1b83 1813#endif
a3621e74 1814 /*
3dab1dad
YO
1815 This means we convert either the first branch or the first Exact,
1816 depending on whether the thing following (in 'last') is a branch
1817 or not and whther first is the startbranch (ie is it a sub part of
1818 the alternation or is it the whole thing.)
1819 Assuming its a sub part we conver the EXACT otherwise we convert
1820 the whole branch sequence, including the first.
a3621e74 1821 */
3dab1dad 1822 /* Find the node we are going to overwrite */
7f69552c 1823 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1824 /* branch sub-chain */
3dab1dad 1825 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1826 DEBUG_r({
1827 mjd_offset= Node_Offset((convert));
1828 mjd_nodelen= Node_Length((convert));
1829 });
7f69552c
YO
1830 /* whole branch chain */
1831 } else {
1832 DEBUG_r({
1833 const regnode *nop = NEXTOPER( convert );
1834 mjd_offset= Node_Offset((nop));
1835 mjd_nodelen= Node_Length((nop));
1836 });
07be1b83 1837 }
7f69552c 1838
07be1b83
YO
1839 DEBUG_OPTIMISE_r(
1840 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1841 (int)depth * 2 + 2, "",
786e8c11 1842 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1843 );
a3621e74 1844
3dab1dad
YO
1845 /* But first we check to see if there is a common prefix we can
1846 split out as an EXACT and put in front of the TRIE node. */
1847 trie->startstate= 1;
786e8c11 1848 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1849 U32 state;
1e2e3d02 1850 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1851 U32 ofs = 0;
8e11feef
RGS
1852 I32 idx = -1;
1853 U32 count = 0;
1854 const U32 base = trie->states[ state ].trans.base;
a3621e74 1855
3dab1dad 1856 if ( trie->states[state].wordnum )
8e11feef 1857 count = 1;
a3621e74 1858
8e11feef 1859 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1860 if ( ( base + ofs >= trie->uniquecharcount ) &&
1861 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1862 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1863 {
3dab1dad 1864 if ( ++count > 1 ) {
8e11feef 1865 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1866 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1867 if ( state == 1 ) break;
3dab1dad
YO
1868 if ( count == 2 ) {
1869 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1870 DEBUG_OPTIMISE_r(
8e11feef
RGS
1871 PerlIO_printf(Perl_debug_log,
1872 "%*sNew Start State=%"UVuf" Class: [",
1873 (int)depth * 2 + 2, "",
786e8c11 1874 (UV)state));
be8e71aa
YO
1875 if (idx >= 0) {
1876 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1877 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1878
3dab1dad 1879 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1880 if ( folder )
1881 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1882 DEBUG_OPTIMISE_r(
07be1b83 1883 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1884 );
8e11feef
RGS
1885 }
1886 }
1887 TRIE_BITMAP_SET(trie,*ch);
1888 if ( folder )
1889 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1890 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1891 }
1892 idx = ofs;
1893 }
3dab1dad
YO
1894 }
1895 if ( count == 1 ) {
1896 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1897 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1898 DEBUG_OPTIMISE_r(
8e11feef
RGS
1899 PerlIO_printf( Perl_debug_log,
1900 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1901 (int)depth * 2 + 2, "",
786e8c11 1902 (UV)state, (UV)idx, ch)
3dab1dad
YO
1903 );
1904 if ( state==1 ) {
1905 OP( convert ) = nodetype;
1906 str=STRING(convert);
1907 STR_LEN(convert)=0;
1908 }
1909 *str++=*ch;
1910 STR_LEN(convert)++;
a3621e74 1911
8e11feef 1912 } else {
f9049ba1 1913#ifdef DEBUGGING
8e11feef
RGS
1914 if (state>1)
1915 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1916#endif
8e11feef
RGS
1917 break;
1918 }
1919 }
3dab1dad 1920 if (str) {
8e11feef 1921 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1922 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1923 trie->startstate = state;
07be1b83
YO
1924 trie->minlen -= (state - 1);
1925 trie->maxlen -= (state - 1);
1926 DEBUG_r({
1927 regnode *fix = convert;
1928 mjd_nodelen++;
1929 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1930 while( ++fix < n ) {
1931 Set_Node_Offset_Length(fix, 0, 0);
1932 }
1933 });
8e11feef
RGS
1934 if (trie->maxlen) {
1935 convert = n;
1936 } else {
3dab1dad 1937 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1938 DEBUG_r(optimize= n);
3dab1dad
YO
1939 }
1940 }
1941 }
a5ca303d
YO
1942 if (!jumper)
1943 jumper = last;
3dab1dad 1944 if ( trie->maxlen ) {
8e11feef
RGS
1945 NEXT_OFF( convert ) = (U16)(tail - convert);
1946 ARG_SET( convert, data_slot );
786e8c11
YO
1947 /* Store the offset to the first unabsorbed branch in
1948 jump[0], which is otherwise unused by the jump logic.
1949 We use this when dumping a trie and during optimisation. */
1950 if (trie->jump)
7f69552c 1951 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1952
786e8c11
YO
1953 /* XXXX */
1954 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1955 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1956 {
1957 OP( convert ) = TRIEC;
1958 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1959 Safefree(trie->bitmap);
1960 trie->bitmap= NULL;
1961 } else
1962 OP( convert ) = TRIE;
a3621e74 1963
3dab1dad
YO
1964 /* store the type in the flags */
1965 convert->flags = nodetype;
a5ca303d
YO
1966 DEBUG_r({
1967 optimize = convert
1968 + NODE_STEP_REGNODE
1969 + regarglen[ OP( convert ) ];
1970 });
1971 /* XXX We really should free up the resource in trie now,
1972 as we won't use them - (which resources?) dmq */
3dab1dad 1973 }
a3621e74 1974 /* needed for dumping*/
e62cc96a 1975 DEBUG_r(if (optimize) {
07be1b83 1976 regnode *opt = convert;
e62cc96a 1977 while ( ++opt < optimize) {
07be1b83
YO
1978 Set_Node_Offset_Length(opt,0,0);
1979 }
786e8c11
YO
1980 /*
1981 Try to clean up some of the debris left after the
1982 optimisation.
a3621e74 1983 */
786e8c11 1984 while( optimize < jumper ) {
07be1b83 1985 mjd_nodelen += Node_Length((optimize));
a3621e74 1986 OP( optimize ) = OPTIMIZED;
07be1b83 1987 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1988 optimize++;
1989 }
07be1b83 1990 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1991 });
1992 } /* end node insert */
07be1b83 1993#ifndef DEBUGGING
6e8b4190 1994 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1995#endif
786e8c11
YO
1996 return trie->jump
1997 ? MADE_JUMP_TRIE
1998 : trie->startstate>1
1999 ? MADE_EXACT_TRIE
2000 : MADE_TRIE;
2001}
2002
2003STATIC void
2004S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2005{
2006/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2007
2008 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2009 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2010 ISBN 0-201-10088-6
2011
2012 We find the fail state for each state in the trie, this state is the longest proper
2013 suffix of the current states 'word' that is also a proper prefix of another word in our
2014 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2015 the DFA not to have to restart after its tried and failed a word at a given point, it
2016 simply continues as though it had been matching the other word in the first place.
2017 Consider
2018 'abcdgu'=~/abcdefg|cdgu/
2019 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2020 fail, which would bring use to the state representing 'd' in the second word where we would
2021 try 'g' and succeed, prodceding to match 'cdgu'.
2022 */
2023 /* add a fail transition */
2024 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2025 U32 *q;
2026 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2027 const U32 numstates = trie->statecount;
786e8c11
YO
2028 const U32 ubound = trie->lasttrans + ucharcount;
2029 U32 q_read = 0;
2030 U32 q_write = 0;
2031 U32 charid;
2032 U32 base = trie->states[ 1 ].trans.base;
2033 U32 *fail;
2034 reg_ac_data *aho;
2035 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2036 GET_RE_DEBUG_FLAGS_DECL;
2037#ifndef DEBUGGING
2038 PERL_UNUSED_ARG(depth);
2039#endif
2040
2041
2042 ARG_SET( stclass, data_slot );
2043 Newxz( aho, 1, reg_ac_data );
2044 RExC_rx->data->data[ data_slot ] = (void*)aho;
2045 aho->trie=trie;
2046 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2047 numstates * sizeof(reg_trie_state));
786e8c11
YO
2048 Newxz( q, numstates, U32);
2049 Newxz( aho->fail, numstates, U32 );
2050 aho->refcount = 1;
2051 fail = aho->fail;
2052 /* initialize fail[0..1] to be 1 so that we always have
2053 a valid final fail state */
2054 fail[ 0 ] = fail[ 1 ] = 1;
2055
2056 for ( charid = 0; charid < ucharcount ; charid++ ) {
2057 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2058 if ( newstate ) {
2059 q[ q_write ] = newstate;
2060 /* set to point at the root */
2061 fail[ q[ q_write++ ] ]=1;
2062 }
2063 }
2064 while ( q_read < q_write) {
2065 const U32 cur = q[ q_read++ % numstates ];
2066 base = trie->states[ cur ].trans.base;
2067
2068 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2069 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2070 if (ch_state) {
2071 U32 fail_state = cur;
2072 U32 fail_base;
2073 do {
2074 fail_state = fail[ fail_state ];
2075 fail_base = aho->states[ fail_state ].trans.base;
2076 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2077
2078 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2079 fail[ ch_state ] = fail_state;
2080 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2081 {
2082 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2083 }
2084 q[ q_write++ % numstates] = ch_state;
2085 }
2086 }
2087 }
2088 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2089 when we fail in state 1, this allows us to use the
2090 charclass scan to find a valid start char. This is based on the principle
2091 that theres a good chance the string being searched contains lots of stuff
2092 that cant be a start char.
2093 */
2094 fail[ 0 ] = fail[ 1 ] = 0;
2095 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2096 PerlIO_printf(Perl_debug_log,
2097 "%*sStclass Failtable (%"UVuf" states): 0",
2098 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2099 );
786e8c11
YO
2100 for( q_read=1; q_read<numstates; q_read++ ) {
2101 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2102 }
2103 PerlIO_printf(Perl_debug_log, "\n");
2104 });
2105 Safefree(q);
2106 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2107}
2108
786e8c11 2109
a3621e74 2110/*
5d1c421c
JH
2111 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2112 * These need to be revisited when a newer toolchain becomes available.
2113 */
2114#if defined(__sparc64__) && defined(__GNUC__)
2115# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2116# undef SPARC64_GCC_WORKAROUND
2117# define SPARC64_GCC_WORKAROUND 1
2118# endif
2119#endif
2120
07be1b83
YO
2121#define DEBUG_PEEP(str,scan,depth) \
2122 DEBUG_OPTIMISE_r({ \
2123 SV * const mysv=sv_newmortal(); \
2124 regnode *Next = regnext(scan); \
2125 regprop(RExC_rx, mysv, scan); \
7f69552c 2126 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2127 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2128 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2129 });
2130
1de06328
YO
2131
2132
2133
2134
07be1b83
YO
2135#define JOIN_EXACT(scan,min,flags) \
2136 if (PL_regkind[OP(scan)] == EXACT) \
2137 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2138
be8e71aa 2139STATIC U32
07be1b83
YO
2140S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2141 /* Merge several consecutive EXACTish nodes into one. */
2142 regnode *n = regnext(scan);
2143 U32 stringok = 1;
2144 regnode *next = scan + NODE_SZ_STR(scan);
2145 U32 merged = 0;
2146 U32 stopnow = 0;
2147#ifdef DEBUGGING
2148 regnode *stop = scan;
72f13be8 2149 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2150#else
d47053eb
RGS
2151 PERL_UNUSED_ARG(depth);
2152#endif
2153#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2154 PERL_UNUSED_ARG(flags);
2155 PERL_UNUSED_ARG(val);
07be1b83 2156#endif
07be1b83
YO
2157 DEBUG_PEEP("join",scan,depth);
2158
2159 /* Skip NOTHING, merge EXACT*. */
2160 while (n &&
2161 ( PL_regkind[OP(n)] == NOTHING ||
2162 (stringok && (OP(n) == OP(scan))))
2163 && NEXT_OFF(n)
2164 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2165
2166 if (OP(n) == TAIL || n > next)
2167 stringok = 0;
2168 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2169 DEBUG_PEEP("skip:",n,depth);
2170 NEXT_OFF(scan) += NEXT_OFF(n);
2171 next = n + NODE_STEP_REGNODE;
2172#ifdef DEBUGGING
2173 if (stringok)
2174 stop = n;
2175#endif
2176 n = regnext(n);
2177 }
2178 else if (stringok) {
786e8c11 2179 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2180 regnode * const nnext = regnext(n);
2181
2182 DEBUG_PEEP("merg",n,depth);
2183
2184 merged++;
2185 if (oldl + STR_LEN(n) > U8_MAX)
2186 break;
2187 NEXT_OFF(scan) += NEXT_OFF(n);
2188 STR_LEN(scan) += STR_LEN(n);
2189 next = n + NODE_SZ_STR(n);
2190 /* Now we can overwrite *n : */
2191 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2192#ifdef DEBUGGING
2193 stop = next - 1;
2194#endif
2195 n = nnext;
2196 if (stopnow) break;
2197 }
2198
d47053eb
RGS
2199#ifdef EXPERIMENTAL_INPLACESCAN
2200 if (flags && !NEXT_OFF(n)) {
2201 DEBUG_PEEP("atch", val, depth);
2202 if (reg_off_by_arg[OP(n)]) {
2203 ARG_SET(n, val - n);
2204 }
2205 else {
2206 NEXT_OFF(n) = val - n;
2207 }
2208 stopnow = 1;
2209 }
07be1b83
YO
2210#endif
2211 }
2212
2213 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2214 /*
2215 Two problematic code points in Unicode casefolding of EXACT nodes:
2216
2217 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2218 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2219
2220 which casefold to
2221
2222 Unicode UTF-8
2223
2224 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2225 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2226
2227 This means that in case-insensitive matching (or "loose matching",
2228 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2229 length of the above casefolded versions) can match a target string
2230 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2231 This would rather mess up the minimum length computation.
2232
2233 What we'll do is to look for the tail four bytes, and then peek
2234 at the preceding two bytes to see whether we need to decrease
2235 the minimum length by four (six minus two).
2236
2237 Thanks to the design of UTF-8, there cannot be false matches:
2238 A sequence of valid UTF-8 bytes cannot be a subsequence of
2239 another valid sequence of UTF-8 bytes.
2240
2241 */
2242 char * const s0 = STRING(scan), *s, *t;
2243 char * const s1 = s0 + STR_LEN(scan) - 1;
2244 char * const s2 = s1 - 4;
e294cc5d
JH
2245#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2246 const char t0[] = "\xaf\x49\xaf\x42";
2247#else
07be1b83 2248 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2249#endif
07be1b83
YO
2250 const char * const t1 = t0 + 3;
2251
2252 for (s = s0 + 2;
2253 s < s2 && (t = ninstr(s, s1, t0, t1));
2254 s = t + 4) {
e294cc5d
JH
2255#ifdef EBCDIC
2256 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2257 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2258#else
07be1b83
YO
2259 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2260 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2261#endif
07be1b83
YO
2262 *min -= 4;
2263 }
2264 }
2265
2266#ifdef DEBUGGING
2267 /* Allow dumping */
2268 n = scan + NODE_SZ_STR(scan);
2269 while (n <= stop) {
2270 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2271 OP(n) = OPTIMIZED;
2272 NEXT_OFF(n) = 0;
2273 }
2274 n++;
2275 }
2276#endif
2277 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2278 return stopnow;
2279}
2280
653099ff
GS
2281/* REx optimizer. Converts nodes into quickier variants "in place".
2282 Finds fixed substrings. */
2283
a0288114 2284/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2285 to the position after last scanned or to NULL. */
2286
40d049e4
YO
2287#define INIT_AND_WITHP \
2288 assert(!and_withp); \
2289 Newx(and_withp,1,struct regnode_charclass_class); \
2290 SAVEFREEPV(and_withp)
07be1b83 2291
76e3520e 2292STATIC I32
40d049e4 2293S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2294 I32 *minlenp, I32 *deltap,
40d049e4
YO
2295 regnode *last,
2296 scan_data_t *data,
2297 I32 stopparen,
2298 U8* recursed,
2299 struct regnode_charclass_class *and_withp,
2300 U32 flags, U32 depth)
c277df42
IZ
2301 /* scanp: Start here (read-write). */
2302 /* deltap: Write maxlen-minlen here. */
2303 /* last: Stop before this one. */
40d049e4
YO
2304 /* data: string data about the pattern */
2305 /* stopparen: treat close N as END */
2306 /* recursed: which subroutines have we recursed into */
2307 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2308{
97aff369 2309 dVAR;
c277df42
IZ
2310 I32 min = 0, pars = 0, code;
2311 regnode *scan = *scanp, *next;
2312 I32 delta = 0;
2313 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2314 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2315 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2316 scan_data_t data_fake;
a3621e74 2317 SV *re_trie_maxbuff = NULL;
786e8c11 2318 regnode *first_non_open = scan;
e2e6a0f1 2319 I32 stopmin = I32_MAX;
a3621e74 2320 GET_RE_DEBUG_FLAGS_DECL;
13a24bad 2321#ifdef DEBUGGING
40d049e4 2322 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2323#endif
40d049e4 2324
786e8c11 2325 if ( depth == 0 ) {
40d049e4 2326 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2327 first_non_open=regnext(first_non_open);
2328 }
2329
b81d288d 2330
c277df42
IZ
2331 while (scan && OP(scan) != END && scan < last) {
2332 /* Peephole optimizer: */
1de06328 2333 DEBUG_STUDYDATA(data,depth);
07be1b83 2334 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2335 JOIN_EXACT(scan,&min,0);
a3621e74 2336
653099ff
GS
2337 /* Follow the next-chain of the current node and optimize
2338 away all the NOTHINGs from it. */
c277df42 2339 if (OP(scan) != CURLYX) {
a3b680e6 2340 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2341 ? I32_MAX
2342 /* I32 may be smaller than U16 on CRAYs! */
2343 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2344 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2345 int noff;
2346 regnode *n = scan;
b81d288d 2347
c277df42
IZ
2348 /* Skip NOTHING and LONGJMP. */
2349 while ((n = regnext(n))
3dab1dad 2350 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2351 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2352 && off + noff < max)
2353 off += noff;
2354 if (reg_off_by_arg[OP(scan)])
2355 ARG(scan) = off;
b81d288d 2356 else
c277df42
IZ
2357 NEXT_OFF(scan) = off;
2358 }
a3621e74 2359
07be1b83 2360
3dab1dad 2361
653099ff
GS
2362 /* The principal pseudo-switch. Cannot be a switch, since we
2363 look into several different things. */
b81d288d 2364 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2365 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2366 next = regnext(scan);
2367 code = OP(scan);
a3621e74 2368 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2369
2370 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2371 /* NOTE - There is similar code to this block below for handling
2372 TRIE nodes on a re-study. If you change stuff here check there
2373 too. */
c277df42 2374 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2375 struct regnode_charclass_class accum;
d4c19fe8 2376 regnode * const startbranch=scan;
c277df42 2377
653099ff 2378 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2379 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2380 if (flags & SCF_DO_STCLASS)
830247a4 2381 cl_init_zero(pRExC_state, &accum);
a3621e74 2382
c277df42 2383 while (OP(scan) == code) {
830247a4 2384 I32 deltanext, minnext, f = 0, fake;
653099ff 2385 struct regnode_charclass_class this_class;
c277df42
IZ
2386
2387 num++;
2388 data_fake.flags = 0;
40d049e4 2389 if (data) {
2c2d71f5 2390 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2391 data_fake.last_closep = data->last_closep;
2392 }
2393 else
2394 data_fake.last_closep = &fake;
c277df42
IZ
2395 next = regnext(scan);
2396 scan = NEXTOPER(scan);
2397 if (code != BRANCH)
2398 scan = NEXTOPER(scan);
653099ff 2399 if (flags & SCF_DO_STCLASS) {
830247a4 2400 cl_init(pRExC_state, &this_class);
653099ff
GS
2401 data_fake.start_class = &this_class;
2402 f = SCF_DO_STCLASS_AND;
b81d288d 2403 }
e1901655
IZ
2404 if (flags & SCF_WHILEM_VISITED_POS)
2405 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2406
653099ff 2407 /* we suppose the run is continuous, last=next...*/
1de06328 2408 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
40d049e4
YO
2409 next, &data_fake,
2410 stopparen, recursed, NULL, f,depth+1);
b81d288d 2411 if (min1 > minnext)
c277df42
IZ
2412 min1 = minnext;
2413 if (max1 < minnext + deltanext)
2414 max1 = minnext + deltanext;
2415 if (deltanext == I32_MAX)
aca2d497 2416 is_inf = is_inf_internal = 1;
c277df42
IZ
2417 scan = next;
2418 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2419 pars++;
e2e6a0f1
YO
2420 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2421 if ( stopmin > minnext)
2422 stopmin = min + min1;
2423 flags &= ~SCF_DO_SUBSTR;
2424 if (data)
2425 data->flags |= SCF_SEEN_ACCEPT;
2426 }
3dab1dad
YO
2427 if (data) {
2428 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2429 data->flags |= SF_HAS_EVAL;
2c2d71f5 2430 data->whilem_c = data_fake.whilem_c;
3dab1dad 2431 }
653099ff 2432 if (flags & SCF_DO_STCLASS)
830247a4 2433 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2434 if (code == SUSPEND)
c277df42
IZ
2435 break;
2436 }
2437 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2438 min1 = 0;
2439 if (flags & SCF_DO_SUBSTR) {
2440 data->pos_min += min1;
2441 data->pos_delta += max1 - min1;
2442 if (max1 != min1 || is_inf)
2443 data->longest = &(data->longest_float);
2444 }
2445 min += min1;
2446 delta += max1 - min1;
653099ff 2447 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2448 cl_or(pRExC_state, data->start_class, &accum);
653099ff 2449 if (min1) {
40d049e4 2450 cl_and(data->start_class, and_withp);
653099ff
GS
2451 flags &= ~SCF_DO_STCLASS;
2452 }
2453 }
2454 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2455 if (min1) {
2456 cl_and(data->start_class, &accum);
653099ff 2457 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2458 }
2459 else {
b81d288d 2460 /* Switch to OR mode: cache the old value of
de0c8cb8 2461 * data->start_class */
40d049e4
YO
2462 INIT_AND_WITHP;
2463 StructCopy(data->start_class, and_withp,
de0c8cb8
GS
2464 struct regnode_charclass_class);
2465 flags &= ~SCF_DO_STCLASS_AND;
2466 StructCopy(&accum, data->start_class,
2467 struct regnode_charclass_class);
2468 flags |= SCF_DO_STCLASS_OR;
2469 data->start_class->flags |= ANYOF_EOS;
2470 }
653099ff 2471 }
a3621e74 2472
786e8c11 2473 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2474 /* demq.
2475
2476 Assuming this was/is a branch we are dealing with: 'scan' now
2477 points at the item that follows the branch sequence, whatever
2478 it is. We now start at the beginning of the sequence and look
2479 for subsequences of
2480
786e8c11
YO
2481 BRANCH->EXACT=>x1
2482 BRANCH->EXACT=>x2
2483 tail
a3621e74
YO
2484
2485 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2486
2487 If we can find such a subseqence we need to turn the first
2488 element into a trie and then add the subsequent branch exact
2489 strings to the trie.
2490
2491 We have two cases
2492
786e8c11 2493 1. patterns where the whole set of branch can be converted.
a3621e74 2494
786e8c11 2495 2. patterns where only a subset can be converted.
a3621e74
YO
2496
2497 In case 1 we can replace the whole set with a single regop
2498 for the trie. In case 2 we need to keep the start and end
2499 branchs so
2500
2501 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2502 becomes BRANCH TRIE; BRANCH X;
2503
786e8c11
YO
2504 There is an additional case, that being where there is a
2505 common prefix, which gets split out into an EXACT like node
2506 preceding the TRIE node.
2507
2508 If x(1..n)==tail then we can do a simple trie, if not we make
2509 a "jump" trie, such that when we match the appropriate word
2510 we "jump" to the appopriate tail node. Essentailly we turn
2511 a nested if into a case structure of sorts.
a3621e74
YO
2512
2513 */
786e8c11 2514
3dab1dad 2515 int made=0;
0111c4fd
RGS
2516 if (!re_trie_maxbuff) {
2517 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2518 if (!SvIOK(re_trie_maxbuff))
2519 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2520 }
786e8c11 2521 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2522 regnode *cur;
2523 regnode *first = (regnode *)NULL;
2524 regnode *last = (regnode *)NULL;
2525 regnode *tail = scan;
2526 U8 optype = 0;
2527 U32 count=0;
2528
2529#ifdef DEBUGGING
c445ea15 2530 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2531#endif
2532 /* var tail is used because there may be a TAIL
2533 regop in the way. Ie, the exacts will point to the
2534 thing following the TAIL, but the last branch will
2535 point at the TAIL. So we advance tail. If we
2536 have nested (?:) we may have to move through several
2537 tails.
2538 */
2539
2540 while ( OP( tail ) == TAIL ) {
2541 /* this is the TAIL generated by (?:) */
2542 tail = regnext( tail );
2543 }
2544
3dab1dad 2545
a3621e74 2546 DEBUG_OPTIMISE_r({
32fc9b6a 2547 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2548 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2549 (int)depth * 2 + 2, "",
2550 "Looking for TRIE'able sequences. Tail node is: ",
2551 SvPV_nolen_const( mysv )
a3621e74
YO
2552 );
2553 });
3dab1dad 2554
a3621e74
YO
2555 /*
2556
2557 step through the branches, cur represents each
2558 branch, noper is the first thing to be matched
2559 as part of that branch and noper_next is the
2560 regnext() of that node. if noper is an EXACT
2561 and noper_next is the same as scan (our current
2562 position in the regex) then the EXACT branch is
2563 a possible optimization target. Once we have
2564 two or more consequetive such branches we can
2565 create a trie of the EXACT's contents and stich
2566 it in place. If the sequence represents all of
2567 the branches we eliminate the whole thing and
2568 replace it with a single TRIE. If it is a
2569 subsequence then we need to stitch it in. This
2570 means the first branch has to remain, and needs
2571 to be repointed at the item on the branch chain
2572 following the last branch optimized. This could
2573 be either a BRANCH, in which case the
2574 subsequence is internal, or it could be the
2575 item following the branch sequence in which
2576 case the subsequence is at the end.
2577
2578 */
2579
2580 /* dont use tail as the end marker for this traverse */
2581 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2582 regnode * const noper = NEXTOPER( cur );
be981c67 2583#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2584 regnode * const noper_next = regnext( noper );
be981c67 2585#endif
a3621e74 2586
a3621e74 2587 DEBUG_OPTIMISE_r({
32fc9b6a 2588 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2589 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2590 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2591
32fc9b6a 2592 regprop(RExC_rx, mysv, noper);
a3621e74 2593 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2594 SvPV_nolen_const(mysv));
a3621e74
YO
2595
2596 if ( noper_next ) {
32fc9b6a 2597 regprop(RExC_rx, mysv, noper_next );
a3621e74 2598 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2599 SvPV_nolen_const(mysv));
a3621e74 2600 }
3dab1dad
YO
2601 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2602 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2603 });
3dab1dad
YO
2604 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2605 : PL_regkind[ OP( noper ) ] == EXACT )
2606 || OP(noper) == NOTHING )
786e8c11
YO
2607#ifdef NOJUMPTRIE
2608 && noper_next == tail
2609#endif
2610 && count < U16_MAX)
a3621e74
YO
2611 {
2612 count++;
3dab1dad
YO
2613 if ( !first || optype == NOTHING ) {
2614 if (!first) first = cur;
a3621e74
YO
2615 optype = OP( noper );
2616 } else {
a3621e74 2617 last = cur;
a3621e74
YO
2618 }
2619 } else {
2620 if ( last ) {
786e8c11
YO
2621 make_trie( pRExC_state,
2622 startbranch, first, cur, tail, count,
2623 optype, depth+1 );
a3621e74 2624 }
3dab1dad 2625 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2626#ifdef NOJUMPTRIE
2627 && noper_next == tail
2628#endif
2629 ){
a3621e74
YO
2630 count = 1;
2631 first = cur;
2632 optype = OP( noper );
2633 } else {
2634 count = 0;
2635 first = NULL;
2636 optype = 0;
2637 }
2638 last = NULL;
2639 }
2640 }
2641 DEBUG_OPTIMISE_r({
32fc9b6a 2642 regprop(RExC_rx, mysv, cur);
a3621e74 2643 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2644 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2645 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2646
2647 });
2648 if ( last ) {
786e8c11 2649 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2650#ifdef TRIE_STUDY_OPT
786e8c11
YO
2651 if ( ((made == MADE_EXACT_TRIE &&
2652 startbranch == first)
2653 || ( first_non_open == first )) &&
5d458dd8 2654 depth==0 ) {
786e8c11 2655 flags |= SCF_TRIE_RESTUDY;
5d458dd8
YO
2656 if ( startbranch == first
2657 && scan == tail )
2658 {
2659 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2660 }
2661 }
3dab1dad 2662#endif
07be1b83 2663 }
a3621e74 2664 }
3dab1dad
YO
2665
2666 } /* do trie */
786e8c11 2667
a0ed51b3 2668 }
a3621e74 2669 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2670 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2671 } else /* single branch is optimized. */
c277df42
IZ
2672 scan = NEXTOPER(scan);
2673 continue;
a0ed51b3
LW
2674 }
2675 else if (OP(scan) == EXACT) {
cd439c50 2676 I32 l = STR_LEN(scan);
c445ea15 2677 UV uc;
a0ed51b3 2678 if (UTF) {
a3b680e6 2679 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2680 l = utf8_length(s, s + l);
9041c2e3 2681 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2682 } else {
2683 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2684 }
2685 min += l;
c277df42 2686 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2687 /* The code below prefers earlier match for fixed
2688 offset, later match for variable offset. */
2689 if (data->last_end == -1) { /* Update the start info. */
2690 data->last_start_min = data->pos_min;
2691 data->last_start_max = is_inf
b81d288d 2692 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2693 }
cd439c50 2694 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2695 if (UTF)
2696 SvUTF8_on(data->last_found);
0eda9292 2697 {
9a957fbc 2698 SV * const sv = data->last_found;
a28509cc 2699 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2700 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2701 if (mg && mg->mg_len >= 0)
5e43f467
JH
2702 mg->mg_len += utf8_length((U8*)STRING(scan),
2703 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2704 }
c277df42
IZ
2705 data->last_end = data->pos_min + l;
2706 data->pos_min += l; /* As in the first entry. */
2707 data->flags &= ~SF_BEFORE_EOL;
2708 }
653099ff
GS
2709 if (flags & SCF_DO_STCLASS_AND) {
2710 /* Check whether it is compatible with what we know already! */
2711 int compat = 1;
2712
1aa99e6b 2713 if (uc >= 0x100 ||
516a5887 2714 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2715 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2716 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2717 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2718 )
653099ff
GS
2719 compat = 0;
2720 ANYOF_CLASS_ZERO(data->start_class);
2721 ANYOF_BITMAP_ZERO(data->start_class);
2722 if (compat)
1aa99e6b 2723 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2724 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2725 if (uc < 0x100)
2726 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2727 }
2728 else if (flags & SCF_DO_STCLASS_OR) {
2729 /* false positive possible if the class is case-folded */
1aa99e6b 2730 if (uc < 0x100)
9b877dbb
IH
2731 ANYOF_BITMAP_SET(data->start_class, uc);
2732 else
2733 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff 2734 data->start_class->flags &= ~ANYOF_EOS;
40d049e4 2735 cl_and(data->start_class, and_withp);
653099ff
GS
2736 }
2737 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2738 }
3dab1dad 2739 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2740 I32 l = STR_LEN(scan);
1aa99e6b 2741 UV uc = *((U8*)STRING(scan));
653099ff
GS
2742
2743 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2744 if (flags & SCF_DO_SUBSTR) {
2745 assert(data);
1de06328 2746 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2747 }
a0ed51b3 2748 if (UTF) {
6136c704 2749 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2750 l = utf8_length(s, s + l);
9041c2e3 2751 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2752 }
2753 min += l;
ecaa9b9c 2754 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2755 data->pos_min += l;
653099ff
GS
2756 if (flags & SCF_DO_STCLASS_AND) {
2757 /* Check whether it is compatible with what we know already! */
2758 int compat = 1;
2759
1aa99e6b 2760 if (uc >= 0x100 ||
516a5887 2761 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2762 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2763 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2764 compat = 0;
2765 ANYOF_CLASS_ZERO(data->start_class);
2766 ANYOF_BITMAP_ZERO(data->start_class);
2767 if (compat) {
1aa99e6b 2768 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2769 data->start_class->flags &= ~ANYOF_EOS;
2770 data->start_class->flags |= ANYOF_FOLD;
2771 if (OP(scan) == EXACTFL)
2772 data->start_class->flags |= ANYOF_LOCALE;
2773 }
2774 }
2775 else if (flags & SCF_DO_STCLASS_OR) {
2776 if (data->start_class->flags & ANYOF_FOLD) {
2777 /* false positive possible if the class is case-folded.
2778 Assume that the locale settings are the same... */
1aa99e6b
IH
2779 if (uc < 0x100)
2780 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2781 data->start_class->flags &= ~ANYOF_EOS;
2782 }
40d049e4 2783 cl_and(data->start_class, and_withp);
653099ff
GS
2784 }
2785 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2786 }
bfed75c6 2787 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2788 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2789 I32 f = flags, pos_before = 0;
d4c19fe8 2790 regnode * const oscan = scan;
653099ff
GS
2791 struct regnode_charclass_class this_class;
2792 struct regnode_charclass_class *oclass = NULL;
727f22e3 2793 I32 next_is_eval = 0;
653099ff 2794
3dab1dad 2795 switch (PL_regkind[OP(scan)]) {
653099ff 2796 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2797 scan = NEXTOPER(scan);
2798 goto finish;
2799 case PLUS:
653099ff 2800 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2801 next = NEXTOPER(scan);
653099ff 2802 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2803 mincount = 1;
2804 maxcount = REG_INFTY;
c277df42
IZ
2805 next = regnext(scan);
2806 scan = NEXTOPER(scan);
2807 goto do_curly;
2808 }
2809 }
2810 if (flags & SCF_DO_SUBSTR)
2811 data->pos_min++;
2812 min++;
2813 /* Fall through. */
2814 case STAR:
653099ff
GS
2815 if (flags & SCF_DO_STCLASS) {
2816 mincount = 0;
b81d288d 2817 maxcount = REG_INFTY;
653099ff
GS
2818 next = regnext(scan);
2819 scan = NEXTOPER(scan);
2820 goto do_curly;
2821 }
b81d288d 2822 is_inf = is_inf_internal = 1;
c277df42
IZ
2823 scan = regnext(scan);
2824 if (flags & SCF_DO_SUBSTR) {
1de06328 2825 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2826 data->longest = &(data->longest_float);
2827 }
2828 goto optimize_curly_tail;
2829 case CURLY:
40d049e4
YO
2830 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2831 && (scan->flags == stopparen))
2832 {
2833 mincount = 1;
2834 maxcount = 1;
2835 } else {
2836 mincount = ARG1(scan);
2837 maxcount = ARG2(scan);
2838 }
c277df42 2839 next = regnext(scan);
cb434fcc
IZ
2840 if (OP(scan) == CURLYX) {
2841 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2842 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2843 }
c277df42 2844 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2845 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2846 do_curly:
2847 if (flags & SCF_DO_SUBSTR) {
1de06328 2848 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2849 pos_before = data->pos_min;
2850 }
2851 if (data) {
2852 fl = data->flags;
2853 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2854 if (is_inf)
2855 data->flags |= SF_IS_INF;
2856 }
653099ff 2857 if (flags & SCF_DO_STCLASS) {
830247a4 2858 cl_init(pRExC_state, &this_class);
653099ff
GS
2859 oclass = data->start_class;
2860 data->start_class = &this_class;
2861 f |= SCF_DO_STCLASS_AND;
2862 f &= ~SCF_DO_STCLASS_OR;
2863 }
e1901655
IZ
2864 /* These are the cases when once a subexpression
2865 fails at a particular position, it cannot succeed
2866 even after backtracking at the enclosing scope.
b81d288d 2867
e1901655
IZ
2868 XXXX what if minimal match and we are at the
2869 initial run of {n,m}? */
2870 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2871 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2872
c277df42 2873 /* This will finish on WHILEM, setting scan, or on NULL: */
40d049e4
YO
2874 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2875 last, data, stopparen, recursed, NULL,
a3621e74
YO
2876 (mincount == 0
2877 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2878
2879 if (flags & SCF_DO_STCLASS)
2880 data->start_class = oclass;
2881 if (mincount == 0 || minnext == 0) {
2882 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2883 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2884 }
2885 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2886 /* Switch to OR mode: cache the old value of
653099ff 2887 * data->start_class */
40d049e4
YO
2888 INIT_AND_WITHP;
2889 StructCopy(data->start_class, and_withp,
653099ff
GS
2890 struct regnode_charclass_class);
2891 flags &= ~SCF_DO_STCLASS_AND;
2892 StructCopy(&this_class, data->start_class,
2893 struct regnode_charclass_class);
2894 flags |= SCF_DO_STCLASS_OR;
2895 data->start_class->flags |= ANYOF_EOS;
2896 }
2897 } else { /* Non-zero len */
2898 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2899 cl_or(pRExC_state, data->start_class, &this_class);
40d049e4 2900 cl_and(data->start_class, and_withp);
653099ff
GS
2901 }
2902 else if (flags & SCF_DO_STCLASS_AND)
2903 cl_and(data->start_class, &this_class);
2904 flags &= ~SCF_DO_STCLASS;
2905 }
c277df42
IZ
2906 if (!scan) /* It was not CURLYX, but CURLY. */
2907 scan = next;
041457d9
DM
2908 if ( /* ? quantifier ok, except for (?{ ... }) */
2909 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2910 && (minnext == 0) && (deltanext == 0)
99799961 2911 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2912 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2913 && ckWARN(WARN_REGEXP))
b45f050a 2914 {
830247a4 2915 vWARN(RExC_parse,
b45f050a
JF
2916 "Quantifier unexpected on zero-length expression");
2917 }
2918
c277df42 2919 min += minnext * mincount;
b81d288d 2920 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2921 && (minnext + deltanext) > 0)
2922 || deltanext == I32_MAX);
aca2d497 2923 is_inf |= is_inf_internal;
c277df42
IZ
2924 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2925
2926 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2927 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2928 && data->flags & SF_IN_PAR
2929 && !(data->flags & SF_HAS_EVAL)
2930 && !deltanext && minnext == 1 ) {
2931 /* Try to optimize to CURLYN. */
2932 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2933 regnode * const nxt1 = nxt;
497b47a8
JH
2934#ifdef DEBUGGING
2935 regnode *nxt2;
2936#endif
c277df42
IZ
2937
2938 /* Skip open. */
2939 nxt = regnext(nxt);
bfed75c6 2940 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2941 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2942 && STR_LEN(nxt) == 1))
c277df42 2943 goto nogo;
497b47a8 2944#ifdef DEBUGGING
c277df42 2945 nxt2 = nxt;
497b47a8 2946#endif
c277df42 2947 nxt = regnext(nxt);
b81d288d 2948 if (OP(nxt) != CLOSE)
c277df42 2949 goto nogo;
40d049e4
YO
2950 if (RExC_open_parens) {
2951 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2952 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
2953 }
c277df42 2954 /* Now we know that nxt2 is the only contents: */
eb160463 2955 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2956 OP(oscan) = CURLYN;
2957 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 2958
c277df42
IZ
2959#ifdef DEBUGGING
2960 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2961 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2962 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2963 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2964 OP(nxt + 1) = OPTIMIZED; /* was count. */
2965 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2966#endif
c277df42 2967 }
c277df42
IZ
2968 nogo:
2969
2970 /* Try optimization CURLYX => CURLYM. */
b81d288d 2971 if ( OP(oscan) == CURLYX && data
c277df42 2972 && !(data->flags & SF_HAS_PAR)
c277df42 2973 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2974 && !deltanext /* atom is fixed width */
2975 && minnext != 0 /* CURLYM can't handle zero width */
2976 ) {
c277df42
IZ
2977 /* XXXX How to optimize if data == 0? */
2978 /* Optimize to a simpler form. */
2979 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2980 regnode *nxt2;
2981
2982 OP(oscan) = CURLYM;
2983 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2984 && (OP(nxt2) != WHILEM))
c277df42
IZ
2985 nxt = nxt2;
2986 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2987 /* Need to optimize away parenths. */
2988 if (data->flags & SF_IN_PAR) {
2989 /* Set the parenth number. */
2990 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2991
b81d288d 2992 if (OP(nxt) != CLOSE)
b45f050a 2993 FAIL("Panic opt close");
eb160463 2994 oscan->flags = (U8)ARG(nxt);
40d049e4
YO
2995 if (RExC_open_parens) {
2996 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2997 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
2998 }
c277df42
IZ
2999 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3000 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3001
c277df42
IZ
3002#ifdef DEBUGGING
3003 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3004 OP(nxt + 1) = OPTIMIZED; /* was count. */
3005 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3006 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3007#endif
c277df42
IZ
3008#if 0
3009 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3010 regnode *nnxt = regnext(nxt1);
b81d288d 3011
c277df42
IZ
3012 if (nnxt == nxt) {
3013 if (reg_off_by_arg[OP(nxt1)])
3014 ARG_SET(nxt1, nxt2 - nxt1);
3015 else if (nxt2 - nxt1 < U16_MAX)
3016 NEXT_OFF(nxt1) = nxt2 - nxt1;
3017 else
3018 OP(nxt) = NOTHING; /* Cannot beautify */
3019 }
3020 nxt1 = nnxt;
3021 }
3022#endif
3023 /* Optimize again: */
1de06328 3024 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
40d049e4 3025 NULL, stopparen, recursed, NULL, 0,depth+1);
a0ed51b3
LW
3026 }
3027 else
c277df42 3028 oscan->flags = 0;
c277df42 3029 }
e1901655
IZ
3030 else if ((OP(oscan) == CURLYX)
3031 && (flags & SCF_WHILEM_VISITED_POS)
3032 /* See the comment on a similar expression above.
3033 However, this time it not a subexpression
3034 we care about, but the expression itself. */
3035 && (maxcount == REG_INFTY)
3036 && data && ++data->whilem_c < 16) {
3037 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
3038 /* Find WHILEM (as in regexec.c) */
3039 regnode *nxt = oscan + NEXT_OFF(oscan);
3040
3041 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3042 nxt += ARG(nxt);
eb160463
GS
3043 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3044 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 3045 }
b81d288d 3046 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
3047 pars++;
3048 if (flags & SCF_DO_SUBSTR) {
c445ea15 3049 SV *last_str = NULL;
c277df42
IZ
3050 int counted = mincount != 0;
3051
3052 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
3053#if defined(SPARC64_GCC_WORKAROUND)
3054 I32 b = 0;
3055 STRLEN l = 0;
cfd0369c 3056 const char *s = NULL;
5d1c421c
JH
3057 I32 old = 0;
3058
3059 if (pos_before >= data->last_start_min)
3060 b = pos_before;
3061 else
3062 b = data->last_start_min;
3063
3064 l = 0;
cfd0369c 3065 s = SvPV_const(data->last_found, l);
5d1c421c
JH
3066 old = b - data->last_start_min;
3067
3068#else
b81d288d 3069 I32 b = pos_before >= data->last_start_min
c277df42
IZ
3070 ? pos_before : data->last_start_min;
3071 STRLEN l;
d4c19fe8 3072 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 3073 I32 old = b - data->last_start_min;
5d1c421c 3074#endif
a0ed51b3
LW
3075
3076 if (UTF)
3077 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 3078
a0ed51b3 3079 l -= old;
c277df42 3080 /* Get the added string: */
79cb57f6 3081 last_str = newSVpvn(s + old, l);
0e933229
IH
3082 if (UTF)
3083 SvUTF8_on(last_str);
c277df42
IZ
3084 if (deltanext == 0 && pos_before == b) {
3085 /* What was added is a constant string */
3086 if (mincount > 1) {
3087 SvGROW(last_str, (mincount * l) + 1);
b81d288d 3088 repeatcpy(SvPVX(last_str) + l,
3f7c398e 3089 SvPVX_const(last_str), l, mincount - 1);
b162af07 3090 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 3091 /* Add additional parts. */
b81d288d 3092 SvCUR_set(data->last_found,
c277df42
IZ
3093 SvCUR(data->last_found) - l);
3094 sv_catsv(data->last_found, last_str);
0eda9292
JH
3095 {
3096 SV * sv = data->last_found;
3097 MAGIC *mg =
3098 SvUTF8(sv) && SvMAGICAL(sv) ?
3099 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3100 if (mg && mg->mg_len >= 0)
3101 mg->mg_len += CHR_SVLEN(last_str);
3102 }
c277df42
IZ
3103 data->last_end += l * (mincount - 1);
3104 }
2a8d9689
HS
3105 } else {
3106 /* start offset must point into the last copy */
3107 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3108 data->last_start_max += is_inf ? I32_MAX
3109 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3110 }
3111 }
3112 /* It is counted once already... */
3113 data->pos_min += minnext * (mincount - counted);
3114 data->pos_delta += - counted * deltanext +
3115 (minnext + deltanext) * maxcount - minnext * mincount;
3116 if (mincount != maxcount) {
653099ff
GS
3117 /* Cannot extend fixed substrings found inside
3118 the group. */
1de06328 3119 scan_commit(pRExC_state,data,minlenp);
c277df42 3120 if (mincount && last_str) {
d4c19fe8
AL
3121 SV * const sv = data->last_found;
3122 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3123 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3124
3125 if (mg)
3126 mg->mg_len = -1;
3127 sv_setsv(sv, last_str);
c277df42 3128 data->last_end = data->pos_min;
b81d288d 3129 data->last_start_min =
a0ed51b3 3130 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3131 data->last_start_max = is_inf
3132 ? I32_MAX
c277df42 3133 : data->pos_min + data->pos_delta
a0ed51b3 3134 - CHR_SVLEN(last_str);
c277df42
IZ
3135 }
3136 data->longest = &(data->longest_float);
3137 }
aca2d497 3138 SvREFCNT_dec(last_str);
c277df42 3139 }
405ff068 3140 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3141 data->flags |= SF_HAS_EVAL;
3142 optimize_curly_tail:
c277df42 3143 if (OP(oscan) != CURLYX) {
3dab1dad 3144 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3145 && NEXT_OFF(next))
3146 NEXT_OFF(oscan) += NEXT_OFF(next);
3147 }
c277df42 3148 continue;
653099ff 3149 default: /* REF and CLUMP only? */
c277df42 3150 if (flags & SCF_DO_SUBSTR) {
1de06328 3151 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3152 data->longest = &(data->longest_float);
3153 }
aca2d497 3154 is_inf = is_inf_internal = 1;
653099ff 3155 if (flags & SCF_DO_STCLASS_OR)
830247a4 3156 cl_anything(pRExC_state, data->start_class);
653099ff 3157 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3158 break;
3159 }
a0ed51b3 3160 }
bfed75c6 3161 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3162 int value = 0;
653099ff 3163
c277df42 3164 if (flags & SCF_DO_SUBSTR) {
1de06328 3165 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3166 data->pos_min++;
3167 }
3168 min++;
653099ff
GS
3169 if (flags & SCF_DO_STCLASS) {
3170 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3171
3172 /* Some of the logic below assumes that switching
3173 locale on will only add false positives. */
3dab1dad 3174 switch (PL_regkind[OP(scan)]) {
653099ff 3175 case SANY:
653099ff
GS
3176 default:
3177 do_default:
3178 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3179 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3180 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3181 break;
3182 case REG_ANY:
3183 if (OP(scan) == SANY)
3184 goto do_default;
3185 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3186 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3187 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3188 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3189 }
3190 if (flags & SCF_DO_STCLASS_AND || !value)
3191 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3192 break;
3193 case ANYOF:
3194 if (flags & SCF_DO_STCLASS_AND)
3195 cl_and(data->start_class,
3196 (struct regnode_charclass_class*)scan);
3197 else
830247a4 3198 cl_or(pRExC_state, data->start_class,
653099ff
GS
3199 (struct regnode_charclass_class*)scan);
3200 break;
3201 case ALNUM:
3202 if (flags & SCF_DO_STCLASS_AND) {
3203 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3204 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3205 for (value = 0; value < 256; value++)
3206 if (!isALNUM(value))
3207 ANYOF_BITMAP_CLEAR(data->start_class, value);
3208 }
3209 }
3210 else {
3211 if (data->start_class->flags & ANYOF_LOCALE)
3212 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3213 else {
3214 for (value = 0; value < 256; value++)
3215 if (isALNUM(value))
b81d288d 3216 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3217 }
3218 }
3219 break;
3220 case ALNUML:
3221 if (flags & SCF_DO_STCLASS_AND) {
3222 if (data->start_class->flags & ANYOF_LOCALE)
3223 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3224 }
3225 else {
3226 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3227 data->start_class->flags |= ANYOF_LOCALE;
3228 }
3229 break;
3230 case NALNUM:
3231 if (flags & SCF_DO_STCLASS_AND) {
3232 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3233 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3234 for (value = 0; value < 256; value++)
3235 if (isALNUM(value))
3236 ANYOF_BITMAP_CLEAR(data->start_class, value);
3237 }
3238 }
3239 else {
3240 if (data->start_class->flags & ANYOF_LOCALE)
3241 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3242 else {
3243 for (value = 0; value < 256; value++)
3244 if (!isALNUM(value))
b81d288d 3245 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3246 }
3247 }
3248 break;
3249 case NALNUML:
3250 if (flags & SCF_DO_STCLASS_AND) {
3251 if (data->start_class->flags & ANYOF_LOCALE)
3252 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3253 }
3254 else {
3255 data->start_class->flags |= ANYOF_LOCALE;
3256 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3257 }
3258 break;
3259 case SPACE:
3260 if (flags & SCF_DO_STCLASS_AND) {
3261 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3262 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3263 for (value = 0; value < 256; value++)
3264 if (!isSPACE(value))
3265 ANYOF_BITMAP_CLEAR(data->start_class, value);
3266 }
3267 }
3268 else {
3269 if (data->start_class->flags & ANYOF_LOCALE)
3270 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3271 else {
3272 for (value = 0; value < 256; value++)
3273 if (isSPACE(value))
b81d288d 3274 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3275 }
3276 }
3277 break;
3278 case SPACEL:
3279 if (flags & SCF_DO_STCLASS_AND) {
3280 if (data->start_class->flags & ANYOF_LOCALE)
3281 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3282 }
3283 else {
3284 data->start_class->flags |= ANYOF_LOCALE;
3285 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3286 }
3287 break;
3288 case NSPACE:
3289 if (flags & SCF_DO_STCLASS_AND) {
3290 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3291 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3292 for (value = 0; value < 256; value++)
3293 if (isSPACE(value))
3294 ANYOF_BITMAP_CLEAR(data->start_class, value);
3295 }
3296 }
3297 else {
3298 if (data->start_class->flags & ANYOF_LOCALE)
3299 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3300 else {
3301 for (value = 0; value < 256; value++)
3302 if (!isSPACE(value))
b81d288d 3303 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3304 }
3305 }
3306 break;
3307 case NSPACEL:
3308 if (flags & SCF_DO_STCLASS_AND) {
3309 if (data->start_class->flags & ANYOF_LOCALE) {
3310 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3311 for (value = 0; value < 256; value++)
3312 if (!isSPACE(value))
3313 ANYOF_BITMAP_CLEAR(data->start_class, value);
3314 }
3315 }
3316 else {
3317 data->start_class->flags |= ANYOF_LOCALE;
3318 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3319 }
3320 break;
3321 case DIGIT:
3322 if (flags & SCF_DO_STCLASS_AND) {
3323 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3324 for (value = 0; value < 256; value++)
3325 if (!isDIGIT(value))
3326 ANYOF_BITMAP_CLEAR(data->start_class, value);
3327 }
3328 else {
3329 if (data->start_class->flags & ANYOF_LOCALE)
3330 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3331 else {
3332 for (value = 0; value < 256; value++)
3333 if (isDIGIT(value))
b81d288d 3334 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3335 }
3336 }
3337 break;
3338 case NDIGIT:
3339 if (flags & SCF_DO_STCLASS_AND) {
3340 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3341 for (value = 0; value < 256; value++)
3342 if (isDIGIT(value))
3343 ANYOF_BITMAP_CLEAR(data->start_class, value);
3344 }
3345 else {
3346 if (data->start_class->flags & ANYOF_LOCALE)
3347 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3348 else {
3349 for (value = 0; value < 256; value++)
3350 if (!isDIGIT(value))
b81d288d 3351 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3352 }
3353 }
3354 break;
3355 }
3356 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3357 cl_and(data->start_class, and_withp);
653099ff
GS
3358 flags &= ~SCF_DO_STCLASS;
3359 }
a0ed51b3 3360 }
3dab1dad 3361 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3362 data->flags |= (OP(scan) == MEOL
3363 ? SF_BEFORE_MEOL
3364 : SF_BEFORE_SEOL);
a0ed51b3 3365 }
3dab1dad 3366 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3367 /* Lookbehind, or need to calculate parens/evals/stclass: */
3368 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3369 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3370 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3371 || OP(scan) == UNLESSM )
3372 {
3373 /* Negative Lookahead/lookbehind
3374 In this case we can't do fixed string optimisation.
3375 */
3376
3377 I32 deltanext, minnext, fake = 0;
3378 regnode *nscan;
3379 struct regnode_charclass_class intrnl;
3380 int f = 0;
3381
3382 data_fake.flags = 0;
3383 if (data) {
3384 data_fake.whilem_c = data->whilem_c;
3385 data_fake.last_closep = data->last_closep;
a0ed51b3 3386 }
1de06328
YO
3387 else
3388 data_fake.last_closep = &fake;
3389 if ( flags & SCF_DO_STCLASS && !scan->flags
3390 && OP(scan) == IFMATCH ) { /* Lookahead */
3391 cl_init(pRExC_state, &intrnl);
3392 data_fake.start_class = &intrnl;
3393 f |= SCF_DO_STCLASS_AND;
c277df42 3394 }
1de06328
YO
3395 if (flags & SCF_WHILEM_VISITED_POS)
3396 f |= SCF_WHILEM_VISITED_POS;
3397 next = regnext(scan);
3398 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3399 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3400 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
1de06328
YO
3401 if (scan->flags) {
3402 if (deltanext) {
3403 vFAIL("Variable length lookbehind not implemented");
3404 }
3405 else if (minnext > (I32)U8_MAX) {
3406 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3407 }
3408 scan->flags = (U8)minnext;
3409 }
3410 if (data) {
3411 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3412 pars++;
3413 if (data_fake.flags & SF_HAS_EVAL)
3414 data->flags |= SF_HAS_EVAL;
3415 data->whilem_c = data_fake.whilem_c;
3416 }
3417 if (f & SCF_DO_STCLASS_AND) {
3418 const int was = (data->start_class->flags & ANYOF_EOS);
3419
3420 cl_and(data->start_class, &intrnl);
3421 if (was)
3422 data->start_class->flags |= ANYOF_EOS;
3423 }
be8e71aa 3424 }
1de06328
YO
3425#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3426 else {
3427 /* Positive Lookahead/lookbehind
3428 In this case we can do fixed string optimisation,
3429 but we must be careful about it. Note in the case of
3430 lookbehind the positions will be offset by the minimum
3431 length of the pattern, something we won't know about
3432 until after the recurse.
3433 */
3434 I32 deltanext, fake = 0;
3435 regnode *nscan;
3436 struct regnode_charclass_class intrnl;
3437 int f = 0;
3438 /* We use SAVEFREEPV so that when the full compile
3439 is finished perl will clean up the allocated
3440 minlens when its all done. This was we don't
3441 have to worry about freeing them when we know
3442 they wont be used, which would be a pain.
3443 */
3444 I32 *minnextp;
3445 Newx( minnextp, 1, I32 );
3446 SAVEFREEPV(minnextp);
3447
3448 if (data) {
3449 StructCopy(data, &data_fake, scan_data_t);
3450 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3451 f |= SCF_DO_SUBSTR;
3452 if (scan->flags)
3453 scan_commit(pRExC_state, &data_fake,minlenp);
3454 data_fake.last_found=newSVsv(data->last_found);
3455 }
3456 }
3457 else
3458 data_fake.last_closep = &fake;
3459 data_fake.flags = 0;
3460 if (is_inf)
3461 data_fake.flags |= SF_IS_INF;
3462 if ( flags & SCF_DO_STCLASS && !scan->flags
3463 && OP(scan) == IFMATCH ) { /* Lookahead */
3464 cl_init(pRExC_state, &intrnl);
3465 data_fake.start_class = &intrnl;
3466 f |= SCF_DO_STCLASS_AND;
3467 }
3468 if (flags & SCF_WHILEM_VISITED_POS)
3469 f |= SCF_WHILEM_VISITED_POS;
3470 next = regnext(scan);
3471 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3472
3473 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3474 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
1de06328
YO
3475 if (scan->flags) {
3476 if (deltanext) {
3477 vFAIL("Variable length lookbehind not implemented");
3478 }
3479 else if (*minnextp > (I32)U8_MAX) {
3480 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3481 }
3482 scan->flags = (U8)*minnextp;
3483 }
40d049e4 3484
1de06328 3485 *minnextp += min;
40d049e4 3486
1de06328
YO
3487 if (f & SCF_DO_STCLASS_AND) {
3488 const int was = (data->start_class->flags & ANYOF_EOS);
3489
3490 cl_and(data->start_class, &intrnl);
3491 if (was)
3492 data->start_class->flags |= ANYOF_EOS;
40d049e4 3493 }
1de06328
YO
3494 if (data) {
3495 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3496 pars++;
3497 if (data_fake.flags & SF_HAS_EVAL)
3498 data->flags |= SF_HAS_EVAL;
3499 data->whilem_c = data_fake.whilem_c;
3500 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3501 if (RExC_rx->minlen<*minnextp)
3502 RExC_rx->minlen=*minnextp;
3503 scan_commit(pRExC_state, &data_fake, minnextp);
3504 SvREFCNT_dec(data_fake.last_found);
3505
3506 if ( data_fake.minlen_fixed != minlenp )
3507 {
3508 data->offset_fixed= data_fake.offset_fixed;
3509 data->minlen_fixed= data_fake.minlen_fixed;
3510 data->lookbehind_fixed+= scan->flags;
3511 }
3512 if ( data_fake.minlen_float != minlenp )
3513 {
3514 data->minlen_float= data_fake.minlen_float;
3515 data->offset_float_min=data_fake.offset_float_min;
3516 data->offset_float_max=data_fake.offset_float_max;
3517 data->lookbehind_float+= scan->flags;
3518 }
3519 }
3520 }
3521
653099ff 3522
653099ff 3523 }
1de06328 3524#endif
a0ed51b3
LW
3525 }
3526 else if (OP(scan) == OPEN) {
40d049e4
YO
3527 if (stopparen != (I32)ARG(scan))
3528 pars++;
a0ed51b3 3529 }
cb434fcc 3530 else if (OP(scan) == CLOSE) {
40d049e4
YO
3531 if (stopparen == (I32)ARG(scan)) {
3532 break;
3533 }
eb160463 3534 if ((I32)ARG(scan) == is_par) {
cb434fcc 3535 next = regnext(scan);
c277df42 3536
cb434fcc
IZ
3537 if ( next && (OP(next) != WHILEM) && next < last)
3538 is_par = 0; /* Disable optimization */
3539 }
3540 if (data)
3541 *(data->last_closep) = ARG(scan);
a0ed51b3 3542 }
1a147d38 3543 else if (OP(scan) == GOSUB || OP(scan) == GOSTART) {
40d049e4
YO
3544 /* set the pointer */
3545 I32 paren;
3546 regnode *start;
3547 regnode *end;
1a147d38 3548 if (OP(scan) == GOSUB) {
40d049e4
YO
3549 paren = ARG(scan);
3550 RExC_recurse[ARG2L(scan)] = scan;
3551 start = RExC_open_parens[paren-1];
3552 end = RExC_close_parens[paren-1];
3553 } else {
3554 paren = 0;
3555 start = RExC_rx->program + 1;
3556 end = RExC_opend;
3557 }
3558 assert(start);
3559 assert(end);
3560 if (!recursed) {
3561 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3562 SAVEFREEPV(recursed);
3563 }
3564 if (!PAREN_TEST(recursed,paren+1)) {
3565 I32 deltanext = 0;
3566 PAREN_SET(recursed,paren+1);
3567
3568 DEBUG_PEEP("goto",start,depth);
3569 min += study_chunk(
3570 pRExC_state,
3571 &start,
3572 minlenp,
3573 &deltanext,
3574 end+1,
3575 data,
3576 paren,
3577 recursed,
3578 and_withp,
3579 flags,depth+1);
3580 delta+=deltanext;
3581 if (deltanext == I32_MAX) {
3582 is_inf = is_inf_internal = 1;
3583 delta=deltanext;
3584 }
3585 DEBUG_PEEP("rtrn",end,depth);
3586 PAREN_UNSET(recursed,paren+1);
3587 } else {
3588 if (flags & SCF_DO_SUBSTR) {
3589 scan_commit(pRExC_state,data,minlenp);
3590 data->longest = &(data->longest_float);
3591 }
3592 is_inf = is_inf_internal = 1;
3593 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3594 cl_anything(pRExC_state, data->start_class);
3595 flags &= ~SCF_DO_STCLASS;
3596 }
3597 }
a0ed51b3 3598 else if (OP(scan) == EVAL) {
c277df42
IZ
3599 if (data)
3600 data->flags |= SF_HAS_EVAL;
3601 }
e2e6a0f1 3602 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
40d049e4
YO
3603 if (flags & SCF_DO_SUBSTR) {
3604 scan_commit(pRExC_state,data,minlenp);
3605 flags &= ~SCF_DO_SUBSTR;
3606 }
e2e6a0f1
YO
3607 if (data && OP(scan)==ACCEPT) {
3608 data->flags |= SCF_SEEN_ACCEPT;
3609 if (stopmin > min)
3610 stopmin = min;
3611 }
40d049e4
YO
3612 }
3613 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3614 {
0f5d15d6 3615 if (flags & SCF_DO_SUBSTR) {
1de06328 3616 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3617 data->longest = &(data->longest_float);
3618 }
3619 is_inf = is_inf_internal = 1;
653099ff 3620 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3621 cl_anything(pRExC_state, data->start_class);
96776eda 3622 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3623 }
786e8c11 3624#ifdef TRIE_STUDY_OPT
40d049e4 3625#ifdef FULL_TRIE_STUDY
786e8c11
YO
3626 else if (PL_regkind[OP(scan)] == TRIE) {
3627 /* NOTE - There is similar code to this block above for handling
40d049e4 3628 BRANCH nodes on the initial study. If you change stuff here
786e8c11 3629 check there too. */
7f69552c 3630 regnode *trie_node= scan;
786e8c11
YO
3631 regnode *tail= regnext(scan);
3632 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3633 I32 max1 = 0, min1 = I32_MAX;
3634 struct regnode_charclass_class accum;
3635
3636 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3637 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3638 if (flags & SCF_DO_STCLASS)
3639 cl_init_zero(pRExC_state, &accum);
3640
3641 if (!trie->jump) {
3642 min1= trie->minlen;
3643 max1= trie->maxlen;
3644 } else {
3645 const regnode *nextbranch= NULL;
3646 U32 word;
3647
3648 for ( word=1 ; word <= trie->wordcount ; word++)
3649 {
3650 I32 deltanext=0, minnext=0, f = 0, fake;
3651 struct regnode_charclass_class this_class;
3652
3653 data_fake.flags = 0;
3654 if (data) {
3655 data_fake.whilem_c = data->whilem_c;
3656 data_fake.last_closep = data->last_closep;
3657 }
3658 else
3659 data_fake.last_closep = &fake;
3660
3661 if (flags & SCF_DO_STCLASS) {
3662 cl_init(pRExC_state, &this_class);
3663 data_fake.start_class = &this_class;
3664 f = SCF_DO_STCLASS_AND;
3665 }
3666 if (flags & SCF_WHILEM_VISITED_POS)
3667 f |= SCF_WHILEM_VISITED_POS;
3668
3669 if (trie->jump[word]) {
3670 if (!nextbranch)
7f69552c
YO
3671 nextbranch = trie_node + trie->jump[0];
3672 scan= trie_node + trie->jump[word];
786e8c11
YO
3673 /* We go from the jump point to the branch that follows
3674 it. Note this means we need the vestigal unused branches
3675 even though they arent otherwise used.
3676 */
40d049e4
YO
3677 minnext = study_chunk(pRExC_state, &scan, minlenp,
3678 &deltanext, (regnode *)nextbranch, &data_fake,
3679 stopparen, recursed, NULL, f,depth+1);
786e8c11
YO
3680 }
3681 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3682 nextbranch= regnext((regnode*)nextbranch);
3683
3684 if (min1 > (I32)(minnext + trie->minlen))
3685 min1 = minnext + trie->minlen;
3686 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3687 max1 = minnext + deltanext + trie->maxlen;
3688 if (deltanext == I32_MAX)
3689 is_inf = is_inf_internal = 1;
3690
3691 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3692 pars++;
e2e6a0f1
YO
3693 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3694 if ( stopmin > min + min1)
3695 stopmin = min + min1;
3696 flags &= ~SCF_DO_SUBSTR;
3697 if (data)
3698 data->flags |= SCF_SEEN_ACCEPT;
3699 }
786e8c11
YO
3700 if (data) {
3701 if (data_fake.flags & SF_HAS_EVAL)
3702 data->flags |= SF_HAS_EVAL;
3703 data->whilem_c = data_fake.whilem_c;
3704 }
3705 if (flags & SCF_DO_STCLASS)
3706 cl_or(pRExC_state, &accum, &this_class);
3707 }
3708 }
3709 if (flags & SCF_DO_SUBSTR) {
3710 data->pos_min += min1;
3711 data->pos_delta += max1 - min1;
3712 if (max1 != min1 || is_inf)
3713 data->longest = &(data->longest_float);
3714 }
3715 min += min1;
3716 delta += max1 - min1;
3717 if (flags & SCF_DO_STCLASS_OR) {
3718 cl_or(pRExC_state, data->start_class, &accum);
3719 if (min1) {
40d049e4 3720 cl_and(data->start_class, and_withp);
786e8c11
YO
3721 flags &= ~SCF_DO_STCLASS;
3722 }
3723 }
3724 else if (flags & SCF_DO_STCLASS_AND) {
3725 if (min1) {
3726 cl_and(data->start_class, &accum);
3727 flags &= ~SCF_DO_STCLASS;
3728 }
3729 else {
3730 /* Switch to OR mode: cache the old value of
3731 * data->start_class */
40d049e4
YO
3732 INIT_AND_WITHP;
3733 StructCopy(data->start_class, and_withp,
786e8c11
YO
3734 struct regnode_charclass_class);
3735 flags &= ~SCF_DO_STCLASS_AND;
3736 StructCopy(&accum, data->start_class,
3737 struct regnode_charclass_class);
3738 flags |= SCF_DO_STCLASS_OR;
3739 data->start_class->flags |= ANYOF_EOS;
3740 }
3741 }
3742 scan= tail;
3743 continue;
3744 }
3745#else
3746 else if (PL_regkind[OP(scan)] == TRIE) {
3747 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3748 U8*bang=NULL;
3749
3750 min += trie->minlen;
3751 delta += (trie->maxlen - trie->minlen);
3752 flags &= ~SCF_DO_STCLASS; /* xxx */
3753 if (flags & SCF_DO_SUBSTR) {
1de06328 3754 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
786e8c11
YO
3755 data->pos_min += trie->minlen;
3756 data->pos_delta += (trie->maxlen - trie->minlen);
3757 if (trie->maxlen != trie->minlen)
3758 data->longest = &(data->longest_float);
3759 }
3760 if (trie->jump) /* no more substrings -- for now /grr*/
3761 flags &= ~SCF_DO_SUBSTR;
3762 }
3763#endif /* old or new */
3764#endif /* TRIE_STUDY_OPT */
c277df42
IZ
3765 /* Else: zero-length, ignore. */
3766 scan = regnext(scan);
3767 }
3768
3769 finish:
3770 *scanp = scan;
aca2d497 3771 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3772 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3773 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3774 if (is_par > (I32)U8_MAX)
c277df42
IZ
3775 is_par = 0;
3776 if (is_par && pars==1 && data) {
3777 data->flags |= SF_IN_PAR;
3778 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3779 }
3780 else if (pars && data) {
c277df42
IZ
3781 data->flags |= SF_HAS_PAR;
3782 data->flags &= ~SF_IN_PAR;
3783 }
653099ff 3784 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3785 cl_and(data->start_class, and_withp);
786e8c11
YO
3786 if (flags & SCF_TRIE_RESTUDY)
3787 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3788
3789 DEBUG_STUDYDATA(data,depth);
3790
e2e6a0f1 3791 return min < stopmin ? min : stopmin;
c277df42
IZ
3792}
3793
76e3520e 3794STATIC I32
5f66b61c 3795S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3796{
830247a4 3797 if (RExC_rx->data) {
2eb97020 3798 const U32 count = RExC_rx->data->count;
b81d288d 3799 Renewc(RExC_rx->data,
2eb97020 3800 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
c277df42 3801 char, struct reg_data);
2eb97020 3802 Renew(RExC_rx->data->what, count + n, U8);
830247a4 3803 RExC_rx->data->count += n;
a0ed51b3
LW
3804 }
3805 else {
a02a5408 3806 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3807 char, struct reg_data);
a02a5408 3808 Newx(RExC_rx->data->what, n, U8);
830247a4 3809 RExC_rx->data->count = n;
c277df42 3810 }
830247a4
IZ
3811 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3812 return RExC_rx->data->count - n;
c277df42
IZ
3813}
3814
76234dfb 3815#ifndef PERL_IN_XSUB_RE
d88dccdf 3816void
864dbfa3 3817Perl_reginitcolors(pTHX)
d88dccdf 3818{
97aff369 3819 dVAR;
1df70142 3820 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3821 if (s) {
1df70142
AL
3822 char *t = savepv(s);
3823 int i = 0;
3824 PL_colors[0] = t;
d88dccdf 3825 while (++i < 6) {
1df70142
AL
3826 t = strchr(t, '\t');
3827 if (t) {
3828 *t = '\0';
3829 PL_colors[i] = ++t;
d88dccdf
IZ
3830 }
3831 else
1df70142 3832 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3833 }
3834 } else {
1df70142 3835 int i = 0;
b81d288d 3836 while (i < 6)
06b5626a 3837 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3838 }
3839 PL_colorset = 1;
3840}
76234dfb 3841#endif
8615cb43 3842
07be1b83 3843
786e8c11
YO
3844#ifdef TRIE_STUDY_OPT
3845#define CHECK_RESTUDY_GOTO \
3846 if ( \
3847 (data.flags & SCF_TRIE_RESTUDY) \
3848 && ! restudied++ \
3849 ) goto reStudy
3850#else
3851#define CHECK_RESTUDY_GOTO
3852#endif
f9f4320a 3853
a687059c 3854/*
e50aee73 3855 - pregcomp - compile a regular expression into internal code
a687059c
LW
3856 *
3857 * We can't allocate space until we know how big the compiled form will be,
3858 * but we can't compile it (and thus know how big it is) until we've got a
3859 * place to put the code. So we cheat: we compile it twice, once with code
3860 * generation turned off and size counting turned on, and once "for real".
3861 * This also means that we don't allocate space until we are sure that the
3862 * thing really will compile successfully, and we never have to move the
3863 * code and thus invalidate pointers into it. (Note that it has to be in
3864 * one piece because free() must be able to free it all.) [NB: not true in perl]
3865 *
3866 * Beware that the optimization-preparation code in here knows about some
3867 * of the structure of the compiled regexp. [I'll say.]
3868 */
b9b4dddf
YO
3869
3870
3871
f9f4320a 3872#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3873#define RE_ENGINE_PTR &PL_core_reg_engine
3874#else
f9f4320a
YO
3875extern const struct regexp_engine my_reg_engine;
3876#define RE_ENGINE_PTR &my_reg_engine
3877#endif
b9b4dddf
YO
3878/* these make a few things look better, to avoid indentation */
3879#define BEGIN_BLOCK {
f9f4320a
YO
3880#define END_BLOCK }
3881
a687059c 3882regexp *
864dbfa3 3883Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3884{
97aff369 3885 dVAR;
f9f4320a
YO
3886 GET_RE_DEBUG_FLAGS_DECL;
3887 DEBUG_r(if (!PL_colorset) reginitcolors());
b9b4dddf
YO
3888#ifndef PERL_IN_XSUB_RE
3889 BEGIN_BLOCK
f9f4320a
YO
3890 /* Dispatch a request to compile a regexp to correct
3891 regexp engine. */
3892 HV * const table = GvHV(PL_hintgv);
3893 if (table) {
3894 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
1e2e3d02 3895 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3896 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3897 DEBUG_COMPILE_r({
8d8756e7 3898 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3899 SvIV(*ptr));
3900 });
f2f78491 3901 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3902 }
b9b4dddf
YO
3903 }
3904 END_BLOCK
3905#endif
3906 BEGIN_BLOCK
a0d0e21e 3907 register regexp *r;
c277df42 3908 regnode *scan;
c277df42 3909 regnode *first;
a0d0e21e 3910 I32 flags;
a0d0e21e
LW
3911 I32 minlen = 0;
3912 I32 sawplus = 0;
3913 I32 sawopen = 0;
2c2d71f5 3914 scan_data_t data;
830247a4 3915 RExC_state_t RExC_state;
be8e71aa 3916 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3917#ifdef TRIE_STUDY_OPT
3918 int restudied= 0;
3919 RExC_state_t copyRExC_state;
3920#endif
a0d0e21e 3921 if (exp == NULL)
c277df42 3922 FAIL("NULL regexp argument");
a0d0e21e 3923
a5961de5 3924 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3925
5cfc7842 3926 RExC_precomp = exp;
a3621e74 3927 DEBUG_COMPILE_r({
ab3bbdeb
YO
3928 SV *dsv= sv_newmortal();
3929 RE_PV_QUOTED_DECL(s, RExC_utf8,
3930 dsv, RExC_precomp, (xend - exp), 60);
3931 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3932 PL_colors[4],PL_colors[5],s);
a5961de5 3933 });
e2509266 3934 RExC_flags = pm->op_pmflags;
830247a4 3935 RExC_sawback = 0;
bbce6d69 3936
830247a4
IZ
3937 RExC_seen = 0;
3938 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3939 RExC_seen_evals = 0;
3940 RExC_extralen = 0;
c277df42 3941
bbce6d69 3942 /* First pass: determine size, legality. */
830247a4 3943 RExC_parse = exp;
fac92740 3944 RExC_start = exp;
830247a4
IZ
3945 RExC_end = xend;
3946 RExC_naughty = 0;
3947 RExC_npar = 1;
c74340f9 3948 RExC_cpar = 1;
e2e6a0f1 3949 RExC_nestroot = 0;
830247a4
IZ
3950 RExC_size = 0L;
3951 RExC_emit = &PL_regdummy;
3952 RExC_whilem_seen = 0;
fc8cd66c 3953 RExC_charnames = NULL;
40d049e4
YO
3954 RExC_open_parens = NULL;
3955 RExC_close_parens = NULL;
3956 RExC_opend = NULL;
81714fb9 3957 RExC_paren_names = NULL;
40d049e4
YO
3958 RExC_recurse = NULL;
3959 RExC_recurse_count = 0;
81714fb9 3960
85ddcde9
JH
3961#if 0 /* REGC() is (currently) a NOP at the first pass.
3962 * Clever compilers notice this and complain. --jhi */
830247a4 3963 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3964#endif
3dab1dad
YO
3965 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3966 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3967 RExC_precomp = NULL;
a0d0e21e
LW
3968 return(NULL);
3969 }
07be1b83 3970 DEBUG_PARSE_r({
81714fb9
YO
3971 PerlIO_printf(Perl_debug_log,
3972 "Required size %"IVdf" nodes\n"
3973 "Starting second pass (creation)\n",
3974 (IV)RExC_size);
07be1b83
YO
3975 RExC_lastnum=0;
3976 RExC_lastparse=NULL;
3977 });
c277df42
IZ
3978 /* Small enough for pointer-storage convention?
3979 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3980 if (RExC_size >= 0x10000L && RExC_extralen)
3981 RExC_size += RExC_extralen;
c277df42 3982 else
830247a4
IZ
3983 RExC_extralen = 0;
3984 if (RExC_whilem_seen > 15)
3985 RExC_whilem_seen = 15;
a0d0e21e 3986
e2e6a0f1
YO
3987#ifdef DEBUGGING
3988 /* Make room for a sentinel value at the end of the program */
3989 RExC_size++;
3990#endif
3991
f9f4320a
YO
3992 /* Allocate space and zero-initialize. Note, the two step process
3993 of zeroing when in debug mode, thus anything assigned has to
3994 happen after that */
a02a5408 3995 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3996 char, regexp);
a0d0e21e 3997 if (r == NULL)
b45f050a 3998 FAIL("Regexp out of space");
0f79a09d
GS
3999#ifdef DEBUGGING
4000 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 4001 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 4002#endif
f9f4320a
YO
4003 /* initialization begins here */
4004 r->engine= RE_ENGINE_PTR;
c277df42 4005 r->refcnt = 1;
bbce6d69 4006 r->prelen = xend - exp;
5cfc7842 4007 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 4008 r->subbeg = NULL;
f8c7b90f 4009#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 4010 r->saved_copy = NULL;
ed252734 4011#endif
cf93c79d 4012 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 4013 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 4014 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
4015
4016 r->substrs = 0; /* Useful during FAIL. */
4017 r->startp = 0; /* Useful during FAIL. */
6bda09f9 4018 r->endp = 0;
c74340f9 4019 r->swap = NULL;
81714fb9
YO
4020 r->paren_names = 0;
4021
6bda09f9 4022 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4023 Newxz(RExC_open_parens, RExC_npar,regnode *);
4024 SAVEFREEPV(RExC_open_parens);
4025 Newxz(RExC_close_parens,RExC_npar,regnode *);
4026 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4027 }
4028
4029 /* Useful during FAIL. */
a02a5408 4030 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 4031 if (r->offsets) {
2af232bd 4032 r->offsets[0] = RExC_size;
fac92740 4033 }
a3621e74 4034 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
4035 "%s %"UVuf" bytes for offset annotations.\n",
4036 r->offsets ? "Got" : "Couldn't get",
392fbf5d 4037 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4038
830247a4 4039 RExC_rx = r;
bbce6d69 4040
4041 /* Second pass: emit code. */
e2509266 4042 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4043 RExC_parse = exp;
4044 RExC_end = xend;
4045 RExC_naughty = 0;
4046 RExC_npar = 1;
c74340f9 4047 RExC_cpar = 1;
fac92740 4048 RExC_emit_start = r->program;
830247a4 4049 RExC_emit = r->program;
e2e6a0f1
YO
4050#ifdef DEBUGGING
4051 /* put a sentinal on the end of the program so we can check for
4052 overwrites */
4053 r->program[RExC_size].type = 255;
4054#endif
2cd61cdb 4055 /* Store the count of eval-groups for security checks: */
786e8c11 4056 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 4057 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 4058 r->data = 0;
3dab1dad 4059 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4060 return(NULL);
6bda09f9 4061
07be1b83
YO
4062 /* XXXX To minimize changes to RE engine we always allocate
4063 3-units-long substrs field. */
4064 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4065 if (RExC_recurse_count) {
4066 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4067 SAVEFREEPV(RExC_recurse);
4068 }
a0d0e21e 4069
07be1b83 4070reStudy:
1de06328 4071 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4072 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4073
07be1b83
YO
4074#ifdef TRIE_STUDY_OPT
4075 if ( restudied ) {
5d458dd8 4076 U32 seen=RExC_seen;
07be1b83 4077 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4078
4079 RExC_state = copyRExC_state;
4080 if (seen & REG_TOP_LEVEL_BRANCHES)
4081 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4082 else
4083 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4084 if (data.last_found) {
07be1b83 4085 SvREFCNT_dec(data.longest_fixed);
07be1b83 4086 SvREFCNT_dec(data.longest_float);
07be1b83 4087 SvREFCNT_dec(data.last_found);
1de06328 4088 }
40d049e4 4089 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4090 } else {
40d049e4 4091 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4092 copyRExC_state = RExC_state;
07be1b83 4093 }
40d049e4
YO
4094#else
4095 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4096#endif
fc8cd66c 4097
a0d0e21e 4098 /* Dig out information for optimizations. */
cf93c79d 4099 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 4100 pm->op_pmflags = RExC_flags;
a0ed51b3 4101 if (UTF)
5ff6fc6d 4102 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 4103 r->regstclass = NULL;
830247a4 4104 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 4105 r->reganch |= ROPT_NAUGHTY;
c277df42 4106 scan = r->program + 1; /* First BRANCH. */
2779dcf1 4107
1de06328
YO
4108 /* testing for BRANCH here tells us whether there is "must appear"
4109 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4110 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4111 I32 fake;
c5254dd6 4112 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4113 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4114 int stclass_flag;
07be1b83 4115 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4116
4117 first = scan;
c277df42 4118 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4119 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4120 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4121 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4122 /* for now we can't handle lookbehind IFMATCH*/
4123 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4124 (OP(first) == PLUS) ||
4125 (OP(first) == MINMOD) ||
653099ff 4126 /* An {n,m} with n>0 */
07be1b83
YO
4127 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4128 {
786e8c11 4129
a0d0e21e
LW
4130 if (OP(first) == PLUS)
4131 sawplus = 1;
4132 else
3dab1dad 4133 first += regarglen[OP(first)];
07be1b83
YO
4134 if (OP(first) == IFMATCH) {
4135 first = NEXTOPER(first);
4136 first += EXTRA_STEP_2ARGS;
7c167cea 4137 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4138 first = NEXTOPER(first);
a687059c
LW
4139 }
4140
a0d0e21e
LW
4141 /* Starting-point info. */
4142 again:
786e8c11 4143 DEBUG_PEEP("first:",first,0);
07be1b83 4144 /* Ignore EXACT as we deal with it later. */
3dab1dad 4145 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4146 if (OP(first) == EXACT)
6f207bd3 4147 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4148 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
4149 r->regstclass = first;
4150 }
07be1b83 4151#ifdef TRIE_STCLASS
786e8c11 4152 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
4153 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4154 {
786e8c11 4155 regnode *trie_op;
07be1b83 4156 /* this can happen only on restudy */
786e8c11
YO
4157 if ( OP(first) == TRIE ) {
4158 struct regnode_1 *trieop;
4159 Newxz(trieop,1,struct regnode_1);
4160 StructCopy(first,trieop,struct regnode_1);
4161 trie_op=(regnode *)trieop;
4162 } else {
4163 struct regnode_charclass *trieop;
4164 Newxz(trieop,1,struct regnode_charclass);
4165 StructCopy(first,trieop,struct regnode_charclass);
4166 trie_op=(regnode *)trieop;
4167 }
1de06328 4168 OP(trie_op)+=2;
786e8c11
YO
4169 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4170 r->regstclass = trie_op;
07be1b83
YO
4171 }
4172#endif
bfed75c6 4173 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 4174 r->regstclass = first;
3dab1dad
YO
4175 else if (PL_regkind[OP(first)] == BOUND ||
4176 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 4177 r->regstclass = first;
3dab1dad 4178 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
4179 r->reganch |= (OP(first) == MBOL
4180 ? ROPT_ANCH_MBOL
4181 : (OP(first) == SBOL
4182 ? ROPT_ANCH_SBOL
4183 : ROPT_ANCH_BOL));
a0d0e21e 4184 first = NEXTOPER(first);
774d564b 4185 goto again;
4186 }
4187 else if (OP(first) == GPOS) {
4188 r->reganch |= ROPT_ANCH_GPOS;
4189 first = NEXTOPER(first);
4190 goto again;
a0d0e21e 4191 }
e09294f4 4192 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 4193 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
4194 !(r->reganch & ROPT_ANCH) )
4195 {
4196 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4197 const int type =
4198 (OP(NEXTOPER(first)) == REG_ANY)
4199 ? ROPT_ANCH_MBOL
4200 : ROPT_ANCH_SBOL;
cad2e5aa 4201 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 4202 first = NEXTOPER(first);
774d564b 4203 goto again;
a0d0e21e 4204 }
b81d288d 4205 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4206 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
4207 /* x+ must match at the 1st pos of run of x's */
4208 r->reganch |= ROPT_SKIP;
a0d0e21e 4209
c277df42 4210 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4211#ifdef TRIE_STUDY_OPT
81714fb9 4212 DEBUG_PARSE_r(
be8e71aa
YO
4213 if (!restudied)
4214 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4215 (IV)(first - scan + 1))
4216 );
4217#else
81714fb9 4218 DEBUG_PARSE_r(
be8e71aa
YO
4219 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4220 (IV)(first - scan + 1))
4221 );
4222#endif
4223
4224
a0d0e21e
LW
4225 /*
4226 * If there's something expensive in the r.e., find the
4227 * longest literal string that must appear and make it the
4228 * regmust. Resolve ties in favor of later strings, since
4229 * the regstart check works with the beginning of the r.e.
4230 * and avoiding duplication strengthens checking. Not a
4231 * strong reason, but sufficient in the absence of others.
4232 * [Now we resolve ties in favor of the earlier string if
c277df42 4233 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4234 * earlier string may buy us something the later one won't.]
4235 */
de8c5301 4236
396482e1
GA
4237 data.longest_fixed = newSVpvs("");
4238 data.longest_float = newSVpvs("");
4239 data.last_found = newSVpvs("");
c277df42
IZ
4240 data.longest = &(data.longest_fixed);
4241 first = scan;
653099ff 4242 if (!r->regstclass) {
830247a4 4243 cl_init(pRExC_state, &ch_class);
653099ff
GS
4244 data.start_class = &ch_class;
4245 stclass_flag = SCF_DO_STCLASS_AND;
4246 } else /* XXXX Check for BOUND? */
4247 stclass_flag = 0;
cb434fcc 4248 data.last_closep = &last_close;
de8c5301 4249
1de06328 4250 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4251 &data, -1, NULL, NULL,
4252 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4253
07be1b83 4254
786e8c11
YO
4255 CHECK_RESTUDY_GOTO;
4256
4257
830247a4 4258 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4259 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
4260 && !RExC_seen_zerolen
4261 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 4262 r->reganch |= ROPT_CHECK_ALL;
1de06328 4263 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4264 SvREFCNT_dec(data.last_found);
4265
1de06328
YO
4266 /* Note that code very similar to this but for anchored string
4267 follows immediately below, changes may need to be made to both.
4268 Be careful.
4269 */
a0ed51b3 4270 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4271 if (longest_float_length
c277df42
IZ
4272 || (data.flags & SF_FL_BEFORE_EOL
4273 && (!(data.flags & SF_FL_BEFORE_MEOL)
1de06328
YO
4274 || (RExC_flags & PMf_MULTILINE))))
4275 {
1182767e 4276 I32 t,ml;
cf93c79d 4277
1de06328 4278 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4279 && data.offset_fixed == data.offset_float_min
4280 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4281 goto remove_float; /* As in (a)+. */
4282
1de06328
YO
4283 /* copy the information about the longest float from the reg_scan_data
4284 over to the program. */
33b8afdf
JH
4285 if (SvUTF8(data.longest_float)) {
4286 r->float_utf8 = data.longest_float;
c445ea15 4287 r->float_substr = NULL;
33b8afdf
JH
4288 } else {
4289 r->float_substr = data.longest_float;
c445ea15 4290 r->float_utf8 = NULL;
33b8afdf 4291 }
1de06328
YO
4292 /* float_end_shift is how many chars that must be matched that
4293 follow this item. We calculate it ahead of time as once the
4294 lookbehind offset is added in we lose the ability to correctly
4295 calculate it.*/
4296 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4297 : (I32)longest_float_length;
1de06328
YO
4298 r->float_end_shift = ml - data.offset_float_min
4299 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4300 + data.lookbehind_float;
4301 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4302 r->float_max_offset = data.offset_float_max;
1182767e 4303 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4304 r->float_max_offset -= data.lookbehind_float;
4305
cf93c79d
IZ
4306 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4307 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 4308 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4309 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4310 }
4311 else {
aca2d497 4312 remove_float:
c445ea15 4313 r->float_substr = r->float_utf8 = NULL;
c277df42 4314 SvREFCNT_dec(data.longest_float);
c5254dd6 4315 longest_float_length = 0;
a0d0e21e 4316 }
c277df42 4317
1de06328
YO
4318 /* Note that code very similar to this but for floating string
4319 is immediately above, changes may need to be made to both.
4320 Be careful.
4321 */
a0ed51b3 4322 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4323 if (longest_fixed_length
c277df42
IZ
4324 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4325 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1de06328
YO
4326 || (RExC_flags & PMf_MULTILINE))))
4327 {
1182767e 4328 I32 t,ml;
cf93c79d 4329
1de06328
YO
4330 /* copy the information about the longest fixed
4331 from the reg_scan_data over to the program. */
33b8afdf
JH
4332 if (SvUTF8(data.longest_fixed)) {
4333 r->anchored_utf8 = data.longest_fixed;
c445ea15 4334 r->anchored_substr = NULL;
33b8afdf
JH
4335 } else {
4336 r->anchored_substr = data.longest_fixed;
c445ea15 4337 r->anchored_utf8 = NULL;
33b8afdf 4338 }
1de06328
YO
4339 /* fixed_end_shift is how many chars that must be matched that
4340 follow this item. We calculate it ahead of time as once the
4341 lookbehind offset is added in we lose the ability to correctly
4342 calculate it.*/
4343 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4344 : (I32)longest_fixed_length;
1de06328
YO
4345 r->anchored_end_shift = ml - data.offset_fixed
4346 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4347 + data.lookbehind_fixed;
4348 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4349
cf93c79d
IZ
4350 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4351 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 4352 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4353 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4354 }
4355 else {
c445ea15 4356 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4357 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4358 longest_fixed_length = 0;
a0d0e21e 4359 }
b81d288d 4360 if (r->regstclass
ffc61ed2 4361 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 4362 r->regstclass = NULL;
33b8afdf
JH
4363 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4364 && stclass_flag
653099ff 4365 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4366 && !cl_is_anything(data.start_class))
4367 {
1df70142 4368 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4369
a02a5408 4370 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4371 struct regnode_charclass_class);
4372 StructCopy(data.start_class,
830247a4 4373 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4374 struct regnode_charclass_class);
830247a4 4375 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4376 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4377 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4378 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4379 PerlIO_printf(Perl_debug_log,
a0288114 4380 "synthetic stclass \"%s\".\n",
3f7c398e 4381 SvPVX_const(sv));});
653099ff 4382 }
c277df42
IZ
4383
4384 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4385 if (longest_fixed_length > longest_float_length) {
1de06328 4386 r->check_end_shift = r->anchored_end_shift;
c277df42 4387 r->check_substr = r->anchored_substr;
33b8afdf 4388 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
4389 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4390 if (r->reganch & ROPT_ANCH_SINGLE)
4391 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
4392 }
4393 else {
1de06328 4394 r->check_end_shift = r->float_end_shift;
c277df42 4395 r->check_substr = r->float_substr;
33b8afdf 4396 r->check_utf8 = r->float_utf8;
1de06328
YO
4397 r->check_offset_min = r->float_min_offset;
4398 r->check_offset_max = r->float_max_offset;
a0d0e21e 4399 }
30382c73
IZ
4400 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4401 This should be changed ASAP! */
33b8afdf 4402 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 4403 r->reganch |= RE_USE_INTUIT;
33b8afdf 4404 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
4405 r->reganch |= RE_INTUIT_TAIL;
4406 }
1de06328
YO
4407 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4408 if ( (STRLEN)minlen < longest_float_length )
4409 minlen= longest_float_length;
4410 if ( (STRLEN)minlen < longest_fixed_length )
4411 minlen= longest_fixed_length;
4412 */
a0ed51b3
LW
4413 }
4414 else {
c277df42
IZ
4415 /* Several toplevels. Best we can is to set minlen. */
4416 I32 fake;
653099ff 4417 struct regnode_charclass_class ch_class;
cb434fcc 4418 I32 last_close = 0;
c277df42 4419
5d458dd8 4420 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4421
c277df42 4422 scan = r->program + 1;
830247a4 4423 cl_init(pRExC_state, &ch_class);
653099ff 4424 data.start_class = &ch_class;
cb434fcc 4425 data.last_closep = &last_close;
07be1b83 4426
de8c5301 4427
1de06328 4428 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4429 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4430
786e8c11 4431 CHECK_RESTUDY_GOTO;
07be1b83 4432
33b8afdf 4433 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4434 = r->float_substr = r->float_utf8 = NULL;
653099ff 4435 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4436 && !cl_is_anything(data.start_class))
4437 {
1df70142 4438 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4439
a02a5408 4440 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4441 struct regnode_charclass_class);
4442 StructCopy(data.start_class,
830247a4 4443 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4444 struct regnode_charclass_class);
830247a4 4445 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4446 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4447 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4448 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4449 PerlIO_printf(Perl_debug_log,
a0288114 4450 "synthetic stclass \"%s\".\n",
3f7c398e 4451 SvPVX_const(sv));});
653099ff 4452 }
a0d0e21e
LW
4453 }
4454
1de06328
YO
4455 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4456 the "real" pattern. */
cf9788e3
RGS
4457 DEBUG_OPTIMISE_r({
4458 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4459 minlen, r->minlen);
4460 });
de8c5301 4461 r->minlenret = minlen;
1de06328
YO
4462 if (r->minlen < minlen)
4463 r->minlen = minlen;
4464
b81d288d 4465 if (RExC_seen & REG_SEEN_GPOS)
c277df42 4466 r->reganch |= ROPT_GPOS_SEEN;
830247a4 4467 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 4468 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 4469 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 4470 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
4471 if (RExC_seen & REG_SEEN_CANY)
4472 r->reganch |= ROPT_CANY_SEEN;
e2e6a0f1
YO
4473 if (RExC_seen & REG_SEEN_VERBARG)
4474 r->reganch |= ROPT_VERBARG_SEEN;
5d458dd8
YO
4475 if (RExC_seen & REG_SEEN_CUTGROUP)
4476 r->reganch |= ROPT_CUTGROUP_SEEN;
81714fb9
YO
4477 if (RExC_paren_names)
4478 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4479 else
4480 r->paren_names = NULL;
4481
40d049e4
YO
4482 if (RExC_recurse_count) {
4483 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4484 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4485 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4486 }
4487 }
a02a5408
JC
4488 Newxz(r->startp, RExC_npar, I32);
4489 Newxz(r->endp, RExC_npar, I32);
c74340f9
YO
4490 /* assume we don't need to swap parens around before we match */
4491
be8e71aa
YO
4492 DEBUG_DUMP_r({
4493 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4494 regdump(r);
4495 });
8e9a8a48
YO
4496 DEBUG_OFFSETS_r(if (r->offsets) {
4497 const U32 len = r->offsets[0];
4498 U32 i;
4499 GET_RE_DEBUG_FLAGS_DECL;
4500 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4501 for (i = 1; i <= len; i++) {
4502 if (r->offsets[i*2-1] || r->offsets[i*2])
4503 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 4504 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
4505 }
4506 PerlIO_printf(Perl_debug_log, "\n");
4507 });
a0d0e21e 4508 return(r);
f9f4320a 4509 END_BLOCK
a687059c
LW
4510}
4511
f9f4320a
YO
4512#undef CORE_ONLY_BLOCK
4513#undef END_BLOCK
4514#undef RE_ENGINE_PTR
3dab1dad 4515
9af228c6 4516#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4517SV*
4518Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4519{
4520 I32 parno = 0; /* no match */
4521 if (PL_curpm) {
4522 const REGEXP * const rx = PM_GETRE(PL_curpm);
4523 if (rx && rx->paren_names) {
4524 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4525 if (he_str) {
4526 IV i;
4527 SV* sv_dat=HeVAL(he_str);
4528 I32 *nums=(I32*)SvPVX(sv_dat);
4529 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4530 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4531 rx->endp[nums[i]] != -1)
4532 {
4533 parno = nums[i];
4534 break;
4535 }
4536 }
4537 }
4538 }
4539 }
4540 if ( !parno ) {
4541 return 0;
4542 } else {
4543 GV *gv_paren;
4544 SV *sv= sv_newmortal();
4545 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4546 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4547 return GvSVn(gv_paren);
4548 }
4549}
9af228c6 4550#endif
0a4db386 4551
894be9b7 4552/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4553 * If flags is REG_RSN_RETURN_NULL returns null.
4554 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4555 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4556 * to the parsed name as looked up in the RExC_paren_names hash.
4557 * If there is an error throws a vFAIL().. type exception.
894be9b7 4558 */
0a4db386
YO
4559
4560#define REG_RSN_RETURN_NULL 0
4561#define REG_RSN_RETURN_NAME 1
4562#define REG_RSN_RETURN_DATA 2
4563
894be9b7
YO
4564STATIC SV*
4565S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4566 char *name_start = RExC_parse;
0a4db386 4567 if ( UTF ) {
894be9b7 4568 STRLEN numlen;
0a4db386
YO
4569 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4570 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4571 {
4572 RExC_parse += numlen;
4573 }
4574 } else {
4575 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4576 RExC_parse++;
4577 }
0a4db386
YO
4578 if ( flags ) {
4579 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4580 (int)(RExC_parse - name_start)));
894be9b7 4581 if (UTF)
0a4db386
YO
4582 SvUTF8_on(sv_name);
4583 if ( flags == REG_RSN_RETURN_NAME)
4584 return sv_name;
4585 else if (flags==REG_RSN_RETURN_DATA) {
4586 HE *he_str = NULL;
4587 SV *sv_dat = NULL;
4588 if ( ! sv_name ) /* should not happen*/
4589 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4590 if (RExC_paren_names)
4591 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4592 if ( he_str )
4593 sv_dat = HeVAL(he_str);
4594 if ( ! sv_dat )
4595 vFAIL("Reference to nonexistent named group");
4596 return sv_dat;
4597 }
4598 else {
4599 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4600 }
4601 /* NOT REACHED */
894be9b7 4602 }
0a4db386 4603 return NULL;
894be9b7
YO
4604}
4605
3dab1dad
YO
4606#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4607 int rem=(int)(RExC_end - RExC_parse); \
4608 int cut; \
4609 int num; \
4610 int iscut=0; \
4611 if (rem>10) { \
4612 rem=10; \
4613 iscut=1; \
4614 } \
4615 cut=10-rem; \
4616 if (RExC_lastparse!=RExC_parse) \
4617 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4618 rem, RExC_parse, \
4619 cut + 4, \
4620 iscut ? "..." : "<" \
4621 ); \
4622 else \
4623 PerlIO_printf(Perl_debug_log,"%16s",""); \
4624 \
4625 if (SIZE_ONLY) \
4626 num=RExC_size; \
4627 else \
4628 num=REG_NODE_NUM(RExC_emit); \
4629 if (RExC_lastnum!=num) \
0a4db386 4630 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4631 else \
0a4db386 4632 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4633 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4634 (int)((depth*2)), "", \
3dab1dad
YO
4635 (funcname) \
4636 ); \
4637 RExC_lastnum=num; \
4638 RExC_lastparse=RExC_parse; \
4639})
4640
07be1b83
YO
4641
4642
3dab1dad
YO
4643#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4644 DEBUG_PARSE_MSG((funcname)); \
4645 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4646})
6bda09f9
YO
4647#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4648 DEBUG_PARSE_MSG((funcname)); \
4649 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4650})
a687059c
LW
4651/*
4652 - reg - regular expression, i.e. main body or parenthesized thing
4653 *
4654 * Caller must absorb opening parenthesis.
4655 *
4656 * Combining parenthesis handling with the base level of regular expression
4657 * is a trifle forced, but the need to tie the tails of the branches to what
4658 * follows makes it hard to avoid.
4659 */
07be1b83
YO
4660#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4661#ifdef DEBUGGING
4662#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4663#else
4664#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4665#endif
3dab1dad 4666
e2e6a0f1
YO
4667/* this idea is borrowed from STR_WITH_LEN in handy.h */
4668#define CHECK_WORD(s,v,l) \
4669 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4670
76e3520e 4671STATIC regnode *
3dab1dad 4672S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4673 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4674{
27da23d5 4675 dVAR;
c277df42
IZ
4676 register regnode *ret; /* Will be the head of the group. */
4677 register regnode *br;
4678 register regnode *lastbr;
cbbf8932 4679 register regnode *ender = NULL;
a0d0e21e 4680 register I32 parno = 0;
cbbf8932
AL
4681 I32 flags;
4682 const I32 oregflags = RExC_flags;
6136c704
AL
4683 bool have_branch = 0;
4684 bool is_open = 0;
9d1d55b5
JP
4685
4686 /* for (?g), (?gc), and (?o) warnings; warning
4687 about (?c) will warn about (?g) -- japhy */
4688
6136c704
AL
4689#define WASTED_O 0x01
4690#define WASTED_G 0x02
4691#define WASTED_C 0x04
4692#define WASTED_GC (0x02|0x04)
cbbf8932 4693 I32 wastedflags = 0x00;
9d1d55b5 4694
fac92740 4695 char * parse_start = RExC_parse; /* MJD */
a28509cc 4696 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4697
3dab1dad
YO
4698 GET_RE_DEBUG_FLAGS_DECL;
4699 DEBUG_PARSE("reg ");
4700
4701
821b33a5 4702 *flagp = 0; /* Tentatively. */
a0d0e21e 4703
9d1d55b5 4704
a0d0e21e
LW
4705 /* Make an OPEN node, if parenthesized. */
4706 if (paren) {
e2e6a0f1
YO
4707 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4708 char *start_verb = RExC_parse;
4709 STRLEN verb_len = 0;
4710 char *start_arg = NULL;
4711 unsigned char op = 0;
4712 int argok = 1;
4713 int internal_argval = 0; /* internal_argval is only useful if !argok */
4714 while ( *RExC_parse && *RExC_parse != ')' ) {
4715 if ( *RExC_parse == ':' ) {
4716 start_arg = RExC_parse + 1;
4717 break;
4718 }
4719 RExC_parse++;
4720 }
4721 ++start_verb;
4722 verb_len = RExC_parse - start_verb;
4723 if ( start_arg ) {
4724 RExC_parse++;
4725 while ( *RExC_parse && *RExC_parse != ')' )
4726 RExC_parse++;
4727 if ( *RExC_parse != ')' )
4728 vFAIL("Unterminated verb pattern argument");
4729 if ( RExC_parse == start_arg )
4730 start_arg = NULL;
4731 } else {
4732 if ( *RExC_parse != ')' )
4733 vFAIL("Unterminated verb pattern");
4734 }
5d458dd8 4735
e2e6a0f1
YO
4736 switch ( *start_verb ) {
4737 case 'A': /* (*ACCEPT) */
4738 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4739 op = ACCEPT;
4740 internal_argval = RExC_nestroot;
4741 }
4742 break;
4743 case 'C': /* (*COMMIT) */
4744 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4745 op = COMMIT;
e2e6a0f1
YO
4746 break;
4747 case 'F': /* (*FAIL) */
4748 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4749 op = OPFAIL;
4750 argok = 0;
4751 }
4752 break;
5d458dd8
YO
4753 case ':': /* (*:NAME) */
4754 case 'M': /* (*MARK:NAME) */
4755 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 4756 op = MARKPOINT;
5d458dd8
YO
4757 argok = -1;
4758 }
4759 break;
4760 case 'P': /* (*PRUNE) */
4761 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4762 op = PRUNE;
e2e6a0f1 4763 break;
5d458dd8
YO
4764 case 'S': /* (*SKIP) */
4765 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4766 op = SKIP;
4767 break;
4768 case 'T': /* (*THEN) */
4769 /* [19:06] <TimToady> :: is then */
4770 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4771 op = CUTGROUP;
4772 RExC_seen |= REG_SEEN_CUTGROUP;
4773 }
e2e6a0f1
YO
4774 break;
4775 }
4776 if ( ! op ) {
4777 RExC_parse++;
4778 vFAIL3("Unknown verb pattern '%.*s'",
4779 verb_len, start_verb);
4780 }
4781 if ( argok ) {
4782 if ( start_arg && internal_argval ) {
4783 vFAIL3("Verb pattern '%.*s' may not have an argument",
4784 verb_len, start_verb);
4785 } else if ( argok < 0 && !start_arg ) {
4786 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4787 verb_len, start_verb);
4788 } else {
4789 ret = reganode(pRExC_state, op, internal_argval);
4790 if ( ! internal_argval && ! SIZE_ONLY ) {
4791 if (start_arg) {
4792 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4793 ARG(ret) = add_data( pRExC_state, 1, "S" );
4794 RExC_rx->data->data[ARG(ret)]=(void*)sv;
4795 ret->flags = 0;
4796 } else {
4797 ret->flags = 1;
4798 }
4799 }
4800 }
4801 if (!internal_argval)
4802 RExC_seen |= REG_SEEN_VERBARG;
4803 } else if ( start_arg ) {
4804 vFAIL3("Verb pattern '%.*s' may not have an argument",
4805 verb_len, start_verb);
4806 } else {
4807 ret = reg_node(pRExC_state, op);
4808 }
4809 nextchar(pRExC_state);
4810 return ret;
4811 } else
fac92740 4812 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4813 U32 posflags = 0, negflags = 0;
4814 U32 *flagsp = &posflags;
6136c704 4815 bool is_logical = 0;
a28509cc 4816 const char * const seqstart = RExC_parse;
ca9dfc88 4817
830247a4
IZ
4818 RExC_parse++;
4819 paren = *RExC_parse++;
c277df42 4820 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4821 switch (paren) {
894be9b7 4822
fac92740 4823 case '<': /* (?<...) */
b81d288d 4824 if (*RExC_parse == '!')
c277df42 4825 paren = ',';
0a4db386
YO
4826 else if (*RExC_parse != '=')
4827 { /* (?<...>) */
81714fb9 4828 char *name_start;
894be9b7 4829 SV *svname;
81714fb9
YO
4830 paren= '>';
4831 case '\'': /* (?'...') */
4832 name_start= RExC_parse;
0a4db386
YO
4833 svname = reg_scan_name(pRExC_state,
4834 SIZE_ONLY ? /* reverse test from the others */
4835 REG_RSN_RETURN_NAME :
4836 REG_RSN_RETURN_NULL);
81714fb9
YO
4837 if (RExC_parse == name_start)
4838 goto unknown;
4839 if (*RExC_parse != paren)
4840 vFAIL2("Sequence (?%c... not terminated",
4841 paren=='>' ? '<' : paren);
4842 if (SIZE_ONLY) {
e62cc96a
YO
4843 HE *he_str;
4844 SV *sv_dat = NULL;
894be9b7
YO
4845 if (!svname) /* shouldnt happen */
4846 Perl_croak(aTHX_
4847 "panic: reg_scan_name returned NULL");
81714fb9
YO
4848 if (!RExC_paren_names) {
4849 RExC_paren_names= newHV();
4850 sv_2mortal((SV*)RExC_paren_names);
4851 }
4852 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4853 if ( he_str )
81714fb9 4854 sv_dat = HeVAL(he_str);
e62cc96a 4855 if ( ! sv_dat ) {
81714fb9 4856 /* croak baby croak */
e62cc96a
YO
4857 Perl_croak(aTHX_
4858 "panic: paren_name hash element allocation failed");
4859 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4860 IV count=SvIV(sv_dat);
4861 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4862 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4863 pv[count]=RExC_npar;
4864 SvIVX(sv_dat)++;
4865 } else {
4866 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4867 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4868 SvIOK_on(sv_dat);
4869 SvIVX(sv_dat)= 1;
e62cc96a
YO
4870 }
4871
81714fb9
YO
4872 /*sv_dump(sv_dat);*/
4873 }
4874 nextchar(pRExC_state);
4875 paren = 1;
4876 goto capturing_parens;
4877 }
4878 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4879 RExC_parse++;
fac92740
MJD
4880 case '=': /* (?=...) */
4881 case '!': /* (?!...) */
830247a4 4882 RExC_seen_zerolen++;
e2e6a0f1
YO
4883 if (*RExC_parse == ')') {
4884 ret=reg_node(pRExC_state, OPFAIL);
4885 nextchar(pRExC_state);
4886 return ret;
4887 }
fac92740
MJD
4888 case ':': /* (?:...) */
4889 case '>': /* (?>...) */
a0d0e21e 4890 break;
fac92740
MJD
4891 case '$': /* (?$...) */
4892 case '@': /* (?@...) */
8615cb43 4893 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4894 break;
fac92740 4895 case '#': /* (?#...) */
830247a4
IZ
4896 while (*RExC_parse && *RExC_parse != ')')
4897 RExC_parse++;
4898 if (*RExC_parse != ')')
c277df42 4899 FAIL("Sequence (?#... not terminated");
830247a4 4900 nextchar(pRExC_state);
a0d0e21e
LW
4901 *flagp = TRYAGAIN;
4902 return NULL;
894be9b7
YO
4903 case '0' : /* (?0) */
4904 case 'R' : /* (?R) */
4905 if (*RExC_parse != ')')
6bda09f9 4906 FAIL("Sequence (?R) not terminated");
1a147d38 4907 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
4908 nextchar(pRExC_state);
4909 return ret;
4910 /*notreached*/
894be9b7
YO
4911 { /* named and numeric backreferences */
4912 I32 num;
4913 char * parse_start;
4914 case '&': /* (?&NAME) */
4915 parse_start = RExC_parse - 1;
4916 {
0a4db386
YO
4917 SV *sv_dat = reg_scan_name(pRExC_state,
4918 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4919 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
4920 }
4921 goto gen_recurse_regop;
4922 /* NOT REACHED */
542fa716
YO
4923 case '+':
4924 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4925 RExC_parse++;
4926 vFAIL("Illegal pattern");
4927 }
4928 goto parse_recursion;
4929 /* NOT REACHED*/
4930 case '-': /* (?-1) */
4931 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4932 RExC_parse--; /* rewind to let it be handled later */
4933 goto parse_flags;
4934 }
4935 /*FALLTHROUGH */
6bda09f9
YO
4936 case '1': case '2': case '3': case '4': /* (?1) */
4937 case '5': case '6': case '7': case '8': case '9':
4938 RExC_parse--;
542fa716 4939 parse_recursion:
894be9b7
YO
4940 num = atoi(RExC_parse);
4941 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
4942 if (*RExC_parse == '-')
4943 RExC_parse++;
6bda09f9
YO
4944 while (isDIGIT(*RExC_parse))
4945 RExC_parse++;
4946 if (*RExC_parse!=')')
4947 vFAIL("Expecting close bracket");
894be9b7
YO
4948
4949 gen_recurse_regop:
542fa716
YO
4950 if ( paren == '-' ) {
4951 /*
4952 Diagram of capture buffer numbering.
4953 Top line is the normal capture buffer numbers
4954 Botton line is the negative indexing as from
4955 the X (the (?-2))
4956
4957 + 1 2 3 4 5 X 6 7
4958 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
4959 - 5 4 3 2 1 X x x
4960
4961 */
4962 num = RExC_npar + num;
4963 if (num < 1) {
4964 RExC_parse++;
4965 vFAIL("Reference to nonexistent group");
4966 }
4967 } else if ( paren == '+' ) {
4968 num = RExC_npar + num - 1;
4969 }
4970
1a147d38 4971 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
4972 if (!SIZE_ONLY) {
4973 if (num > (I32)RExC_rx->nparens) {
4974 RExC_parse++;
4975 vFAIL("Reference to nonexistent group");
4976 }
40d049e4 4977 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 4978 RExC_emit++;
226de585 4979 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 4980 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 4981 } else {
6bda09f9 4982 RExC_size++;
6bda09f9 4983 }
0a4db386 4984 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 4985 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
4986 Set_Node_Offset(ret, parse_start); /* MJD */
4987
6bda09f9
YO
4988 nextchar(pRExC_state);
4989 return ret;
894be9b7
YO
4990 } /* named and numeric backreferences */
4991 /* NOT REACHED */
4992
fac92740 4993 case 'p': /* (?p...) */
9014280d 4994 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 4995 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 4996 /* FALL THROUGH*/
fac92740 4997 case '?': /* (??...) */
6136c704 4998 is_logical = 1;
438a3801
YST
4999 if (*RExC_parse != '{')
5000 goto unknown;
830247a4 5001 paren = *RExC_parse++;
0f5d15d6 5002 /* FALL THROUGH */
fac92740 5003 case '{': /* (?{...}) */
c277df42 5004 {
c277df42
IZ
5005 I32 count = 1, n = 0;
5006 char c;
830247a4 5007 char *s = RExC_parse;
c277df42 5008
830247a4
IZ
5009 RExC_seen_zerolen++;
5010 RExC_seen |= REG_SEEN_EVAL;
5011 while (count && (c = *RExC_parse)) {
6136c704
AL
5012 if (c == '\\') {
5013 if (RExC_parse[1])
5014 RExC_parse++;
5015 }
b81d288d 5016 else if (c == '{')
c277df42 5017 count++;
b81d288d 5018 else if (c == '}')
c277df42 5019 count--;
830247a4 5020 RExC_parse++;
c277df42 5021 }
6136c704 5022 if (*RExC_parse != ')') {
b81d288d 5023 RExC_parse = s;
b45f050a
JF
5024 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5025 }
c277df42 5026 if (!SIZE_ONLY) {
f3548bdc 5027 PAD *pad;
6136c704
AL
5028 OP_4tree *sop, *rop;
5029 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5030
569233ed
SB
5031 ENTER;
5032 Perl_save_re_context(aTHX);
f3548bdc 5033 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5034 sop->op_private |= OPpREFCOUNTED;
5035 /* re_dup will OpREFCNT_inc */
5036 OpREFCNT_set(sop, 1);
569233ed 5037 LEAVE;
c277df42 5038
830247a4
IZ
5039 n = add_data(pRExC_state, 3, "nop");
5040 RExC_rx->data->data[n] = (void*)rop;
5041 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 5042 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 5043 SvREFCNT_dec(sv);
a0ed51b3 5044 }
e24b16f9 5045 else { /* First pass */
830247a4 5046 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5047 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5048 /* No compiled RE interpolated, has runtime
5049 components ===> unsafe. */
5050 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5051 if (PL_tainting && PL_tainted)
cc6b7395 5052 FAIL("Eval-group in insecure regular expression");
54df2634 5053#if PERL_VERSION > 8
923e4eb5 5054 if (IN_PERL_COMPILETIME)
b5c19bd7 5055 PL_cv_has_eval = 1;
54df2634 5056#endif
c277df42 5057 }
b5c19bd7 5058
830247a4 5059 nextchar(pRExC_state);
6136c704 5060 if (is_logical) {
830247a4 5061 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5062 if (!SIZE_ONLY)
5063 ret->flags = 2;
3dab1dad 5064 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5065 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5066 return ret;
5067 }
ccb2c380
MP
5068 ret = reganode(pRExC_state, EVAL, n);
5069 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5070 Set_Node_Offset(ret, parse_start);
5071 return ret;
c277df42 5072 }
fac92740 5073 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5074 {
0a4db386 5075 int is_define= 0;
fac92740 5076 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5077 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5078 || RExC_parse[1] == '<'
830247a4 5079 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5080 I32 flag;
5081
830247a4 5082 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5083 if (!SIZE_ONLY)
5084 ret->flags = 1;
3dab1dad 5085 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5086 goto insert_if;
b81d288d 5087 }
a0ed51b3 5088 }
0a4db386
YO
5089 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5090 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5091 {
5092 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5093 char *name_start= RExC_parse++;
5094 I32 num = 0;
5095 SV *sv_dat=reg_scan_name(pRExC_state,
5096 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5097 if (RExC_parse == name_start || *RExC_parse != ch)
5098 vFAIL2("Sequence (?(%c... not terminated",
5099 (ch == '>' ? '<' : ch));
5100 RExC_parse++;
5101 if (!SIZE_ONLY) {
5102 num = add_data( pRExC_state, 1, "S" );
5103 RExC_rx->data->data[num]=(void*)sv_dat;
5104 SvREFCNT_inc(sv_dat);
5105 }
5106 ret = reganode(pRExC_state,NGROUPP,num);
5107 goto insert_if_check_paren;
5108 }
5109 else if (RExC_parse[0] == 'D' &&
5110 RExC_parse[1] == 'E' &&
5111 RExC_parse[2] == 'F' &&
5112 RExC_parse[3] == 'I' &&
5113 RExC_parse[4] == 'N' &&
5114 RExC_parse[5] == 'E')
5115 {
5116 ret = reganode(pRExC_state,DEFINEP,0);
5117 RExC_parse +=6 ;
5118 is_define = 1;
5119 goto insert_if_check_paren;
5120 }
5121 else if (RExC_parse[0] == 'R') {
5122 RExC_parse++;
5123 parno = 0;
5124 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5125 parno = atoi(RExC_parse++);
5126 while (isDIGIT(*RExC_parse))
5127 RExC_parse++;
5128 } else if (RExC_parse[0] == '&') {
5129 SV *sv_dat;
5130 RExC_parse++;
5131 sv_dat = reg_scan_name(pRExC_state,
5132 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5133 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5134 }
1a147d38 5135 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5136 goto insert_if_check_paren;
5137 }
830247a4 5138 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5139 /* (?(1)...) */
6136c704 5140 char c;
830247a4 5141 parno = atoi(RExC_parse++);
c277df42 5142
830247a4
IZ
5143 while (isDIGIT(*RExC_parse))
5144 RExC_parse++;
fac92740 5145 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5146
0a4db386 5147 insert_if_check_paren:
830247a4 5148 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5149 vFAIL("Switch condition not recognized");
c277df42 5150 insert_if:
3dab1dad
YO
5151 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5152 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5153 if (br == NULL)
830247a4 5154 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5155 else
3dab1dad 5156 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5157 c = *nextchar(pRExC_state);
d1b80229
IZ
5158 if (flags&HASWIDTH)
5159 *flagp |= HASWIDTH;
c277df42 5160 if (c == '|') {
0a4db386
YO
5161 if (is_define)
5162 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5163 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5164 regbranch(pRExC_state, &flags, 1,depth+1);
5165 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5166 if (flags&HASWIDTH)
5167 *flagp |= HASWIDTH;
830247a4 5168 c = *nextchar(pRExC_state);
a0ed51b3
LW
5169 }
5170 else
c277df42
IZ
5171 lastbr = NULL;
5172 if (c != ')')
8615cb43 5173 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5174 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5175 REGTAIL(pRExC_state, br, ender);
c277df42 5176 if (lastbr) {
3dab1dad
YO
5177 REGTAIL(pRExC_state, lastbr, ender);
5178 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5179 }
5180 else
3dab1dad 5181 REGTAIL(pRExC_state, ret, ender);
c277df42 5182 return ret;
a0ed51b3
LW
5183 }
5184 else {
830247a4 5185 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5186 }
5187 }
1b1626e4 5188 case 0:
830247a4 5189 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5190 vFAIL("Sequence (? incomplete");
1b1626e4 5191 break;
a0d0e21e 5192 default:
830247a4 5193 --RExC_parse;
fac92740 5194 parse_flags: /* (?i) */
830247a4 5195 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5196 /* (?g), (?gc) and (?o) are useless here
5197 and must be globally applied -- japhy */
5198
5199 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5200 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5201 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5202 if (! (wastedflags & wflagbit) ) {
5203 wastedflags |= wflagbit;
5204 vWARN5(
5205 RExC_parse + 1,
5206 "Useless (%s%c) - %suse /%c modifier",
5207 flagsp == &negflags ? "?-" : "?",
5208 *RExC_parse,
5209 flagsp == &negflags ? "don't " : "",
5210 *RExC_parse
5211 );
5212 }
5213 }
5214 }
5215 else if (*RExC_parse == 'c') {
5216 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5217 if (! (wastedflags & WASTED_C) ) {
5218 wastedflags |= WASTED_GC;
9d1d55b5
JP
5219 vWARN3(
5220 RExC_parse + 1,
5221 "Useless (%sc) - %suse /gc modifier",
5222 flagsp == &negflags ? "?-" : "?",
5223 flagsp == &negflags ? "don't " : ""
5224 );
5225 }
5226 }
5227 }
5228 else { pmflag(flagsp, *RExC_parse); }
5229
830247a4 5230 ++RExC_parse;
ca9dfc88 5231 }
830247a4 5232 if (*RExC_parse == '-') {
ca9dfc88 5233 flagsp = &negflags;
9d1d55b5 5234 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5235 ++RExC_parse;
ca9dfc88 5236 goto parse_flags;
48c036b1 5237 }
e2509266
JH
5238 RExC_flags |= posflags;
5239 RExC_flags &= ~negflags;
830247a4
IZ
5240 if (*RExC_parse == ':') {
5241 RExC_parse++;
ca9dfc88
IZ
5242 paren = ':';
5243 break;
5244 }
c277df42 5245 unknown:
830247a4
IZ
5246 if (*RExC_parse != ')') {
5247 RExC_parse++;
5248 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5249 }
830247a4 5250 nextchar(pRExC_state);
a0d0e21e
LW
5251 *flagp = TRYAGAIN;
5252 return NULL;
5253 }
5254 }
fac92740 5255 else { /* (...) */
81714fb9 5256 capturing_parens:
830247a4
IZ
5257 parno = RExC_npar;
5258 RExC_npar++;
e2e6a0f1 5259
830247a4 5260 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5261 if (!SIZE_ONLY ){
5262 if (!RExC_nestroot)
5263 RExC_nestroot = parno;
5264 if (RExC_seen & REG_SEEN_RECURSE) {
5265 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5266 "Setting open paren #%"IVdf" to %d\n",
5267 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5268 RExC_open_parens[parno-1]= ret;
5269 }
6bda09f9 5270 }
fac92740
MJD
5271 Set_Node_Length(ret, 1); /* MJD */
5272 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5273 is_open = 1;
a0d0e21e 5274 }
a0ed51b3 5275 }
fac92740 5276 else /* ! paren */
a0d0e21e
LW
5277 ret = NULL;
5278
5279 /* Pick up the branches, linking them together. */
fac92740 5280 parse_start = RExC_parse; /* MJD */
3dab1dad 5281 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5282 /* branch_len = (paren != 0); */
2af232bd 5283
a0d0e21e
LW
5284 if (br == NULL)
5285 return(NULL);
830247a4
IZ
5286 if (*RExC_parse == '|') {
5287 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5288 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5289 }
fac92740 5290 else { /* MJD */
6bda09f9 5291 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5292 Set_Node_Length(br, paren != 0);
5293 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5294 }
c277df42
IZ
5295 have_branch = 1;
5296 if (SIZE_ONLY)
830247a4 5297 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5298 }
5299 else if (paren == ':') {
c277df42
IZ
5300 *flagp |= flags&SIMPLE;
5301 }
6136c704 5302 if (is_open) { /* Starts with OPEN. */
3dab1dad 5303 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5304 }
5305 else if (paren != '?') /* Not Conditional */
a0d0e21e 5306 ret = br;
32a0ca98 5307 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5308 lastbr = br;
830247a4
IZ
5309 while (*RExC_parse == '|') {
5310 if (!SIZE_ONLY && RExC_extralen) {
5311 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5312 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5313 }
5314 if (SIZE_ONLY)
830247a4
IZ
5315 RExC_extralen += 2; /* Account for LONGJMP. */
5316 nextchar(pRExC_state);
3dab1dad 5317 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5318
a687059c 5319 if (br == NULL)
a0d0e21e 5320 return(NULL);
3dab1dad 5321 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5322 lastbr = br;
821b33a5
IZ
5323 if (flags&HASWIDTH)
5324 *flagp |= HASWIDTH;
a687059c 5325 *flagp |= flags&SPSTART;
a0d0e21e
LW
5326 }
5327
c277df42
IZ
5328 if (have_branch || paren != ':') {
5329 /* Make a closing node, and hook it on the end. */
5330 switch (paren) {
5331 case ':':
830247a4 5332 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5333 break;
5334 case 1:
c74340f9 5335 RExC_cpar++;
830247a4 5336 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5337 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5338 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5339 "Setting close paren #%"IVdf" to %d\n",
5340 (IV)parno, REG_NODE_NUM(ender)));
5341 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5342 if (RExC_nestroot == parno)
5343 RExC_nestroot = 0;
40d049e4 5344 }
fac92740
MJD
5345 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5346 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5347 break;
5348 case '<':
c277df42
IZ
5349 case ',':
5350 case '=':
5351 case '!':
c277df42 5352 *flagp &= ~HASWIDTH;
821b33a5
IZ
5353 /* FALL THROUGH */
5354 case '>':
830247a4 5355 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5356 break;
5357 case 0:
830247a4 5358 ender = reg_node(pRExC_state, END);
40d049e4
YO
5359 if (!SIZE_ONLY) {
5360 assert(!RExC_opend); /* there can only be one! */
5361 RExC_opend = ender;
5362 }
c277df42
IZ
5363 break;
5364 }
eaf3ca90 5365 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5366
9674d46a 5367 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5368 if (depth==1)
5369 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5370
c277df42 5371 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5372 for (br = ret; br; br = regnext(br)) {
5373 const U8 op = PL_regkind[OP(br)];
5374 if (op == BRANCH) {
07be1b83 5375 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5376 }
5377 else if (op == BRANCHJ) {
07be1b83 5378 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5379 }
c277df42
IZ
5380 }
5381 }
a0d0e21e 5382 }
c277df42
IZ
5383
5384 {
e1ec3a88
AL
5385 const char *p;
5386 static const char parens[] = "=!<,>";
c277df42
IZ
5387
5388 if (paren && (p = strchr(parens, paren))) {
eb160463 5389 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5390 int flag = (p - parens) > 1;
5391
5392 if (paren == '>')
5393 node = SUSPEND, flag = 0;
6bda09f9 5394 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5395 Set_Node_Cur_Length(ret);
5396 Set_Node_Offset(ret, parse_start + 1);
c277df42 5397 ret->flags = flag;
07be1b83 5398 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5399 }
a0d0e21e
LW
5400 }
5401
5402 /* Check for proper termination. */
ce3e6498 5403 if (paren) {
e2509266 5404 RExC_flags = oregflags;
830247a4
IZ
5405 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5406 RExC_parse = oregcomp_parse;
380a0633 5407 vFAIL("Unmatched (");
ce3e6498 5408 }
a0ed51b3 5409 }
830247a4
IZ
5410 else if (!paren && RExC_parse < RExC_end) {
5411 if (*RExC_parse == ')') {
5412 RExC_parse++;
380a0633 5413 vFAIL("Unmatched )");
a0ed51b3
LW
5414 }
5415 else
b45f050a 5416 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5417 /* NOTREACHED */
5418 }
a687059c 5419
a0d0e21e 5420 return(ret);
a687059c
LW
5421}
5422
5423/*
5424 - regbranch - one alternative of an | operator
5425 *
5426 * Implements the concatenation operator.
5427 */
76e3520e 5428STATIC regnode *
3dab1dad 5429S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5430{
97aff369 5431 dVAR;
c277df42
IZ
5432 register regnode *ret;
5433 register regnode *chain = NULL;
5434 register regnode *latest;
5435 I32 flags = 0, c = 0;
3dab1dad
YO
5436 GET_RE_DEBUG_FLAGS_DECL;
5437 DEBUG_PARSE("brnc");
b81d288d 5438 if (first)
c277df42
IZ
5439 ret = NULL;
5440 else {
b81d288d 5441 if (!SIZE_ONLY && RExC_extralen)
830247a4 5442 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5443 else {
830247a4 5444 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5445 Set_Node_Length(ret, 1);
5446 }
c277df42
IZ
5447 }
5448
b81d288d 5449 if (!first && SIZE_ONLY)
830247a4 5450 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5451
c277df42 5452 *flagp = WORST; /* Tentatively. */
a0d0e21e 5453
830247a4
IZ
5454 RExC_parse--;
5455 nextchar(pRExC_state);
5456 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5457 flags &= ~TRYAGAIN;
3dab1dad 5458 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5459 if (latest == NULL) {
5460 if (flags & TRYAGAIN)
5461 continue;
5462 return(NULL);
a0ed51b3
LW
5463 }
5464 else if (ret == NULL)
c277df42 5465 ret = latest;
a0d0e21e 5466 *flagp |= flags&HASWIDTH;
c277df42 5467 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5468 *flagp |= flags&SPSTART;
5469 else {
830247a4 5470 RExC_naughty++;
3dab1dad 5471 REGTAIL(pRExC_state, chain, latest);
a687059c 5472 }
a0d0e21e 5473 chain = latest;
c277df42
IZ
5474 c++;
5475 }
5476 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5477 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5478 if (ret == NULL)
5479 ret = chain;
5480 }
5481 if (c == 1) {
5482 *flagp |= flags&SIMPLE;
a0d0e21e 5483 }
a687059c 5484
d4c19fe8 5485 return ret;
a687059c
LW
5486}
5487
5488/*
5489 - regpiece - something followed by possible [*+?]
5490 *
5491 * Note that the branching code sequences used for ? and the general cases
5492 * of * and + are somewhat optimized: they use the same NOTHING node as
5493 * both the endmarker for their branch list and the body of the last branch.
5494 * It might seem that this node could be dispensed with entirely, but the
5495 * endmarker role is not redundant.
5496 */
76e3520e 5497STATIC regnode *
3dab1dad 5498S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5499{
97aff369 5500 dVAR;
c277df42 5501 register regnode *ret;
a0d0e21e
LW
5502 register char op;
5503 register char *next;
5504 I32 flags;
1df70142 5505 const char * const origparse = RExC_parse;
a0d0e21e 5506 I32 min;
c277df42 5507 I32 max = REG_INFTY;
fac92740 5508 char *parse_start;
10edeb5d 5509 const char *maxpos = NULL;
3dab1dad
YO
5510 GET_RE_DEBUG_FLAGS_DECL;
5511 DEBUG_PARSE("piec");
a0d0e21e 5512
3dab1dad 5513 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5514 if (ret == NULL) {
5515 if (flags & TRYAGAIN)
5516 *flagp |= TRYAGAIN;
5517 return(NULL);
5518 }
5519
830247a4 5520 op = *RExC_parse;
a0d0e21e 5521
830247a4 5522 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5523 maxpos = NULL;
fac92740 5524 parse_start = RExC_parse; /* MJD */
830247a4 5525 next = RExC_parse + 1;
a0d0e21e
LW
5526 while (isDIGIT(*next) || *next == ',') {
5527 if (*next == ',') {
5528 if (maxpos)
5529 break;
5530 else
5531 maxpos = next;
a687059c 5532 }
a0d0e21e
LW
5533 next++;
5534 }
5535 if (*next == '}') { /* got one */
5536 if (!maxpos)
5537 maxpos = next;
830247a4
IZ
5538 RExC_parse++;
5539 min = atoi(RExC_parse);
a0d0e21e
LW
5540 if (*maxpos == ',')
5541 maxpos++;
5542 else
830247a4 5543 maxpos = RExC_parse;
a0d0e21e
LW
5544 max = atoi(maxpos);
5545 if (!max && *maxpos != '0')
c277df42
IZ
5546 max = REG_INFTY; /* meaning "infinity" */
5547 else if (max >= REG_INFTY)
8615cb43 5548 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5549 RExC_parse = next;
5550 nextchar(pRExC_state);
a0d0e21e
LW
5551
5552 do_curly:
5553 if ((flags&SIMPLE)) {
830247a4 5554 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5555 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5556 Set_Node_Offset(ret, parse_start+1); /* MJD */
5557 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5558 }
5559 else {
3dab1dad 5560 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5561
5562 w->flags = 0;
3dab1dad 5563 REGTAIL(pRExC_state, ret, w);
830247a4 5564 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5565 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5566 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5567 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5568 }
6bda09f9 5569 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5570 /* MJD hk */
5571 Set_Node_Offset(ret, parse_start+1);
2af232bd 5572 Set_Node_Length(ret,
fac92740 5573 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5574
830247a4 5575 if (!SIZE_ONLY && RExC_extralen)
c277df42 5576 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5577 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5578 if (SIZE_ONLY)
830247a4
IZ
5579 RExC_whilem_seen++, RExC_extralen += 3;
5580 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5581 }
c277df42 5582 ret->flags = 0;
a0d0e21e
LW
5583
5584 if (min > 0)
821b33a5
IZ
5585 *flagp = WORST;
5586 if (max > 0)
5587 *flagp |= HASWIDTH;
a0d0e21e 5588 if (max && max < min)
8615cb43 5589 vFAIL("Can't do {n,m} with n > m");
c277df42 5590 if (!SIZE_ONLY) {
eb160463
GS
5591 ARG1_SET(ret, (U16)min);
5592 ARG2_SET(ret, (U16)max);
a687059c 5593 }
a687059c 5594
a0d0e21e 5595 goto nest_check;
a687059c 5596 }
a0d0e21e 5597 }
a687059c 5598
a0d0e21e
LW
5599 if (!ISMULT1(op)) {
5600 *flagp = flags;
a687059c 5601 return(ret);
a0d0e21e 5602 }
bb20fd44 5603
c277df42 5604#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5605
5606 /* if this is reinstated, don't forget to put this back into perldiag:
5607
5608 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5609
5610 (F) The part of the regexp subject to either the * or + quantifier
5611 could match an empty string. The {#} shows in the regular
5612 expression about where the problem was discovered.
5613
5614 */
5615
bb20fd44 5616 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5617 vFAIL("Regexp *+ operand could be empty");
b81d288d 5618#endif
bb20fd44 5619
fac92740 5620 parse_start = RExC_parse;
830247a4 5621 nextchar(pRExC_state);
a0d0e21e 5622
821b33a5 5623 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5624
5625 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5626 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5627 ret->flags = 0;
830247a4 5628 RExC_naughty += 4;
a0d0e21e
LW
5629 }
5630 else if (op == '*') {
5631 min = 0;
5632 goto do_curly;
a0ed51b3
LW
5633 }
5634 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5635 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5636 ret->flags = 0;
830247a4 5637 RExC_naughty += 3;
a0d0e21e
LW
5638 }
5639 else if (op == '+') {
5640 min = 1;
5641 goto do_curly;
a0ed51b3
LW
5642 }
5643 else if (op == '?') {
a0d0e21e
LW
5644 min = 0; max = 1;
5645 goto do_curly;
5646 }
5647 nest_check:
041457d9 5648 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5649 vWARN3(RExC_parse,
b45f050a 5650 "%.*s matches null string many times",
afd78fd5 5651 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5652 origparse);
a0d0e21e
LW
5653 }
5654
b9b4dddf 5655 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5656 nextchar(pRExC_state);
6bda09f9 5657 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5658 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5659 }
b9b4dddf
YO
5660#ifndef REG_ALLOW_MINMOD_SUSPEND
5661 else
5662#endif
5663 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5664 regnode *ender;
5665 nextchar(pRExC_state);
5666 ender = reg_node(pRExC_state, SUCCEED);
5667 REGTAIL(pRExC_state, ret, ender);
5668 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5669 ret->flags = 0;
5670 ender = reg_node(pRExC_state, TAIL);
5671 REGTAIL(pRExC_state, ret, ender);
5672 /*ret= ender;*/
5673 }
5674
5675 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5676 RExC_parse++;
b45f050a
JF
5677 vFAIL("Nested quantifiers");
5678 }
a0d0e21e
LW
5679
5680 return(ret);
a687059c
LW
5681}
5682
fc8cd66c
YO
5683
5684/* reg_namedseq(pRExC_state,UVp)
5685
5686 This is expected to be called by a parser routine that has
5687 recognized'\N' and needs to handle the rest. RExC_parse is
5688 expected to point at the first char following the N at the time
5689 of the call.
5690
5691 If valuep is non-null then it is assumed that we are parsing inside
5692 of a charclass definition and the first codepoint in the resolved
5693 string is returned via *valuep and the routine will return NULL.
5694 In this mode if a multichar string is returned from the charnames
5695 handler a warning will be issued, and only the first char in the
5696 sequence will be examined. If the string returned is zero length
5697 then the value of *valuep is undefined and NON-NULL will
5698 be returned to indicate failure. (This will NOT be a valid pointer
5699 to a regnode.)
5700
5701 If value is null then it is assumed that we are parsing normal text
5702 and inserts a new EXACT node into the program containing the resolved
5703 string and returns a pointer to the new node. If the string is
5704 zerolength a NOTHING node is emitted.
5705
5706 On success RExC_parse is set to the char following the endbrace.
5707 Parsing failures will generate a fatal errorvia vFAIL(...)
5708
5709 NOTE: We cache all results from the charnames handler locally in
5710 the RExC_charnames hash (created on first use) to prevent a charnames
5711 handler from playing silly-buggers and returning a short string and
5712 then a long string for a given pattern. Since the regexp program
5713 size is calculated during an initial parse this would result
5714 in a buffer overrun so we cache to prevent the charname result from
5715 changing during the course of the parse.
5716
5717 */
5718STATIC regnode *
5719S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5720{
5721 char * name; /* start of the content of the name */
5722 char * endbrace; /* endbrace following the name */
5723 SV *sv_str = NULL;
5724 SV *sv_name = NULL;
5725 STRLEN len; /* this has various purposes throughout the code */
5726 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5727 regnode *ret = NULL;
5728
5729 if (*RExC_parse != '{') {
5730 vFAIL("Missing braces on \\N{}");
5731 }
5732 name = RExC_parse+1;
5733 endbrace = strchr(RExC_parse, '}');
5734 if ( ! endbrace ) {
5735 RExC_parse++;
5736 vFAIL("Missing right brace on \\N{}");
5737 }
5738 RExC_parse = endbrace + 1;
5739
5740
5741 /* RExC_parse points at the beginning brace,
5742 endbrace points at the last */
5743 if ( name[0]=='U' && name[1]=='+' ) {
5744 /* its a "unicode hex" notation {U+89AB} */
5745 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5746 | PERL_SCAN_DISALLOW_PREFIX
5747 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5748 UV cp;
196f1508 5749 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5750 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5751 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5752 cp = 0xFFFD;
5753 }
5754 if (cp > 0xff)
5755 RExC_utf8 = 1;
5756 if ( valuep ) {
5757 *valuep = cp;
5758 return NULL;
5759 }
5760 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5761 } else {
5762 /* fetch the charnames handler for this scope */
5763 HV * const table = GvHV(PL_hintgv);
5764 SV **cvp= table ?
5765 hv_fetchs(table, "charnames", FALSE) :
5766 NULL;
5767 SV *cv= cvp ? *cvp : NULL;
5768 HE *he_str;
5769 int count;
5770 /* create an SV with the name as argument */
5771 sv_name = newSVpvn(name, endbrace - name);
5772
5773 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5774 vFAIL2("Constant(\\N{%s}) unknown: "
5775 "(possibly a missing \"use charnames ...\")",
5776 SvPVX(sv_name));
5777 }
5778 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5779 vFAIL2("Constant(\\N{%s}): "
5780 "$^H{charnames} is not defined",SvPVX(sv_name));
5781 }
5782
5783
5784
5785 if (!RExC_charnames) {
5786 /* make sure our cache is allocated */
5787 RExC_charnames = newHV();
6bda09f9 5788 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5789 }
5790 /* see if we have looked this one up before */
5791 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5792 if ( he_str ) {
5793 sv_str = HeVAL(he_str);
5794 cached = 1;
5795 } else {
5796 dSP ;
5797
5798 ENTER ;
5799 SAVETMPS ;
5800 PUSHMARK(SP) ;
5801
5802 XPUSHs(sv_name);
5803
5804 PUTBACK ;
5805
5806 count= call_sv(cv, G_SCALAR);
5807
5808 if (count == 1) { /* XXXX is this right? dmq */
5809 sv_str = POPs;
5810 SvREFCNT_inc_simple_void(sv_str);
5811 }
5812
5813 SPAGAIN ;
5814 PUTBACK ;
5815 FREETMPS ;
5816 LEAVE ;
5817
5818 if ( !sv_str || !SvOK(sv_str) ) {
5819 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5820 "did not return a defined value",SvPVX(sv_name));
5821 }
5822 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5823 cached = 1;
5824 }
5825 }
5826 if (valuep) {
5827 char *p = SvPV(sv_str, len);
5828 if (len) {
5829 STRLEN numlen = 1;
5830 if ( SvUTF8(sv_str) ) {
196f1508 5831 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5832 if (*valuep > 0x7F)
5833 RExC_utf8 = 1;
5834 /* XXXX
5835 We have to turn on utf8 for high bit chars otherwise
5836 we get failures with
5837
5838 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5839 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5840
5841 This is different from what \x{} would do with the same
5842 codepoint, where the condition is > 0xFF.
5843 - dmq
5844 */
5845
5846
5847 } else {
5848 *valuep = (UV)*p;
5849 /* warn if we havent used the whole string? */
5850 }
5851 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5852 vWARN2(RExC_parse,
5853 "Ignoring excess chars from \\N{%s} in character class",
5854 SvPVX(sv_name)
5855 );
5856 }
5857 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5858 vWARN2(RExC_parse,
5859 "Ignoring zero length \\N{%s} in character class",
5860 SvPVX(sv_name)
5861 );
5862 }
5863 if (sv_name)
5864 SvREFCNT_dec(sv_name);
5865 if (!cached)
5866 SvREFCNT_dec(sv_str);
5867 return len ? NULL : (regnode *)&len;
5868 } else if(SvCUR(sv_str)) {
5869
5870 char *s;
5871 char *p, *pend;
5872 STRLEN charlen = 1;
5873 char * parse_start = name-3; /* needed for the offsets */
5874 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5875
5876 ret = reg_node(pRExC_state,
5877 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5878 s= STRING(ret);
5879
5880 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5881 sv_utf8_upgrade(sv_str);
5882 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5883 RExC_utf8= 1;
5884 }
5885
5886 p = SvPV(sv_str, len);
5887 pend = p + len;
5888 /* len is the length written, charlen is the size the char read */
5889 for ( len = 0; p < pend; p += charlen ) {
5890 if (UTF) {
196f1508 5891 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5892 if (FOLD) {
5893 STRLEN foldlen,numlen;
5894 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5895 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5896 /* Emit all the Unicode characters. */
5897
5898 for (foldbuf = tmpbuf;
5899 foldlen;
5900 foldlen -= numlen)
5901 {
5902 uvc = utf8_to_uvchr(foldbuf, &numlen);
5903 if (numlen > 0) {
5904 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5905 s += unilen;
5906 len += unilen;
5907 /* In EBCDIC the numlen
5908 * and unilen can differ. */
5909 foldbuf += numlen;
5910 if (numlen >= foldlen)
5911 break;
5912 }
5913 else
5914 break; /* "Can't happen." */
5915 }
5916 } else {
5917 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5918 if (unilen > 0) {
5919 s += unilen;
5920 len += unilen;
5921 }
5922 }
5923 } else {
5924 len++;
5925 REGC(*p, s++);
5926 }
5927 }
5928 if (SIZE_ONLY) {
5929 RExC_size += STR_SZ(len);
5930 } else {
5931 STR_LEN(ret) = len;
5932 RExC_emit += STR_SZ(len);
5933 }
5934 Set_Node_Cur_Length(ret); /* MJD */
5935 RExC_parse--;
5936 nextchar(pRExC_state);
5937 } else {
5938 ret = reg_node(pRExC_state,NOTHING);
5939 }
5940 if (!cached) {
5941 SvREFCNT_dec(sv_str);
5942 }
5943 if (sv_name) {
5944 SvREFCNT_dec(sv_name);
5945 }
5946 return ret;
5947
5948}
5949
5950
9e08bc66
TS
5951/*
5952 * reg_recode
5953 *
5954 * It returns the code point in utf8 for the value in *encp.
5955 * value: a code value in the source encoding
5956 * encp: a pointer to an Encode object
5957 *
5958 * If the result from Encode is not a single character,
5959 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
5960 */
5961STATIC UV
5962S_reg_recode(pTHX_ const char value, SV **encp)
5963{
5964 STRLEN numlen = 1;
5965 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
5966 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
5967 : SvPVX(sv);
5968 const STRLEN newlen = SvCUR(sv);
5969 UV uv = UNICODE_REPLACEMENT;
5970
5971 if (newlen)
5972 uv = SvUTF8(sv)
5973 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
5974 : *(U8*)s;
5975
5976 if (!newlen || numlen != newlen) {
5977 uv = UNICODE_REPLACEMENT;
5978 if (encp)
5979 *encp = NULL;
5980 }
5981 return uv;
5982}
5983
fc8cd66c 5984
a687059c
LW
5985/*
5986 - regatom - the lowest level
5987 *
5988 * Optimization: gobbles an entire sequence of ordinary characters so that
5989 * it can turn them into a single node, which is smaller to store and
5990 * faster to run. Backslashed characters are exceptions, each becoming a
5991 * separate node; the code is simpler that way and it's not worth fixing.
5992 *
7f6f358c
YO
5993 * [Yes, it is worth fixing, some scripts can run twice the speed.]
5994 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
5995 */
76e3520e 5996STATIC regnode *
3dab1dad 5997S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5998{
97aff369 5999 dVAR;
cbbf8932 6000 register regnode *ret = NULL;
a0d0e21e 6001 I32 flags;
45948336 6002 char *parse_start = RExC_parse;
3dab1dad
YO
6003 GET_RE_DEBUG_FLAGS_DECL;
6004 DEBUG_PARSE("atom");
a0d0e21e
LW
6005 *flagp = WORST; /* Tentatively. */
6006
6007tryagain:
830247a4 6008 switch (*RExC_parse) {
a0d0e21e 6009 case '^':
830247a4
IZ
6010 RExC_seen_zerolen++;
6011 nextchar(pRExC_state);
e2509266 6012 if (RExC_flags & PMf_MULTILINE)
830247a4 6013 ret = reg_node(pRExC_state, MBOL);
e2509266 6014 else if (RExC_flags & PMf_SINGLELINE)
830247a4 6015 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6016 else
830247a4 6017 ret = reg_node(pRExC_state, BOL);
fac92740 6018 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6019 break;
6020 case '$':
830247a4 6021 nextchar(pRExC_state);
b81d288d 6022 if (*RExC_parse)
830247a4 6023 RExC_seen_zerolen++;
e2509266 6024 if (RExC_flags & PMf_MULTILINE)
830247a4 6025 ret = reg_node(pRExC_state, MEOL);
e2509266 6026 else if (RExC_flags & PMf_SINGLELINE)
830247a4 6027 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6028 else
830247a4 6029 ret = reg_node(pRExC_state, EOL);
fac92740 6030 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6031 break;
6032 case '.':
830247a4 6033 nextchar(pRExC_state);
e2509266 6034 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
6035 ret = reg_node(pRExC_state, SANY);
6036 else
6037 ret = reg_node(pRExC_state, REG_ANY);
6038 *flagp |= HASWIDTH|SIMPLE;
830247a4 6039 RExC_naughty++;
fac92740 6040 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6041 break;
6042 case '[':
b45f050a 6043 {
3dab1dad
YO
6044 char * const oregcomp_parse = ++RExC_parse;
6045 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6046 if (*RExC_parse != ']') {
6047 RExC_parse = oregcomp_parse;
b45f050a
JF
6048 vFAIL("Unmatched [");
6049 }
830247a4 6050 nextchar(pRExC_state);
a0d0e21e 6051 *flagp |= HASWIDTH|SIMPLE;
fac92740 6052 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6053 break;
b45f050a 6054 }
a0d0e21e 6055 case '(':
830247a4 6056 nextchar(pRExC_state);
3dab1dad 6057 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6058 if (ret == NULL) {
bf93d4cc 6059 if (flags & TRYAGAIN) {
830247a4 6060 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6061 /* Make parent create an empty node if needed. */
6062 *flagp |= TRYAGAIN;
6063 return(NULL);
6064 }
a0d0e21e 6065 goto tryagain;
bf93d4cc 6066 }
a0d0e21e
LW
6067 return(NULL);
6068 }
c277df42 6069 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
6070 break;
6071 case '|':
6072 case ')':
6073 if (flags & TRYAGAIN) {
6074 *flagp |= TRYAGAIN;
6075 return NULL;
6076 }
b45f050a 6077 vFAIL("Internal urp");
a0d0e21e
LW
6078 /* Supposed to be caught earlier. */
6079 break;
85afd4ae 6080 case '{':
830247a4
IZ
6081 if (!regcurly(RExC_parse)) {
6082 RExC_parse++;
85afd4ae
CS
6083 goto defchar;
6084 }
6085 /* FALL THROUGH */
a0d0e21e
LW
6086 case '?':
6087 case '+':
6088 case '*':
830247a4 6089 RExC_parse++;
b45f050a 6090 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6091 break;
6092 case '\\':
830247a4 6093 switch (*++RExC_parse) {
a0d0e21e 6094 case 'A':
830247a4
IZ
6095 RExC_seen_zerolen++;
6096 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6097 *flagp |= SIMPLE;
830247a4 6098 nextchar(pRExC_state);
fac92740 6099 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6100 break;
6101 case 'G':
830247a4
IZ
6102 ret = reg_node(pRExC_state, GPOS);
6103 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6104 *flagp |= SIMPLE;
830247a4 6105 nextchar(pRExC_state);
fac92740 6106 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6107 break;
6108 case 'Z':
830247a4 6109 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6110 *flagp |= SIMPLE;
a1917ab9 6111 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 6112 nextchar(pRExC_state);
a0d0e21e 6113 break;
b85d18e9 6114 case 'z':
830247a4 6115 ret = reg_node(pRExC_state, EOS);
b85d18e9 6116 *flagp |= SIMPLE;
830247a4
IZ
6117 RExC_seen_zerolen++; /* Do not optimize RE away */
6118 nextchar(pRExC_state);
fac92740 6119 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 6120 break;
4a2d328f 6121 case 'C':
f33976b4
DB
6122 ret = reg_node(pRExC_state, CANY);
6123 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6124 *flagp |= HASWIDTH|SIMPLE;
830247a4 6125 nextchar(pRExC_state);
fac92740 6126 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
6127 break;
6128 case 'X':
830247a4 6129 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6130 *flagp |= HASWIDTH;
830247a4 6131 nextchar(pRExC_state);
fac92740 6132 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 6133 break;
a0d0e21e 6134 case 'w':
eb160463 6135 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6136 *flagp |= HASWIDTH|SIMPLE;
830247a4 6137 nextchar(pRExC_state);
fac92740 6138 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6139 break;
6140 case 'W':
eb160463 6141 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6142 *flagp |= HASWIDTH|SIMPLE;
830247a4 6143 nextchar(pRExC_state);
fac92740 6144 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6145 break;
6146 case 'b':
830247a4
IZ
6147 RExC_seen_zerolen++;
6148 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6149 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6150 *flagp |= SIMPLE;
830247a4 6151 nextchar(pRExC_state);
fac92740 6152 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6153 break;
6154 case 'B':
830247a4
IZ
6155 RExC_seen_zerolen++;
6156 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6157 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6158 *flagp |= SIMPLE;
830247a4 6159 nextchar(pRExC_state);
fac92740 6160 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6161 break;
6162 case 's':
eb160463 6163 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6164 *flagp |= HASWIDTH|SIMPLE;
830247a4 6165 nextchar(pRExC_state);
fac92740 6166 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6167 break;
6168 case 'S':
eb160463 6169 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6170 *flagp |= HASWIDTH|SIMPLE;
830247a4 6171 nextchar(pRExC_state);
fac92740 6172 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6173 break;
6174 case 'd':
ffc61ed2 6175 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6176 *flagp |= HASWIDTH|SIMPLE;
830247a4 6177 nextchar(pRExC_state);
fac92740 6178 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6179 break;
6180 case 'D':
ffc61ed2 6181 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6182 *flagp |= HASWIDTH|SIMPLE;
830247a4 6183 nextchar(pRExC_state);
fac92740 6184 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 6185 break;
a14b48bc
LW
6186 case 'p':
6187 case 'P':
3568d838 6188 {
3dab1dad 6189 char* const oldregxend = RExC_end;
ccb2c380 6190 char* parse_start = RExC_parse - 2;
a14b48bc 6191
830247a4 6192 if (RExC_parse[1] == '{') {
3568d838 6193 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6194 RExC_end = strchr(RExC_parse, '}');
6195 if (!RExC_end) {
3dab1dad 6196 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6197 RExC_parse += 2;
6198 RExC_end = oldregxend;
0da60cf5 6199 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6200 }
830247a4 6201 RExC_end++;
a14b48bc 6202 }
af6f566e 6203 else {
830247a4 6204 RExC_end = RExC_parse + 2;
af6f566e
HS
6205 if (RExC_end > oldregxend)
6206 RExC_end = oldregxend;
6207 }
830247a4 6208 RExC_parse--;
a14b48bc 6209
3dab1dad 6210 ret = regclass(pRExC_state,depth+1);
a14b48bc 6211
830247a4
IZ
6212 RExC_end = oldregxend;
6213 RExC_parse--;
ccb2c380
MP
6214
6215 Set_Node_Offset(ret, parse_start + 2);
6216 Set_Node_Cur_Length(ret);
830247a4 6217 nextchar(pRExC_state);
a14b48bc
LW
6218 *flagp |= HASWIDTH|SIMPLE;
6219 }
6220 break;
fc8cd66c
YO
6221 case 'N':
6222 /* Handle \N{NAME} here and not below because it can be
6223 multicharacter. join_exact() will join them up later on.
6224 Also this makes sure that things like /\N{BLAH}+/ and
6225 \N{BLAH} being multi char Just Happen. dmq*/
6226 ++RExC_parse;
6227 ret= reg_namedseq(pRExC_state, NULL);
6228 break;
0a4db386 6229 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
6230 {
6231 char ch= RExC_parse[1];
6232 if (ch != '<' && ch != '\'') {
6233 if (SIZE_ONLY)
6234 vWARN( RExC_parse + 1,
6235 "Possible broken named back reference treated as literal k");
6236 parse_start--;
6237 goto defchar;
6238 } else {
6239 char* name_start = (RExC_parse += 2);
6240 I32 num = 0;
0a4db386
YO
6241 SV *sv_dat = reg_scan_name(pRExC_state,
6242 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
6243 ch= (ch == '<') ? '>' : '\'';
6244
81714fb9
YO
6245 if (RExC_parse == name_start || *RExC_parse != ch)
6246 vFAIL2("Sequence \\k%c... not terminated",
6247 (ch == '>' ? '<' : ch));
6248
6249 RExC_sawback = 1;
6250 ret = reganode(pRExC_state,
6251 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6252 num);
6253 *flagp |= HASWIDTH;
6254
6255
6256 if (!SIZE_ONLY) {
81714fb9
YO
6257 num = add_data( pRExC_state, 1, "S" );
6258 ARG_SET(ret,num);
6259 RExC_rx->data->data[num]=(void*)sv_dat;
6260 SvREFCNT_inc(sv_dat);
6261 }
6262 /* override incorrect value set in reganode MJD */
6263 Set_Node_Offset(ret, parse_start+1);
6264 Set_Node_Cur_Length(ret); /* MJD */
6265 nextchar(pRExC_state);
6266
6267 }
6268 break;
6269 }
a0d0e21e
LW
6270 case 'n':
6271 case 'r':
6272 case 't':
6273 case 'f':
6274 case 'e':
6275 case 'a':
6276 case 'x':
6277 case 'c':
6278 case '0':
6279 goto defchar;
c74340f9 6280 case 'R':
a0d0e21e
LW
6281 case '1': case '2': case '3': case '4':
6282 case '5': case '6': case '7': case '8': case '9':
6283 {
c74340f9
YO
6284 I32 num;
6285 bool isrel=(*RExC_parse=='R');
6286 if (isrel)
6287 RExC_parse++;
6288 num = atoi(RExC_parse);
6289 if (isrel) {
6290 num = RExC_cpar - num;
6291 if (num < 1)
6292 vFAIL("Reference to nonexistent or unclosed group");
6293 }
830247a4 6294 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
6295 goto defchar;
6296 else {
3dab1dad 6297 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6298 while (isDIGIT(*RExC_parse))
6299 RExC_parse++;
b45f050a 6300
c74340f9
YO
6301 if (!SIZE_ONLY) {
6302 if (num > (I32)RExC_rx->nparens)
6303 vFAIL("Reference to nonexistent group");
6304 /* People make this error all the time apparently.
6305 So we cant fail on it, even though we should
6306
6307 else if (num >= RExC_cpar)
6308 vFAIL("Reference to unclosed group will always match");
6309 */
6310 }
830247a4 6311 RExC_sawback = 1;
eb160463
GS
6312 ret = reganode(pRExC_state,
6313 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6314 num);
a0d0e21e 6315 *flagp |= HASWIDTH;
2af232bd 6316
fac92740 6317 /* override incorrect value set in reganode MJD */
2af232bd 6318 Set_Node_Offset(ret, parse_start+1);
fac92740 6319 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6320 RExC_parse--;
6321 nextchar(pRExC_state);
a0d0e21e
LW
6322 }
6323 }
6324 break;
6325 case '\0':
830247a4 6326 if (RExC_parse >= RExC_end)
b45f050a 6327 FAIL("Trailing \\");
a0d0e21e
LW
6328 /* FALL THROUGH */
6329 default:
a0288114 6330 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6331 back into the quick-grab loop below */
45948336 6332 parse_start--;
a0d0e21e
LW
6333 goto defchar;
6334 }
6335 break;
4633a7c4
LW
6336
6337 case '#':
e2509266 6338 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
6339 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6340 RExC_parse++;
830247a4 6341 if (RExC_parse < RExC_end)
4633a7c4
LW
6342 goto tryagain;
6343 }
6344 /* FALL THROUGH */
6345
a0d0e21e 6346 default: {
ba210ebe 6347 register STRLEN len;
58ae7d3f 6348 register UV ender;
a0d0e21e 6349 register char *p;
3dab1dad 6350 char *s;
80aecb99 6351 STRLEN foldlen;
89ebb4a3 6352 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6353
6354 parse_start = RExC_parse - 1;
a0d0e21e 6355
830247a4 6356 RExC_parse++;
a0d0e21e
LW
6357
6358 defchar:
58ae7d3f 6359 ender = 0;
eb160463
GS
6360 ret = reg_node(pRExC_state,
6361 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6362 s = STRING(ret);
830247a4
IZ
6363 for (len = 0, p = RExC_parse - 1;
6364 len < 127 && p < RExC_end;
a0d0e21e
LW
6365 len++)
6366 {
3dab1dad 6367 char * const oldp = p;
5b5a24f7 6368
e2509266 6369 if (RExC_flags & PMf_EXTENDED)
830247a4 6370 p = regwhite(p, RExC_end);
a0d0e21e
LW
6371 switch (*p) {
6372 case '^':
6373 case '$':
6374 case '.':
6375 case '[':
6376 case '(':
6377 case ')':
6378 case '|':
6379 goto loopdone;
6380 case '\\':
6381 switch (*++p) {
6382 case 'A':
1ed8eac0
JF
6383 case 'C':
6384 case 'X':
a0d0e21e
LW
6385 case 'G':
6386 case 'Z':
b85d18e9 6387 case 'z':
a0d0e21e
LW
6388 case 'w':
6389 case 'W':
6390 case 'b':
6391 case 'B':
6392 case 's':
6393 case 'S':
6394 case 'd':
6395 case 'D':
a14b48bc
LW
6396 case 'p':
6397 case 'P':
fc8cd66c 6398 case 'N':
c74340f9 6399 case 'R':
a0d0e21e
LW
6400 --p;
6401 goto loopdone;
6402 case 'n':
6403 ender = '\n';
6404 p++;
a687059c 6405 break;
a0d0e21e
LW
6406 case 'r':
6407 ender = '\r';
6408 p++;
a687059c 6409 break;
a0d0e21e
LW
6410 case 't':
6411 ender = '\t';
6412 p++;
a687059c 6413 break;
a0d0e21e
LW
6414 case 'f':
6415 ender = '\f';
6416 p++;
a687059c 6417 break;
a0d0e21e 6418 case 'e':
c7f1f016 6419 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6420 p++;
a687059c 6421 break;
a0d0e21e 6422 case 'a':
c7f1f016 6423 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6424 p++;
a687059c 6425 break;
a0d0e21e 6426 case 'x':
a0ed51b3 6427 if (*++p == '{') {
1df70142 6428 char* const e = strchr(p, '}');
b81d288d 6429
b45f050a 6430 if (!e) {
830247a4 6431 RExC_parse = p + 1;
b45f050a
JF
6432 vFAIL("Missing right brace on \\x{}");
6433 }
de5f0749 6434 else {
a4c04bdc
NC
6435 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6436 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6437 STRLEN numlen = e - p - 1;
53305cf1 6438 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6439 if (ender > 0xff)
6440 RExC_utf8 = 1;
a0ed51b3
LW
6441 p = e + 1;
6442 }
a0ed51b3
LW
6443 }
6444 else {
a4c04bdc 6445 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6446 STRLEN numlen = 2;
53305cf1 6447 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6448 p += numlen;
6449 }
9e08bc66
TS
6450 if (PL_encoding && ender < 0x100)
6451 goto recode_encoding;
a687059c 6452 break;
a0d0e21e
LW
6453 case 'c':
6454 p++;
bbce6d69 6455 ender = UCHARAT(p++);
6456 ender = toCTRL(ender);
a687059c 6457 break;
a0d0e21e
LW
6458 case '0': case '1': case '2': case '3':case '4':
6459 case '5': case '6': case '7': case '8':case '9':
6460 if (*p == '0' ||
830247a4 6461 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6462 I32 flags = 0;
1df70142 6463 STRLEN numlen = 3;
53305cf1 6464 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6465 p += numlen;
6466 }
6467 else {
6468 --p;
6469 goto loopdone;
a687059c 6470 }
9e08bc66
TS
6471 if (PL_encoding && ender < 0x100)
6472 goto recode_encoding;
6473 break;
6474 recode_encoding:
6475 {
6476 SV* enc = PL_encoding;
6477 ender = reg_recode((const char)(U8)ender, &enc);
6478 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6479 vWARN(p, "Invalid escape in the specified encoding");
6480 RExC_utf8 = 1;
6481 }
a687059c 6482 break;
a0d0e21e 6483 case '\0':
830247a4 6484 if (p >= RExC_end)
b45f050a 6485 FAIL("Trailing \\");
a687059c 6486 /* FALL THROUGH */
a0d0e21e 6487 default:
041457d9 6488 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6489 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6490 goto normal_default;
a0d0e21e
LW
6491 }
6492 break;
a687059c 6493 default:
a0ed51b3 6494 normal_default:
fd400ab9 6495 if (UTF8_IS_START(*p) && UTF) {
1df70142 6496 STRLEN numlen;
5e12f4fb 6497 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6498 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6499 p += numlen;
6500 }
6501 else
6502 ender = *p++;
a0d0e21e 6503 break;
a687059c 6504 }
e2509266 6505 if (RExC_flags & PMf_EXTENDED)
830247a4 6506 p = regwhite(p, RExC_end);
60a8b682
JH
6507 if (UTF && FOLD) {
6508 /* Prime the casefolded buffer. */
ac7e0132 6509 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6510 }
a0d0e21e
LW
6511 if (ISMULT2(p)) { /* Back off on ?+*. */
6512 if (len)
6513 p = oldp;
16ea2a2e 6514 else if (UTF) {
80aecb99 6515 if (FOLD) {
60a8b682 6516 /* Emit all the Unicode characters. */
1df70142 6517 STRLEN numlen;
80aecb99
JH
6518 for (foldbuf = tmpbuf;
6519 foldlen;
6520 foldlen -= numlen) {
6521 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6522 if (numlen > 0) {
71207a34 6523 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6524 s += unilen;
6525 len += unilen;
6526 /* In EBCDIC the numlen
6527 * and unilen can differ. */
9dc45d57 6528 foldbuf += numlen;
47654450
JH
6529 if (numlen >= foldlen)
6530 break;
9dc45d57
JH
6531 }
6532 else
6533 break; /* "Can't happen." */
80aecb99
JH
6534 }
6535 }
6536 else {
71207a34 6537 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6538 if (unilen > 0) {
0ebc6274
JH
6539 s += unilen;
6540 len += unilen;
9dc45d57 6541 }
80aecb99 6542 }
a0ed51b3 6543 }
a0d0e21e
LW
6544 else {
6545 len++;
eb160463 6546 REGC((char)ender, s++);
a0d0e21e
LW
6547 }
6548 break;
a687059c 6549 }
16ea2a2e 6550 if (UTF) {
80aecb99 6551 if (FOLD) {
60a8b682 6552 /* Emit all the Unicode characters. */
1df70142 6553 STRLEN numlen;
80aecb99
JH
6554 for (foldbuf = tmpbuf;
6555 foldlen;
6556 foldlen -= numlen) {
6557 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6558 if (numlen > 0) {
71207a34 6559 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6560 len += unilen;
6561 s += unilen;
6562 /* In EBCDIC the numlen
6563 * and unilen can differ. */
9dc45d57 6564 foldbuf += numlen;
47654450
JH
6565 if (numlen >= foldlen)
6566 break;
9dc45d57
JH
6567 }
6568 else
6569 break;
80aecb99
JH
6570 }
6571 }
6572 else {
71207a34 6573 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6574 if (unilen > 0) {
0ebc6274
JH
6575 s += unilen;
6576 len += unilen;
9dc45d57 6577 }
80aecb99
JH
6578 }
6579 len--;
a0ed51b3
LW
6580 }
6581 else
eb160463 6582 REGC((char)ender, s++);
a0d0e21e
LW
6583 }
6584 loopdone:
830247a4 6585 RExC_parse = p - 1;
fac92740 6586 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6587 nextchar(pRExC_state);
793db0cb
JH
6588 {
6589 /* len is STRLEN which is unsigned, need to copy to signed */
6590 IV iv = len;
6591 if (iv < 0)
6592 vFAIL("Internal disaster");
6593 }
a0d0e21e
LW
6594 if (len > 0)
6595 *flagp |= HASWIDTH;
090f7165 6596 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6597 *flagp |= SIMPLE;
3dab1dad 6598
cd439c50 6599 if (SIZE_ONLY)
830247a4 6600 RExC_size += STR_SZ(len);
3dab1dad
YO
6601 else {
6602 STR_LEN(ret) = len;
830247a4 6603 RExC_emit += STR_SZ(len);
07be1b83 6604 }
3dab1dad 6605 }
a0d0e21e
LW
6606 break;
6607 }
a687059c 6608
a0d0e21e 6609 return(ret);
a687059c
LW
6610}
6611
873ef191 6612STATIC char *
5f66b61c 6613S_regwhite(char *p, const char *e)
5b5a24f7
CS
6614{
6615 while (p < e) {
6616 if (isSPACE(*p))
6617 ++p;
6618 else if (*p == '#') {
6619 do {
6620 p++;
6621 } while (p < e && *p != '\n');
6622 }
6623 else
6624 break;
6625 }
6626 return p;
6627}
6628
b8c5462f
JH
6629/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6630 Character classes ([:foo:]) can also be negated ([:^foo:]).
6631 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6632 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6633 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6634
6635#define POSIXCC_DONE(c) ((c) == ':')
6636#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6637#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6638
b8c5462f 6639STATIC I32
830247a4 6640S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6641{
97aff369 6642 dVAR;
936ed897 6643 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6644
830247a4 6645 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6646 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6647 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6648 const char c = UCHARAT(RExC_parse);
097eb12c 6649 char* const s = RExC_parse++;
b81d288d 6650
9a86a77b 6651 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6652 RExC_parse++;
6653 if (RExC_parse == RExC_end)
620e46c5 6654 /* Grandfather lone [:, [=, [. */
830247a4 6655 RExC_parse = s;
620e46c5 6656 else {
3dab1dad 6657 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6658 assert(*t == c);
6659
9a86a77b 6660 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6661 const char *posixcc = s + 1;
830247a4 6662 RExC_parse++; /* skip over the ending ] */
3dab1dad 6663
b8c5462f 6664 if (*s == ':') {
1df70142
AL
6665 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6666 const I32 skip = t - posixcc;
80916619
NC
6667
6668 /* Initially switch on the length of the name. */
6669 switch (skip) {
6670 case 4:
3dab1dad
YO
6671 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6672 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6673 break;
80916619
NC
6674 case 5:
6675 /* Names all of length 5. */
6676 /* alnum alpha ascii blank cntrl digit graph lower
6677 print punct space upper */
6678 /* Offset 4 gives the best switch position. */
6679 switch (posixcc[4]) {
6680 case 'a':
3dab1dad
YO
6681 if (memEQ(posixcc, "alph", 4)) /* alpha */
6682 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6683 break;
6684 case 'e':
3dab1dad
YO
6685 if (memEQ(posixcc, "spac", 4)) /* space */
6686 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6687 break;
6688 case 'h':
3dab1dad
YO
6689 if (memEQ(posixcc, "grap", 4)) /* graph */
6690 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6691 break;
6692 case 'i':
3dab1dad
YO
6693 if (memEQ(posixcc, "asci", 4)) /* ascii */
6694 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6695 break;
6696 case 'k':
3dab1dad
YO
6697 if (memEQ(posixcc, "blan", 4)) /* blank */
6698 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6699 break;
6700 case 'l':
3dab1dad
YO
6701 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6702 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6703 break;
6704 case 'm':
3dab1dad
YO
6705 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6706 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6707 break;
6708 case 'r':
3dab1dad
YO
6709 if (memEQ(posixcc, "lowe", 4)) /* lower */
6710 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6711 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6712 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6713 break;
6714 case 't':
3dab1dad
YO
6715 if (memEQ(posixcc, "digi", 4)) /* digit */
6716 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6717 else if (memEQ(posixcc, "prin", 4)) /* print */
6718 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6719 else if (memEQ(posixcc, "punc", 4)) /* punct */
6720 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6721 break;
b8c5462f
JH
6722 }
6723 break;
80916619 6724 case 6:
3dab1dad
YO
6725 if (memEQ(posixcc, "xdigit", 6))
6726 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6727 break;
6728 }
80916619
NC
6729
6730 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6731 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6732 t - s - 1, s + 1);
80916619
NC
6733 assert (posixcc[skip] == ':');
6734 assert (posixcc[skip+1] == ']');
b45f050a 6735 } else if (!SIZE_ONLY) {
b8c5462f 6736 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6737
830247a4 6738 /* adjust RExC_parse so the warning shows after
b45f050a 6739 the class closes */
9a86a77b 6740 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6741 RExC_parse++;
b45f050a
JF
6742 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6743 }
b8c5462f
JH
6744 } else {
6745 /* Maternal grandfather:
6746 * "[:" ending in ":" but not in ":]" */
830247a4 6747 RExC_parse = s;
767d463e 6748 }
620e46c5
JH
6749 }
6750 }
6751
b8c5462f
JH
6752 return namedclass;
6753}
6754
6755STATIC void
830247a4 6756S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6757{
97aff369 6758 dVAR;
3dab1dad 6759 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6760 const char *s = RExC_parse;
6761 const char c = *s++;
b8c5462f 6762
3dab1dad 6763 while (isALNUM(*s))
b8c5462f
JH
6764 s++;
6765 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6766 if (ckWARN(WARN_REGEXP))
6767 vWARN3(s+2,
6768 "POSIX syntax [%c %c] belongs inside character classes",
6769 c, c);
b45f050a
JF
6770
6771 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6772 if (POSIXCC_NOTYET(c)) {
830247a4 6773 /* adjust RExC_parse so the error shows after
b45f050a 6774 the class closes */
9a86a77b 6775 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6776 NOOP;
b45f050a
JF
6777 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6778 }
b8c5462f
JH
6779 }
6780 }
620e46c5
JH
6781}
6782
7f6f358c
YO
6783
6784/*
6785 parse a class specification and produce either an ANYOF node that
6786 matches the pattern. If the pattern matches a single char only and
6787 that char is < 256 then we produce an EXACT node instead.
6788*/
76e3520e 6789STATIC regnode *
3dab1dad 6790S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6791{
97aff369 6792 dVAR;
9ef43ace 6793 register UV value = 0;
9a86a77b 6794 register UV nextvalue;
3568d838 6795 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6796 register IV range = 0;
c277df42 6797 register regnode *ret;
ba210ebe 6798 STRLEN numlen;
ffc61ed2 6799 IV namedclass;
cbbf8932 6800 char *rangebegin = NULL;
936ed897 6801 bool need_class = 0;
c445ea15 6802 SV *listsv = NULL;
ffc61ed2 6803 UV n;
9e55ce06 6804 bool optimize_invert = TRUE;
cbbf8932 6805 AV* unicode_alternate = NULL;
1b2d223b
JH
6806#ifdef EBCDIC
6807 UV literal_endpoint = 0;
6808#endif
7f6f358c 6809 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6810
3dab1dad 6811 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6812 case we need to change the emitted regop to an EXACT. */
07be1b83 6813 const char * orig_parse = RExC_parse;
72f13be8 6814 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6815#ifndef DEBUGGING
6816 PERL_UNUSED_ARG(depth);
6817#endif
72f13be8 6818
3dab1dad 6819 DEBUG_PARSE("clas");
7f6f358c
YO
6820
6821 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6822 ret = reganode(pRExC_state, ANYOF, 0);
6823
6824 if (!SIZE_ONLY)
6825 ANYOF_FLAGS(ret) = 0;
6826
9a86a77b 6827 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6828 RExC_naughty++;
6829 RExC_parse++;
6830 if (!SIZE_ONLY)
6831 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6832 }
a0d0e21e 6833
73060fc4 6834 if (SIZE_ONLY) {
830247a4 6835 RExC_size += ANYOF_SKIP;
73060fc4
JH
6836 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6837 }
936ed897 6838 else {
830247a4 6839 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6840 if (FOLD)
6841 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6842 if (LOC)
6843 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6844 ANYOF_BITMAP_ZERO(ret);
396482e1 6845 listsv = newSVpvs("# comment\n");
a0d0e21e 6846 }
b8c5462f 6847
9a86a77b
JH
6848 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6849
b938889d 6850 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6851 checkposixcc(pRExC_state);
b8c5462f 6852
f064b6ad
HS
6853 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6854 if (UCHARAT(RExC_parse) == ']')
6855 goto charclassloop;
ffc61ed2 6856
fc8cd66c 6857parseit:
9a86a77b 6858 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6859
6860 charclassloop:
6861
6862 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6863
73b437c8 6864 if (!range)
830247a4 6865 rangebegin = RExC_parse;
ffc61ed2 6866 if (UTF) {
5e12f4fb 6867 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6868 RExC_end - RExC_parse,
9f7f3913 6869 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6870 RExC_parse += numlen;
6871 }
6872 else
6873 value = UCHARAT(RExC_parse++);
7f6f358c 6874
9a86a77b
JH
6875 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6876 if (value == '[' && POSIXCC(nextvalue))
830247a4 6877 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6878 else if (value == '\\') {
ffc61ed2 6879 if (UTF) {
5e12f4fb 6880 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6881 RExC_end - RExC_parse,
9f7f3913 6882 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6883 RExC_parse += numlen;
6884 }
6885 else
6886 value = UCHARAT(RExC_parse++);
470c3474 6887 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6888 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6889 * be a problem later if we want switch on Unicode.
6890 * A similar issue a little bit later when switching on
6891 * namedclass. --jhi */
ffc61ed2 6892 switch ((I32)value) {
b8c5462f
JH
6893 case 'w': namedclass = ANYOF_ALNUM; break;
6894 case 'W': namedclass = ANYOF_NALNUM; break;
6895 case 's': namedclass = ANYOF_SPACE; break;
6896 case 'S': namedclass = ANYOF_NSPACE; break;
6897 case 'd': namedclass = ANYOF_DIGIT; break;
6898 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6899 case 'N': /* Handle \N{NAME} in class */
6900 {
6901 /* We only pay attention to the first char of
6902 multichar strings being returned. I kinda wonder
6903 if this makes sense as it does change the behaviour
6904 from earlier versions, OTOH that behaviour was broken
6905 as well. */
6906 UV v; /* value is register so we cant & it /grrr */
6907 if (reg_namedseq(pRExC_state, &v)) {
6908 goto parseit;
6909 }
6910 value= v;
6911 }
6912 break;
ffc61ed2
JH
6913 case 'p':
6914 case 'P':
3dab1dad
YO
6915 {
6916 char *e;
af6f566e 6917 if (RExC_parse >= RExC_end)
2a4859cd 6918 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6919 if (*RExC_parse == '{') {
1df70142 6920 const U8 c = (U8)value;
ffc61ed2
JH
6921 e = strchr(RExC_parse++, '}');
6922 if (!e)
0da60cf5 6923 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
6924 while (isSPACE(UCHARAT(RExC_parse)))
6925 RExC_parse++;
6926 if (e == RExC_parse)
0da60cf5 6927 vFAIL2("Empty \\%c{}", c);
ffc61ed2 6928 n = e - RExC_parse;
ab13f0c7
JH
6929 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6930 n--;
ffc61ed2
JH
6931 }
6932 else {
6933 e = RExC_parse;
6934 n = 1;
6935 }
6936 if (!SIZE_ONLY) {
ab13f0c7
JH
6937 if (UCHARAT(RExC_parse) == '^') {
6938 RExC_parse++;
6939 n--;
6940 value = value == 'p' ? 'P' : 'p'; /* toggle */
6941 while (isSPACE(UCHARAT(RExC_parse))) {
6942 RExC_parse++;
6943 n--;
6944 }
6945 }
097eb12c
AL
6946 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6947 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
6948 }
6949 RExC_parse = e + 1;
6950 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 6951 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 6952 }
f81125e2 6953 break;
b8c5462f
JH
6954 case 'n': value = '\n'; break;
6955 case 'r': value = '\r'; break;
6956 case 't': value = '\t'; break;
6957 case 'f': value = '\f'; break;
6958 case 'b': value = '\b'; break;
c7f1f016
NIS
6959 case 'e': value = ASCII_TO_NATIVE('\033');break;
6960 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 6961 case 'x':
ffc61ed2 6962 if (*RExC_parse == '{') {
a4c04bdc
NC
6963 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6964 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 6965 char * const e = strchr(RExC_parse++, '}');
b81d288d 6966 if (!e)
ffc61ed2 6967 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
6968
6969 numlen = e - RExC_parse;
6970 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6971 RExC_parse = e + 1;
6972 }
6973 else {
a4c04bdc 6974 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
6975 numlen = 2;
6976 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6977 RExC_parse += numlen;
6978 }
9e08bc66
TS
6979 if (PL_encoding && value < 0x100)
6980 goto recode_encoding;
b8c5462f
JH
6981 break;
6982 case 'c':
830247a4 6983 value = UCHARAT(RExC_parse++);
b8c5462f
JH
6984 value = toCTRL(value);
6985 break;
6986 case '0': case '1': case '2': case '3': case '4':
6987 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
6988 {
6989 I32 flags = 0;
6990 numlen = 3;
6991 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
6992 RExC_parse += numlen;
6993 if (PL_encoding && value < 0x100)
6994 goto recode_encoding;
6995 break;
6996 }
6997 recode_encoding:
6998 {
6999 SV* enc = PL_encoding;
7000 value = reg_recode((const char)(U8)value, &enc);
7001 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7002 vWARN(RExC_parse,
7003 "Invalid escape in the specified encoding");
7004 break;
7005 }
1028017a 7006 default:
041457d9 7007 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7008 vWARN2(RExC_parse,
7009 "Unrecognized escape \\%c in character class passed through",
7010 (int)value);
1028017a 7011 break;
b8c5462f 7012 }
ffc61ed2 7013 } /* end of \blah */
1b2d223b
JH
7014#ifdef EBCDIC
7015 else
7016 literal_endpoint++;
7017#endif
ffc61ed2
JH
7018
7019 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7020
7021 if (!SIZE_ONLY && !need_class)
936ed897 7022 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7023
936ed897 7024 need_class = 1;
ffc61ed2
JH
7025
7026 /* a bad range like a-\d, a-[:digit:] ? */
7027 if (range) {
73b437c8 7028 if (!SIZE_ONLY) {
afd78fd5 7029 if (ckWARN(WARN_REGEXP)) {
097eb12c 7030 const int w =
afd78fd5
JH
7031 RExC_parse >= rangebegin ?
7032 RExC_parse - rangebegin : 0;
830247a4 7033 vWARN4(RExC_parse,
b45f050a 7034 "False [] range \"%*.*s\"",
097eb12c 7035 w, w, rangebegin);
afd78fd5 7036 }
3568d838
JH
7037 if (prevvalue < 256) {
7038 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7039 ANYOF_BITMAP_SET(ret, '-');
7040 }
7041 else {
7042 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7043 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7044 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7045 }
b8c5462f 7046 }
ffc61ed2
JH
7047
7048 range = 0; /* this was not a true range */
73b437c8 7049 }
ffc61ed2 7050
73b437c8 7051 if (!SIZE_ONLY) {
c49a72a9
NC
7052 const char *what = NULL;
7053 char yesno = 0;
7054
3568d838
JH
7055 if (namedclass > OOB_NAMEDCLASS)
7056 optimize_invert = FALSE;
e2962f66
JH
7057 /* Possible truncation here but in some 64-bit environments
7058 * the compiler gets heartburn about switch on 64-bit values.
7059 * A similar issue a little earlier when switching on value.
98f323fa 7060 * --jhi */
e2962f66 7061 switch ((I32)namedclass) {
73b437c8
JH
7062 case ANYOF_ALNUM:
7063 if (LOC)
936ed897 7064 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
7065 else {
7066 for (value = 0; value < 256; value++)
7067 if (isALNUM(value))
936ed897 7068 ANYOF_BITMAP_SET(ret, value);
73b437c8 7069 }
c49a72a9
NC
7070 yesno = '+';
7071 what = "Word";
73b437c8
JH
7072 break;
7073 case ANYOF_NALNUM:
7074 if (LOC)
936ed897 7075 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
7076 else {
7077 for (value = 0; value < 256; value++)
7078 if (!isALNUM(value))
936ed897 7079 ANYOF_BITMAP_SET(ret, value);
73b437c8 7080 }
c49a72a9
NC
7081 yesno = '!';
7082 what = "Word";
73b437c8 7083 break;
ffc61ed2 7084 case ANYOF_ALNUMC:
73b437c8 7085 if (LOC)
ffc61ed2 7086 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
7087 else {
7088 for (value = 0; value < 256; value++)
ffc61ed2 7089 if (isALNUMC(value))
936ed897 7090 ANYOF_BITMAP_SET(ret, value);
73b437c8 7091 }
c49a72a9
NC
7092 yesno = '+';
7093 what = "Alnum";
73b437c8
JH
7094 break;
7095 case ANYOF_NALNUMC:
7096 if (LOC)
936ed897 7097 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
7098 else {
7099 for (value = 0; value < 256; value++)
7100 if (!isALNUMC(value))
936ed897 7101 ANYOF_BITMAP_SET(ret, value);
73b437c8 7102 }
c49a72a9
NC
7103 yesno = '!';
7104 what = "Alnum";
73b437c8
JH
7105 break;
7106 case ANYOF_ALPHA:
7107 if (LOC)
936ed897 7108 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
7109 else {
7110 for (value = 0; value < 256; value++)
7111 if (isALPHA(value))
936ed897 7112 ANYOF_BITMAP_SET(ret, value);
73b437c8 7113 }
c49a72a9
NC
7114 yesno = '+';
7115 what = "Alpha";
73b437c8
JH
7116 break;
7117 case ANYOF_NALPHA:
7118 if (LOC)
936ed897 7119 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
7120 else {
7121 for (value = 0; value < 256; value++)
7122 if (!isALPHA(value))
936ed897 7123 ANYOF_BITMAP_SET(ret, value);
73b437c8 7124 }
c49a72a9
NC
7125 yesno = '!';
7126 what = "Alpha";
73b437c8
JH
7127 break;
7128 case ANYOF_ASCII:
7129 if (LOC)
936ed897 7130 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7131 else {
c7f1f016 7132#ifndef EBCDIC
1ba5c669
JH
7133 for (value = 0; value < 128; value++)
7134 ANYOF_BITMAP_SET(ret, value);
7135#else /* EBCDIC */
ffbc6a93 7136 for (value = 0; value < 256; value++) {
3a3c4447
JH
7137 if (isASCII(value))
7138 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7139 }
1ba5c669 7140#endif /* EBCDIC */
73b437c8 7141 }
c49a72a9
NC
7142 yesno = '+';
7143 what = "ASCII";
73b437c8
JH
7144 break;
7145 case ANYOF_NASCII:
7146 if (LOC)
936ed897 7147 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7148 else {
c7f1f016 7149#ifndef EBCDIC
1ba5c669
JH
7150 for (value = 128; value < 256; value++)
7151 ANYOF_BITMAP_SET(ret, value);
7152#else /* EBCDIC */
ffbc6a93 7153 for (value = 0; value < 256; value++) {
3a3c4447
JH
7154 if (!isASCII(value))
7155 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7156 }
1ba5c669 7157#endif /* EBCDIC */
73b437c8 7158 }
c49a72a9
NC
7159 yesno = '!';
7160 what = "ASCII";
73b437c8 7161 break;
aaa51d5e
JF
7162 case ANYOF_BLANK:
7163 if (LOC)
7164 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7165 else {
7166 for (value = 0; value < 256; value++)
7167 if (isBLANK(value))
7168 ANYOF_BITMAP_SET(ret, value);
7169 }
c49a72a9
NC
7170 yesno = '+';
7171 what = "Blank";
aaa51d5e
JF
7172 break;
7173 case ANYOF_NBLANK:
7174 if (LOC)
7175 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7176 else {
7177 for (value = 0; value < 256; value++)
7178 if (!isBLANK(value))
7179 ANYOF_BITMAP_SET(ret, value);
7180 }
c49a72a9
NC
7181 yesno = '!';
7182 what = "Blank";
aaa51d5e 7183 break;
73b437c8
JH
7184 case ANYOF_CNTRL:
7185 if (LOC)
936ed897 7186 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
7187 else {
7188 for (value = 0; value < 256; value++)
7189 if (isCNTRL(value))
936ed897 7190 ANYOF_BITMAP_SET(ret, value);
73b437c8 7191 }
c49a72a9
NC
7192 yesno = '+';
7193 what = "Cntrl";
73b437c8
JH
7194 break;
7195 case ANYOF_NCNTRL:
7196 if (LOC)
936ed897 7197 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
7198 else {
7199 for (value = 0; value < 256; value++)
7200 if (!isCNTRL(value))
936ed897 7201 ANYOF_BITMAP_SET(ret, value);
73b437c8 7202 }
c49a72a9
NC
7203 yesno = '!';
7204 what = "Cntrl";
ffc61ed2
JH
7205 break;
7206 case ANYOF_DIGIT:
7207 if (LOC)
7208 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7209 else {
7210 /* consecutive digits assumed */
7211 for (value = '0'; value <= '9'; value++)
7212 ANYOF_BITMAP_SET(ret, value);
7213 }
c49a72a9
NC
7214 yesno = '+';
7215 what = "Digit";
ffc61ed2
JH
7216 break;
7217 case ANYOF_NDIGIT:
7218 if (LOC)
7219 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7220 else {
7221 /* consecutive digits assumed */
7222 for (value = 0; value < '0'; value++)
7223 ANYOF_BITMAP_SET(ret, value);
7224 for (value = '9' + 1; value < 256; value++)
7225 ANYOF_BITMAP_SET(ret, value);
7226 }
c49a72a9
NC
7227 yesno = '!';
7228 what = "Digit";
73b437c8
JH
7229 break;
7230 case ANYOF_GRAPH:
7231 if (LOC)
936ed897 7232 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
7233 else {
7234 for (value = 0; value < 256; value++)
7235 if (isGRAPH(value))
936ed897 7236 ANYOF_BITMAP_SET(ret, value);
73b437c8 7237 }
c49a72a9
NC
7238 yesno = '+';
7239 what = "Graph";
73b437c8
JH
7240 break;
7241 case ANYOF_NGRAPH:
7242 if (LOC)
936ed897 7243 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
7244 else {
7245 for (value = 0; value < 256; value++)
7246 if (!isGRAPH(value))
936ed897 7247 ANYOF_BITMAP_SET(ret, value);
73b437c8 7248 }
c49a72a9
NC
7249 yesno = '!';
7250 what = "Graph";
73b437c8
JH
7251 break;
7252 case ANYOF_LOWER:
7253 if (LOC)
936ed897 7254 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
7255 else {
7256 for (value = 0; value < 256; value++)
7257 if (isLOWER(value))
936ed897 7258 ANYOF_BITMAP_SET(ret, value);
73b437c8 7259 }
c49a72a9
NC
7260 yesno = '+';
7261 what = "Lower";
73b437c8
JH
7262 break;
7263 case ANYOF_NLOWER:
7264 if (LOC)
936ed897 7265 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
7266 else {
7267 for (value = 0; value < 256; value++)
7268 if (!isLOWER(value))
936ed897 7269 ANYOF_BITMAP_SET(ret, value);
73b437c8 7270 }
c49a72a9
NC
7271 yesno = '!';
7272 what = "Lower";
73b437c8
JH
7273 break;
7274 case ANYOF_PRINT:
7275 if (LOC)
936ed897 7276 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
7277 else {
7278 for (value = 0; value < 256; value++)
7279 if (isPRINT(value))
936ed897 7280 ANYOF_BITMAP_SET(ret, value);
73b437c8 7281 }
c49a72a9
NC
7282 yesno = '+';
7283 what = "Print";
73b437c8
JH
7284 break;
7285 case ANYOF_NPRINT:
7286 if (LOC)
936ed897 7287 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
7288 else {
7289 for (value = 0; value < 256; value++)
7290 if (!isPRINT(value))
936ed897 7291 ANYOF_BITMAP_SET(ret, value);
73b437c8 7292 }
c49a72a9
NC
7293 yesno = '!';
7294 what = "Print";
73b437c8 7295 break;
aaa51d5e
JF
7296 case ANYOF_PSXSPC:
7297 if (LOC)
7298 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7299 else {
7300 for (value = 0; value < 256; value++)
7301 if (isPSXSPC(value))
7302 ANYOF_BITMAP_SET(ret, value);
7303 }
c49a72a9
NC
7304 yesno = '+';
7305 what = "Space";
aaa51d5e
JF
7306 break;
7307 case ANYOF_NPSXSPC:
7308 if (LOC)
7309 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7310 else {
7311 for (value = 0; value < 256; value++)
7312 if (!isPSXSPC(value))
7313 ANYOF_BITMAP_SET(ret, value);
7314 }
c49a72a9
NC
7315 yesno = '!';
7316 what = "Space";
aaa51d5e 7317 break;
73b437c8
JH
7318 case ANYOF_PUNCT:
7319 if (LOC)
936ed897 7320 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
7321 else {
7322 for (value = 0; value < 256; value++)
7323 if (isPUNCT(value))
936ed897 7324 ANYOF_BITMAP_SET(ret, value);
73b437c8 7325 }
c49a72a9
NC
7326 yesno = '+';
7327 what = "Punct";
73b437c8
JH
7328 break;
7329 case ANYOF_NPUNCT:
7330 if (LOC)
936ed897 7331 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
7332 else {
7333 for (value = 0; value < 256; value++)
7334 if (!isPUNCT(value))
936ed897 7335 ANYOF_BITMAP_SET(ret, value);
73b437c8 7336 }
c49a72a9
NC
7337 yesno = '!';
7338 what = "Punct";
ffc61ed2
JH
7339 break;
7340 case ANYOF_SPACE:
7341 if (LOC)
7342 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7343 else {
7344 for (value = 0; value < 256; value++)
7345 if (isSPACE(value))
7346 ANYOF_BITMAP_SET(ret, value);
7347 }
c49a72a9
NC
7348 yesno = '+';
7349 what = "SpacePerl";
ffc61ed2
JH
7350 break;
7351 case ANYOF_NSPACE:
7352 if (LOC)
7353 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7354 else {
7355 for (value = 0; value < 256; value++)
7356 if (!isSPACE(value))
7357 ANYOF_BITMAP_SET(ret, value);
7358 }
c49a72a9
NC
7359 yesno = '!';
7360 what = "SpacePerl";
73b437c8
JH
7361 break;
7362 case ANYOF_UPPER:
7363 if (LOC)
936ed897 7364 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
7365 else {
7366 for (value = 0; value < 256; value++)
7367 if (isUPPER(value))
936ed897 7368 ANYOF_BITMAP_SET(ret, value);
73b437c8 7369 }
c49a72a9
NC
7370 yesno = '+';
7371 what = "Upper";
73b437c8
JH
7372 break;
7373 case ANYOF_NUPPER:
7374 if (LOC)
936ed897 7375 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
7376 else {
7377 for (value = 0; value < 256; value++)
7378 if (!isUPPER(value))
936ed897 7379 ANYOF_BITMAP_SET(ret, value);
73b437c8 7380 }
c49a72a9
NC
7381 yesno = '!';
7382 what = "Upper";
73b437c8
JH
7383 break;
7384 case ANYOF_XDIGIT:
7385 if (LOC)
936ed897 7386 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
7387 else {
7388 for (value = 0; value < 256; value++)
7389 if (isXDIGIT(value))
936ed897 7390 ANYOF_BITMAP_SET(ret, value);
73b437c8 7391 }
c49a72a9
NC
7392 yesno = '+';
7393 what = "XDigit";
73b437c8
JH
7394 break;
7395 case ANYOF_NXDIGIT:
7396 if (LOC)
936ed897 7397 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
7398 else {
7399 for (value = 0; value < 256; value++)
7400 if (!isXDIGIT(value))
936ed897 7401 ANYOF_BITMAP_SET(ret, value);
73b437c8 7402 }
c49a72a9
NC
7403 yesno = '!';
7404 what = "XDigit";
73b437c8 7405 break;
f81125e2
JP
7406 case ANYOF_MAX:
7407 /* this is to handle \p and \P */
7408 break;
73b437c8 7409 default:
b45f050a 7410 vFAIL("Invalid [::] class");
73b437c8 7411 break;
b8c5462f 7412 }
c49a72a9
NC
7413 if (what) {
7414 /* Strings such as "+utf8::isWord\n" */
7415 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7416 }
b8c5462f 7417 if (LOC)
936ed897 7418 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7419 continue;
a0d0e21e 7420 }
ffc61ed2
JH
7421 } /* end of namedclass \blah */
7422
a0d0e21e 7423 if (range) {
eb160463 7424 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7425 const int w = RExC_parse - rangebegin;
7426 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7427 range = 0; /* not a valid range */
73b437c8 7428 }
a0d0e21e
LW
7429 }
7430 else {
3568d838 7431 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7432 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7433 RExC_parse[1] != ']') {
7434 RExC_parse++;
ffc61ed2
JH
7435
7436 /* a bad range like \w-, [:word:]- ? */
7437 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7438 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7439 const int w =
afd78fd5
JH
7440 RExC_parse >= rangebegin ?
7441 RExC_parse - rangebegin : 0;
830247a4 7442 vWARN4(RExC_parse,
b45f050a 7443 "False [] range \"%*.*s\"",
097eb12c 7444 w, w, rangebegin);
afd78fd5 7445 }
73b437c8 7446 if (!SIZE_ONLY)
936ed897 7447 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7448 } else
ffc61ed2
JH
7449 range = 1; /* yeah, it's a range! */
7450 continue; /* but do it the next time */
a0d0e21e 7451 }
a687059c 7452 }
ffc61ed2 7453
93733859 7454 /* now is the next time */
07be1b83 7455 /*stored += (value - prevvalue + 1);*/
ae5c130c 7456 if (!SIZE_ONLY) {
3568d838 7457 if (prevvalue < 256) {
1df70142 7458 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7459 IV i;
3568d838 7460#ifdef EBCDIC
1b2d223b
JH
7461 /* In EBCDIC [\x89-\x91] should include
7462 * the \x8e but [i-j] should not. */
7463 if (literal_endpoint == 2 &&
7464 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7465 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7466 {
3568d838
JH
7467 if (isLOWER(prevvalue)) {
7468 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7469 if (isLOWER(i))
7470 ANYOF_BITMAP_SET(ret, i);
7471 } else {
3568d838 7472 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7473 if (isUPPER(i))
7474 ANYOF_BITMAP_SET(ret, i);
7475 }
8ada0baa 7476 }
ffc61ed2 7477 else
8ada0baa 7478#endif
07be1b83
YO
7479 for (i = prevvalue; i <= ceilvalue; i++) {
7480 if (!ANYOF_BITMAP_TEST(ret,i)) {
7481 stored++;
7482 ANYOF_BITMAP_SET(ret, i);
7483 }
7484 }
3568d838 7485 }
a5961de5 7486 if (value > 255 || UTF) {
1df70142
AL
7487 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7488 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7489 stored+=2; /* can't optimize this class */
ffc61ed2 7490 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7491 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7492 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7493 prevnatvalue, natvalue);
7494 }
7495 else if (prevnatvalue == natvalue) {
7496 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7497 if (FOLD) {
89ebb4a3 7498 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7499 STRLEN foldlen;
1df70142 7500 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7501
e294cc5d
JH
7502#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7503 if (RExC_precomp[0] == ':' &&
7504 RExC_precomp[1] == '[' &&
7505 (f == 0xDF || f == 0x92)) {
7506 f = NATIVE_TO_UNI(f);
7507 }
7508#endif
c840d2a2
JH
7509 /* If folding and foldable and a single
7510 * character, insert also the folded version
7511 * to the charclass. */
9e55ce06 7512 if (f != value) {
e294cc5d
JH
7513#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7514 if ((RExC_precomp[0] == ':' &&
7515 RExC_precomp[1] == '[' &&
7516 (f == 0xA2 &&
7517 (value == 0xFB05 || value == 0xFB06))) ?
7518 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7519 foldlen == (STRLEN)UNISKIP(f) )
7520#else
eb160463 7521 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7522#endif
9e55ce06
JH
7523 Perl_sv_catpvf(aTHX_ listsv,
7524 "%04"UVxf"\n", f);
7525 else {
7526 /* Any multicharacter foldings
7527 * require the following transform:
7528 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7529 * where E folds into "pq" and F folds
7530 * into "rst", all other characters
7531 * fold to single characters. We save
7532 * away these multicharacter foldings,
7533 * to be later saved as part of the
7534 * additional "s" data. */
7535 SV *sv;
7536
7537 if (!unicode_alternate)
7538 unicode_alternate = newAV();
7539 sv = newSVpvn((char*)foldbuf, foldlen);
7540 SvUTF8_on(sv);
7541 av_push(unicode_alternate, sv);
7542 }
7543 }
254ba52a 7544
60a8b682
JH
7545 /* If folding and the value is one of the Greek
7546 * sigmas insert a few more sigmas to make the
7547 * folding rules of the sigmas to work right.
7548 * Note that not all the possible combinations
7549 * are handled here: some of them are handled
9e55ce06
JH
7550 * by the standard folding rules, and some of
7551 * them (literal or EXACTF cases) are handled
7552 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7553 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7554 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7555 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7556 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7557 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7558 }
7559 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7560 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7561 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7562 }
7563 }
ffc61ed2 7564 }
1b2d223b
JH
7565#ifdef EBCDIC
7566 literal_endpoint = 0;
7567#endif
8ada0baa 7568 }
ffc61ed2
JH
7569
7570 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7571 }
ffc61ed2 7572
936ed897 7573 if (need_class) {
4f66b38d 7574 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7575 if (SIZE_ONLY)
830247a4 7576 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7577 else
830247a4 7578 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7579 }
ffc61ed2 7580
7f6f358c
YO
7581
7582 if (SIZE_ONLY)
7583 return ret;
7584 /****** !SIZE_ONLY AFTER HERE *********/
7585
7586 if( stored == 1 && value < 256
7587 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7588 ) {
7589 /* optimize single char class to an EXACT node
7590 but *only* when its not a UTF/high char */
07be1b83
YO
7591 const char * cur_parse= RExC_parse;
7592 RExC_emit = (regnode *)orig_emit;
7593 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7594 ret = reg_node(pRExC_state,
7595 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7596 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7597 *STRING(ret)= (char)value;
7598 STR_LEN(ret)= 1;
7599 RExC_emit += STR_SZ(1);
7600 return ret;
7601 }
ae5c130c 7602 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7603 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7604 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7605 ) {
a0ed51b3 7606 for (value = 0; value < 256; ++value) {
936ed897 7607 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7608 UV fold = PL_fold[value];
ffc61ed2
JH
7609
7610 if (fold != value)
7611 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7612 }
7613 }
936ed897 7614 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7615 }
ffc61ed2 7616
ae5c130c 7617 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7618 if (optimize_invert &&
ffc61ed2
JH
7619 /* If the only flag is inversion. */
7620 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7621 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7622 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7623 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7624 }
7f6f358c 7625 {
097eb12c 7626 AV * const av = newAV();
ffc61ed2 7627 SV *rv;
9e55ce06 7628 /* The 0th element stores the character class description
6a0407ee 7629 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7630 * to initialize the appropriate swash (which gets stored in
7631 * the 1st element), and also useful for dumping the regnode.
7632 * The 2nd element stores the multicharacter foldings,
6a0407ee 7633 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7634 av_store(av, 0, listsv);
7635 av_store(av, 1, NULL);
9e55ce06 7636 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7637 rv = newRV_noinc((SV*)av);
19860706 7638 n = add_data(pRExC_state, 1, "s");
830247a4 7639 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 7640 ARG_SET(ret, n);
a0ed51b3 7641 }
a0ed51b3
LW
7642 return ret;
7643}
7644
76e3520e 7645STATIC char*
830247a4 7646S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7647{
097eb12c 7648 char* const retval = RExC_parse++;
a0d0e21e 7649
4633a7c4 7650 for (;;) {
830247a4
IZ
7651 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7652 RExC_parse[2] == '#') {
e994fd66
AE
7653 while (*RExC_parse != ')') {
7654 if (RExC_parse == RExC_end)
7655 FAIL("Sequence (?#... not terminated");
830247a4 7656 RExC_parse++;
e994fd66 7657 }
830247a4 7658 RExC_parse++;
4633a7c4
LW
7659 continue;
7660 }
e2509266 7661 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
7662 if (isSPACE(*RExC_parse)) {
7663 RExC_parse++;
748a9306
LW
7664 continue;
7665 }
830247a4 7666 else if (*RExC_parse == '#') {
e994fd66
AE
7667 while (RExC_parse < RExC_end)
7668 if (*RExC_parse++ == '\n') break;
748a9306
LW
7669 continue;
7670 }
748a9306 7671 }
4633a7c4 7672 return retval;
a0d0e21e 7673 }
a687059c
LW
7674}
7675
7676/*
c277df42 7677- reg_node - emit a node
a0d0e21e 7678*/
76e3520e 7679STATIC regnode * /* Location. */
830247a4 7680S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7681{
97aff369 7682 dVAR;
c277df42 7683 register regnode *ptr;
504618e9 7684 regnode * const ret = RExC_emit;
07be1b83 7685 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7686
c277df42 7687 if (SIZE_ONLY) {
830247a4
IZ
7688 SIZE_ALIGN(RExC_size);
7689 RExC_size += 1;
a0d0e21e
LW
7690 return(ret);
7691 }
e2e6a0f1
YO
7692#ifdef DEBUGGING
7693 if (OP(RExC_emit) == 255)
7694 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7695 reg_name[op], OP(RExC_emit));
7696#endif
c277df42 7697 NODE_ALIGN_FILL(ret);
a0d0e21e 7698 ptr = ret;
c277df42 7699 FILL_ADVANCE_NODE(ptr, op);
fac92740 7700 if (RExC_offsets) { /* MJD */
07be1b83 7701 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7702 "reg_node", __LINE__,
7703 reg_name[op],
07be1b83
YO
7704 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7705 ? "Overwriting end of array!\n" : "OK",
7706 (UV)(RExC_emit - RExC_emit_start),
7707 (UV)(RExC_parse - RExC_start),
7708 (UV)RExC_offsets[0]));
ccb2c380 7709 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7710 }
07be1b83 7711
830247a4 7712 RExC_emit = ptr;
a0d0e21e 7713 return(ret);
a687059c
LW
7714}
7715
7716/*
a0d0e21e
LW
7717- reganode - emit a node with an argument
7718*/
76e3520e 7719STATIC regnode * /* Location. */
830247a4 7720S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7721{
97aff369 7722 dVAR;
c277df42 7723 register regnode *ptr;
504618e9 7724 regnode * const ret = RExC_emit;
07be1b83 7725 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7726
c277df42 7727 if (SIZE_ONLY) {
830247a4
IZ
7728 SIZE_ALIGN(RExC_size);
7729 RExC_size += 2;
6bda09f9
YO
7730 /*
7731 We can't do this:
7732
7733 assert(2==regarglen[op]+1);
7734
7735 Anything larger than this has to allocate the extra amount.
7736 If we changed this to be:
7737
7738 RExC_size += (1 + regarglen[op]);
7739
7740 then it wouldn't matter. Its not clear what side effect
7741 might come from that so its not done so far.
7742 -- dmq
7743 */
a0d0e21e
LW
7744 return(ret);
7745 }
e2e6a0f1
YO
7746#ifdef DEBUGGING
7747 if (OP(RExC_emit) == 255)
7748 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7749#endif
c277df42 7750 NODE_ALIGN_FILL(ret);
a0d0e21e 7751 ptr = ret;
c277df42 7752 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7753 if (RExC_offsets) { /* MJD */
07be1b83 7754 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7755 "reganode",
ccb2c380
MP
7756 __LINE__,
7757 reg_name[op],
07be1b83 7758 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7759 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7760 (UV)(RExC_emit - RExC_emit_start),
7761 (UV)(RExC_parse - RExC_start),
7762 (UV)RExC_offsets[0]));
ccb2c380 7763 Set_Cur_Node_Offset;
fac92740
MJD
7764 }
7765
830247a4 7766 RExC_emit = ptr;
a0d0e21e 7767 return(ret);
fe14fcc3
LW
7768}
7769
7770/*
cd439c50 7771- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7772*/
71207a34
AL
7773STATIC STRLEN
7774S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7775{
97aff369 7776 dVAR;
71207a34 7777 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7778}
7779
7780/*
a0d0e21e
LW
7781- reginsert - insert an operator in front of already-emitted operand
7782*
7783* Means relocating the operand.
7784*/
76e3520e 7785STATIC void
6bda09f9 7786S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7787{
97aff369 7788 dVAR;
c277df42
IZ
7789 register regnode *src;
7790 register regnode *dst;
7791 register regnode *place;
504618e9 7792 const int offset = regarglen[(U8)op];
6bda09f9 7793 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7794 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7795/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7796 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7797 if (SIZE_ONLY) {
6bda09f9 7798 RExC_size += size;
a0d0e21e
LW
7799 return;
7800 }
a687059c 7801
830247a4 7802 src = RExC_emit;
6bda09f9 7803 RExC_emit += size;
830247a4 7804 dst = RExC_emit;
40d049e4 7805 if (RExC_open_parens) {
6bda09f9 7806 int paren;
6d99fb9b 7807 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 7808 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7809 if ( RExC_open_parens[paren] >= opnd ) {
7810 DEBUG_PARSE_FMT("open"," - %d",size);
7811 RExC_open_parens[paren] += size;
7812 } else {
7813 DEBUG_PARSE_FMT("open"," - %s","ok");
7814 }
7815 if ( RExC_close_parens[paren] >= opnd ) {
7816 DEBUG_PARSE_FMT("close"," - %d",size);
7817 RExC_close_parens[paren] += size;
7818 } else {
7819 DEBUG_PARSE_FMT("close"," - %s","ok");
7820 }
7821 }
6bda09f9 7822 }
40d049e4 7823
fac92740 7824 while (src > opnd) {
c277df42 7825 StructCopy(--src, --dst, regnode);
fac92740 7826 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7827 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7828 "reg_insert",
ccb2c380
MP
7829 __LINE__,
7830 reg_name[op],
07be1b83
YO
7831 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7832 ? "Overwriting end of array!\n" : "OK",
7833 (UV)(src - RExC_emit_start),
7834 (UV)(dst - RExC_emit_start),
7835 (UV)RExC_offsets[0]));
ccb2c380
MP
7836 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7837 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7838 }
7839 }
7840
a0d0e21e
LW
7841
7842 place = opnd; /* Op node, where operand used to be. */
fac92740 7843 if (RExC_offsets) { /* MJD */
07be1b83 7844 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7845 "reginsert",
ccb2c380
MP
7846 __LINE__,
7847 reg_name[op],
07be1b83 7848 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7849 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7850 (UV)(place - RExC_emit_start),
7851 (UV)(RExC_parse - RExC_start),
786e8c11 7852 (UV)RExC_offsets[0]));
ccb2c380 7853 Set_Node_Offset(place, RExC_parse);
45948336 7854 Set_Node_Length(place, 1);
fac92740 7855 }
c277df42
IZ
7856 src = NEXTOPER(place);
7857 FILL_ADVANCE_NODE(place, op);
7858 Zero(src, offset, regnode);
a687059c
LW
7859}
7860
7861/*
c277df42 7862- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7863- SEE ALSO: regtail_study
a0d0e21e 7864*/
097eb12c 7865/* TODO: All three parms should be const */
76e3520e 7866STATIC void
3dab1dad 7867S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7868{
97aff369 7869 dVAR;
c277df42 7870 register regnode *scan;
72f13be8 7871 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7872#ifndef DEBUGGING
7873 PERL_UNUSED_ARG(depth);
7874#endif
a0d0e21e 7875
c277df42 7876 if (SIZE_ONLY)
a0d0e21e
LW
7877 return;
7878
7879 /* Find last node. */
7880 scan = p;
7881 for (;;) {
504618e9 7882 regnode * const temp = regnext(scan);
3dab1dad
YO
7883 DEBUG_PARSE_r({
7884 SV * const mysv=sv_newmortal();
7885 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7886 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7887 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7888 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7889 (temp == NULL ? "->" : ""),
7890 (temp == NULL ? reg_name[OP(val)] : "")
7891 );
3dab1dad
YO
7892 });
7893 if (temp == NULL)
7894 break;
7895 scan = temp;
7896 }
7897
7898 if (reg_off_by_arg[OP(scan)]) {
7899 ARG_SET(scan, val - scan);
7900 }
7901 else {
7902 NEXT_OFF(scan) = val - scan;
7903 }
7904}
7905
07be1b83 7906#ifdef DEBUGGING
3dab1dad
YO
7907/*
7908- regtail_study - set the next-pointer at the end of a node chain of p to val.
7909- Look for optimizable sequences at the same time.
7910- currently only looks for EXACT chains.
07be1b83
YO
7911
7912This is expermental code. The idea is to use this routine to perform
7913in place optimizations on branches and groups as they are constructed,
7914with the long term intention of removing optimization from study_chunk so
7915that it is purely analytical.
7916
7917Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7918to control which is which.
7919
3dab1dad
YO
7920*/
7921/* TODO: All four parms should be const */
07be1b83 7922
3dab1dad
YO
7923STATIC U8
7924S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7925{
7926 dVAR;
7927 register regnode *scan;
07be1b83
YO
7928 U8 exact = PSEUDO;
7929#ifdef EXPERIMENTAL_INPLACESCAN
7930 I32 min = 0;
7931#endif
7932
3dab1dad
YO
7933 GET_RE_DEBUG_FLAGS_DECL;
7934
07be1b83 7935
3dab1dad
YO
7936 if (SIZE_ONLY)
7937 return exact;
7938
7939 /* Find last node. */
7940
7941 scan = p;
7942 for (;;) {
7943 regnode * const temp = regnext(scan);
07be1b83
YO
7944#ifdef EXPERIMENTAL_INPLACESCAN
7945 if (PL_regkind[OP(scan)] == EXACT)
7946 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7947 return EXACT;
7948#endif
3dab1dad
YO
7949 if ( exact ) {
7950 switch (OP(scan)) {
7951 case EXACT:
7952 case EXACTF:
7953 case EXACTFL:
7954 if( exact == PSEUDO )
7955 exact= OP(scan);
07be1b83
YO
7956 else if ( exact != OP(scan) )
7957 exact= 0;
3dab1dad
YO
7958 case NOTHING:
7959 break;
7960 default:
7961 exact= 0;
7962 }
7963 }
7964 DEBUG_PARSE_r({
7965 SV * const mysv=sv_newmortal();
7966 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7967 regprop(RExC_rx, mysv, scan);
eaf3ca90 7968 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 7969 SvPV_nolen_const(mysv),
eaf3ca90
YO
7970 REG_NODE_NUM(scan),
7971 reg_name[exact]);
3dab1dad 7972 });
a0d0e21e
LW
7973 if (temp == NULL)
7974 break;
7975 scan = temp;
7976 }
07be1b83
YO
7977 DEBUG_PARSE_r({
7978 SV * const mysv_val=sv_newmortal();
7979 DEBUG_PARSE_MSG("");
7980 regprop(RExC_rx, mysv_val, val);
7981 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
7982 SvPV_nolen_const(mysv_val),
7983 REG_NODE_NUM(val),
7984 val - scan
7985 );
7986 });
c277df42
IZ
7987 if (reg_off_by_arg[OP(scan)]) {
7988 ARG_SET(scan, val - scan);
a0ed51b3
LW
7989 }
7990 else {
c277df42
IZ
7991 NEXT_OFF(scan) = val - scan;
7992 }
3dab1dad
YO
7993
7994 return exact;
a687059c 7995}
07be1b83 7996#endif
a687059c
LW
7997
7998/*
a687059c
LW
7999 - regcurly - a little FSA that accepts {\d+,?\d*}
8000 */
79072805 8001STATIC I32
5f66b61c 8002S_regcurly(register const char *s)
a687059c
LW
8003{
8004 if (*s++ != '{')
8005 return FALSE;
f0fcb552 8006 if (!isDIGIT(*s))
a687059c 8007 return FALSE;
f0fcb552 8008 while (isDIGIT(*s))
a687059c
LW
8009 s++;
8010 if (*s == ',')
8011 s++;
f0fcb552 8012 while (isDIGIT(*s))
a687059c
LW
8013 s++;
8014 if (*s != '}')
8015 return FALSE;
8016 return TRUE;
8017}
8018
a687059c
LW
8019
8020/*
fd181c75 8021 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
8022 */
8023void
097eb12c 8024Perl_regdump(pTHX_ const regexp *r)
a687059c 8025{
35ff7856 8026#ifdef DEBUGGING
97aff369 8027 dVAR;
c445ea15 8028 SV * const sv = sv_newmortal();
ab3bbdeb 8029 SV *dsv= sv_newmortal();
a687059c 8030
786e8c11 8031 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8032
8033 /* Header fields of interest. */
ab3bbdeb
YO
8034 if (r->anchored_substr) {
8035 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8036 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8037 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8038 "anchored %s%s at %"IVdf" ",
8039 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8040 (IV)r->anchored_offset);
ab3bbdeb
YO
8041 } else if (r->anchored_utf8) {
8042 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8043 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8044 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8045 "anchored utf8 %s%s at %"IVdf" ",
8046 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8047 (IV)r->anchored_offset);
ab3bbdeb
YO
8048 }
8049 if (r->float_substr) {
8050 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8051 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8052 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8053 "floating %s%s at %"IVdf"..%"UVuf" ",
8054 s, RE_SV_TAIL(r->float_substr),
7b0972df 8055 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8056 } else if (r->float_utf8) {
8057 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8058 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8059 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8060 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8061 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8062 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8063 }
33b8afdf 8064 if (r->check_substr || r->check_utf8)
b81d288d 8065 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8066 (const char *)
8067 (r->check_substr == r->float_substr
8068 && r->check_utf8 == r->float_utf8
8069 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
8070 if (r->reganch & ROPT_NOSCAN)
8071 PerlIO_printf(Perl_debug_log, " noscan");
8072 if (r->reganch & ROPT_CHECK_ALL)
8073 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8074 if (r->check_substr || r->check_utf8)
c277df42
IZ
8075 PerlIO_printf(Perl_debug_log, ") ");
8076
46fc3d4c 8077 if (r->regstclass) {
32fc9b6a 8078 regprop(r, sv, r->regstclass);
1de06328 8079 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8080 }
774d564b 8081 if (r->reganch & ROPT_ANCH) {
8082 PerlIO_printf(Perl_debug_log, "anchored");
8083 if (r->reganch & ROPT_ANCH_BOL)
8084 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
8085 if (r->reganch & ROPT_ANCH_MBOL)
8086 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
8087 if (r->reganch & ROPT_ANCH_SBOL)
8088 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 8089 if (r->reganch & ROPT_ANCH_GPOS)
8090 PerlIO_printf(Perl_debug_log, "(GPOS)");
8091 PerlIO_putc(Perl_debug_log, ' ');
8092 }
c277df42
IZ
8093 if (r->reganch & ROPT_GPOS_SEEN)
8094 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 8095 if (r->reganch & ROPT_SKIP)
760ac839 8096 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 8097 if (r->reganch & ROPT_IMPLICIT)
760ac839 8098 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 8099 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
8100 if (r->reganch & ROPT_EVAL_SEEN)
8101 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8102 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8103#else
96a5add6 8104 PERL_UNUSED_CONTEXT;
65e66c80 8105 PERL_UNUSED_ARG(r);
17c3b450 8106#endif /* DEBUGGING */
a687059c
LW
8107}
8108
8109/*
a0d0e21e
LW
8110- regprop - printable representation of opcode
8111*/
46fc3d4c 8112void
32fc9b6a 8113Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8114{
35ff7856 8115#ifdef DEBUGGING
97aff369 8116 dVAR;
9b155405 8117 register int k;
1de06328 8118 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 8119
54dc92de 8120 sv_setpvn(sv, "", 0);
03363afd 8121 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8122 /* It would be nice to FAIL() here, but this may be called from
8123 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8124 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8125 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8126
3dab1dad 8127 k = PL_regkind[OP(o)];
9b155405 8128
2a782b5b 8129 if (k == EXACT) {
396482e1 8130 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8131 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8132 * is a crude hack but it may be the best for now since
8133 * we have no flag "this EXACTish node was UTF-8"
8134 * --jhi */
8135 const char * const s =
ddc5bc0f 8136 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8137 PL_colors[0], PL_colors[1],
8138 PERL_PV_ESCAPE_UNI_DETECT |
8139 PERL_PV_PRETTY_ELIPSES |
8140 PERL_PV_PRETTY_LTGT
8141 );
8142 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8143 } else if (k == TRIE) {
3dab1dad 8144 /* print the details of the trie in dumpuntil instead, as
4f639d21 8145 * prog->data isn't available here */
1de06328
YO
8146 const char op = OP(o);
8147 const I32 n = ARG(o);
8148 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8149 (reg_ac_data *)prog->data->data[n] :
8150 NULL;
8151 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
8152 (reg_trie_data*)prog->data->data[n] :
8153 ac->trie;
8154
8155 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8156 DEBUG_TRIE_COMPILE_r(
8157 Perl_sv_catpvf(aTHX_ sv,
8158 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8159 (UV)trie->startstate,
1e2e3d02 8160 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8161 (UV)trie->wordcount,
8162 (UV)trie->minlen,
8163 (UV)trie->maxlen,
8164 (UV)TRIE_CHARCOUNT(trie),
8165 (UV)trie->uniquecharcount
8166 )
8167 );
8168 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8169 int i;
8170 int rangestart = -1;
f46cb337 8171 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8172 Perl_sv_catpvf(aTHX_ sv, "[");
8173 for (i = 0; i <= 256; i++) {
8174 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8175 if (rangestart == -1)
8176 rangestart = i;
8177 } else if (rangestart != -1) {
8178 if (i <= rangestart + 3)
8179 for (; rangestart < i; rangestart++)
8180 put_byte(sv, rangestart);
8181 else {
8182 put_byte(sv, rangestart);
8183 sv_catpvs(sv, "-");
8184 put_byte(sv, i - 1);
8185 }
8186 rangestart = -1;
8187 }
8188 }
8189 Perl_sv_catpvf(aTHX_ sv, "]");
8190 }
8191
a3621e74 8192 } else if (k == CURLY) {
cb434fcc 8193 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8194 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8195 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8196 }
2c2d71f5
JH
8197 else if (k == WHILEM && o->flags) /* Ordinal/of */
8198 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
e2e6a0f1 8199 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
894356b3 8200 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1a147d38 8201 else if (k == GOSUB)
6bda09f9 8202 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8203 else if (k == VERB) {
8204 if (!o->flags)
8205 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8206 (SV*)prog->data->data[ ARG( o ) ]);
8207 } else if (k == LOGICAL)
04ebc1ab 8208 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8209 else if (k == ANYOF) {
8210 int i, rangestart = -1;
2d03de9c 8211 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8212
8213 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8214 static const char * const anyofs[] = {
653099ff
GS
8215 "\\w",
8216 "\\W",
8217 "\\s",
8218 "\\S",
8219 "\\d",
8220 "\\D",
8221 "[:alnum:]",
8222 "[:^alnum:]",
8223 "[:alpha:]",
8224 "[:^alpha:]",
8225 "[:ascii:]",
8226 "[:^ascii:]",
8227 "[:ctrl:]",
8228 "[:^ctrl:]",
8229 "[:graph:]",
8230 "[:^graph:]",
8231 "[:lower:]",
8232 "[:^lower:]",
8233 "[:print:]",
8234 "[:^print:]",
8235 "[:punct:]",
8236 "[:^punct:]",
8237 "[:upper:]",
aaa51d5e 8238 "[:^upper:]",
653099ff 8239 "[:xdigit:]",
aaa51d5e
JF
8240 "[:^xdigit:]",
8241 "[:space:]",
8242 "[:^space:]",
8243 "[:blank:]",
8244 "[:^blank:]"
653099ff
GS
8245 };
8246
19860706 8247 if (flags & ANYOF_LOCALE)
396482e1 8248 sv_catpvs(sv, "{loc}");
19860706 8249 if (flags & ANYOF_FOLD)
396482e1 8250 sv_catpvs(sv, "{i}");
653099ff 8251 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8252 if (flags & ANYOF_INVERT)
396482e1 8253 sv_catpvs(sv, "^");
ffc61ed2
JH
8254 for (i = 0; i <= 256; i++) {
8255 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8256 if (rangestart == -1)
8257 rangestart = i;
8258 } else if (rangestart != -1) {
8259 if (i <= rangestart + 3)
8260 for (; rangestart < i; rangestart++)
653099ff 8261 put_byte(sv, rangestart);
ffc61ed2
JH
8262 else {
8263 put_byte(sv, rangestart);
396482e1 8264 sv_catpvs(sv, "-");
ffc61ed2 8265 put_byte(sv, i - 1);
653099ff 8266 }
ffc61ed2 8267 rangestart = -1;
653099ff 8268 }
847a199f 8269 }
ffc61ed2
JH
8270
8271 if (o->flags & ANYOF_CLASS)
bb7a0f54 8272 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8273 if (ANYOF_CLASS_TEST(o,i))
8274 sv_catpv(sv, anyofs[i]);
8275
8276 if (flags & ANYOF_UNICODE)
396482e1 8277 sv_catpvs(sv, "{unicode}");
1aa99e6b 8278 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8279 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8280
8281 {
8282 SV *lv;
32fc9b6a 8283 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8284
ffc61ed2
JH
8285 if (lv) {
8286 if (sw) {
89ebb4a3 8287 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8288
ffc61ed2 8289 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8290 uvchr_to_utf8(s, i);
ffc61ed2 8291
3568d838 8292 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8293 if (rangestart == -1)
8294 rangestart = i;
8295 } else if (rangestart != -1) {
ffc61ed2
JH
8296 if (i <= rangestart + 3)
8297 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8298 const U8 * const e = uvchr_to_utf8(s,rangestart);
8299 U8 *p;
8300 for(p = s; p < e; p++)
ffc61ed2
JH
8301 put_byte(sv, *p);
8302 }
8303 else {
2d03de9c
AL
8304 const U8 *e = uvchr_to_utf8(s,rangestart);
8305 U8 *p;
8306 for (p = s; p < e; p++)
ffc61ed2 8307 put_byte(sv, *p);
396482e1 8308 sv_catpvs(sv, "-");
2d03de9c
AL
8309 e = uvchr_to_utf8(s, i-1);
8310 for (p = s; p < e; p++)
1df70142 8311 put_byte(sv, *p);
ffc61ed2
JH
8312 }
8313 rangestart = -1;
8314 }
19860706 8315 }
ffc61ed2 8316
396482e1 8317 sv_catpvs(sv, "..."); /* et cetera */
19860706 8318 }
fde631ed 8319
ffc61ed2 8320 {
2e0de35c 8321 char *s = savesvpv(lv);
c445ea15 8322 char * const origs = s;
b81d288d 8323
3dab1dad
YO
8324 while (*s && *s != '\n')
8325 s++;
b81d288d 8326
ffc61ed2 8327 if (*s == '\n') {
2d03de9c 8328 const char * const t = ++s;
ffc61ed2
JH
8329
8330 while (*s) {
8331 if (*s == '\n')
8332 *s = ' ';
8333 s++;
8334 }
8335 if (s[-1] == ' ')
8336 s[-1] = 0;
8337
8338 sv_catpv(sv, t);
fde631ed 8339 }
b81d288d 8340
ffc61ed2 8341 Safefree(origs);
fde631ed
JH
8342 }
8343 }
653099ff 8344 }
ffc61ed2 8345
653099ff
GS
8346 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8347 }
9b155405 8348 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8349 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8350#else
96a5add6 8351 PERL_UNUSED_CONTEXT;
65e66c80
SP
8352 PERL_UNUSED_ARG(sv);
8353 PERL_UNUSED_ARG(o);
f9049ba1 8354 PERL_UNUSED_ARG(prog);
17c3b450 8355#endif /* DEBUGGING */
35ff7856 8356}
a687059c 8357
cad2e5aa
JH
8358SV *
8359Perl_re_intuit_string(pTHX_ regexp *prog)
8360{ /* Assume that RE_INTUIT is set */
97aff369 8361 dVAR;
a3621e74 8362 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8363 PERL_UNUSED_CONTEXT;
8364
a3621e74 8365 DEBUG_COMPILE_r(
cfd0369c 8366 {
2d03de9c 8367 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8368 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8369
8370 if (!PL_colorset) reginitcolors();
8371 PerlIO_printf(Perl_debug_log,
a0288114 8372 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8373 PL_colors[4],
8374 prog->check_substr ? "" : "utf8 ",
8375 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8376 s,
8377 PL_colors[1],
8378 (strlen(s) > 60 ? "..." : ""));
8379 } );
8380
33b8afdf 8381 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8382}
8383
84da74a7
YO
8384/*
8385 pregfree - free a regexp
8386
8387 See regdupe below if you change anything here.
8388*/
8389
2b69d0c2 8390void
864dbfa3 8391Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8392{
27da23d5 8393 dVAR;
0df25f3d 8394
fc32ee4a 8395 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8396
7821416a
IZ
8397 if (!r || (--r->refcnt > 0))
8398 return;
ab3bbdeb 8399 DEBUG_COMPILE_r({
0df25f3d
YO
8400 if (!PL_colorset)
8401 reginitcolors();
5c9f2f80
YO
8402 {
8403 SV *dsv= sv_newmortal();
ab3bbdeb
YO
8404 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
8405 dsv, r->precomp, r->prelen, 60);
8406 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8407 PL_colors[4],PL_colors[5],s);
8408 }
9e55ce06 8409 });
cad2e5aa 8410
43c5f42d
NC
8411 /* gcov results gave these as non-null 100% of the time, so there's no
8412 optimisation in checking them before calling Safefree */
8413 Safefree(r->precomp);
8414 Safefree(r->offsets); /* 20010421 MJD */
ed252734 8415 RX_MATCH_COPY_FREE(r);
f8c7b90f 8416#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8417 if (r->saved_copy)
8418 SvREFCNT_dec(r->saved_copy);
8419#endif
a193d654
GS
8420 if (r->substrs) {
8421 if (r->anchored_substr)
8422 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8423 if (r->anchored_utf8)
8424 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8425 if (r->float_substr)
8426 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8427 if (r->float_utf8)
8428 SvREFCNT_dec(r->float_utf8);
2779dcf1 8429 Safefree(r->substrs);
a193d654 8430 }
81714fb9
YO
8431 if (r->paren_names)
8432 SvREFCNT_dec(r->paren_names);
c277df42
IZ
8433 if (r->data) {
8434 int n = r->data->count;
f3548bdc
DM
8435 PAD* new_comppad = NULL;
8436 PAD* old_comppad;
4026c95a 8437 PADOFFSET refcnt;
dfad63ad 8438
c277df42 8439 while (--n >= 0) {
261faec3 8440 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
8441 switch (r->data->what[n]) {
8442 case 's':
81714fb9 8443 case 'S':
c277df42
IZ
8444 SvREFCNT_dec((SV*)r->data->data[n]);
8445 break;
653099ff
GS
8446 case 'f':
8447 Safefree(r->data->data[n]);
8448 break;
dfad63ad
HS
8449 case 'p':
8450 new_comppad = (AV*)r->data->data[n];
8451 break;
c277df42 8452 case 'o':
dfad63ad 8453 if (new_comppad == NULL)
cea2e8a9 8454 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8455 PAD_SAVE_LOCAL(old_comppad,
8456 /* Watch out for global destruction's random ordering. */
c445ea15 8457 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8458 );
b34c0dd4 8459 OP_REFCNT_LOCK;
4026c95a
SH
8460 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8461 OP_REFCNT_UNLOCK;
8462 if (!refcnt)
9b978d73 8463 op_free((OP_4tree*)r->data->data[n]);
9b978d73 8464
f3548bdc 8465 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8466 SvREFCNT_dec((SV*)new_comppad);
8467 new_comppad = NULL;
c277df42
IZ
8468 break;
8469 case 'n':
9e55ce06 8470 break;
07be1b83 8471 case 'T':
be8e71aa
YO
8472 { /* Aho Corasick add-on structure for a trie node.
8473 Used in stclass optimization only */
07be1b83
YO
8474 U32 refcount;
8475 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8476 OP_REFCNT_LOCK;
8477 refcount = --aho->refcount;
8478 OP_REFCNT_UNLOCK;
8479 if ( !refcount ) {
8480 Safefree(aho->states);
8481 Safefree(aho->fail);
8482 aho->trie=NULL; /* not necessary to free this as it is
8483 handled by the 't' case */
8484 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 8485 Safefree(r->regstclass);
07be1b83
YO
8486 }
8487 }
8488 break;
a3621e74 8489 case 't':
07be1b83 8490 {
be8e71aa 8491 /* trie structure. */
07be1b83
YO
8492 U32 refcount;
8493 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8494 OP_REFCNT_LOCK;
8495 refcount = --trie->refcount;
8496 OP_REFCNT_UNLOCK;
8497 if ( !refcount ) {
8498 Safefree(trie->charmap);
8499 if (trie->widecharmap)
8500 SvREFCNT_dec((SV*)trie->widecharmap);
8501 Safefree(trie->states);
8502 Safefree(trie->trans);
8503 if (trie->bitmap)
8504 Safefree(trie->bitmap);
8505 if (trie->wordlen)
8506 Safefree(trie->wordlen);
786e8c11
YO
8507 if (trie->jump)
8508 Safefree(trie->jump);
8509 if (trie->nextword)
8510 Safefree(trie->nextword);
a3621e74 8511#ifdef DEBUGGING
5c9f2f80
YO
8512 if (trie->words)
8513 SvREFCNT_dec((SV*)trie->words);
8514 if (trie->revcharmap)
8515 SvREFCNT_dec((SV*)trie->revcharmap);
a3621e74 8516#endif
07be1b83 8517 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 8518 }
07be1b83
YO
8519 }
8520 break;
c277df42 8521 default:
830247a4 8522 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
8523 }
8524 }
8525 Safefree(r->data->what);
8526 Safefree(r->data);
a0d0e21e
LW
8527 }
8528 Safefree(r->startp);
8529 Safefree(r->endp);
c74340f9
YO
8530 if (r->swap) {
8531 Safefree(r->swap->startp);
8532 Safefree(r->swap->endp);
8533 Safefree(r->swap);
8534 }
a0d0e21e 8535 Safefree(r);
a687059c 8536}
c277df42 8537
84da74a7
YO
8538#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8539#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8540#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8541#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8542
8543/*
8544 regdupe - duplicate a regexp.
8545
8546 This routine is called by sv.c's re_dup and is expected to clone a
8547 given regexp structure. It is a no-op when not under USE_ITHREADS.
8548 (Originally this *was* re_dup() for change history see sv.c)
8549
8550 See pregfree() above if you change anything here.
8551*/
a3c0e9ca 8552#if defined(USE_ITHREADS)
84da74a7
YO
8553regexp *
8554Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8555{
84da74a7
YO
8556 dVAR;
8557 REGEXP *ret;
8558 int i, len, npar;
8559 struct reg_substr_datum *s;
8560
8561 if (!r)
8562 return (REGEXP *)NULL;
8563
8564 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8565 return ret;
8566
8567 len = r->offsets[0];
8568 npar = r->nparens+1;
8569
8570 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8571 Copy(r->program, ret->program, len+1, regnode);
8572
8573 Newx(ret->startp, npar, I32);
8574 Copy(r->startp, ret->startp, npar, I32);
8575 Newx(ret->endp, npar, I32);
8576 Copy(r->startp, ret->startp, npar, I32);
c74340f9
YO
8577 if(r->swap) {
8578 Newx(ret->swap, 1, regexp_paren_ofs);
8579 /* no need to copy these */
8580 Newx(ret->swap->startp, npar, I32);
8581 Newx(ret->swap->endp, npar, I32);
8582 } else {
8583 ret->swap = NULL;
8584 }
84da74a7
YO
8585
8586 Newx(ret->substrs, 1, struct reg_substr_data);
8587 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8588 s->min_offset = r->substrs->data[i].min_offset;
8589 s->max_offset = r->substrs->data[i].max_offset;
8590 s->end_shift = r->substrs->data[i].end_shift;
8591 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8592 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8593 }
8594
8595 ret->regstclass = NULL;
8596 if (r->data) {
8597 struct reg_data *d;
8598 const int count = r->data->count;
8599 int i;
8600
8601 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8602 char, struct reg_data);
8603 Newx(d->what, count, U8);
8604
8605 d->count = count;
8606 for (i = 0; i < count; i++) {
8607 d->what[i] = r->data->what[i];
8608 switch (d->what[i]) {
e2e6a0f1 8609 /* legal options are one of: sSfpont
84da74a7
YO
8610 see also regcomp.h and pregfree() */
8611 case 's':
81714fb9 8612 case 'S':
84da74a7
YO
8613 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8614 break;
8615 case 'p':
8616 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8617 break;
8618 case 'f':
8619 /* This is cheating. */
8620 Newx(d->data[i], 1, struct regnode_charclass_class);
8621 StructCopy(r->data->data[i], d->data[i],
8622 struct regnode_charclass_class);
8623 ret->regstclass = (regnode*)d->data[i];
8624 break;
8625 case 'o':
8626 /* Compiled op trees are readonly, and can thus be
8627 shared without duplication. */
8628 OP_REFCNT_LOCK;
8629 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8630 OP_REFCNT_UNLOCK;
8631 break;
8632 case 'n':
8633 d->data[i] = r->data->data[i];
8634 break;
8635 case 't':
8636 d->data[i] = r->data->data[i];
8637 OP_REFCNT_LOCK;
8638 ((reg_trie_data*)d->data[i])->refcount++;
8639 OP_REFCNT_UNLOCK;
8640 break;
8641 case 'T':
8642 d->data[i] = r->data->data[i];
8643 OP_REFCNT_LOCK;
8644 ((reg_ac_data*)d->data[i])->refcount++;
8645 OP_REFCNT_UNLOCK;
8646 /* Trie stclasses are readonly and can thus be shared
8647 * without duplication. We free the stclass in pregfree
8648 * when the corresponding reg_ac_data struct is freed.
8649 */
8650 ret->regstclass= r->regstclass;
8651 break;
8652 default:
8653 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8654 }
8655 }
8656
8657 ret->data = d;
8658 }
8659 else
8660 ret->data = NULL;
8661
8662 Newx(ret->offsets, 2*len+1, U32);
8663 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8664
8665 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8666 ret->refcnt = r->refcnt;
8667 ret->minlen = r->minlen;
de8c5301 8668 ret->minlenret = r->minlenret;
84da74a7
YO
8669 ret->prelen = r->prelen;
8670 ret->nparens = r->nparens;
8671 ret->lastparen = r->lastparen;
8672 ret->lastcloseparen = r->lastcloseparen;
8673 ret->reganch = r->reganch;
8674
8675 ret->sublen = r->sublen;
8676
f9f4320a 8677 ret->engine = r->engine;
81714fb9
YO
8678
8679 ret->paren_names = hv_dup_inc(r->paren_names, param);
f9f4320a 8680
84da74a7
YO
8681 if (RX_MATCH_COPIED(ret))
8682 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8683 else
8684 ret->subbeg = NULL;
8685#ifdef PERL_OLD_COPY_ON_WRITE
8686 ret->saved_copy = NULL;
8687#endif
8688
8689 ptr_table_store(PL_ptr_table, r, ret);
8690 return ret;
84da74a7 8691}
a3c0e9ca 8692#endif
84da74a7 8693
de8c5301
YO
8694/*
8695 reg_stringify()
8696
8697 converts a regexp embedded in a MAGIC struct to its stringified form,
8698 caching the converted form in the struct and returns the cached
8699 string.
8700
8701 If lp is nonnull then it is used to return the length of the
8702 resulting string
8703
8704 If flags is nonnull and the returned string contains UTF8 then
8705 (flags & 1) will be true.
8706
8707 If haseval is nonnull then it is used to return whether the pattern
8708 contains evals.
8709
8710 Normally called via macro:
8711
8712 CALLREG_STRINGIFY(mg,0,0);
8713
8714 And internally with
8715
8716 CALLREG_AS_STR(mg,lp,flags,haseval)
8717
8718 See sv_2pv_flags() in sv.c for an example of internal usage.
8719
8720 */
8721
8722char *
8723Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8724 dVAR;
8725 const regexp * const re = (regexp *)mg->mg_obj;
8726
8727 if (!mg->mg_ptr) {
8728 const char *fptr = "msix";
8729 char reflags[6];
8730 char ch;
8731 int left = 0;
8732 int right = 4;
8733 bool need_newline = 0;
8734 U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12);
8735
8736 while((ch = *fptr++)) {
8737 if(reganch & 1) {
8738 reflags[left++] = ch;
8739 }
8740 else {
8741 reflags[right--] = ch;
8742 }
8743 reganch >>= 1;
8744 }
8745 if(left != 4) {
8746 reflags[left] = '-';
8747 left = 5;
8748 }
8749
8750 mg->mg_len = re->prelen + 4 + left;
8751 /*
8752 * If /x was used, we have to worry about a regex ending with a
8753 * comment later being embedded within another regex. If so, we don't
8754 * want this regex's "commentization" to leak out to the right part of
8755 * the enclosing regex, we must cap it with a newline.
8756 *
8757 * So, if /x was used, we scan backwards from the end of the regex. If
8758 * we find a '#' before we find a newline, we need to add a newline
8759 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8760 * we don't need to add anything. -jfriedl
8761 */
8762 if (PMf_EXTENDED & re->reganch) {
8763 const char *endptr = re->precomp + re->prelen;
8764 while (endptr >= re->precomp) {
8765 const char c = *(endptr--);
8766 if (c == '\n')
8767 break; /* don't need another */
8768 if (c == '#') {
8769 /* we end while in a comment, so we need a newline */
8770 mg->mg_len++; /* save space for it */
8771 need_newline = 1; /* note to add it */
8772 break;
8773 }
8774 }
8775 }
8776
8777 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8778 mg->mg_ptr[0] = '(';
8779 mg->mg_ptr[1] = '?';
8780 Copy(reflags, mg->mg_ptr+2, left, char);
8781 *(mg->mg_ptr+left+2) = ':';
8782 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8783 if (need_newline)
8784 mg->mg_ptr[mg->mg_len - 2] = '\n';
8785 mg->mg_ptr[mg->mg_len - 1] = ')';
8786 mg->mg_ptr[mg->mg_len] = 0;
8787 }
8788 if (haseval)
8789 *haseval = re->program[0].next_off;
8790 if (flags)
8791 *flags = ((re->reganch & ROPT_UTF8) ? 1 : 0);
8792
8793 if (lp)
8794 *lp = mg->mg_len;
8795 return mg->mg_ptr;
8796}
8797
8798
76234dfb 8799#ifndef PERL_IN_XSUB_RE
c277df42
IZ
8800/*
8801 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8802 */
8803regnode *
864dbfa3 8804Perl_regnext(pTHX_ register regnode *p)
c277df42 8805{
97aff369 8806 dVAR;
c277df42
IZ
8807 register I32 offset;
8808
3280af22 8809 if (p == &PL_regdummy)
c277df42
IZ
8810 return(NULL);
8811
8812 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8813 if (offset == 0)
8814 return(NULL);
8815
c277df42 8816 return(p+offset);
c277df42 8817}
76234dfb 8818#endif
c277df42 8819
01f988be 8820STATIC void
cea2e8a9 8821S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8822{
8823 va_list args;
8824 STRLEN l1 = strlen(pat1);
8825 STRLEN l2 = strlen(pat2);
8826 char buf[512];
06bf62c7 8827 SV *msv;
73d840c0 8828 const char *message;
c277df42
IZ
8829
8830 if (l1 > 510)
8831 l1 = 510;
8832 if (l1 + l2 > 510)
8833 l2 = 510 - l1;
8834 Copy(pat1, buf, l1 , char);
8835 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8836 buf[l1 + l2] = '\n';
8837 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8838#ifdef I_STDARG
8839 /* ANSI variant takes additional second argument */
c277df42 8840 va_start(args, pat2);
8736538c
AS
8841#else
8842 va_start(args);
8843#endif
5a844595 8844 msv = vmess(buf, &args);
c277df42 8845 va_end(args);
cfd0369c 8846 message = SvPV_const(msv,l1);
c277df42
IZ
8847 if (l1 > 512)
8848 l1 = 512;
8849 Copy(message, buf, l1 , char);
197cf9b9 8850 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8851 Perl_croak(aTHX_ "%s", buf);
c277df42 8852}
a0ed51b3
LW
8853
8854/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8855
76234dfb 8856#ifndef PERL_IN_XSUB_RE
a0ed51b3 8857void
864dbfa3 8858Perl_save_re_context(pTHX)
b81d288d 8859{
97aff369 8860 dVAR;
1ade1aa1
NC
8861
8862 struct re_save_state *state;
8863
8864 SAVEVPTR(PL_curcop);
8865 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8866
8867 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8868 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8869 SSPUSHINT(SAVEt_RE_STATE);
8870
46ab3289 8871 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8872
a0ed51b3 8873 PL_reg_start_tmp = 0;
a0ed51b3 8874 PL_reg_start_tmpl = 0;
c445ea15 8875 PL_reg_oldsaved = NULL;
a5db57d6 8876 PL_reg_oldsavedlen = 0;
a5db57d6 8877 PL_reg_maxiter = 0;
a5db57d6 8878 PL_reg_leftiter = 0;
c445ea15 8879 PL_reg_poscache = NULL;
a5db57d6 8880 PL_reg_poscache_size = 0;
1ade1aa1
NC
8881#ifdef PERL_OLD_COPY_ON_WRITE
8882 PL_nrs = NULL;
8883#endif
ada6e8a9 8884
c445ea15
AL
8885 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8886 if (PL_curpm) {
8887 const REGEXP * const rx = PM_GETRE(PL_curpm);
8888 if (rx) {
1df70142 8889 U32 i;
ada6e8a9 8890 for (i = 1; i <= rx->nparens; i++) {
1df70142 8891 char digits[TYPE_CHARS(long)];
d9fad198 8892 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8893 GV *const *const gvp
8894 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8895
b37c2d43
AL
8896 if (gvp) {
8897 GV * const gv = *gvp;
8898 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8899 save_scalar(gv);
49f27e4b 8900 }
ada6e8a9
AMS
8901 }
8902 }
8903 }
a0ed51b3 8904}
76234dfb 8905#endif
51371543 8906
51371543 8907static void
acfe0abc 8908clear_re(pTHX_ void *r)
51371543 8909{
97aff369 8910 dVAR;
51371543
GS
8911 ReREFCNT_dec((regexp *)r);
8912}
ffbc6a93 8913
a28509cc
AL
8914#ifdef DEBUGGING
8915
8916STATIC void
8917S_put_byte(pTHX_ SV *sv, int c)
8918{
8919 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8920 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8921 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8922 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8923 else
8924 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8925}
8926
786e8c11 8927
3dab1dad
YO
8928#define CLEAR_OPTSTART \
8929 if (optstart) STMT_START { \
07be1b83 8930 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
8931 optstart=NULL; \
8932 } STMT_END
8933
786e8c11 8934#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 8935
b5a2f8d8
NC
8936STATIC const regnode *
8937S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
8938 const regnode *last, const regnode *plast,
8939 SV* sv, I32 indent, U32 depth)
a28509cc 8940{
97aff369 8941 dVAR;
786e8c11 8942 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 8943 register const regnode *next;
3dab1dad
YO
8944 const regnode *optstart= NULL;
8945 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 8946
786e8c11
YO
8947#ifdef DEBUG_DUMPUNTIL
8948 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
8949 last ? last-start : 0,plast ? plast-start : 0);
8950#endif
8951
8952 if (plast && plast < last)
8953 last= plast;
8954
8955 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
8956 /* While that wasn't END last time... */
8957
8958 NODE_ALIGN(node);
8959 op = OP(node);
8960 if (op == CLOSE)
786e8c11 8961 indent--;
b5a2f8d8 8962 next = regnext((regnode *)node);
07be1b83 8963
a28509cc 8964 /* Where, what. */
8e11feef 8965 if (OP(node) == OPTIMIZED) {
e68ec53f 8966 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 8967 optstart = node;
3dab1dad 8968 else
8e11feef 8969 goto after_print;
3dab1dad
YO
8970 } else
8971 CLEAR_OPTSTART;
07be1b83 8972
32fc9b6a 8973 regprop(r, sv, node);
a28509cc 8974 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 8975 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
8976
8977 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
8978 if (next == NULL) /* Next ptr. */
8979 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
8980 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
8981 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
8982 else
8983 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 8984
1de06328 8985 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 8986 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
8987 }
8988
a28509cc
AL
8989 after_print:
8990 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
8991 assert(next);
8992 {
8993 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
8994 ? regnext((regnode *)next)
8995 : next);
be8e71aa
YO
8996 if (last && nnode > last)
8997 nnode = last;
786e8c11 8998 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 8999 }
a28509cc
AL
9000 }
9001 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9002 assert(next);
786e8c11 9003 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9004 }
9005 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9006 const regnode *this_trie = node;
1de06328 9007 const char op = OP(node);
a28509cc 9008 const I32 n = ARG(node);
1de06328
YO
9009 const reg_ac_data * const ac = op>=AHOCORASICK ?
9010 (reg_ac_data *)r->data->data[n] :
9011 NULL;
9012 const reg_trie_data * const trie = op<AHOCORASICK ?
9013 (reg_trie_data*)r->data->data[n] :
9014 ac->trie;
786e8c11 9015 const regnode *nextbranch= NULL;
a28509cc 9016 I32 word_idx;
1de06328 9017 sv_setpvn(sv, "", 0);
786e8c11 9018 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 9019 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
9020
9021 PerlIO_printf(Perl_debug_log, "%*s%s ",
9022 (int)(2*(indent+3)), "",
9023 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9024 PL_colors[0], PL_colors[1],
9025 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9026 PERL_PV_PRETTY_ELIPSES |
7f69552c 9027 PERL_PV_PRETTY_LTGT
786e8c11
YO
9028 )
9029 : "???"
9030 );
9031 if (trie->jump) {
40d049e4 9032 U16 dist= trie->jump[word_idx+1];
7f69552c 9033 PerlIO_printf(Perl_debug_log, "(%u)\n",
40d049e4 9034 (dist ? this_trie + dist : next) - start);
786e8c11
YO
9035 if (dist) {
9036 if (!nextbranch)
24b23f37 9037 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9038 DUMPUNTIL(this_trie + dist, nextbranch);
9039 }
786e8c11
YO
9040 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9041 nextbranch= regnext((regnode *)nextbranch);
9042 } else {
9043 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9044 }
786e8c11
YO
9045 }
9046 if (last && next > last)
9047 node= last;
9048 else
9049 node= next;
a28509cc 9050 }
786e8c11
YO
9051 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9052 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9053 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9054 }
9055 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9056 assert(next);
786e8c11 9057 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9058 }
9059 else if ( op == PLUS || op == STAR) {
786e8c11 9060 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9061 }
9062 else if (op == ANYOF) {
9063 /* arglen 1 + class block */
9064 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9065 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9066 node = NEXTOPER(node);
9067 }
9068 else if (PL_regkind[(U8)op] == EXACT) {
9069 /* Literal string, where present. */
9070 node += NODE_SZ_STR(node) - 1;
9071 node = NEXTOPER(node);
9072 }
9073 else {
9074 node = NEXTOPER(node);
9075 node += regarglen[(U8)op];
9076 }
9077 if (op == CURLYX || op == OPEN)
786e8c11 9078 indent++;
a28509cc 9079 else if (op == WHILEM)
786e8c11 9080 indent--;
a28509cc 9081 }
3dab1dad 9082 CLEAR_OPTSTART;
786e8c11
YO
9083#ifdef DEBUG_DUMPUNTIL
9084 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
9085#endif
1de06328 9086 return node;
a28509cc
AL
9087}
9088
9089#endif /* DEBUGGING */
9090
241d1a3b
NC
9091/*
9092 * Local variables:
9093 * c-indentation-style: bsd
9094 * c-basic-offset: 4
9095 * indent-tabs-mode: t
9096 * End:
9097 *
37442d52
RGS
9098 * ex: set ts=8 sts=4 sw=4 noet:
9099 */