This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Simplify S_add_data(), given that realloc will NULL acts as malloc().
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
40d049e4
YO
129 regnode **recurse; /* Recurse regops */
130 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
131#if ADD_TO_REGEXEC
132 char *starttry; /* -Dr: where regtry was called. */
133#define RExC_starttry (pRExC_state->starttry)
134#endif
3dab1dad 135#ifdef DEBUGGING
be8e71aa 136 const char *lastparse;
3dab1dad
YO
137 I32 lastnum;
138#define RExC_lastparse (pRExC_state->lastparse)
139#define RExC_lastnum (pRExC_state->lastnum)
140#endif
830247a4
IZ
141} RExC_state_t;
142
e2509266 143#define RExC_flags (pRExC_state->flags)
830247a4
IZ
144#define RExC_precomp (pRExC_state->precomp)
145#define RExC_rx (pRExC_state->rx)
f8fc2ecf 146#define RExC_rxi (pRExC_state->rxi)
fac92740 147#define RExC_start (pRExC_state->start)
830247a4
IZ
148#define RExC_end (pRExC_state->end)
149#define RExC_parse (pRExC_state->parse)
150#define RExC_whilem_seen (pRExC_state->whilem_seen)
f8fc2ecf 151#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
830247a4 152#define RExC_emit (pRExC_state->emit)
fac92740 153#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
154#define RExC_naughty (pRExC_state->naughty)
155#define RExC_sawback (pRExC_state->sawback)
156#define RExC_seen (pRExC_state->seen)
157#define RExC_size (pRExC_state->size)
158#define RExC_npar (pRExC_state->npar)
c74340f9 159#define RExC_cpar (pRExC_state->cpar)
e2e6a0f1 160#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
161#define RExC_extralen (pRExC_state->extralen)
162#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 164#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 165#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
166#define RExC_open_parens (pRExC_state->open_parens)
167#define RExC_close_parens (pRExC_state->close_parens)
168#define RExC_opend (pRExC_state->opend)
81714fb9 169#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
170#define RExC_recurse (pRExC_state->recurse)
171#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 172
a687059c
LW
173#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
174#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
175 ((*s) == '{' && regcurly(s)))
a687059c 176
35c8bce7
LW
177#ifdef SPSTART
178#undef SPSTART /* dratted cpp namespace... */
179#endif
a687059c
LW
180/*
181 * Flags to be passed up and down.
182 */
a687059c 183#define WORST 0 /* Worst case. */
821b33a5 184#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
185#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
186#define SPSTART 0x4 /* Starts with * or +. */
187#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 188
3dab1dad
YO
189#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
190
07be1b83
YO
191/* whether trie related optimizations are enabled */
192#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
193#define TRIE_STUDY_OPT
786e8c11 194#define FULL_TRIE_STUDY
07be1b83
YO
195#define TRIE_STCLASS
196#endif
1de06328
YO
197
198
40d049e4
YO
199
200#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
201#define PBITVAL(paren) (1 << ((paren) & 7))
202#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
203#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
204#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
205
206
1de06328
YO
207/* About scan_data_t.
208
209 During optimisation we recurse through the regexp program performing
210 various inplace (keyhole style) optimisations. In addition study_chunk
211 and scan_commit populate this data structure with information about
212 what strings MUST appear in the pattern. We look for the longest
213 string that must appear for at a fixed location, and we look for the
214 longest string that may appear at a floating location. So for instance
215 in the pattern:
216
217 /FOO[xX]A.*B[xX]BAR/
218
219 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
220 strings (because they follow a .* construct). study_chunk will identify
221 both FOO and BAR as being the longest fixed and floating strings respectively.
222
223 The strings can be composites, for instance
224
225 /(f)(o)(o)/
226
227 will result in a composite fixed substring 'foo'.
228
229 For each string some basic information is maintained:
230
231 - offset or min_offset
232 This is the position the string must appear at, or not before.
233 It also implicitly (when combined with minlenp) tells us how many
234 character must match before the string we are searching.
235 Likewise when combined with minlenp and the length of the string
236 tells us how many characters must appear after the string we have
237 found.
238
239 - max_offset
240 Only used for floating strings. This is the rightmost point that
241 the string can appear at. Ifset to I32 max it indicates that the
242 string can occur infinitely far to the right.
243
244 - minlenp
245 A pointer to the minimum length of the pattern that the string
246 was found inside. This is important as in the case of positive
247 lookahead or positive lookbehind we can have multiple patterns
248 involved. Consider
249
250 /(?=FOO).*F/
251
252 The minimum length of the pattern overall is 3, the minimum length
253 of the lookahead part is 3, but the minimum length of the part that
254 will actually match is 1. So 'FOO's minimum length is 3, but the
255 minimum length for the F is 1. This is important as the minimum length
256 is used to determine offsets in front of and behind the string being
257 looked for. Since strings can be composites this is the length of the
258 pattern at the time it was commited with a scan_commit. Note that
259 the length is calculated by study_chunk, so that the minimum lengths
260 are not known until the full pattern has been compiled, thus the
261 pointer to the value.
262
263 - lookbehind
264
265 In the case of lookbehind the string being searched for can be
266 offset past the start point of the final matching string.
267 If this value was just blithely removed from the min_offset it would
268 invalidate some of the calculations for how many chars must match
269 before or after (as they are derived from min_offset and minlen and
270 the length of the string being searched for).
271 When the final pattern is compiled and the data is moved from the
272 scan_data_t structure into the regexp structure the information
273 about lookbehind is factored in, with the information that would
274 have been lost precalculated in the end_shift field for the
275 associated string.
276
277 The fields pos_min and pos_delta are used to store the minimum offset
278 and the delta to the maximum offset at the current point in the pattern.
279
280*/
2c2d71f5
JH
281
282typedef struct scan_data_t {
1de06328
YO
283 /*I32 len_min; unused */
284 /*I32 len_delta; unused */
2c2d71f5
JH
285 I32 pos_min;
286 I32 pos_delta;
287 SV *last_found;
1de06328 288 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
289 I32 last_start_min;
290 I32 last_start_max;
1de06328
YO
291 SV **longest; /* Either &l_fixed, or &l_float. */
292 SV *longest_fixed; /* longest fixed string found in pattern */
293 I32 offset_fixed; /* offset where it starts */
294 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
295 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
296 SV *longest_float; /* longest floating string found in pattern */
297 I32 offset_float_min; /* earliest point in string it can appear */
298 I32 offset_float_max; /* latest point in string it can appear */
299 I32 *minlen_float; /* pointer to the minlen relevent to the string */
300 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
301 I32 flags;
302 I32 whilem_c;
cb434fcc 303 I32 *last_closep;
653099ff 304 struct regnode_charclass_class *start_class;
2c2d71f5
JH
305} scan_data_t;
306
a687059c 307/*
e50aee73 308 * Forward declarations for pregcomp()'s friends.
a687059c 309 */
a0d0e21e 310
27da23d5 311static const scan_data_t zero_scan_data =
1de06328 312 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
313
314#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
315#define SF_BEFORE_SEOL 0x0001
316#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
317#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
318#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
319
09b7f37c
CB
320#ifdef NO_UNARY_PLUS
321# define SF_FIX_SHIFT_EOL (0+2)
322# define SF_FL_SHIFT_EOL (0+4)
323#else
324# define SF_FIX_SHIFT_EOL (+2)
325# define SF_FL_SHIFT_EOL (+4)
326#endif
c277df42
IZ
327
328#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
329#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
330
331#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
332#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
333#define SF_IS_INF 0x0040
334#define SF_HAS_PAR 0x0080
335#define SF_IN_PAR 0x0100
336#define SF_HAS_EVAL 0x0200
337#define SCF_DO_SUBSTR 0x0400
653099ff
GS
338#define SCF_DO_STCLASS_AND 0x0800
339#define SCF_DO_STCLASS_OR 0x1000
340#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 341#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 342
786e8c11 343#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 344#define SCF_SEEN_ACCEPT 0x8000
07be1b83 345
eb160463 346#define UTF (RExC_utf8 != 0)
bbe252da
YO
347#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
348#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 349
ffc61ed2 350#define OOB_UNICODE 12345678
93733859 351#define OOB_NAMEDCLASS -1
b8c5462f 352
a0ed51b3
LW
353#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
354#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
355
8615cb43 356
b45f050a
JF
357/* length of regex to show in messages that don't mark a position within */
358#define RegexLengthToShowInErrorMessages 127
359
360/*
361 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
362 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
363 * op/pragma/warn/regcomp.
364 */
7253e4e3
RK
365#define MARKER1 "<-- HERE" /* marker as it appears in the description */
366#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 367
7253e4e3 368#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
369
370/*
371 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
372 * arg. Show regex, up to a maximum length. If it's too long, chop and add
373 * "...".
374 */
58e23c8d 375#define _FAIL(code) STMT_START { \
bfed75c6 376 const char *ellipses = ""; \
ccb2c380
MP
377 IV len = RExC_end - RExC_precomp; \
378 \
379 if (!SIZE_ONLY) \
380 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
381 if (len > RegexLengthToShowInErrorMessages) { \
382 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
383 len = RegexLengthToShowInErrorMessages - 10; \
384 ellipses = "..."; \
385 } \
58e23c8d 386 code; \
ccb2c380 387} STMT_END
8615cb43 388
58e23c8d
YO
389#define FAIL(msg) _FAIL( \
390 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
391 msg, (int)len, RExC_precomp, ellipses))
392
393#define FAIL2(msg,arg) _FAIL( \
394 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
395 arg, (int)len, RExC_precomp, ellipses))
396
b45f050a 397/*
b45f050a
JF
398 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
399 */
ccb2c380 400#define Simple_vFAIL(m) STMT_START { \
a28509cc 401 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
402 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
403 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
404} STMT_END
b45f050a
JF
405
406/*
407 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
408 */
ccb2c380
MP
409#define vFAIL(m) STMT_START { \
410 if (!SIZE_ONLY) \
411 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
412 Simple_vFAIL(m); \
413} STMT_END
b45f050a
JF
414
415/*
416 * Like Simple_vFAIL(), but accepts two arguments.
417 */
ccb2c380 418#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 419 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
420 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
421 (int)offset, RExC_precomp, RExC_precomp + offset); \
422} STMT_END
b45f050a
JF
423
424/*
425 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
426 */
ccb2c380
MP
427#define vFAIL2(m,a1) STMT_START { \
428 if (!SIZE_ONLY) \
429 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
430 Simple_vFAIL2(m, a1); \
431} STMT_END
b45f050a
JF
432
433
434/*
435 * Like Simple_vFAIL(), but accepts three arguments.
436 */
ccb2c380 437#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 438 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
439 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
440 (int)offset, RExC_precomp, RExC_precomp + offset); \
441} STMT_END
b45f050a
JF
442
443/*
444 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
445 */
ccb2c380
MP
446#define vFAIL3(m,a1,a2) STMT_START { \
447 if (!SIZE_ONLY) \
448 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
449 Simple_vFAIL3(m, a1, a2); \
450} STMT_END
b45f050a
JF
451
452/*
453 * Like Simple_vFAIL(), but accepts four arguments.
454 */
ccb2c380 455#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 456 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
457 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
458 (int)offset, RExC_precomp, RExC_precomp + offset); \
459} STMT_END
b45f050a 460
ccb2c380 461#define vWARN(loc,m) STMT_START { \
a28509cc 462 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
463 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
464 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
465} STMT_END
466
467#define vWARNdep(loc,m) STMT_START { \
a28509cc 468 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
469 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
470 "%s" REPORT_LOCATION, \
471 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
472} STMT_END
473
474
475#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 476 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
477 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
478 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
479} STMT_END
480
481#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 482 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
483 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
484 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
485} STMT_END
486
487#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 488 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
489 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
491} STMT_END
492
493#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 494 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
495 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
496 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
497} STMT_END
9d1d55b5 498
8615cb43 499
cd439c50 500/* Allow for side effects in s */
ccb2c380
MP
501#define REGC(c,s) STMT_START { \
502 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
503} STMT_END
cd439c50 504
fac92740
MJD
505/* Macros for recording node offsets. 20001227 mjd@plover.com
506 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
507 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
508 * Element 0 holds the number n.
07be1b83 509 * Position is 1 indexed.
fac92740
MJD
510 */
511
ccb2c380
MP
512#define Set_Node_Offset_To_R(node,byte) STMT_START { \
513 if (! SIZE_ONLY) { \
514 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 515 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 516 if((node) < 0) { \
551405c4 517 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
518 } else { \
519 RExC_offsets[2*(node)-1] = (byte); \
520 } \
521 } \
522} STMT_END
523
524#define Set_Node_Offset(node,byte) \
525 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
526#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
527
528#define Set_Node_Length_To_R(node,len) STMT_START { \
529 if (! SIZE_ONLY) { \
530 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 531 __LINE__, (int)(node), (int)(len))); \
ccb2c380 532 if((node) < 0) { \
551405c4 533 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
534 } else { \
535 RExC_offsets[2*(node)] = (len); \
536 } \
537 } \
538} STMT_END
539
540#define Set_Node_Length(node,len) \
541 Set_Node_Length_To_R((node)-RExC_emit_start, len)
542#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
543#define Set_Node_Cur_Length(node) \
544 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
545
546/* Get offsets and lengths */
547#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
548#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
549
07be1b83
YO
550#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
551 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
552 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
553} STMT_END
554
555
556#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
557#define EXPERIMENTAL_INPLACESCAN
558#endif
559
1de06328 560#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 561DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 562 PerlIO_printf(Perl_debug_log, \
b515a41d 563 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
1de06328
YO
564 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
565 (int)(depth)*2, "", \
566 (IV)((data)->pos_min), \
567 (IV)((data)->pos_delta), \
568 (IV)((data)->flags), \
569 (IV)((data)->whilem_c), \
570 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
571 ); \
572 if ((data)->last_found) \
573 PerlIO_printf(Perl_debug_log, \
574 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
575 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
576 SvPVX_const((data)->last_found), \
577 (IV)((data)->last_end), \
578 (IV)((data)->last_start_min), \
579 (IV)((data)->last_start_max), \
580 ((data)->longest && \
581 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
582 SvPVX_const((data)->longest_fixed), \
583 (IV)((data)->offset_fixed), \
584 ((data)->longest && \
585 (data)->longest==&((data)->longest_float)) ? "*" : "", \
586 SvPVX_const((data)->longest_float), \
587 (IV)((data)->offset_float_min), \
588 (IV)((data)->offset_float_max) \
589 ); \
590 PerlIO_printf(Perl_debug_log,"\n"); \
591});
592
acfe0abc 593static void clear_re(pTHX_ void *r);
4327152a 594
653099ff 595/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 596 Update the longest found anchored substring and the longest found
653099ff
GS
597 floating substrings if needed. */
598
4327152a 599STATIC void
1de06328 600S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 601{
e1ec3a88
AL
602 const STRLEN l = CHR_SVLEN(data->last_found);
603 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 604 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 605
c277df42 606 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 607 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
608 if (*data->longest == data->longest_fixed) {
609 data->offset_fixed = l ? data->last_start_min : data->pos_min;
610 if (data->flags & SF_BEFORE_EOL)
b81d288d 611 data->flags
c277df42
IZ
612 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
613 else
614 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
615 data->minlen_fixed=minlenp;
616 data->lookbehind_fixed=0;
a0ed51b3
LW
617 }
618 else {
c277df42 619 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
620 data->offset_float_max = (l
621 ? data->last_start_max
c277df42 622 : data->pos_min + data->pos_delta);
9051bda5
HS
623 if ((U32)data->offset_float_max > (U32)I32_MAX)
624 data->offset_float_max = I32_MAX;
c277df42 625 if (data->flags & SF_BEFORE_EOL)
b81d288d 626 data->flags
c277df42
IZ
627 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
628 else
629 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
630 data->minlen_float=minlenp;
631 data->lookbehind_float=0;
c277df42
IZ
632 }
633 }
634 SvCUR_set(data->last_found, 0);
0eda9292 635 {
a28509cc 636 SV * const sv = data->last_found;
097eb12c
AL
637 if (SvUTF8(sv) && SvMAGICAL(sv)) {
638 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
639 if (mg)
640 mg->mg_len = 0;
641 }
0eda9292 642 }
c277df42
IZ
643 data->last_end = -1;
644 data->flags &= ~SF_BEFORE_EOL;
1de06328 645 DEBUG_STUDYDATA(data,0);
c277df42
IZ
646}
647
653099ff
GS
648/* Can match anything (initialization) */
649STATIC void
097eb12c 650S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 651{
653099ff 652 ANYOF_CLASS_ZERO(cl);
f8bef550 653 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 654 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
655 if (LOC)
656 cl->flags |= ANYOF_LOCALE;
657}
658
659/* Can match anything (initialization) */
660STATIC int
5f66b61c 661S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
662{
663 int value;
664
aaa51d5e 665 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
666 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
667 return 1;
1aa99e6b
IH
668 if (!(cl->flags & ANYOF_UNICODE_ALL))
669 return 0;
10edeb5d 670 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 671 return 0;
653099ff
GS
672 return 1;
673}
674
675/* Can match anything (initialization) */
676STATIC void
097eb12c 677S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 678{
8ecf7187 679 Zero(cl, 1, struct regnode_charclass_class);
653099ff 680 cl->type = ANYOF;
830247a4 681 cl_anything(pRExC_state, cl);
653099ff
GS
682}
683
684STATIC void
097eb12c 685S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 686{
8ecf7187 687 Zero(cl, 1, struct regnode_charclass_class);
653099ff 688 cl->type = ANYOF;
830247a4 689 cl_anything(pRExC_state, cl);
653099ff
GS
690 if (LOC)
691 cl->flags |= ANYOF_LOCALE;
692}
693
694/* 'And' a given class with another one. Can create false positives */
695/* We assume that cl is not inverted */
696STATIC void
5f66b61c 697S_cl_and(struct regnode_charclass_class *cl,
a28509cc 698 const struct regnode_charclass_class *and_with)
653099ff 699{
40d049e4
YO
700
701 assert(and_with->type == ANYOF);
653099ff
GS
702 if (!(and_with->flags & ANYOF_CLASS)
703 && !(cl->flags & ANYOF_CLASS)
704 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
705 && !(and_with->flags & ANYOF_FOLD)
706 && !(cl->flags & ANYOF_FOLD)) {
707 int i;
708
709 if (and_with->flags & ANYOF_INVERT)
710 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
711 cl->bitmap[i] &= ~and_with->bitmap[i];
712 else
713 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
714 cl->bitmap[i] &= and_with->bitmap[i];
715 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
716 if (!(and_with->flags & ANYOF_EOS))
717 cl->flags &= ~ANYOF_EOS;
1aa99e6b 718
14ebb1a2
JH
719 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
720 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
721 cl->flags &= ~ANYOF_UNICODE_ALL;
722 cl->flags |= ANYOF_UNICODE;
723 ARG_SET(cl, ARG(and_with));
724 }
14ebb1a2
JH
725 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
726 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 727 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
728 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
729 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 730 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
731}
732
733/* 'OR' a given class with another one. Can create false positives */
734/* We assume that cl is not inverted */
735STATIC void
097eb12c 736S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 737{
653099ff
GS
738 if (or_with->flags & ANYOF_INVERT) {
739 /* We do not use
740 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
741 * <= (B1 | !B2) | (CL1 | !CL2)
742 * which is wasteful if CL2 is small, but we ignore CL2:
743 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
744 * XXXX Can we handle case-fold? Unclear:
745 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
746 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
747 */
748 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
749 && !(or_with->flags & ANYOF_FOLD)
750 && !(cl->flags & ANYOF_FOLD) ) {
751 int i;
752
753 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
754 cl->bitmap[i] |= ~or_with->bitmap[i];
755 } /* XXXX: logic is complicated otherwise */
756 else {
830247a4 757 cl_anything(pRExC_state, cl);
653099ff
GS
758 }
759 } else {
760 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
761 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 762 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
763 || (cl->flags & ANYOF_FOLD)) ) {
764 int i;
765
766 /* OR char bitmap and class bitmap separately */
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] |= or_with->bitmap[i];
769 if (or_with->flags & ANYOF_CLASS) {
770 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
771 cl->classflags[i] |= or_with->classflags[i];
772 cl->flags |= ANYOF_CLASS;
773 }
774 }
775 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 776 cl_anything(pRExC_state, cl);
653099ff
GS
777 }
778 }
779 if (or_with->flags & ANYOF_EOS)
780 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
781
782 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
783 ARG(cl) != ARG(or_with)) {
784 cl->flags |= ANYOF_UNICODE_ALL;
785 cl->flags &= ~ANYOF_UNICODE;
786 }
787 if (or_with->flags & ANYOF_UNICODE_ALL) {
788 cl->flags |= ANYOF_UNICODE_ALL;
789 cl->flags &= ~ANYOF_UNICODE;
790 }
653099ff
GS
791}
792
a3621e74
YO
793#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
794#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
795#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
796#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
797
3dab1dad
YO
798
799#ifdef DEBUGGING
07be1b83 800/*
3dab1dad
YO
801 dump_trie(trie)
802 dump_trie_interim_list(trie,next_alloc)
803 dump_trie_interim_table(trie,next_alloc)
804
805 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
806 The _interim_ variants are used for debugging the interim
807 tables that are used to generate the final compressed
808 representation which is what dump_trie expects.
809
3dab1dad
YO
810 Part of the reason for their existance is to provide a form
811 of documentation as to how the different representations function.
07be1b83
YO
812
813*/
3dab1dad
YO
814
815/*
816 dump_trie(trie)
817 Dumps the final compressed table form of the trie to Perl_debug_log.
818 Used for debugging make_trie().
819*/
820
821STATIC void
822S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
823{
824 U32 state;
ab3bbdeb
YO
825 SV *sv=sv_newmortal();
826 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
827 GET_RE_DEBUG_FLAGS_DECL;
828
ab3bbdeb 829
3dab1dad
YO
830 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
831 (int)depth * 2 + 2,"",
832 "Match","Base","Ofs" );
833
834 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 835 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 836 if ( tmp ) {
ab3bbdeb
YO
837 PerlIO_printf( Perl_debug_log, "%*s",
838 colwidth,
ddc5bc0f 839 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
840 PL_colors[0], PL_colors[1],
841 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
842 PERL_PV_ESCAPE_FIRSTCHAR
843 )
844 );
3dab1dad
YO
845 }
846 }
847 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
848 (int)depth * 2 + 2,"");
849
850 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 851 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
852 PerlIO_printf( Perl_debug_log, "\n");
853
1e2e3d02 854 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 855 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
856
857 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
858
859 if ( trie->states[ state ].wordnum ) {
860 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
861 } else {
862 PerlIO_printf( Perl_debug_log, "%6s", "" );
863 }
864
865 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
866
867 if ( base ) {
868 U32 ofs = 0;
869
870 while( ( base + ofs < trie->uniquecharcount ) ||
871 ( base + ofs - trie->uniquecharcount < trie->lasttrans
872 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
873 ofs++;
874
875 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
876
877 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
878 if ( ( base + ofs >= trie->uniquecharcount ) &&
879 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
880 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
881 {
ab3bbdeb
YO
882 PerlIO_printf( Perl_debug_log, "%*"UVXf,
883 colwidth,
3dab1dad
YO
884 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
885 } else {
ab3bbdeb 886 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
887 }
888 }
889
890 PerlIO_printf( Perl_debug_log, "]");
891
892 }
893 PerlIO_printf( Perl_debug_log, "\n" );
894 }
895}
896/*
897 dump_trie_interim_list(trie,next_alloc)
898 Dumps a fully constructed but uncompressed trie in list form.
899 List tries normally only are used for construction when the number of
900 possible chars (trie->uniquecharcount) is very high.
901 Used for debugging make_trie().
902*/
903STATIC void
904S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
905{
906 U32 state;
ab3bbdeb
YO
907 SV *sv=sv_newmortal();
908 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
909 GET_RE_DEBUG_FLAGS_DECL;
910 /* print out the table precompression. */
ab3bbdeb
YO
911 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
912 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
913 "------:-----+-----------------\n" );
3dab1dad
YO
914
915 for( state=1 ; state < next_alloc ; state ++ ) {
916 U16 charid;
917
ab3bbdeb 918 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
919 (int)depth * 2 + 2,"", (UV)state );
920 if ( ! trie->states[ state ].wordnum ) {
921 PerlIO_printf( Perl_debug_log, "%5s| ","");
922 } else {
923 PerlIO_printf( Perl_debug_log, "W%4x| ",
924 trie->states[ state ].wordnum
925 );
926 }
927 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 928 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
929 if ( tmp ) {
930 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
931 colwidth,
ddc5bc0f 932 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
933 PL_colors[0], PL_colors[1],
934 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
935 PERL_PV_ESCAPE_FIRSTCHAR
936 ) ,
1e2e3d02
YO
937 TRIE_LIST_ITEM(state,charid).forid,
938 (UV)TRIE_LIST_ITEM(state,charid).newstate
939 );
940 if (!(charid % 10))
664e119d
RGS
941 PerlIO_printf(Perl_debug_log, "\n%*s| ",
942 (int)((depth * 2) + 14), "");
1e2e3d02 943 }
ab3bbdeb
YO
944 }
945 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
946 }
947}
948
949/*
950 dump_trie_interim_table(trie,next_alloc)
951 Dumps a fully constructed but uncompressed trie in table form.
952 This is the normal DFA style state transition table, with a few
953 twists to facilitate compression later.
954 Used for debugging make_trie().
955*/
956STATIC void
957S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
958{
959 U32 state;
960 U16 charid;
ab3bbdeb
YO
961 SV *sv=sv_newmortal();
962 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
963 GET_RE_DEBUG_FLAGS_DECL;
964
965 /*
966 print out the table precompression so that we can do a visual check
967 that they are identical.
968 */
969
970 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
971
972 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 973 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 974 if ( tmp ) {
ab3bbdeb
YO
975 PerlIO_printf( Perl_debug_log, "%*s",
976 colwidth,
ddc5bc0f 977 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
978 PL_colors[0], PL_colors[1],
979 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
980 PERL_PV_ESCAPE_FIRSTCHAR
981 )
982 );
3dab1dad
YO
983 }
984 }
985
986 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
987
988 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 989 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
990 }
991
992 PerlIO_printf( Perl_debug_log, "\n" );
993
994 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
995
996 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
997 (int)depth * 2 + 2,"",
998 (UV)TRIE_NODENUM( state ) );
999
1000 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1001 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1002 if (v)
1003 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1004 else
1005 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1006 }
1007 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1008 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1009 } else {
1010 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1011 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1012 }
1013 }
07be1b83 1014}
3dab1dad
YO
1015
1016#endif
1017
786e8c11
YO
1018/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1019 startbranch: the first branch in the whole branch sequence
1020 first : start branch of sequence of branch-exact nodes.
1021 May be the same as startbranch
1022 last : Thing following the last branch.
1023 May be the same as tail.
1024 tail : item following the branch sequence
1025 count : words in the sequence
1026 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1027 depth : indent depth
3dab1dad 1028
786e8c11 1029Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1030
786e8c11
YO
1031A trie is an N'ary tree where the branches are determined by digital
1032decomposition of the key. IE, at the root node you look up the 1st character and
1033follow that branch repeat until you find the end of the branches. Nodes can be
1034marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1035
786e8c11 1036 /he|she|his|hers/
72f13be8 1037
786e8c11
YO
1038would convert into the following structure. Numbers represent states, letters
1039following numbers represent valid transitions on the letter from that state, if
1040the number is in square brackets it represents an accepting state, otherwise it
1041will be in parenthesis.
07be1b83 1042
786e8c11
YO
1043 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1044 | |
1045 | (2)
1046 | |
1047 (1) +-i->(6)-+-s->[7]
1048 |
1049 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1050
786e8c11
YO
1051 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1052
1053This shows that when matching against the string 'hers' we will begin at state 1
1054read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1055then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1056is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1057single traverse. We store a mapping from accepting to state to which word was
1058matched, and then when we have multiple possibilities we try to complete the
1059rest of the regex in the order in which they occured in the alternation.
1060
1061The only prior NFA like behaviour that would be changed by the TRIE support is
1062the silent ignoring of duplicate alternations which are of the form:
1063
1064 / (DUPE|DUPE) X? (?{ ... }) Y /x
1065
1066Thus EVAL blocks follwing a trie may be called a different number of times with
1067and without the optimisation. With the optimisations dupes will be silently
1068ignored. This inconsistant behaviour of EVAL type nodes is well established as
1069the following demonstrates:
1070
1071 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1072
1073which prints out 'word' three times, but
1074
1075 'words'=~/(word|word|word)(?{ print $1 })S/
1076
1077which doesnt print it out at all. This is due to other optimisations kicking in.
1078
1079Example of what happens on a structural level:
1080
1081The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1082
1083 1: CURLYM[1] {1,32767}(18)
1084 5: BRANCH(8)
1085 6: EXACT <ac>(16)
1086 8: BRANCH(11)
1087 9: EXACT <ad>(16)
1088 11: BRANCH(14)
1089 12: EXACT <ab>(16)
1090 16: SUCCEED(0)
1091 17: NOTHING(18)
1092 18: END(0)
1093
1094This would be optimizable with startbranch=5, first=5, last=16, tail=16
1095and should turn into:
1096
1097 1: CURLYM[1] {1,32767}(18)
1098 5: TRIE(16)
1099 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1100 <ac>
1101 <ad>
1102 <ab>
1103 16: SUCCEED(0)
1104 17: NOTHING(18)
1105 18: END(0)
1106
1107Cases where tail != last would be like /(?foo|bar)baz/:
1108
1109 1: BRANCH(4)
1110 2: EXACT <foo>(8)
1111 4: BRANCH(7)
1112 5: EXACT <bar>(8)
1113 7: TAIL(8)
1114 8: EXACT <baz>(10)
1115 10: END(0)
1116
1117which would be optimizable with startbranch=1, first=1, last=7, tail=8
1118and would end up looking like:
1119
1120 1: TRIE(8)
1121 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1122 <foo>
1123 <bar>
1124 7: TAIL(8)
1125 8: EXACT <baz>(10)
1126 10: END(0)
1127
1128 d = uvuni_to_utf8_flags(d, uv, 0);
1129
1130is the recommended Unicode-aware way of saying
1131
1132 *(d++) = uv;
1133*/
1134
1e2e3d02 1135#define TRIE_STORE_REVCHAR \
786e8c11 1136 STMT_START { \
1e2e3d02 1137 SV *tmp = newSVpvs(""); \
786e8c11 1138 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1139 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1140 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1141 } STMT_END
1142
1143#define TRIE_READ_CHAR STMT_START { \
1144 wordlen++; \
1145 if ( UTF ) { \
1146 if ( folder ) { \
1147 if ( foldlen > 0 ) { \
1148 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1149 foldlen -= len; \
1150 scan += len; \
1151 len = 0; \
1152 } else { \
1153 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1154 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1155 foldlen -= UNISKIP( uvc ); \
1156 scan = foldbuf + UNISKIP( uvc ); \
1157 } \
1158 } else { \
1159 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1160 } \
1161 } else { \
1162 uvc = (U32)*uc; \
1163 len = 1; \
1164 } \
1165} STMT_END
1166
1167
1168
1169#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1170 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1171 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1172 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1173 } \
1174 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1175 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1176 TRIE_LIST_CUR( state )++; \
1177} STMT_END
07be1b83 1178
786e8c11
YO
1179#define TRIE_LIST_NEW(state) STMT_START { \
1180 Newxz( trie->states[ state ].trans.list, \
1181 4, reg_trie_trans_le ); \
1182 TRIE_LIST_CUR( state ) = 1; \
1183 TRIE_LIST_LEN( state ) = 4; \
1184} STMT_END
07be1b83 1185
786e8c11
YO
1186#define TRIE_HANDLE_WORD(state) STMT_START { \
1187 U16 dupe= trie->states[ state ].wordnum; \
1188 regnode * const noper_next = regnext( noper ); \
1189 \
1190 if (trie->wordlen) \
1191 trie->wordlen[ curword ] = wordlen; \
1192 DEBUG_r({ \
1193 /* store the word for dumping */ \
1194 SV* tmp; \
1195 if (OP(noper) != NOTHING) \
1196 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1197 else \
1198 tmp = newSVpvn( "", 0 ); \
1199 if ( UTF ) SvUTF8_on( tmp ); \
1200 av_push( trie->words, tmp ); \
1201 }); \
1202 \
1203 curword++; \
1204 \
1205 if ( noper_next < tail ) { \
1206 if (!trie->jump) \
1207 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1208 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1209 if (!jumper) \
1210 jumper = noper_next; \
1211 if (!nextbranch) \
1212 nextbranch= regnext(cur); \
1213 } \
1214 \
1215 if ( dupe ) { \
1216 /* So it's a dupe. This means we need to maintain a */\
1217 /* linked-list from the first to the next. */\
1218 /* we only allocate the nextword buffer when there */\
1219 /* a dupe, so first time we have to do the allocation */\
1220 if (!trie->nextword) \
1221 Newxz( trie->nextword, word_count + 1, U16); \
1222 while ( trie->nextword[dupe] ) \
1223 dupe= trie->nextword[dupe]; \
1224 trie->nextword[dupe]= curword; \
1225 } else { \
1226 /* we haven't inserted this word yet. */ \
1227 trie->states[ state ].wordnum = curword; \
1228 } \
1229} STMT_END
07be1b83 1230
3dab1dad 1231
786e8c11
YO
1232#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1233 ( ( base + charid >= ucharcount \
1234 && base + charid < ubound \
1235 && state == trie->trans[ base - ucharcount + charid ].check \
1236 && trie->trans[ base - ucharcount + charid ].next ) \
1237 ? trie->trans[ base - ucharcount + charid ].next \
1238 : ( state==1 ? special : 0 ) \
1239 )
3dab1dad 1240
786e8c11
YO
1241#define MADE_TRIE 1
1242#define MADE_JUMP_TRIE 2
1243#define MADE_EXACT_TRIE 4
3dab1dad 1244
a3621e74 1245STATIC I32
786e8c11 1246S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1247{
27da23d5 1248 dVAR;
a3621e74
YO
1249 /* first pass, loop through and scan words */
1250 reg_trie_data *trie;
1251 regnode *cur;
9f7f3913 1252 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1253 STRLEN len = 0;
1254 UV uvc = 0;
1255 U16 curword = 0;
1256 U32 next_alloc = 0;
786e8c11
YO
1257 regnode *jumper = NULL;
1258 regnode *nextbranch = NULL;
7f69552c 1259 regnode *convert = NULL;
a3621e74 1260 /* we just use folder as a flag in utf8 */
e1ec3a88 1261 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1262 ? PL_fold
1263 : ( flags == EXACTFL
1264 ? PL_fold_locale
1265 : NULL
1266 )
1267 );
1268
e1ec3a88 1269 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1270 SV *re_trie_maxbuff;
3dab1dad
YO
1271#ifndef DEBUGGING
1272 /* these are only used during construction but are useful during
8e11feef 1273 * debugging so we store them in the struct when debugging.
8e11feef 1274 */
3dab1dad 1275 STRLEN trie_charcount=0;
3dab1dad
YO
1276 AV *trie_revcharmap;
1277#endif
a3621e74 1278 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1279#ifndef DEBUGGING
1280 PERL_UNUSED_ARG(depth);
1281#endif
a3621e74 1282
a02a5408 1283 Newxz( trie, 1, reg_trie_data );
a3621e74 1284 trie->refcount = 1;
3dab1dad 1285 trie->startstate = 1;
786e8c11 1286 trie->wordcount = word_count;
f8fc2ecf 1287 RExC_rxi->data->data[ data_slot ] = (void*)trie;
a02a5408 1288 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1289 if (!(UTF && folder))
1290 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1291 DEBUG_r({
1292 trie->words = newAV();
a3621e74 1293 });
3dab1dad 1294 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1295
0111c4fd 1296 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1297 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1298 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1299 }
3dab1dad
YO
1300 DEBUG_OPTIMISE_r({
1301 PerlIO_printf( Perl_debug_log,
786e8c11 1302 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1303 (int)depth * 2 + 2, "",
1304 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1305 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1306 (int)depth);
3dab1dad 1307 });
7f69552c
YO
1308
1309 /* Find the node we are going to overwrite */
1310 if ( first == startbranch && OP( last ) != BRANCH ) {
1311 /* whole branch chain */
1312 convert = first;
1313 } else {
1314 /* branch sub-chain */
1315 convert = NEXTOPER( first );
1316 }
1317
a3621e74
YO
1318 /* -- First loop and Setup --
1319
1320 We first traverse the branches and scan each word to determine if it
1321 contains widechars, and how many unique chars there are, this is
1322 important as we have to build a table with at least as many columns as we
1323 have unique chars.
1324
1325 We use an array of integers to represent the character codes 0..255
1326 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1327 native representation of the character value as the key and IV's for the
1328 coded index.
1329
1330 *TODO* If we keep track of how many times each character is used we can
1331 remap the columns so that the table compression later on is more
1332 efficient in terms of memory by ensuring most common value is in the
1333 middle and the least common are on the outside. IMO this would be better
1334 than a most to least common mapping as theres a decent chance the most
1335 common letter will share a node with the least common, meaning the node
1336 will not be compressable. With a middle is most common approach the worst
1337 case is when we have the least common nodes twice.
1338
1339 */
1340
a3621e74 1341 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1342 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1343 const U8 *uc = (U8*)STRING( noper );
a28509cc 1344 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1345 STRLEN foldlen = 0;
1346 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1347 const U8 *scan = (U8*)NULL;
07be1b83 1348 U32 wordlen = 0; /* required init */
3dab1dad 1349 STRLEN chars=0;
a3621e74 1350
3dab1dad
YO
1351 if (OP(noper) == NOTHING) {
1352 trie->minlen= 0;
1353 continue;
1354 }
1355 if (trie->bitmap) {
1356 TRIE_BITMAP_SET(trie,*uc);
1357 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1358 }
a3621e74 1359 for ( ; uc < e ; uc += len ) {
3dab1dad 1360 TRIE_CHARCOUNT(trie)++;
a3621e74 1361 TRIE_READ_CHAR;
3dab1dad 1362 chars++;
a3621e74
YO
1363 if ( uvc < 256 ) {
1364 if ( !trie->charmap[ uvc ] ) {
1365 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1366 if ( folder )
1367 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1368 TRIE_STORE_REVCHAR;
a3621e74
YO
1369 }
1370 } else {
1371 SV** svpp;
1372 if ( !trie->widecharmap )
1373 trie->widecharmap = newHV();
1374
1375 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1376
1377 if ( !svpp )
e4584336 1378 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1379
1380 if ( !SvTRUE( *svpp ) ) {
1381 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1382 TRIE_STORE_REVCHAR;
a3621e74
YO
1383 }
1384 }
1385 }
3dab1dad
YO
1386 if( cur == first ) {
1387 trie->minlen=chars;
1388 trie->maxlen=chars;
1389 } else if (chars < trie->minlen) {
1390 trie->minlen=chars;
1391 } else if (chars > trie->maxlen) {
1392 trie->maxlen=chars;
1393 }
1394
a3621e74
YO
1395 } /* end first pass */
1396 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1397 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1398 (int)depth * 2 + 2,"",
85c3142d 1399 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1400 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1401 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1402 );
786e8c11 1403 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1404
1405 /*
1406 We now know what we are dealing with in terms of unique chars and
1407 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1408 representation using a flat table will take. If it's over a reasonable
1409 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1410 conservative but potentially much slower representation using an array
1411 of lists.
1412
1413 At the end we convert both representations into the same compressed
1414 form that will be used in regexec.c for matching with. The latter
1415 is a form that cannot be used to construct with but has memory
1416 properties similar to the list form and access properties similar
1417 to the table form making it both suitable for fast searches and
1418 small enough that its feasable to store for the duration of a program.
1419
1420 See the comment in the code where the compressed table is produced
1421 inplace from the flat tabe representation for an explanation of how
1422 the compression works.
1423
1424 */
1425
1426
3dab1dad 1427 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1428 /*
1429 Second Pass -- Array Of Lists Representation
1430
1431 Each state will be represented by a list of charid:state records
1432 (reg_trie_trans_le) the first such element holds the CUR and LEN
1433 points of the allocated array. (See defines above).
1434
1435 We build the initial structure using the lists, and then convert
1436 it into the compressed table form which allows faster lookups
1437 (but cant be modified once converted).
a3621e74
YO
1438 */
1439
a3621e74
YO
1440 STRLEN transcount = 1;
1441
1e2e3d02
YO
1442 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1443 "%*sCompiling trie using list compiler\n",
1444 (int)depth * 2 + 2, ""));
1445
3dab1dad 1446 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1447 TRIE_LIST_NEW(1);
1448 next_alloc = 2;
1449
1450 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1451
c445ea15
AL
1452 regnode * const noper = NEXTOPER( cur );
1453 U8 *uc = (U8*)STRING( noper );
1454 const U8 * const e = uc + STR_LEN( noper );
1455 U32 state = 1; /* required init */
1456 U16 charid = 0; /* sanity init */
1457 U8 *scan = (U8*)NULL; /* sanity init */
1458 STRLEN foldlen = 0; /* required init */
07be1b83 1459 U32 wordlen = 0; /* required init */
c445ea15
AL
1460 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1461
3dab1dad 1462 if (OP(noper) != NOTHING) {
786e8c11 1463 for ( ; uc < e ; uc += len ) {
c445ea15 1464
786e8c11 1465 TRIE_READ_CHAR;
c445ea15 1466
786e8c11
YO
1467 if ( uvc < 256 ) {
1468 charid = trie->charmap[ uvc ];
c445ea15 1469 } else {
786e8c11
YO
1470 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1471 if ( !svpp ) {
1472 charid = 0;
1473 } else {
1474 charid=(U16)SvIV( *svpp );
1475 }
c445ea15 1476 }
786e8c11
YO
1477 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1478 if ( charid ) {
a3621e74 1479
786e8c11
YO
1480 U16 check;
1481 U32 newstate = 0;
a3621e74 1482
786e8c11
YO
1483 charid--;
1484 if ( !trie->states[ state ].trans.list ) {
1485 TRIE_LIST_NEW( state );
c445ea15 1486 }
786e8c11
YO
1487 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1488 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1489 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1490 break;
1491 }
1492 }
1493 if ( ! newstate ) {
1494 newstate = next_alloc++;
1495 TRIE_LIST_PUSH( state, charid, newstate );
1496 transcount++;
1497 }
1498 state = newstate;
1499 } else {
1500 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1501 }
a28509cc 1502 }
c445ea15 1503 }
3dab1dad 1504 TRIE_HANDLE_WORD(state);
a3621e74
YO
1505
1506 } /* end second pass */
1507
1e2e3d02
YO
1508 /* next alloc is the NEXT state to be allocated */
1509 trie->statecount = next_alloc;
a3621e74
YO
1510 Renew( trie->states, next_alloc, reg_trie_state );
1511
3dab1dad
YO
1512 /* and now dump it out before we compress it */
1513 DEBUG_TRIE_COMPILE_MORE_r(
1514 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1515 );
a3621e74 1516
a02a5408 1517 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1518 {
1519 U32 state;
a3621e74
YO
1520 U32 tp = 0;
1521 U32 zp = 0;
1522
1523
1524 for( state=1 ; state < next_alloc ; state ++ ) {
1525 U32 base=0;
1526
1527 /*
1528 DEBUG_TRIE_COMPILE_MORE_r(
1529 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1530 );
1531 */
1532
1533 if (trie->states[state].trans.list) {
1534 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1535 U16 maxid=minid;
a28509cc 1536 U16 idx;
a3621e74
YO
1537
1538 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1539 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1540 if ( forid < minid ) {
1541 minid=forid;
1542 } else if ( forid > maxid ) {
1543 maxid=forid;
1544 }
a3621e74
YO
1545 }
1546 if ( transcount < tp + maxid - minid + 1) {
1547 transcount *= 2;
1548 Renew( trie->trans, transcount, reg_trie_trans );
1549 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1550 }
1551 base = trie->uniquecharcount + tp - minid;
1552 if ( maxid == minid ) {
1553 U32 set = 0;
1554 for ( ; zp < tp ; zp++ ) {
1555 if ( ! trie->trans[ zp ].next ) {
1556 base = trie->uniquecharcount + zp - minid;
1557 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1558 trie->trans[ zp ].check = state;
1559 set = 1;
1560 break;
1561 }
1562 }
1563 if ( !set ) {
1564 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1565 trie->trans[ tp ].check = state;
1566 tp++;
1567 zp = tp;
1568 }
1569 } else {
1570 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1571 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1572 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1573 trie->trans[ tid ].check = state;
1574 }
1575 tp += ( maxid - minid + 1 );
1576 }
1577 Safefree(trie->states[ state ].trans.list);
1578 }
1579 /*
1580 DEBUG_TRIE_COMPILE_MORE_r(
1581 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1582 );
1583 */
1584 trie->states[ state ].trans.base=base;
1585 }
cc601c31 1586 trie->lasttrans = tp + 1;
a3621e74
YO
1587 }
1588 } else {
1589 /*
1590 Second Pass -- Flat Table Representation.
1591
1592 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1593 We know that we will need Charcount+1 trans at most to store the data
1594 (one row per char at worst case) So we preallocate both structures
1595 assuming worst case.
1596
1597 We then construct the trie using only the .next slots of the entry
1598 structs.
1599
1600 We use the .check field of the first entry of the node temporarily to
1601 make compression both faster and easier by keeping track of how many non
1602 zero fields are in the node.
1603
1604 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1605 transition.
1606
1607 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1608 number representing the first entry of the node, and state as a
1609 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1610 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1611 are 2 entrys per node. eg:
1612
1613 A B A B
1614 1. 2 4 1. 3 7
1615 2. 0 3 3. 0 5
1616 3. 0 0 5. 0 0
1617 4. 0 0 7. 0 0
1618
1619 The table is internally in the right hand, idx form. However as we also
1620 have to deal with the states array which is indexed by nodenum we have to
1621 use TRIE_NODENUM() to convert.
1622
1623 */
1e2e3d02
YO
1624 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1625 "%*sCompiling trie using table compiler\n",
1626 (int)depth * 2 + 2, ""));
3dab1dad
YO
1627
1628 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1629 reg_trie_trans );
3dab1dad 1630 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1631 next_alloc = trie->uniquecharcount + 1;
1632
3dab1dad 1633
a3621e74
YO
1634 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1635
c445ea15 1636 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1637 const U8 *uc = (U8*)STRING( noper );
1638 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1639
1640 U32 state = 1; /* required init */
1641
1642 U16 charid = 0; /* sanity init */
1643 U32 accept_state = 0; /* sanity init */
1644 U8 *scan = (U8*)NULL; /* sanity init */
1645
1646 STRLEN foldlen = 0; /* required init */
07be1b83 1647 U32 wordlen = 0; /* required init */
a3621e74
YO
1648 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1649
3dab1dad 1650 if ( OP(noper) != NOTHING ) {
786e8c11 1651 for ( ; uc < e ; uc += len ) {
a3621e74 1652
786e8c11 1653 TRIE_READ_CHAR;
a3621e74 1654
786e8c11
YO
1655 if ( uvc < 256 ) {
1656 charid = trie->charmap[ uvc ];
1657 } else {
1658 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1659 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1660 }
786e8c11
YO
1661 if ( charid ) {
1662 charid--;
1663 if ( !trie->trans[ state + charid ].next ) {
1664 trie->trans[ state + charid ].next = next_alloc;
1665 trie->trans[ state ].check++;
1666 next_alloc += trie->uniquecharcount;
1667 }
1668 state = trie->trans[ state + charid ].next;
1669 } else {
1670 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1671 }
1672 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1673 }
a3621e74 1674 }
3dab1dad
YO
1675 accept_state = TRIE_NODENUM( state );
1676 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1677
1678 } /* end second pass */
1679
3dab1dad
YO
1680 /* and now dump it out before we compress it */
1681 DEBUG_TRIE_COMPILE_MORE_r(
1682 dump_trie_interim_table(trie,next_alloc,depth+1)
1683 );
a3621e74 1684
a3621e74
YO
1685 {
1686 /*
1687 * Inplace compress the table.*
1688
1689 For sparse data sets the table constructed by the trie algorithm will
1690 be mostly 0/FAIL transitions or to put it another way mostly empty.
1691 (Note that leaf nodes will not contain any transitions.)
1692
1693 This algorithm compresses the tables by eliminating most such
1694 transitions, at the cost of a modest bit of extra work during lookup:
1695
1696 - Each states[] entry contains a .base field which indicates the
1697 index in the state[] array wheres its transition data is stored.
1698
1699 - If .base is 0 there are no valid transitions from that node.
1700
1701 - If .base is nonzero then charid is added to it to find an entry in
1702 the trans array.
1703
1704 -If trans[states[state].base+charid].check!=state then the
1705 transition is taken to be a 0/Fail transition. Thus if there are fail
1706 transitions at the front of the node then the .base offset will point
1707 somewhere inside the previous nodes data (or maybe even into a node
1708 even earlier), but the .check field determines if the transition is
1709 valid.
1710
786e8c11 1711 XXX - wrong maybe?
a3621e74
YO
1712 The following process inplace converts the table to the compressed
1713 table: We first do not compress the root node 1,and mark its all its
1714 .check pointers as 1 and set its .base pointer as 1 as well. This
1715 allows to do a DFA construction from the compressed table later, and
1716 ensures that any .base pointers we calculate later are greater than
1717 0.
1718
1719 - We set 'pos' to indicate the first entry of the second node.
1720
1721 - We then iterate over the columns of the node, finding the first and
1722 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1723 and set the .check pointers accordingly, and advance pos
1724 appropriately and repreat for the next node. Note that when we copy
1725 the next pointers we have to convert them from the original
1726 NODEIDX form to NODENUM form as the former is not valid post
1727 compression.
1728
1729 - If a node has no transitions used we mark its base as 0 and do not
1730 advance the pos pointer.
1731
1732 - If a node only has one transition we use a second pointer into the
1733 structure to fill in allocated fail transitions from other states.
1734 This pointer is independent of the main pointer and scans forward
1735 looking for null transitions that are allocated to a state. When it
1736 finds one it writes the single transition into the "hole". If the
786e8c11 1737 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1738
1739 - Once compressed we can Renew/realloc the structures to release the
1740 excess space.
1741
1742 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1743 specifically Fig 3.47 and the associated pseudocode.
1744
1745 demq
1746 */
a3b680e6 1747 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1748 U32 state, charid;
a3621e74 1749 U32 pos = 0, zp=0;
1e2e3d02 1750 trie->statecount = laststate;
a3621e74
YO
1751
1752 for ( state = 1 ; state < laststate ; state++ ) {
1753 U8 flag = 0;
a28509cc
AL
1754 const U32 stateidx = TRIE_NODEIDX( state );
1755 const U32 o_used = trie->trans[ stateidx ].check;
1756 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1757 trie->trans[ stateidx ].check = 0;
1758
1759 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1760 if ( flag || trie->trans[ stateidx + charid ].next ) {
1761 if ( trie->trans[ stateidx + charid ].next ) {
1762 if (o_used == 1) {
1763 for ( ; zp < pos ; zp++ ) {
1764 if ( ! trie->trans[ zp ].next ) {
1765 break;
1766 }
1767 }
1768 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1769 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1770 trie->trans[ zp ].check = state;
1771 if ( ++zp > pos ) pos = zp;
1772 break;
1773 }
1774 used--;
1775 }
1776 if ( !flag ) {
1777 flag = 1;
1778 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1779 }
1780 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1781 trie->trans[ pos ].check = state;
1782 pos++;
1783 }
1784 }
1785 }
cc601c31 1786 trie->lasttrans = pos + 1;
1e2e3d02 1787 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1788 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1789 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1790 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1791 (int)depth * 2 + 2,"",
1792 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1793 (IV)next_alloc,
1794 (IV)pos,
a3621e74
YO
1795 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1796 );
1797
1798 } /* end table compress */
1799 }
1e2e3d02
YO
1800 DEBUG_TRIE_COMPILE_MORE_r(
1801 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1802 (int)depth * 2 + 2, "",
1803 (UV)trie->statecount,
1804 (UV)trie->lasttrans)
1805 );
cc601c31
YO
1806 /* resize the trans array to remove unused space */
1807 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1808
3dab1dad
YO
1809 /* and now dump out the compressed format */
1810 DEBUG_TRIE_COMPILE_r(
1811 dump_trie(trie,depth+1)
1812 );
07be1b83 1813
3dab1dad 1814 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1815 U8 nodetype =(U8)(flags & 0xFF);
1816 char *str=NULL;
786e8c11 1817
07be1b83 1818#ifdef DEBUGGING
e62cc96a 1819 regnode *optimize = NULL;
b57a0404
JH
1820 U32 mjd_offset = 0;
1821 U32 mjd_nodelen = 0;
07be1b83 1822#endif
a3621e74 1823 /*
3dab1dad
YO
1824 This means we convert either the first branch or the first Exact,
1825 depending on whether the thing following (in 'last') is a branch
1826 or not and whther first is the startbranch (ie is it a sub part of
1827 the alternation or is it the whole thing.)
1828 Assuming its a sub part we conver the EXACT otherwise we convert
1829 the whole branch sequence, including the first.
a3621e74 1830 */
3dab1dad 1831 /* Find the node we are going to overwrite */
7f69552c 1832 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1833 /* branch sub-chain */
3dab1dad 1834 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1835 DEBUG_r({
1836 mjd_offset= Node_Offset((convert));
1837 mjd_nodelen= Node_Length((convert));
1838 });
7f69552c
YO
1839 /* whole branch chain */
1840 } else {
1841 DEBUG_r({
1842 const regnode *nop = NEXTOPER( convert );
1843 mjd_offset= Node_Offset((nop));
1844 mjd_nodelen= Node_Length((nop));
1845 });
07be1b83 1846 }
7f69552c 1847
07be1b83
YO
1848 DEBUG_OPTIMISE_r(
1849 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1850 (int)depth * 2 + 2, "",
786e8c11 1851 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1852 );
a3621e74 1853
3dab1dad
YO
1854 /* But first we check to see if there is a common prefix we can
1855 split out as an EXACT and put in front of the TRIE node. */
1856 trie->startstate= 1;
786e8c11 1857 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1858 U32 state;
1e2e3d02 1859 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1860 U32 ofs = 0;
8e11feef
RGS
1861 I32 idx = -1;
1862 U32 count = 0;
1863 const U32 base = trie->states[ state ].trans.base;
a3621e74 1864
3dab1dad 1865 if ( trie->states[state].wordnum )
8e11feef 1866 count = 1;
a3621e74 1867
8e11feef 1868 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1869 if ( ( base + ofs >= trie->uniquecharcount ) &&
1870 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1871 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1872 {
3dab1dad 1873 if ( ++count > 1 ) {
8e11feef 1874 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1875 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1876 if ( state == 1 ) break;
3dab1dad
YO
1877 if ( count == 2 ) {
1878 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1879 DEBUG_OPTIMISE_r(
8e11feef
RGS
1880 PerlIO_printf(Perl_debug_log,
1881 "%*sNew Start State=%"UVuf" Class: [",
1882 (int)depth * 2 + 2, "",
786e8c11 1883 (UV)state));
be8e71aa
YO
1884 if (idx >= 0) {
1885 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1886 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1887
3dab1dad 1888 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1889 if ( folder )
1890 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1891 DEBUG_OPTIMISE_r(
07be1b83 1892 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1893 );
8e11feef
RGS
1894 }
1895 }
1896 TRIE_BITMAP_SET(trie,*ch);
1897 if ( folder )
1898 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1899 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1900 }
1901 idx = ofs;
1902 }
3dab1dad
YO
1903 }
1904 if ( count == 1 ) {
1905 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
de734bd5
A
1906 char *ch = SvPV_nolen( *tmp );
1907 DEBUG_OPTIMISE_r({
1908 SV *sv=sv_newmortal();
8e11feef
RGS
1909 PerlIO_printf( Perl_debug_log,
1910 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1911 (int)depth * 2 + 2, "",
de734bd5
A
1912 (UV)state, (UV)idx,
1913 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1914 PL_colors[0], PL_colors[1],
1915 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1916 PERL_PV_ESCAPE_FIRSTCHAR
1917 )
1918 );
1919 });
3dab1dad
YO
1920 if ( state==1 ) {
1921 OP( convert ) = nodetype;
1922 str=STRING(convert);
1923 STR_LEN(convert)=0;
1924 }
de734bd5
A
1925 while (*ch) {
1926 *str++ = *ch++;
1927 STR_LEN(convert)++;
1928 }
1929
8e11feef 1930 } else {
f9049ba1 1931#ifdef DEBUGGING
8e11feef
RGS
1932 if (state>1)
1933 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1934#endif
8e11feef
RGS
1935 break;
1936 }
1937 }
3dab1dad 1938 if (str) {
8e11feef 1939 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1940 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1941 trie->startstate = state;
07be1b83
YO
1942 trie->minlen -= (state - 1);
1943 trie->maxlen -= (state - 1);
1944 DEBUG_r({
1945 regnode *fix = convert;
de734bd5 1946 U32 word = trie->wordcount;
07be1b83
YO
1947 mjd_nodelen++;
1948 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1949 while( ++fix < n ) {
1950 Set_Node_Offset_Length(fix, 0, 0);
1951 }
de734bd5
A
1952 while (word--) {
1953 SV ** const tmp = av_fetch( trie->words, word, 0 );
1954 if (tmp) {
1955 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1956 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1957 else
1958 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1959 }
1960 }
07be1b83 1961 });
8e11feef
RGS
1962 if (trie->maxlen) {
1963 convert = n;
1964 } else {
3dab1dad 1965 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1966 DEBUG_r(optimize= n);
3dab1dad
YO
1967 }
1968 }
1969 }
a5ca303d
YO
1970 if (!jumper)
1971 jumper = last;
3dab1dad 1972 if ( trie->maxlen ) {
8e11feef
RGS
1973 NEXT_OFF( convert ) = (U16)(tail - convert);
1974 ARG_SET( convert, data_slot );
786e8c11
YO
1975 /* Store the offset to the first unabsorbed branch in
1976 jump[0], which is otherwise unused by the jump logic.
1977 We use this when dumping a trie and during optimisation. */
1978 if (trie->jump)
7f69552c 1979 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1980
786e8c11
YO
1981 /* XXXX */
1982 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1983 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1984 {
1985 OP( convert ) = TRIEC;
1986 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1987 Safefree(trie->bitmap);
1988 trie->bitmap= NULL;
1989 } else
1990 OP( convert ) = TRIE;
a3621e74 1991
3dab1dad
YO
1992 /* store the type in the flags */
1993 convert->flags = nodetype;
a5ca303d
YO
1994 DEBUG_r({
1995 optimize = convert
1996 + NODE_STEP_REGNODE
1997 + regarglen[ OP( convert ) ];
1998 });
1999 /* XXX We really should free up the resource in trie now,
2000 as we won't use them - (which resources?) dmq */
3dab1dad 2001 }
a3621e74 2002 /* needed for dumping*/
e62cc96a 2003 DEBUG_r(if (optimize) {
07be1b83 2004 regnode *opt = convert;
e62cc96a 2005 while ( ++opt < optimize) {
07be1b83
YO
2006 Set_Node_Offset_Length(opt,0,0);
2007 }
786e8c11
YO
2008 /*
2009 Try to clean up some of the debris left after the
2010 optimisation.
a3621e74 2011 */
786e8c11 2012 while( optimize < jumper ) {
07be1b83 2013 mjd_nodelen += Node_Length((optimize));
a3621e74 2014 OP( optimize ) = OPTIMIZED;
07be1b83 2015 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2016 optimize++;
2017 }
07be1b83 2018 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2019 });
2020 } /* end node insert */
07be1b83 2021#ifndef DEBUGGING
6e8b4190 2022 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 2023#endif
786e8c11
YO
2024 return trie->jump
2025 ? MADE_JUMP_TRIE
2026 : trie->startstate>1
2027 ? MADE_EXACT_TRIE
2028 : MADE_TRIE;
2029}
2030
2031STATIC void
2032S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2033{
2034/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2035
2036 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2037 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2038 ISBN 0-201-10088-6
2039
2040 We find the fail state for each state in the trie, this state is the longest proper
2041 suffix of the current states 'word' that is also a proper prefix of another word in our
2042 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2043 the DFA not to have to restart after its tried and failed a word at a given point, it
2044 simply continues as though it had been matching the other word in the first place.
2045 Consider
2046 'abcdgu'=~/abcdefg|cdgu/
2047 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2048 fail, which would bring use to the state representing 'd' in the second word where we would
2049 try 'g' and succeed, prodceding to match 'cdgu'.
2050 */
2051 /* add a fail transition */
f8fc2ecf 2052 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[ARG(source)];
786e8c11
YO
2053 U32 *q;
2054 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2055 const U32 numstates = trie->statecount;
786e8c11
YO
2056 const U32 ubound = trie->lasttrans + ucharcount;
2057 U32 q_read = 0;
2058 U32 q_write = 0;
2059 U32 charid;
2060 U32 base = trie->states[ 1 ].trans.base;
2061 U32 *fail;
2062 reg_ac_data *aho;
2063 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2064 GET_RE_DEBUG_FLAGS_DECL;
2065#ifndef DEBUGGING
2066 PERL_UNUSED_ARG(depth);
2067#endif
2068
2069
2070 ARG_SET( stclass, data_slot );
2071 Newxz( aho, 1, reg_ac_data );
f8fc2ecf 2072 RExC_rxi->data->data[ data_slot ] = (void*)aho;
786e8c11
YO
2073 aho->trie=trie;
2074 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2075 numstates * sizeof(reg_trie_state));
786e8c11
YO
2076 Newxz( q, numstates, U32);
2077 Newxz( aho->fail, numstates, U32 );
2078 aho->refcount = 1;
2079 fail = aho->fail;
2080 /* initialize fail[0..1] to be 1 so that we always have
2081 a valid final fail state */
2082 fail[ 0 ] = fail[ 1 ] = 1;
2083
2084 for ( charid = 0; charid < ucharcount ; charid++ ) {
2085 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2086 if ( newstate ) {
2087 q[ q_write ] = newstate;
2088 /* set to point at the root */
2089 fail[ q[ q_write++ ] ]=1;
2090 }
2091 }
2092 while ( q_read < q_write) {
2093 const U32 cur = q[ q_read++ % numstates ];
2094 base = trie->states[ cur ].trans.base;
2095
2096 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2097 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2098 if (ch_state) {
2099 U32 fail_state = cur;
2100 U32 fail_base;
2101 do {
2102 fail_state = fail[ fail_state ];
2103 fail_base = aho->states[ fail_state ].trans.base;
2104 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2105
2106 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2107 fail[ ch_state ] = fail_state;
2108 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2109 {
2110 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2111 }
2112 q[ q_write++ % numstates] = ch_state;
2113 }
2114 }
2115 }
2116 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2117 when we fail in state 1, this allows us to use the
2118 charclass scan to find a valid start char. This is based on the principle
2119 that theres a good chance the string being searched contains lots of stuff
2120 that cant be a start char.
2121 */
2122 fail[ 0 ] = fail[ 1 ] = 0;
2123 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2124 PerlIO_printf(Perl_debug_log,
2125 "%*sStclass Failtable (%"UVuf" states): 0",
2126 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2127 );
786e8c11
YO
2128 for( q_read=1; q_read<numstates; q_read++ ) {
2129 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2130 }
2131 PerlIO_printf(Perl_debug_log, "\n");
2132 });
2133 Safefree(q);
2134 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2135}
2136
786e8c11 2137
a3621e74 2138/*
5d1c421c
JH
2139 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2140 * These need to be revisited when a newer toolchain becomes available.
2141 */
2142#if defined(__sparc64__) && defined(__GNUC__)
2143# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2144# undef SPARC64_GCC_WORKAROUND
2145# define SPARC64_GCC_WORKAROUND 1
2146# endif
2147#endif
2148
07be1b83 2149#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2150 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2151 SV * const mysv=sv_newmortal(); \
2152 regnode *Next = regnext(scan); \
2153 regprop(RExC_rx, mysv, scan); \
7f69552c 2154 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2155 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2156 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2157 }});
07be1b83 2158
1de06328
YO
2159
2160
2161
2162
07be1b83
YO
2163#define JOIN_EXACT(scan,min,flags) \
2164 if (PL_regkind[OP(scan)] == EXACT) \
2165 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2166
be8e71aa 2167STATIC U32
07be1b83
YO
2168S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2169 /* Merge several consecutive EXACTish nodes into one. */
2170 regnode *n = regnext(scan);
2171 U32 stringok = 1;
2172 regnode *next = scan + NODE_SZ_STR(scan);
2173 U32 merged = 0;
2174 U32 stopnow = 0;
2175#ifdef DEBUGGING
2176 regnode *stop = scan;
72f13be8 2177 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2178#else
d47053eb
RGS
2179 PERL_UNUSED_ARG(depth);
2180#endif
2181#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2182 PERL_UNUSED_ARG(flags);
2183 PERL_UNUSED_ARG(val);
07be1b83 2184#endif
07be1b83
YO
2185 DEBUG_PEEP("join",scan,depth);
2186
2187 /* Skip NOTHING, merge EXACT*. */
2188 while (n &&
2189 ( PL_regkind[OP(n)] == NOTHING ||
2190 (stringok && (OP(n) == OP(scan))))
2191 && NEXT_OFF(n)
2192 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2193
2194 if (OP(n) == TAIL || n > next)
2195 stringok = 0;
2196 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2197 DEBUG_PEEP("skip:",n,depth);
2198 NEXT_OFF(scan) += NEXT_OFF(n);
2199 next = n + NODE_STEP_REGNODE;
2200#ifdef DEBUGGING
2201 if (stringok)
2202 stop = n;
2203#endif
2204 n = regnext(n);
2205 }
2206 else if (stringok) {
786e8c11 2207 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2208 regnode * const nnext = regnext(n);
2209
2210 DEBUG_PEEP("merg",n,depth);
2211
2212 merged++;
2213 if (oldl + STR_LEN(n) > U8_MAX)
2214 break;
2215 NEXT_OFF(scan) += NEXT_OFF(n);
2216 STR_LEN(scan) += STR_LEN(n);
2217 next = n + NODE_SZ_STR(n);
2218 /* Now we can overwrite *n : */
2219 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2220#ifdef DEBUGGING
2221 stop = next - 1;
2222#endif
2223 n = nnext;
2224 if (stopnow) break;
2225 }
2226
d47053eb
RGS
2227#ifdef EXPERIMENTAL_INPLACESCAN
2228 if (flags && !NEXT_OFF(n)) {
2229 DEBUG_PEEP("atch", val, depth);
2230 if (reg_off_by_arg[OP(n)]) {
2231 ARG_SET(n, val - n);
2232 }
2233 else {
2234 NEXT_OFF(n) = val - n;
2235 }
2236 stopnow = 1;
2237 }
07be1b83
YO
2238#endif
2239 }
2240
2241 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2242 /*
2243 Two problematic code points in Unicode casefolding of EXACT nodes:
2244
2245 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2246 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2247
2248 which casefold to
2249
2250 Unicode UTF-8
2251
2252 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2253 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2254
2255 This means that in case-insensitive matching (or "loose matching",
2256 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2257 length of the above casefolded versions) can match a target string
2258 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2259 This would rather mess up the minimum length computation.
2260
2261 What we'll do is to look for the tail four bytes, and then peek
2262 at the preceding two bytes to see whether we need to decrease
2263 the minimum length by four (six minus two).
2264
2265 Thanks to the design of UTF-8, there cannot be false matches:
2266 A sequence of valid UTF-8 bytes cannot be a subsequence of
2267 another valid sequence of UTF-8 bytes.
2268
2269 */
2270 char * const s0 = STRING(scan), *s, *t;
2271 char * const s1 = s0 + STR_LEN(scan) - 1;
2272 char * const s2 = s1 - 4;
e294cc5d
JH
2273#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2274 const char t0[] = "\xaf\x49\xaf\x42";
2275#else
07be1b83 2276 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2277#endif
07be1b83
YO
2278 const char * const t1 = t0 + 3;
2279
2280 for (s = s0 + 2;
2281 s < s2 && (t = ninstr(s, s1, t0, t1));
2282 s = t + 4) {
e294cc5d
JH
2283#ifdef EBCDIC
2284 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2285 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2286#else
07be1b83
YO
2287 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2288 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2289#endif
07be1b83
YO
2290 *min -= 4;
2291 }
2292 }
2293
2294#ifdef DEBUGGING
2295 /* Allow dumping */
2296 n = scan + NODE_SZ_STR(scan);
2297 while (n <= stop) {
2298 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2299 OP(n) = OPTIMIZED;
2300 NEXT_OFF(n) = 0;
2301 }
2302 n++;
2303 }
2304#endif
2305 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2306 return stopnow;
2307}
2308
653099ff
GS
2309/* REx optimizer. Converts nodes into quickier variants "in place".
2310 Finds fixed substrings. */
2311
a0288114 2312/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2313 to the position after last scanned or to NULL. */
2314
40d049e4
YO
2315#define INIT_AND_WITHP \
2316 assert(!and_withp); \
2317 Newx(and_withp,1,struct regnode_charclass_class); \
2318 SAVEFREEPV(and_withp)
07be1b83 2319
b515a41d
YO
2320/* this is a chain of data about sub patterns we are processing that
2321 need to be handled seperately/specially in study_chunk. Its so
2322 we can simulate recursion without losing state. */
2323struct scan_frame;
2324typedef struct scan_frame {
2325 regnode *last; /* last node to process in this frame */
2326 regnode *next; /* next node to process when last is reached */
2327 struct scan_frame *prev; /*previous frame*/
2328 I32 stop; /* what stopparen do we use */
2329} scan_frame;
2330
76e3520e 2331STATIC I32
40d049e4 2332S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2333 I32 *minlenp, I32 *deltap,
40d049e4
YO
2334 regnode *last,
2335 scan_data_t *data,
2336 I32 stopparen,
2337 U8* recursed,
2338 struct regnode_charclass_class *and_withp,
2339 U32 flags, U32 depth)
c277df42
IZ
2340 /* scanp: Start here (read-write). */
2341 /* deltap: Write maxlen-minlen here. */
2342 /* last: Stop before this one. */
40d049e4
YO
2343 /* data: string data about the pattern */
2344 /* stopparen: treat close N as END */
2345 /* recursed: which subroutines have we recursed into */
2346 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2347{
97aff369 2348 dVAR;
c277df42
IZ
2349 I32 min = 0, pars = 0, code;
2350 regnode *scan = *scanp, *next;
2351 I32 delta = 0;
2352 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2353 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2354 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2355 scan_data_t data_fake;
a3621e74 2356 SV *re_trie_maxbuff = NULL;
786e8c11 2357 regnode *first_non_open = scan;
e2e6a0f1 2358 I32 stopmin = I32_MAX;
8aa23a47
YO
2359 scan_frame *frame = NULL;
2360
a3621e74 2361 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2362
13a24bad 2363#ifdef DEBUGGING
40d049e4 2364 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2365#endif
40d049e4 2366
786e8c11 2367 if ( depth == 0 ) {
40d049e4 2368 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2369 first_non_open=regnext(first_non_open);
2370 }
2371
b81d288d 2372
8aa23a47
YO
2373 fake_study_recurse:
2374 while ( scan && OP(scan) != END && scan < last ){
2375 /* Peephole optimizer: */
2376 DEBUG_STUDYDATA(data,depth);
2377 DEBUG_PEEP("Peep",scan,depth);
2378 JOIN_EXACT(scan,&min,0);
2379
2380 /* Follow the next-chain of the current node and optimize
2381 away all the NOTHINGs from it. */
2382 if (OP(scan) != CURLYX) {
2383 const int max = (reg_off_by_arg[OP(scan)]
2384 ? I32_MAX
2385 /* I32 may be smaller than U16 on CRAYs! */
2386 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2387 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2388 int noff;
2389 regnode *n = scan;
2390
2391 /* Skip NOTHING and LONGJMP. */
2392 while ((n = regnext(n))
2393 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2394 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2395 && off + noff < max)
2396 off += noff;
2397 if (reg_off_by_arg[OP(scan)])
2398 ARG(scan) = off;
2399 else
2400 NEXT_OFF(scan) = off;
2401 }
a3621e74 2402
c277df42 2403
8aa23a47
YO
2404
2405 /* The principal pseudo-switch. Cannot be a switch, since we
2406 look into several different things. */
2407 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2408 || OP(scan) == IFTHEN) {
2409 next = regnext(scan);
2410 code = OP(scan);
2411 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2412
2413 if (OP(next) == code || code == IFTHEN) {
2414 /* NOTE - There is similar code to this block below for handling
2415 TRIE nodes on a re-study. If you change stuff here check there
2416 too. */
2417 I32 max1 = 0, min1 = I32_MAX, num = 0;
2418 struct regnode_charclass_class accum;
2419 regnode * const startbranch=scan;
2420
2421 if (flags & SCF_DO_SUBSTR)
2422 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2423 if (flags & SCF_DO_STCLASS)
2424 cl_init_zero(pRExC_state, &accum);
2425
2426 while (OP(scan) == code) {
2427 I32 deltanext, minnext, f = 0, fake;
2428 struct regnode_charclass_class this_class;
2429
2430 num++;
2431 data_fake.flags = 0;
2432 if (data) {
2433 data_fake.whilem_c = data->whilem_c;
2434 data_fake.last_closep = data->last_closep;
2435 }
2436 else
2437 data_fake.last_closep = &fake;
58e23c8d
YO
2438
2439 data_fake.pos_delta = delta;
8aa23a47
YO
2440 next = regnext(scan);
2441 scan = NEXTOPER(scan);
2442 if (code != BRANCH)
c277df42 2443 scan = NEXTOPER(scan);
8aa23a47
YO
2444 if (flags & SCF_DO_STCLASS) {
2445 cl_init(pRExC_state, &this_class);
2446 data_fake.start_class = &this_class;
2447 f = SCF_DO_STCLASS_AND;
58e23c8d 2448 }
8aa23a47
YO
2449 if (flags & SCF_WHILEM_VISITED_POS)
2450 f |= SCF_WHILEM_VISITED_POS;
2451
2452 /* we suppose the run is continuous, last=next...*/
2453 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2454 next, &data_fake,
2455 stopparen, recursed, NULL, f,depth+1);
2456 if (min1 > minnext)
2457 min1 = minnext;
2458 if (max1 < minnext + deltanext)
2459 max1 = minnext + deltanext;
2460 if (deltanext == I32_MAX)
2461 is_inf = is_inf_internal = 1;
2462 scan = next;
2463 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2464 pars++;
2465 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2466 if ( stopmin > minnext)
2467 stopmin = min + min1;
2468 flags &= ~SCF_DO_SUBSTR;
2469 if (data)
2470 data->flags |= SCF_SEEN_ACCEPT;
2471 }
2472 if (data) {
2473 if (data_fake.flags & SF_HAS_EVAL)
2474 data->flags |= SF_HAS_EVAL;
2475 data->whilem_c = data_fake.whilem_c;
3dab1dad 2476 }
8aa23a47
YO
2477 if (flags & SCF_DO_STCLASS)
2478 cl_or(pRExC_state, &accum, &this_class);
2479 }
2480 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2481 min1 = 0;
2482 if (flags & SCF_DO_SUBSTR) {
2483 data->pos_min += min1;
2484 data->pos_delta += max1 - min1;
2485 if (max1 != min1 || is_inf)
2486 data->longest = &(data->longest_float);
2487 }
2488 min += min1;
2489 delta += max1 - min1;
2490 if (flags & SCF_DO_STCLASS_OR) {
2491 cl_or(pRExC_state, data->start_class, &accum);
2492 if (min1) {
2493 cl_and(data->start_class, and_withp);
2494 flags &= ~SCF_DO_STCLASS;
653099ff 2495 }
8aa23a47
YO
2496 }
2497 else if (flags & SCF_DO_STCLASS_AND) {
2498 if (min1) {
2499 cl_and(data->start_class, &accum);
2500 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2501 }
8aa23a47
YO
2502 else {
2503 /* Switch to OR mode: cache the old value of
2504 * data->start_class */
2505 INIT_AND_WITHP;
2506 StructCopy(data->start_class, and_withp,
2507 struct regnode_charclass_class);
2508 flags &= ~SCF_DO_STCLASS_AND;
2509 StructCopy(&accum, data->start_class,
2510 struct regnode_charclass_class);
2511 flags |= SCF_DO_STCLASS_OR;
2512 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2513 }
8aa23a47 2514 }
a3621e74 2515
8aa23a47
YO
2516 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2517 /* demq.
a3621e74 2518
8aa23a47
YO
2519 Assuming this was/is a branch we are dealing with: 'scan' now
2520 points at the item that follows the branch sequence, whatever
2521 it is. We now start at the beginning of the sequence and look
2522 for subsequences of
a3621e74 2523
8aa23a47
YO
2524 BRANCH->EXACT=>x1
2525 BRANCH->EXACT=>x2
2526 tail
a3621e74 2527
8aa23a47 2528 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2529
8aa23a47
YO
2530 If we can find such a subseqence we need to turn the first
2531 element into a trie and then add the subsequent branch exact
2532 strings to the trie.
a3621e74 2533
8aa23a47 2534 We have two cases
a3621e74 2535
8aa23a47 2536 1. patterns where the whole set of branch can be converted.
a3621e74 2537
8aa23a47 2538 2. patterns where only a subset can be converted.
a3621e74 2539
8aa23a47
YO
2540 In case 1 we can replace the whole set with a single regop
2541 for the trie. In case 2 we need to keep the start and end
2542 branchs so
a3621e74 2543
8aa23a47
YO
2544 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2545 becomes BRANCH TRIE; BRANCH X;
786e8c11 2546
8aa23a47
YO
2547 There is an additional case, that being where there is a
2548 common prefix, which gets split out into an EXACT like node
2549 preceding the TRIE node.
a3621e74 2550
8aa23a47
YO
2551 If x(1..n)==tail then we can do a simple trie, if not we make
2552 a "jump" trie, such that when we match the appropriate word
2553 we "jump" to the appopriate tail node. Essentailly we turn
2554 a nested if into a case structure of sorts.
b515a41d 2555
8aa23a47
YO
2556 */
2557
2558 int made=0;
2559 if (!re_trie_maxbuff) {
2560 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2561 if (!SvIOK(re_trie_maxbuff))
2562 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2563 }
2564 if ( SvIV(re_trie_maxbuff)>=0 ) {
2565 regnode *cur;
2566 regnode *first = (regnode *)NULL;
2567 regnode *last = (regnode *)NULL;
2568 regnode *tail = scan;
2569 U8 optype = 0;
2570 U32 count=0;
a3621e74
YO
2571
2572#ifdef DEBUGGING
8aa23a47 2573 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2574#endif
8aa23a47
YO
2575 /* var tail is used because there may be a TAIL
2576 regop in the way. Ie, the exacts will point to the
2577 thing following the TAIL, but the last branch will
2578 point at the TAIL. So we advance tail. If we
2579 have nested (?:) we may have to move through several
2580 tails.
2581 */
2582
2583 while ( OP( tail ) == TAIL ) {
2584 /* this is the TAIL generated by (?:) */
2585 tail = regnext( tail );
2586 }
a3621e74 2587
8aa23a47
YO
2588
2589 DEBUG_OPTIMISE_r({
2590 regprop(RExC_rx, mysv, tail );
2591 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2592 (int)depth * 2 + 2, "",
2593 "Looking for TRIE'able sequences. Tail node is: ",
2594 SvPV_nolen_const( mysv )
2595 );
2596 });
2597
2598 /*
2599
2600 step through the branches, cur represents each
2601 branch, noper is the first thing to be matched
2602 as part of that branch and noper_next is the
2603 regnext() of that node. if noper is an EXACT
2604 and noper_next is the same as scan (our current
2605 position in the regex) then the EXACT branch is
2606 a possible optimization target. Once we have
2607 two or more consequetive such branches we can
2608 create a trie of the EXACT's contents and stich
2609 it in place. If the sequence represents all of
2610 the branches we eliminate the whole thing and
2611 replace it with a single TRIE. If it is a
2612 subsequence then we need to stitch it in. This
2613 means the first branch has to remain, and needs
2614 to be repointed at the item on the branch chain
2615 following the last branch optimized. This could
2616 be either a BRANCH, in which case the
2617 subsequence is internal, or it could be the
2618 item following the branch sequence in which
2619 case the subsequence is at the end.
2620
2621 */
2622
2623 /* dont use tail as the end marker for this traverse */
2624 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2625 regnode * const noper = NEXTOPER( cur );
b515a41d 2626#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2627 regnode * const noper_next = regnext( noper );
b515a41d
YO
2628#endif
2629
8aa23a47
YO
2630 DEBUG_OPTIMISE_r({
2631 regprop(RExC_rx, mysv, cur);
2632 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2633 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2634
2635 regprop(RExC_rx, mysv, noper);
2636 PerlIO_printf( Perl_debug_log, " -> %s",
2637 SvPV_nolen_const(mysv));
2638
2639 if ( noper_next ) {
2640 regprop(RExC_rx, mysv, noper_next );
2641 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2642 SvPV_nolen_const(mysv));
2643 }
2644 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2645 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2646 });
2647 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2648 : PL_regkind[ OP( noper ) ] == EXACT )
2649 || OP(noper) == NOTHING )
786e8c11 2650#ifdef NOJUMPTRIE
8aa23a47 2651 && noper_next == tail
786e8c11 2652#endif
8aa23a47
YO
2653 && count < U16_MAX)
2654 {
2655 count++;
2656 if ( !first || optype == NOTHING ) {
2657 if (!first) first = cur;
2658 optype = OP( noper );
2659 } else {
2660 last = cur;
2661 }
2662 } else {
2663 if ( last ) {
2664 make_trie( pRExC_state,
2665 startbranch, first, cur, tail, count,
2666 optype, depth+1 );
2667 }
2668 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2669#ifdef NOJUMPTRIE
8aa23a47 2670 && noper_next == tail
786e8c11 2671#endif
8aa23a47
YO
2672 ){
2673 count = 1;
2674 first = cur;
2675 optype = OP( noper );
2676 } else {
2677 count = 0;
2678 first = NULL;
2679 optype = 0;
2680 }
2681 last = NULL;
2682 }
2683 }
2684 DEBUG_OPTIMISE_r({
2685 regprop(RExC_rx, mysv, cur);
2686 PerlIO_printf( Perl_debug_log,
2687 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2688 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2689
2690 });
2691 if ( last ) {
2692 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2693#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2694 if ( ((made == MADE_EXACT_TRIE &&
2695 startbranch == first)
2696 || ( first_non_open == first )) &&
2697 depth==0 ) {
2698 flags |= SCF_TRIE_RESTUDY;
2699 if ( startbranch == first
2700 && scan == tail )
2701 {
2702 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2703 }
2704 }
3dab1dad 2705#endif
8aa23a47
YO
2706 }
2707 }
2708
2709 } /* do trie */
2710
653099ff 2711 }
8aa23a47
YO
2712 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2713 scan = NEXTOPER(NEXTOPER(scan));
2714 } else /* single branch is optimized. */
2715 scan = NEXTOPER(scan);
2716 continue;
2717 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2718 scan_frame *newframe = NULL;
2719 I32 paren;
2720 regnode *start;
2721 regnode *end;
2722
2723 if (OP(scan) != SUSPEND) {
2724 /* set the pointer */
2725 if (OP(scan) == GOSUB) {
2726 paren = ARG(scan);
2727 RExC_recurse[ARG2L(scan)] = scan;
2728 start = RExC_open_parens[paren-1];
2729 end = RExC_close_parens[paren-1];
2730 } else {
2731 paren = 0;
f8fc2ecf 2732 start = RExC_rxi->program + 1;
8aa23a47
YO
2733 end = RExC_opend;
2734 }
2735 if (!recursed) {
2736 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2737 SAVEFREEPV(recursed);
2738 }
2739 if (!PAREN_TEST(recursed,paren+1)) {
2740 PAREN_SET(recursed,paren+1);
2741 Newx(newframe,1,scan_frame);
2742 } else {
2743 if (flags & SCF_DO_SUBSTR) {
2744 scan_commit(pRExC_state,data,minlenp);
2745 data->longest = &(data->longest_float);
2746 }
2747 is_inf = is_inf_internal = 1;
2748 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2749 cl_anything(pRExC_state, data->start_class);
2750 flags &= ~SCF_DO_STCLASS;
2751 }
2752 } else {
2753 Newx(newframe,1,scan_frame);
2754 paren = stopparen;
2755 start = scan+2;
2756 end = regnext(scan);
2757 }
2758 if (newframe) {
2759 assert(start);
2760 assert(end);
2761 SAVEFREEPV(newframe);
2762 newframe->next = regnext(scan);
2763 newframe->last = last;
2764 newframe->stop = stopparen;
2765 newframe->prev = frame;
2766
2767 frame = newframe;
2768 scan = start;
2769 stopparen = paren;
2770 last = end;
2771
2772 continue;
2773 }
2774 }
2775 else if (OP(scan) == EXACT) {
2776 I32 l = STR_LEN(scan);
2777 UV uc;
2778 if (UTF) {
2779 const U8 * const s = (U8*)STRING(scan);
2780 l = utf8_length(s, s + l);
2781 uc = utf8_to_uvchr(s, NULL);
2782 } else {
2783 uc = *((U8*)STRING(scan));
2784 }
2785 min += l;
2786 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2787 /* The code below prefers earlier match for fixed
2788 offset, later match for variable offset. */
2789 if (data->last_end == -1) { /* Update the start info. */
2790 data->last_start_min = data->pos_min;
2791 data->last_start_max = is_inf
2792 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2793 }
8aa23a47
YO
2794 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2795 if (UTF)
2796 SvUTF8_on(data->last_found);
2797 {
2798 SV * const sv = data->last_found;
2799 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2800 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2801 if (mg && mg->mg_len >= 0)
2802 mg->mg_len += utf8_length((U8*)STRING(scan),
2803 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2804 }
8aa23a47
YO
2805 data->last_end = data->pos_min + l;
2806 data->pos_min += l; /* As in the first entry. */
2807 data->flags &= ~SF_BEFORE_EOL;
2808 }
2809 if (flags & SCF_DO_STCLASS_AND) {
2810 /* Check whether it is compatible with what we know already! */
2811 int compat = 1;
2812
2813 if (uc >= 0x100 ||
2814 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2815 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2816 && (!(data->start_class->flags & ANYOF_FOLD)
2817 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2818 )
2819 compat = 0;
2820 ANYOF_CLASS_ZERO(data->start_class);
2821 ANYOF_BITMAP_ZERO(data->start_class);
2822 if (compat)
2823 ANYOF_BITMAP_SET(data->start_class, uc);
2824 data->start_class->flags &= ~ANYOF_EOS;
2825 if (uc < 0x100)
2826 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2827 }
2828 else if (flags & SCF_DO_STCLASS_OR) {
2829 /* false positive possible if the class is case-folded */
2830 if (uc < 0x100)
2831 ANYOF_BITMAP_SET(data->start_class, uc);
2832 else
2833 data->start_class->flags |= ANYOF_UNICODE_ALL;
2834 data->start_class->flags &= ~ANYOF_EOS;
2835 cl_and(data->start_class, and_withp);
2836 }
2837 flags &= ~SCF_DO_STCLASS;
2838 }
2839 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2840 I32 l = STR_LEN(scan);
2841 UV uc = *((U8*)STRING(scan));
2842
2843 /* Search for fixed substrings supports EXACT only. */
2844 if (flags & SCF_DO_SUBSTR) {
2845 assert(data);
2846 scan_commit(pRExC_state, data, minlenp);
2847 }
2848 if (UTF) {
2849 const U8 * const s = (U8 *)STRING(scan);
2850 l = utf8_length(s, s + l);
2851 uc = utf8_to_uvchr(s, NULL);
2852 }
2853 min += l;
2854 if (flags & SCF_DO_SUBSTR)
2855 data->pos_min += l;
2856 if (flags & SCF_DO_STCLASS_AND) {
2857 /* Check whether it is compatible with what we know already! */
2858 int compat = 1;
2859
2860 if (uc >= 0x100 ||
2861 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2862 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2863 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2864 compat = 0;
2865 ANYOF_CLASS_ZERO(data->start_class);
2866 ANYOF_BITMAP_ZERO(data->start_class);
2867 if (compat) {
2868 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2869 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2870 data->start_class->flags |= ANYOF_FOLD;
2871 if (OP(scan) == EXACTFL)
2872 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2873 }
8aa23a47
YO
2874 }
2875 else if (flags & SCF_DO_STCLASS_OR) {
2876 if (data->start_class->flags & ANYOF_FOLD) {
2877 /* false positive possible if the class is case-folded.
2878 Assume that the locale settings are the same... */
1aa99e6b
IH
2879 if (uc < 0x100)
2880 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2881 data->start_class->flags &= ~ANYOF_EOS;
2882 }
8aa23a47 2883 cl_and(data->start_class, and_withp);
653099ff 2884 }
8aa23a47
YO
2885 flags &= ~SCF_DO_STCLASS;
2886 }
2887 else if (strchr((const char*)PL_varies,OP(scan))) {
2888 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2889 I32 f = flags, pos_before = 0;
2890 regnode * const oscan = scan;
2891 struct regnode_charclass_class this_class;
2892 struct regnode_charclass_class *oclass = NULL;
2893 I32 next_is_eval = 0;
2894
2895 switch (PL_regkind[OP(scan)]) {
2896 case WHILEM: /* End of (?:...)* . */
2897 scan = NEXTOPER(scan);
2898 goto finish;
2899 case PLUS:
2900 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2901 next = NEXTOPER(scan);
2902 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2903 mincount = 1;
2904 maxcount = REG_INFTY;
2905 next = regnext(scan);
2906 scan = NEXTOPER(scan);
2907 goto do_curly;
2908 }
2909 }
2910 if (flags & SCF_DO_SUBSTR)
2911 data->pos_min++;
2912 min++;
2913 /* Fall through. */
2914 case STAR:
2915 if (flags & SCF_DO_STCLASS) {
2916 mincount = 0;
2917 maxcount = REG_INFTY;
2918 next = regnext(scan);
2919 scan = NEXTOPER(scan);
2920 goto do_curly;
2921 }
2922 is_inf = is_inf_internal = 1;
2923 scan = regnext(scan);
c277df42 2924 if (flags & SCF_DO_SUBSTR) {
8aa23a47
YO
2925 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2926 data->longest = &(data->longest_float);
c277df42 2927 }
8aa23a47
YO
2928 goto optimize_curly_tail;
2929 case CURLY:
2930 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2931 && (scan->flags == stopparen))
2932 {
2933 mincount = 1;
2934 maxcount = 1;
2935 } else {
2936 mincount = ARG1(scan);
2937 maxcount = ARG2(scan);
653099ff 2938 }
8aa23a47
YO
2939 next = regnext(scan);
2940 if (OP(scan) == CURLYX) {
2941 I32 lp = (data ? *(data->last_closep) : 0);
2942 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2943 }
8aa23a47
YO
2944 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2945 next_is_eval = (OP(scan) == EVAL);
2946 do_curly:
2947 if (flags & SCF_DO_SUBSTR) {
2948 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2949 pos_before = data->pos_min;
b45f050a 2950 }
8aa23a47
YO
2951 if (data) {
2952 fl = data->flags;
2953 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2954 if (is_inf)
2955 data->flags |= SF_IS_INF;
2956 }
2957 if (flags & SCF_DO_STCLASS) {
2958 cl_init(pRExC_state, &this_class);
2959 oclass = data->start_class;
2960 data->start_class = &this_class;
2961 f |= SCF_DO_STCLASS_AND;
2962 f &= ~SCF_DO_STCLASS_OR;
2963 }
2964 /* These are the cases when once a subexpression
2965 fails at a particular position, it cannot succeed
2966 even after backtracking at the enclosing scope.
2967
2968 XXXX what if minimal match and we are at the
2969 initial run of {n,m}? */
2970 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2971 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 2972
8aa23a47
YO
2973 /* This will finish on WHILEM, setting scan, or on NULL: */
2974 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2975 last, data, stopparen, recursed, NULL,
2976 (mincount == 0
2977 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 2978
8aa23a47
YO
2979 if (flags & SCF_DO_STCLASS)
2980 data->start_class = oclass;
2981 if (mincount == 0 || minnext == 0) {
2982 if (flags & SCF_DO_STCLASS_OR) {
2983 cl_or(pRExC_state, data->start_class, &this_class);
2984 }
2985 else if (flags & SCF_DO_STCLASS_AND) {
2986 /* Switch to OR mode: cache the old value of
2987 * data->start_class */
2988 INIT_AND_WITHP;
2989 StructCopy(data->start_class, and_withp,
2990 struct regnode_charclass_class);
2991 flags &= ~SCF_DO_STCLASS_AND;
2992 StructCopy(&this_class, data->start_class,
2993 struct regnode_charclass_class);
2994 flags |= SCF_DO_STCLASS_OR;
2995 data->start_class->flags |= ANYOF_EOS;
2996 }
2997 } else { /* Non-zero len */
2998 if (flags & SCF_DO_STCLASS_OR) {
2999 cl_or(pRExC_state, data->start_class, &this_class);
3000 cl_and(data->start_class, and_withp);
3001 }
3002 else if (flags & SCF_DO_STCLASS_AND)
3003 cl_and(data->start_class, &this_class);
3004 flags &= ~SCF_DO_STCLASS;
3005 }
3006 if (!scan) /* It was not CURLYX, but CURLY. */
3007 scan = next;
3008 if ( /* ? quantifier ok, except for (?{ ... }) */
3009 (next_is_eval || !(mincount == 0 && maxcount == 1))
3010 && (minnext == 0) && (deltanext == 0)
3011 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3012 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3013 && ckWARN(WARN_REGEXP))
3014 {
3015 vWARN(RExC_parse,
3016 "Quantifier unexpected on zero-length expression");
3017 }
3018
3019 min += minnext * mincount;
3020 is_inf_internal |= ((maxcount == REG_INFTY
3021 && (minnext + deltanext) > 0)
3022 || deltanext == I32_MAX);
3023 is_inf |= is_inf_internal;
3024 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3025
3026 /* Try powerful optimization CURLYX => CURLYN. */
3027 if ( OP(oscan) == CURLYX && data
3028 && data->flags & SF_IN_PAR
3029 && !(data->flags & SF_HAS_EVAL)
3030 && !deltanext && minnext == 1 ) {
3031 /* Try to optimize to CURLYN. */
3032 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3033 regnode * const nxt1 = nxt;
497b47a8 3034#ifdef DEBUGGING
8aa23a47 3035 regnode *nxt2;
497b47a8 3036#endif
c277df42 3037
8aa23a47
YO
3038 /* Skip open. */
3039 nxt = regnext(nxt);
3040 if (!strchr((const char*)PL_simple,OP(nxt))
3041 && !(PL_regkind[OP(nxt)] == EXACT
3042 && STR_LEN(nxt) == 1))
3043 goto nogo;
497b47a8 3044#ifdef DEBUGGING
8aa23a47 3045 nxt2 = nxt;
497b47a8 3046#endif
8aa23a47
YO
3047 nxt = regnext(nxt);
3048 if (OP(nxt) != CLOSE)
3049 goto nogo;
3050 if (RExC_open_parens) {
3051 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3052 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3053 }
3054 /* Now we know that nxt2 is the only contents: */
3055 oscan->flags = (U8)ARG(nxt);
3056 OP(oscan) = CURLYN;
3057 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3058
c277df42 3059#ifdef DEBUGGING
8aa23a47
YO
3060 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3061 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3062 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3063 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3064 OP(nxt + 1) = OPTIMIZED; /* was count. */
3065 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3066#endif
8aa23a47
YO
3067 }
3068 nogo:
3069
3070 /* Try optimization CURLYX => CURLYM. */
3071 if ( OP(oscan) == CURLYX && data
3072 && !(data->flags & SF_HAS_PAR)
3073 && !(data->flags & SF_HAS_EVAL)
3074 && !deltanext /* atom is fixed width */
3075 && minnext != 0 /* CURLYM can't handle zero width */
3076 ) {
3077 /* XXXX How to optimize if data == 0? */
3078 /* Optimize to a simpler form. */
3079 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3080 regnode *nxt2;
3081
3082 OP(oscan) = CURLYM;
3083 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3084 && (OP(nxt2) != WHILEM))
3085 nxt = nxt2;
3086 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3087 /* Need to optimize away parenths. */
3088 if (data->flags & SF_IN_PAR) {
3089 /* Set the parenth number. */
3090 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3091
3092 if (OP(nxt) != CLOSE)
3093 FAIL("Panic opt close");
3094 oscan->flags = (U8)ARG(nxt);
3095 if (RExC_open_parens) {
3096 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3097 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3098 }
8aa23a47
YO
3099 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3100 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3101
c277df42 3102#ifdef DEBUGGING
8aa23a47
YO
3103 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3104 OP(nxt + 1) = OPTIMIZED; /* was count. */
3105 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3106 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3107#endif
c277df42 3108#if 0
8aa23a47
YO
3109 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3110 regnode *nnxt = regnext(nxt1);
3111
3112 if (nnxt == nxt) {
3113 if (reg_off_by_arg[OP(nxt1)])
3114 ARG_SET(nxt1, nxt2 - nxt1);
3115 else if (nxt2 - nxt1 < U16_MAX)
3116 NEXT_OFF(nxt1) = nxt2 - nxt1;
3117 else
3118 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3119 }
8aa23a47 3120 nxt1 = nnxt;
c277df42 3121 }
5d1c421c 3122#endif
8aa23a47
YO
3123 /* Optimize again: */
3124 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3125 NULL, stopparen, recursed, NULL, 0,depth+1);
3126 }
3127 else
3128 oscan->flags = 0;
3129 }
3130 else if ((OP(oscan) == CURLYX)
3131 && (flags & SCF_WHILEM_VISITED_POS)
3132 /* See the comment on a similar expression above.
3133 However, this time it not a subexpression
3134 we care about, but the expression itself. */
3135 && (maxcount == REG_INFTY)
3136 && data && ++data->whilem_c < 16) {
3137 /* This stays as CURLYX, we can put the count/of pair. */
3138 /* Find WHILEM (as in regexec.c) */
3139 regnode *nxt = oscan + NEXT_OFF(oscan);
3140
3141 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3142 nxt += ARG(nxt);
3143 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3144 | (RExC_whilem_seen << 4)); /* On WHILEM */
3145 }
3146 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3147 pars++;
3148 if (flags & SCF_DO_SUBSTR) {
3149 SV *last_str = NULL;
3150 int counted = mincount != 0;
a0ed51b3 3151
8aa23a47
YO
3152 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3153#if defined(SPARC64_GCC_WORKAROUND)
3154 I32 b = 0;
3155 STRLEN l = 0;
3156 const char *s = NULL;
3157 I32 old = 0;
b515a41d 3158
8aa23a47
YO
3159 if (pos_before >= data->last_start_min)
3160 b = pos_before;
3161 else
3162 b = data->last_start_min;
b515a41d 3163
8aa23a47
YO
3164 l = 0;
3165 s = SvPV_const(data->last_found, l);
3166 old = b - data->last_start_min;
3167
3168#else
3169 I32 b = pos_before >= data->last_start_min
3170 ? pos_before : data->last_start_min;
3171 STRLEN l;
3172 const char * const s = SvPV_const(data->last_found, l);
3173 I32 old = b - data->last_start_min;
3174#endif
3175
3176 if (UTF)
3177 old = utf8_hop((U8*)s, old) - (U8*)s;
3178
3179 l -= old;
3180 /* Get the added string: */
3181 last_str = newSVpvn(s + old, l);
3182 if (UTF)
3183 SvUTF8_on(last_str);
3184 if (deltanext == 0 && pos_before == b) {
3185 /* What was added is a constant string */
3186 if (mincount > 1) {
3187 SvGROW(last_str, (mincount * l) + 1);
3188 repeatcpy(SvPVX(last_str) + l,
3189 SvPVX_const(last_str), l, mincount - 1);
3190 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3191 /* Add additional parts. */
3192 SvCUR_set(data->last_found,
3193 SvCUR(data->last_found) - l);
3194 sv_catsv(data->last_found, last_str);
3195 {
3196 SV * sv = data->last_found;
3197 MAGIC *mg =
3198 SvUTF8(sv) && SvMAGICAL(sv) ?
3199 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3200 if (mg && mg->mg_len >= 0)
3201 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3202 }
8aa23a47 3203 data->last_end += l * (mincount - 1);
b515a41d 3204 }
8aa23a47
YO
3205 } else {
3206 /* start offset must point into the last copy */
3207 data->last_start_min += minnext * (mincount - 1);
3208 data->last_start_max += is_inf ? I32_MAX
3209 : (maxcount - 1) * (minnext + data->pos_delta);
3210 }
c277df42 3211 }
8aa23a47
YO
3212 /* It is counted once already... */
3213 data->pos_min += minnext * (mincount - counted);
3214 data->pos_delta += - counted * deltanext +
3215 (minnext + deltanext) * maxcount - minnext * mincount;
3216 if (mincount != maxcount) {
3217 /* Cannot extend fixed substrings found inside
3218 the group. */
3219 scan_commit(pRExC_state,data,minlenp);
3220 if (mincount && last_str) {
3221 SV * const sv = data->last_found;
3222 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3223 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3224
3225 if (mg)
3226 mg->mg_len = -1;
3227 sv_setsv(sv, last_str);
3228 data->last_end = data->pos_min;
3229 data->last_start_min =
3230 data->pos_min - CHR_SVLEN(last_str);
3231 data->last_start_max = is_inf
3232 ? I32_MAX
3233 : data->pos_min + data->pos_delta
3234 - CHR_SVLEN(last_str);
3235 }
3236 data->longest = &(data->longest_float);
3237 }
3238 SvREFCNT_dec(last_str);
c277df42 3239 }
8aa23a47
YO
3240 if (data && (fl & SF_HAS_EVAL))
3241 data->flags |= SF_HAS_EVAL;
3242 optimize_curly_tail:
3243 if (OP(oscan) != CURLYX) {
3244 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3245 && NEXT_OFF(next))
3246 NEXT_OFF(oscan) += NEXT_OFF(next);
3247 }
3248 continue;
3249 default: /* REF and CLUMP only? */
3250 if (flags & SCF_DO_SUBSTR) {
3251 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3252 data->longest = &(data->longest_float);
3253 }
3254 is_inf = is_inf_internal = 1;
3255 if (flags & SCF_DO_STCLASS_OR)
3256 cl_anything(pRExC_state, data->start_class);
3257 flags &= ~SCF_DO_STCLASS;
3258 break;
c277df42 3259 }
8aa23a47
YO
3260 }
3261 else if (strchr((const char*)PL_simple,OP(scan))) {
3262 int value = 0;
653099ff 3263
8aa23a47
YO
3264 if (flags & SCF_DO_SUBSTR) {
3265 scan_commit(pRExC_state,data,minlenp);
3266 data->pos_min++;
3267 }
3268 min++;
3269 if (flags & SCF_DO_STCLASS) {
3270 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3271
8aa23a47
YO
3272 /* Some of the logic below assumes that switching
3273 locale on will only add false positives. */
3274 switch (PL_regkind[OP(scan)]) {
3275 case SANY:
3276 default:
3277 do_default:
3278 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3279 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3280 cl_anything(pRExC_state, data->start_class);
3281 break;
3282 case REG_ANY:
3283 if (OP(scan) == SANY)
3284 goto do_default;
3285 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3286 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3287 || (data->start_class->flags & ANYOF_CLASS));
3288 cl_anything(pRExC_state, data->start_class);
653099ff 3289 }
8aa23a47
YO
3290 if (flags & SCF_DO_STCLASS_AND || !value)
3291 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3292 break;
3293 case ANYOF:
3294 if (flags & SCF_DO_STCLASS_AND)
3295 cl_and(data->start_class,
3296 (struct regnode_charclass_class*)scan);
653099ff 3297 else
8aa23a47
YO
3298 cl_or(pRExC_state, data->start_class,
3299 (struct regnode_charclass_class*)scan);
3300 break;
3301 case ALNUM:
3302 if (flags & SCF_DO_STCLASS_AND) {
3303 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3304 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3305 for (value = 0; value < 256; value++)
3306 if (!isALNUM(value))
3307 ANYOF_BITMAP_CLEAR(data->start_class, value);
3308 }
653099ff 3309 }
8aa23a47
YO
3310 else {
3311 if (data->start_class->flags & ANYOF_LOCALE)
3312 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3313 else {
3314 for (value = 0; value < 256; value++)
3315 if (isALNUM(value))
3316 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3317 }
8aa23a47
YO
3318 }
3319 break;
3320 case ALNUML:
3321 if (flags & SCF_DO_STCLASS_AND) {
3322 if (data->start_class->flags & ANYOF_LOCALE)
3323 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3324 }
3325 else {
3326 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3327 data->start_class->flags |= ANYOF_LOCALE;
3328 }
3329 break;
3330 case NALNUM:
3331 if (flags & SCF_DO_STCLASS_AND) {
3332 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3333 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3334 for (value = 0; value < 256; value++)
3335 if (isALNUM(value))
3336 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3337 }
3338 }
8aa23a47
YO
3339 else {
3340 if (data->start_class->flags & ANYOF_LOCALE)
3341 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3342 else {
3343 for (value = 0; value < 256; value++)
3344 if (!isALNUM(value))
3345 ANYOF_BITMAP_SET(data->start_class, value);
3346 }
653099ff 3347 }
8aa23a47
YO
3348 break;
3349 case NALNUML:
3350 if (flags & SCF_DO_STCLASS_AND) {
3351 if (data->start_class->flags & ANYOF_LOCALE)
3352 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3353 }
8aa23a47
YO
3354 else {
3355 data->start_class->flags |= ANYOF_LOCALE;
3356 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3357 }
3358 break;
3359 case SPACE:
3360 if (flags & SCF_DO_STCLASS_AND) {
3361 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3362 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3363 for (value = 0; value < 256; value++)
3364 if (!isSPACE(value))
3365 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3366 }
3367 }
8aa23a47
YO
3368 else {
3369 if (data->start_class->flags & ANYOF_LOCALE)
3370 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3371 else {
3372 for (value = 0; value < 256; value++)
3373 if (isSPACE(value))
3374 ANYOF_BITMAP_SET(data->start_class, value);
3375 }
653099ff 3376 }
8aa23a47
YO
3377 break;
3378 case SPACEL:
3379 if (flags & SCF_DO_STCLASS_AND) {
3380 if (data->start_class->flags & ANYOF_LOCALE)
3381 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3382 }
3383 else {
3384 data->start_class->flags |= ANYOF_LOCALE;
3385 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3386 }
3387 break;
3388 case NSPACE:
3389 if (flags & SCF_DO_STCLASS_AND) {
3390 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3391 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3392 for (value = 0; value < 256; value++)
3393 if (isSPACE(value))
3394 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3395 }
8aa23a47
YO
3396 }
3397 else {
3398 if (data->start_class->flags & ANYOF_LOCALE)
3399 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3400 else {
3401 for (value = 0; value < 256; value++)
3402 if (!isSPACE(value))
3403 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3404 }
3405 }
8aa23a47
YO
3406 break;
3407 case NSPACEL:
3408 if (flags & SCF_DO_STCLASS_AND) {
3409 if (data->start_class->flags & ANYOF_LOCALE) {
3410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3411 for (value = 0; value < 256; value++)
3412 if (!isSPACE(value))
3413 ANYOF_BITMAP_CLEAR(data->start_class, value);
3414 }
653099ff 3415 }
8aa23a47
YO
3416 else {
3417 data->start_class->flags |= ANYOF_LOCALE;
3418 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3419 }
3420 break;
3421 case DIGIT:
3422 if (flags & SCF_DO_STCLASS_AND) {
3423 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3424 for (value = 0; value < 256; value++)
3425 if (!isDIGIT(value))
3426 ANYOF_BITMAP_CLEAR(data->start_class, value);
3427 }
3428 else {
3429 if (data->start_class->flags & ANYOF_LOCALE)
3430 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3431 else {
3432 for (value = 0; value < 256; value++)
3433 if (isDIGIT(value))
3434 ANYOF_BITMAP_SET(data->start_class, value);
3435 }
3436 }
3437 break;
3438 case NDIGIT:
3439 if (flags & SCF_DO_STCLASS_AND) {
3440 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3441 for (value = 0; value < 256; value++)
3442 if (isDIGIT(value))
3443 ANYOF_BITMAP_CLEAR(data->start_class, value);
3444 }
3445 else {
3446 if (data->start_class->flags & ANYOF_LOCALE)
3447 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3448 else {
3449 for (value = 0; value < 256; value++)
3450 if (!isDIGIT(value))
3451 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3452 }
3453 }
8aa23a47
YO
3454 break;
3455 }
3456 if (flags & SCF_DO_STCLASS_OR)
3457 cl_and(data->start_class, and_withp);
3458 flags &= ~SCF_DO_STCLASS;
3459 }
3460 }
3461 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3462 data->flags |= (OP(scan) == MEOL
3463 ? SF_BEFORE_MEOL
3464 : SF_BEFORE_SEOL);
3465 }
3466 else if ( PL_regkind[OP(scan)] == BRANCHJ
3467 /* Lookbehind, or need to calculate parens/evals/stclass: */
3468 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3469 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3470 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3471 || OP(scan) == UNLESSM )
3472 {
3473 /* Negative Lookahead/lookbehind
3474 In this case we can't do fixed string optimisation.
3475 */
1de06328 3476
8aa23a47
YO
3477 I32 deltanext, minnext, fake = 0;
3478 regnode *nscan;
3479 struct regnode_charclass_class intrnl;
3480 int f = 0;
1de06328 3481
8aa23a47
YO
3482 data_fake.flags = 0;
3483 if (data) {
3484 data_fake.whilem_c = data->whilem_c;
3485 data_fake.last_closep = data->last_closep;
c277df42 3486 }
8aa23a47
YO
3487 else
3488 data_fake.last_closep = &fake;
58e23c8d 3489 data_fake.pos_delta = delta;
8aa23a47
YO
3490 if ( flags & SCF_DO_STCLASS && !scan->flags
3491 && OP(scan) == IFMATCH ) { /* Lookahead */
3492 cl_init(pRExC_state, &intrnl);
3493 data_fake.start_class = &intrnl;
3494 f |= SCF_DO_STCLASS_AND;
3495 }
3496 if (flags & SCF_WHILEM_VISITED_POS)
3497 f |= SCF_WHILEM_VISITED_POS;
3498 next = regnext(scan);
3499 nscan = NEXTOPER(NEXTOPER(scan));
3500 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3501 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3502 if (scan->flags) {
3503 if (deltanext) {
58e23c8d 3504 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3505 }
3506 else if (minnext > (I32)U8_MAX) {
58e23c8d 3507 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3508 }
3509 scan->flags = (U8)minnext;
3510 }
3511 if (data) {
3512 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3513 pars++;
3514 if (data_fake.flags & SF_HAS_EVAL)
3515 data->flags |= SF_HAS_EVAL;
3516 data->whilem_c = data_fake.whilem_c;
3517 }
3518 if (f & SCF_DO_STCLASS_AND) {
3519 const int was = (data->start_class->flags & ANYOF_EOS);
3520
3521 cl_and(data->start_class, &intrnl);
3522 if (was)
3523 data->start_class->flags |= ANYOF_EOS;
3524 }
cb434fcc 3525 }
8aa23a47
YO
3526#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3527 else {
3528 /* Positive Lookahead/lookbehind
3529 In this case we can do fixed string optimisation,
3530 but we must be careful about it. Note in the case of
3531 lookbehind the positions will be offset by the minimum
3532 length of the pattern, something we won't know about
3533 until after the recurse.
3534 */
3535 I32 deltanext, fake = 0;
3536 regnode *nscan;
3537 struct regnode_charclass_class intrnl;
3538 int f = 0;
3539 /* We use SAVEFREEPV so that when the full compile
3540 is finished perl will clean up the allocated
3541 minlens when its all done. This was we don't
3542 have to worry about freeing them when we know
3543 they wont be used, which would be a pain.
3544 */
3545 I32 *minnextp;
3546 Newx( minnextp, 1, I32 );
3547 SAVEFREEPV(minnextp);
3548
3549 if (data) {
3550 StructCopy(data, &data_fake, scan_data_t);
3551 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3552 f |= SCF_DO_SUBSTR;
3553 if (scan->flags)
3554 scan_commit(pRExC_state, &data_fake,minlenp);
3555 data_fake.last_found=newSVsv(data->last_found);
3556 }
3557 }
3558 else
3559 data_fake.last_closep = &fake;
3560 data_fake.flags = 0;
58e23c8d 3561 data_fake.pos_delta = delta;
8aa23a47
YO
3562 if (is_inf)
3563 data_fake.flags |= SF_IS_INF;
3564 if ( flags & SCF_DO_STCLASS && !scan->flags
3565 && OP(scan) == IFMATCH ) { /* Lookahead */
3566 cl_init(pRExC_state, &intrnl);
3567 data_fake.start_class = &intrnl;
3568 f |= SCF_DO_STCLASS_AND;
3569 }
3570 if (flags & SCF_WHILEM_VISITED_POS)
3571 f |= SCF_WHILEM_VISITED_POS;
3572 next = regnext(scan);
3573 nscan = NEXTOPER(NEXTOPER(scan));
3574
3575 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3576 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3577 if (scan->flags) {
3578 if (deltanext) {
58e23c8d 3579 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3580 }
3581 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3582 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3583 }
3584 scan->flags = (U8)*minnextp;
3585 }
3586
3587 *minnextp += min;
3588
3589 if (f & SCF_DO_STCLASS_AND) {
3590 const int was = (data->start_class->flags & ANYOF_EOS);
3591
3592 cl_and(data->start_class, &intrnl);
3593 if (was)
3594 data->start_class->flags |= ANYOF_EOS;
3595 }
3596 if (data) {
3597 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3598 pars++;
3599 if (data_fake.flags & SF_HAS_EVAL)
3600 data->flags |= SF_HAS_EVAL;
3601 data->whilem_c = data_fake.whilem_c;
3602 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3603 if (RExC_rx->minlen<*minnextp)
3604 RExC_rx->minlen=*minnextp;
3605 scan_commit(pRExC_state, &data_fake, minnextp);
3606 SvREFCNT_dec(data_fake.last_found);
3607
3608 if ( data_fake.minlen_fixed != minlenp )
3609 {
3610 data->offset_fixed= data_fake.offset_fixed;
3611 data->minlen_fixed= data_fake.minlen_fixed;
3612 data->lookbehind_fixed+= scan->flags;
3613 }
3614 if ( data_fake.minlen_float != minlenp )
3615 {
3616 data->minlen_float= data_fake.minlen_float;
3617 data->offset_float_min=data_fake.offset_float_min;
3618 data->offset_float_max=data_fake.offset_float_max;
3619 data->lookbehind_float+= scan->flags;
3620 }
3621 }
3622 }
3623
3624
40d049e4 3625 }
8aa23a47
YO
3626#endif
3627 }
3628 else if (OP(scan) == OPEN) {
3629 if (stopparen != (I32)ARG(scan))
3630 pars++;
3631 }
3632 else if (OP(scan) == CLOSE) {
3633 if (stopparen == (I32)ARG(scan)) {
3634 break;
3635 }
3636 if ((I32)ARG(scan) == is_par) {
3637 next = regnext(scan);
b515a41d 3638
8aa23a47
YO
3639 if ( next && (OP(next) != WHILEM) && next < last)
3640 is_par = 0; /* Disable optimization */
40d049e4 3641 }
8aa23a47
YO
3642 if (data)
3643 *(data->last_closep) = ARG(scan);
3644 }
3645 else if (OP(scan) == EVAL) {
c277df42
IZ
3646 if (data)
3647 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3648 }
3649 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3650 if (flags & SCF_DO_SUBSTR) {
3651 scan_commit(pRExC_state,data,minlenp);
3652 flags &= ~SCF_DO_SUBSTR;
40d049e4 3653 }
8aa23a47
YO
3654 if (data && OP(scan)==ACCEPT) {
3655 data->flags |= SCF_SEEN_ACCEPT;
3656 if (stopmin > min)
3657 stopmin = min;
e2e6a0f1 3658 }
8aa23a47
YO
3659 }
3660 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3661 {
0f5d15d6 3662 if (flags & SCF_DO_SUBSTR) {
1de06328 3663 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3664 data->longest = &(data->longest_float);
3665 }
3666 is_inf = is_inf_internal = 1;
653099ff 3667 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3668 cl_anything(pRExC_state, data->start_class);
96776eda 3669 flags &= ~SCF_DO_STCLASS;
8aa23a47 3670 }
58e23c8d 3671 else if (OP(scan) == GPOS) {
bbe252da 3672 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3673 !(delta || is_inf || (data && data->pos_delta)))
3674 {
bbe252da
YO
3675 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3676 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3677 if (RExC_rx->gofs < (U32)min)
3678 RExC_rx->gofs = min;
3679 } else {
bbe252da 3680 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3681 RExC_rx->gofs = 0;
3682 }
3683 }
786e8c11 3684#ifdef TRIE_STUDY_OPT
40d049e4 3685#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3686 else if (PL_regkind[OP(scan)] == TRIE) {
3687 /* NOTE - There is similar code to this block above for handling
3688 BRANCH nodes on the initial study. If you change stuff here
3689 check there too. */
3690 regnode *trie_node= scan;
3691 regnode *tail= regnext(scan);
f8fc2ecf 3692 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3693 I32 max1 = 0, min1 = I32_MAX;
3694 struct regnode_charclass_class accum;
3695
3696 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3697 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3698 if (flags & SCF_DO_STCLASS)
3699 cl_init_zero(pRExC_state, &accum);
3700
3701 if (!trie->jump) {
3702 min1= trie->minlen;
3703 max1= trie->maxlen;
3704 } else {
3705 const regnode *nextbranch= NULL;
3706 U32 word;
3707
3708 for ( word=1 ; word <= trie->wordcount ; word++)
3709 {
3710 I32 deltanext=0, minnext=0, f = 0, fake;
3711 struct regnode_charclass_class this_class;
3712
3713 data_fake.flags = 0;
3714 if (data) {
3715 data_fake.whilem_c = data->whilem_c;
3716 data_fake.last_closep = data->last_closep;
3717 }
3718 else
3719 data_fake.last_closep = &fake;
58e23c8d 3720 data_fake.pos_delta = delta;
8aa23a47
YO
3721 if (flags & SCF_DO_STCLASS) {
3722 cl_init(pRExC_state, &this_class);
3723 data_fake.start_class = &this_class;
3724 f = SCF_DO_STCLASS_AND;
3725 }
3726 if (flags & SCF_WHILEM_VISITED_POS)
3727 f |= SCF_WHILEM_VISITED_POS;
3728
3729 if (trie->jump[word]) {
3730 if (!nextbranch)
3731 nextbranch = trie_node + trie->jump[0];
3732 scan= trie_node + trie->jump[word];
3733 /* We go from the jump point to the branch that follows
3734 it. Note this means we need the vestigal unused branches
3735 even though they arent otherwise used.
3736 */
3737 minnext = study_chunk(pRExC_state, &scan, minlenp,
3738 &deltanext, (regnode *)nextbranch, &data_fake,
3739 stopparen, recursed, NULL, f,depth+1);
3740 }
3741 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3742 nextbranch= regnext((regnode*)nextbranch);
3743
3744 if (min1 > (I32)(minnext + trie->minlen))
3745 min1 = minnext + trie->minlen;
3746 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3747 max1 = minnext + deltanext + trie->maxlen;
3748 if (deltanext == I32_MAX)
3749 is_inf = is_inf_internal = 1;
3750
3751 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3752 pars++;
3753 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3754 if ( stopmin > min + min1)
3755 stopmin = min + min1;
3756 flags &= ~SCF_DO_SUBSTR;
3757 if (data)
3758 data->flags |= SCF_SEEN_ACCEPT;
3759 }
3760 if (data) {
3761 if (data_fake.flags & SF_HAS_EVAL)
3762 data->flags |= SF_HAS_EVAL;
3763 data->whilem_c = data_fake.whilem_c;
3764 }
3765 if (flags & SCF_DO_STCLASS)
3766 cl_or(pRExC_state, &accum, &this_class);
3767 }
3768 }
3769 if (flags & SCF_DO_SUBSTR) {
3770 data->pos_min += min1;
3771 data->pos_delta += max1 - min1;
3772 if (max1 != min1 || is_inf)
3773 data->longest = &(data->longest_float);
3774 }
3775 min += min1;
3776 delta += max1 - min1;
3777 if (flags & SCF_DO_STCLASS_OR) {
3778 cl_or(pRExC_state, data->start_class, &accum);
3779 if (min1) {
3780 cl_and(data->start_class, and_withp);
3781 flags &= ~SCF_DO_STCLASS;
3782 }
3783 }
3784 else if (flags & SCF_DO_STCLASS_AND) {
3785 if (min1) {
3786 cl_and(data->start_class, &accum);
3787 flags &= ~SCF_DO_STCLASS;
3788 }
3789 else {
3790 /* Switch to OR mode: cache the old value of
3791 * data->start_class */
3792 INIT_AND_WITHP;
3793 StructCopy(data->start_class, and_withp,
3794 struct regnode_charclass_class);
3795 flags &= ~SCF_DO_STCLASS_AND;
3796 StructCopy(&accum, data->start_class,
3797 struct regnode_charclass_class);
3798 flags |= SCF_DO_STCLASS_OR;
3799 data->start_class->flags |= ANYOF_EOS;
3800 }
3801 }
3802 scan= tail;
3803 continue;
3804 }
786e8c11 3805#else
8aa23a47 3806 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 3807 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3808 U8*bang=NULL;
3809
3810 min += trie->minlen;
3811 delta += (trie->maxlen - trie->minlen);
3812 flags &= ~SCF_DO_STCLASS; /* xxx */
3813 if (flags & SCF_DO_SUBSTR) {
3814 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3815 data->pos_min += trie->minlen;
3816 data->pos_delta += (trie->maxlen - trie->minlen);
3817 if (trie->maxlen != trie->minlen)
3818 data->longest = &(data->longest_float);
3819 }
3820 if (trie->jump) /* no more substrings -- for now /grr*/
3821 flags &= ~SCF_DO_SUBSTR;
b515a41d 3822 }
8aa23a47
YO
3823#endif /* old or new */
3824#endif /* TRIE_STUDY_OPT */
3825 /* Else: zero-length, ignore. */
3826 scan = regnext(scan);
3827 }
3828 if (frame) {
3829 last = frame->last;
3830 scan = frame->next;
3831 stopparen = frame->stop;
3832 frame = frame->prev;
3833 goto fake_study_recurse;
c277df42
IZ
3834 }
3835
3836 finish:
8aa23a47
YO
3837 assert(!frame);
3838
c277df42 3839 *scanp = scan;
aca2d497 3840 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3841 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3842 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3843 if (is_par > (I32)U8_MAX)
c277df42
IZ
3844 is_par = 0;
3845 if (is_par && pars==1 && data) {
3846 data->flags |= SF_IN_PAR;
3847 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3848 }
3849 else if (pars && data) {
c277df42
IZ
3850 data->flags |= SF_HAS_PAR;
3851 data->flags &= ~SF_IN_PAR;
3852 }
653099ff 3853 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3854 cl_and(data->start_class, and_withp);
786e8c11
YO
3855 if (flags & SCF_TRIE_RESTUDY)
3856 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3857
3858 DEBUG_STUDYDATA(data,depth);
3859
e2e6a0f1 3860 return min < stopmin ? min : stopmin;
c277df42
IZ
3861}
3862
2eccd3b2
NC
3863STATIC U32
3864S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 3865{
4a4e7719
NC
3866 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
3867
3868 Renewc(RExC_rxi->data,
3869 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
3870 char, struct reg_data);
3871 if(count)
f8fc2ecf 3872 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 3873 else
f8fc2ecf 3874 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
3875 RExC_rxi->data->count = count + n;
3876 Copy(s, RExC_rxi->data->what + count, n, U8);
3877 return count;
c277df42
IZ
3878}
3879
76234dfb 3880#ifndef PERL_IN_XSUB_RE
d88dccdf 3881void
864dbfa3 3882Perl_reginitcolors(pTHX)
d88dccdf 3883{
97aff369 3884 dVAR;
1df70142 3885 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3886 if (s) {
1df70142
AL
3887 char *t = savepv(s);
3888 int i = 0;
3889 PL_colors[0] = t;
d88dccdf 3890 while (++i < 6) {
1df70142
AL
3891 t = strchr(t, '\t');
3892 if (t) {
3893 *t = '\0';
3894 PL_colors[i] = ++t;
d88dccdf
IZ
3895 }
3896 else
1df70142 3897 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3898 }
3899 } else {
1df70142 3900 int i = 0;
b81d288d 3901 while (i < 6)
06b5626a 3902 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3903 }
3904 PL_colorset = 1;
3905}
76234dfb 3906#endif
8615cb43 3907
07be1b83 3908
786e8c11
YO
3909#ifdef TRIE_STUDY_OPT
3910#define CHECK_RESTUDY_GOTO \
3911 if ( \
3912 (data.flags & SCF_TRIE_RESTUDY) \
3913 && ! restudied++ \
3914 ) goto reStudy
3915#else
3916#define CHECK_RESTUDY_GOTO
3917#endif
f9f4320a 3918
a687059c 3919/*
e50aee73 3920 - pregcomp - compile a regular expression into internal code
a687059c
LW
3921 *
3922 * We can't allocate space until we know how big the compiled form will be,
3923 * but we can't compile it (and thus know how big it is) until we've got a
3924 * place to put the code. So we cheat: we compile it twice, once with code
3925 * generation turned off and size counting turned on, and once "for real".
3926 * This also means that we don't allocate space until we are sure that the
3927 * thing really will compile successfully, and we never have to move the
3928 * code and thus invalidate pointers into it. (Note that it has to be in
3929 * one piece because free() must be able to free it all.) [NB: not true in perl]
3930 *
3931 * Beware that the optimization-preparation code in here knows about some
3932 * of the structure of the compiled regexp. [I'll say.]
3933 */
b9b4dddf
YO
3934
3935
3936
f9f4320a 3937#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3938#define RE_ENGINE_PTR &PL_core_reg_engine
3939#else
f9f4320a
YO
3940extern const struct regexp_engine my_reg_engine;
3941#define RE_ENGINE_PTR &my_reg_engine
3942#endif
b9b4dddf
YO
3943/* these make a few things look better, to avoid indentation */
3944#define BEGIN_BLOCK {
f9f4320a
YO
3945#define END_BLOCK }
3946
a687059c 3947regexp *
864dbfa3 3948Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3949{
97aff369 3950 dVAR;
f9f4320a
YO
3951 GET_RE_DEBUG_FLAGS_DECL;
3952 DEBUG_r(if (!PL_colorset) reginitcolors());
b9b4dddf
YO
3953#ifndef PERL_IN_XSUB_RE
3954 BEGIN_BLOCK
f9f4320a
YO
3955 /* Dispatch a request to compile a regexp to correct
3956 regexp engine. */
3957 HV * const table = GvHV(PL_hintgv);
3958 if (table) {
3959 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
1e2e3d02 3960 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3961 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3962 DEBUG_COMPILE_r({
8d8756e7 3963 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3964 SvIV(*ptr));
3965 });
f2f78491 3966 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3967 }
b9b4dddf
YO
3968 }
3969 END_BLOCK
3970#endif
3971 BEGIN_BLOCK
a0d0e21e 3972 register regexp *r;
f8fc2ecf 3973 register regexp_internal *ri;
c277df42 3974 regnode *scan;
c277df42 3975 regnode *first;
a0d0e21e 3976 I32 flags;
a0d0e21e
LW
3977 I32 minlen = 0;
3978 I32 sawplus = 0;
3979 I32 sawopen = 0;
2c2d71f5 3980 scan_data_t data;
830247a4 3981 RExC_state_t RExC_state;
be8e71aa 3982 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3983#ifdef TRIE_STUDY_OPT
3984 int restudied= 0;
3985 RExC_state_t copyRExC_state;
3986#endif
a0d0e21e 3987 if (exp == NULL)
c277df42 3988 FAIL("NULL regexp argument");
a0d0e21e 3989
a5961de5 3990 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3991
5cfc7842 3992 RExC_precomp = exp;
a3621e74 3993 DEBUG_COMPILE_r({
ab3bbdeb
YO
3994 SV *dsv= sv_newmortal();
3995 RE_PV_QUOTED_DECL(s, RExC_utf8,
3996 dsv, RExC_precomp, (xend - exp), 60);
3997 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3998 PL_colors[4],PL_colors[5],s);
a5961de5 3999 });
e2509266 4000 RExC_flags = pm->op_pmflags;
830247a4 4001 RExC_sawback = 0;
bbce6d69 4002
830247a4
IZ
4003 RExC_seen = 0;
4004 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4005 RExC_seen_evals = 0;
4006 RExC_extralen = 0;
c277df42 4007
bbce6d69 4008 /* First pass: determine size, legality. */
830247a4 4009 RExC_parse = exp;
fac92740 4010 RExC_start = exp;
830247a4
IZ
4011 RExC_end = xend;
4012 RExC_naughty = 0;
4013 RExC_npar = 1;
c74340f9 4014 RExC_cpar = 1;
e2e6a0f1 4015 RExC_nestroot = 0;
830247a4
IZ
4016 RExC_size = 0L;
4017 RExC_emit = &PL_regdummy;
4018 RExC_whilem_seen = 0;
fc8cd66c 4019 RExC_charnames = NULL;
40d049e4
YO
4020 RExC_open_parens = NULL;
4021 RExC_close_parens = NULL;
4022 RExC_opend = NULL;
81714fb9 4023 RExC_paren_names = NULL;
40d049e4
YO
4024 RExC_recurse = NULL;
4025 RExC_recurse_count = 0;
81714fb9 4026
85ddcde9
JH
4027#if 0 /* REGC() is (currently) a NOP at the first pass.
4028 * Clever compilers notice this and complain. --jhi */
830247a4 4029 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4030#endif
3dab1dad
YO
4031 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4032 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4033 RExC_precomp = NULL;
a0d0e21e
LW
4034 return(NULL);
4035 }
07be1b83 4036 DEBUG_PARSE_r({
81714fb9
YO
4037 PerlIO_printf(Perl_debug_log,
4038 "Required size %"IVdf" nodes\n"
4039 "Starting second pass (creation)\n",
4040 (IV)RExC_size);
07be1b83
YO
4041 RExC_lastnum=0;
4042 RExC_lastparse=NULL;
4043 });
c277df42
IZ
4044 /* Small enough for pointer-storage convention?
4045 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4046 if (RExC_size >= 0x10000L && RExC_extralen)
4047 RExC_size += RExC_extralen;
c277df42 4048 else
830247a4
IZ
4049 RExC_extralen = 0;
4050 if (RExC_whilem_seen > 15)
4051 RExC_whilem_seen = 15;
a0d0e21e 4052
e2e6a0f1
YO
4053#ifdef DEBUGGING
4054 /* Make room for a sentinel value at the end of the program */
4055 RExC_size++;
4056#endif
4057
f9f4320a
YO
4058 /* Allocate space and zero-initialize. Note, the two step process
4059 of zeroing when in debug mode, thus anything assigned has to
4060 happen after that */
f8fc2ecf
YO
4061 Newxz(r, 1, regexp);
4062 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4063 char, regexp_internal);
4064 if ( r == NULL || ri == NULL )
b45f050a 4065 FAIL("Regexp out of space");
0f79a09d
GS
4066#ifdef DEBUGGING
4067 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4068 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4069#else
f8fc2ecf
YO
4070 /* bulk initialize base fields with 0. */
4071 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4072#endif
58e23c8d
YO
4073
4074 /* non-zero initialization begins here */
f8fc2ecf 4075 RXi_SET( r, ri );
f9f4320a 4076 r->engine= RE_ENGINE_PTR;
c277df42 4077 r->refcnt = 1;
bbce6d69 4078 r->prelen = xend - exp;
5cfc7842 4079 r->precomp = savepvn(RExC_precomp, r->prelen);
bbe252da
YO
4080 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
4081 r->intflags = 0;
830247a4 4082 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4083
6bda09f9 4084 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4085 Newxz(RExC_open_parens, RExC_npar,regnode *);
4086 SAVEFREEPV(RExC_open_parens);
4087 Newxz(RExC_close_parens,RExC_npar,regnode *);
4088 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4089 }
4090
4091 /* Useful during FAIL. */
f8fc2ecf
YO
4092 Newxz(ri->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4093 if (ri->offsets) {
4094 ri->offsets[0] = RExC_size;
fac92740 4095 }
a3621e74 4096 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4097 "%s %"UVuf" bytes for offset annotations.\n",
f8fc2ecf 4098 ri->offsets ? "Got" : "Couldn't get",
392fbf5d 4099 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4100
830247a4 4101 RExC_rx = r;
f8fc2ecf 4102 RExC_rxi = ri;
bbce6d69 4103
4104 /* Second pass: emit code. */
e2509266 4105 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4106 RExC_parse = exp;
4107 RExC_end = xend;
4108 RExC_naughty = 0;
4109 RExC_npar = 1;
c74340f9 4110 RExC_cpar = 1;
f8fc2ecf
YO
4111 RExC_emit_start = ri->program;
4112 RExC_emit = ri->program;
e2e6a0f1
YO
4113#ifdef DEBUGGING
4114 /* put a sentinal on the end of the program so we can check for
4115 overwrites */
f8fc2ecf 4116 ri->program[RExC_size].type = 255;
e2e6a0f1 4117#endif
2cd61cdb 4118 /* Store the count of eval-groups for security checks: */
786e8c11 4119 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 4120 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3dab1dad 4121 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4122 return(NULL);
6bda09f9 4123
07be1b83
YO
4124 /* XXXX To minimize changes to RE engine we always allocate
4125 3-units-long substrs field. */
4126 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4127 if (RExC_recurse_count) {
4128 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4129 SAVEFREEPV(RExC_recurse);
4130 }
a0d0e21e 4131
07be1b83 4132reStudy:
1de06328 4133 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4134 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4135
07be1b83
YO
4136#ifdef TRIE_STUDY_OPT
4137 if ( restudied ) {
5d458dd8 4138 U32 seen=RExC_seen;
07be1b83 4139 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4140
4141 RExC_state = copyRExC_state;
4142 if (seen & REG_TOP_LEVEL_BRANCHES)
4143 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4144 else
4145 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4146 if (data.last_found) {
07be1b83 4147 SvREFCNT_dec(data.longest_fixed);
07be1b83 4148 SvREFCNT_dec(data.longest_float);
07be1b83 4149 SvREFCNT_dec(data.last_found);
1de06328 4150 }
40d049e4 4151 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4152 } else {
40d049e4 4153 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4154 copyRExC_state = RExC_state;
07be1b83 4155 }
40d049e4
YO
4156#else
4157 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4158#endif
fc8cd66c 4159
a0d0e21e 4160 /* Dig out information for optimizations. */
bbe252da 4161 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */
e2509266 4162 pm->op_pmflags = RExC_flags;
a0ed51b3 4163 if (UTF)
bbe252da 4164 r->extflags |= RXf_UTF8; /* Unicode in it? */
f8fc2ecf 4165 ri->regstclass = NULL;
830247a4 4166 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4167 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4168 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4169
1de06328
YO
4170 /* testing for BRANCH here tells us whether there is "must appear"
4171 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4172 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4173 I32 fake;
c5254dd6 4174 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4175 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4176 int stclass_flag;
07be1b83 4177 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4178
4179 first = scan;
c277df42 4180 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4181 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4182 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4183 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4184 /* for now we can't handle lookbehind IFMATCH*/
4185 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4186 (OP(first) == PLUS) ||
4187 (OP(first) == MINMOD) ||
653099ff 4188 /* An {n,m} with n>0 */
07be1b83
YO
4189 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4190 {
786e8c11 4191
a0d0e21e
LW
4192 if (OP(first) == PLUS)
4193 sawplus = 1;
4194 else
3dab1dad 4195 first += regarglen[OP(first)];
07be1b83
YO
4196 if (OP(first) == IFMATCH) {
4197 first = NEXTOPER(first);
4198 first += EXTRA_STEP_2ARGS;
7c167cea 4199 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4200 first = NEXTOPER(first);
a687059c
LW
4201 }
4202
a0d0e21e
LW
4203 /* Starting-point info. */
4204 again:
786e8c11 4205 DEBUG_PEEP("first:",first,0);
07be1b83 4206 /* Ignore EXACT as we deal with it later. */
3dab1dad 4207 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4208 if (OP(first) == EXACT)
6f207bd3 4209 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4210 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4211 ri->regstclass = first;
b3c9acc1 4212 }
07be1b83 4213#ifdef TRIE_STCLASS
786e8c11 4214 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4215 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4216 {
786e8c11 4217 regnode *trie_op;
07be1b83 4218 /* this can happen only on restudy */
786e8c11
YO
4219 if ( OP(first) == TRIE ) {
4220 struct regnode_1 *trieop;
4221 Newxz(trieop,1,struct regnode_1);
4222 StructCopy(first,trieop,struct regnode_1);
4223 trie_op=(regnode *)trieop;
4224 } else {
4225 struct regnode_charclass *trieop;
4226 Newxz(trieop,1,struct regnode_charclass);
4227 StructCopy(first,trieop,struct regnode_charclass);
4228 trie_op=(regnode *)trieop;
4229 }
1de06328 4230 OP(trie_op)+=2;
786e8c11 4231 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4232 ri->regstclass = trie_op;
07be1b83
YO
4233 }
4234#endif
bfed75c6 4235 else if (strchr((const char*)PL_simple,OP(first)))
f8fc2ecf 4236 ri->regstclass = first;
3dab1dad
YO
4237 else if (PL_regkind[OP(first)] == BOUND ||
4238 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4239 ri->regstclass = first;
3dab1dad 4240 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4241 r->extflags |= (OP(first) == MBOL
4242 ? RXf_ANCH_MBOL
cad2e5aa 4243 : (OP(first) == SBOL
bbe252da
YO
4244 ? RXf_ANCH_SBOL
4245 : RXf_ANCH_BOL));
a0d0e21e 4246 first = NEXTOPER(first);
774d564b 4247 goto again;
4248 }
4249 else if (OP(first) == GPOS) {
bbe252da 4250 r->extflags |= RXf_ANCH_GPOS;
774d564b 4251 first = NEXTOPER(first);
4252 goto again;
a0d0e21e 4253 }
cf2a2b69
YO
4254 else if ((!sawopen || !RExC_sawback) &&
4255 (OP(first) == STAR &&
3dab1dad 4256 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4257 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4258 {
4259 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4260 const int type =
4261 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4262 ? RXf_ANCH_MBOL
4263 : RXf_ANCH_SBOL;
4264 r->extflags |= type;
4265 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4266 first = NEXTOPER(first);
774d564b 4267 goto again;
a0d0e21e 4268 }
b81d288d 4269 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4270 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4271 /* x+ must match at the 1st pos of run of x's */
bbe252da 4272 r->intflags |= PREGf_SKIP;
a0d0e21e 4273
c277df42 4274 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4275#ifdef TRIE_STUDY_OPT
81714fb9 4276 DEBUG_PARSE_r(
be8e71aa
YO
4277 if (!restudied)
4278 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4279 (IV)(first - scan + 1))
4280 );
4281#else
81714fb9 4282 DEBUG_PARSE_r(
be8e71aa
YO
4283 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4284 (IV)(first - scan + 1))
4285 );
4286#endif
4287
4288
a0d0e21e
LW
4289 /*
4290 * If there's something expensive in the r.e., find the
4291 * longest literal string that must appear and make it the
4292 * regmust. Resolve ties in favor of later strings, since
4293 * the regstart check works with the beginning of the r.e.
4294 * and avoiding duplication strengthens checking. Not a
4295 * strong reason, but sufficient in the absence of others.
4296 * [Now we resolve ties in favor of the earlier string if
c277df42 4297 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4298 * earlier string may buy us something the later one won't.]
4299 */
de8c5301 4300
396482e1
GA
4301 data.longest_fixed = newSVpvs("");
4302 data.longest_float = newSVpvs("");
4303 data.last_found = newSVpvs("");
c277df42
IZ
4304 data.longest = &(data.longest_fixed);
4305 first = scan;
f8fc2ecf 4306 if (!ri->regstclass) {
830247a4 4307 cl_init(pRExC_state, &ch_class);
653099ff
GS
4308 data.start_class = &ch_class;
4309 stclass_flag = SCF_DO_STCLASS_AND;
4310 } else /* XXXX Check for BOUND? */
4311 stclass_flag = 0;
cb434fcc 4312 data.last_closep = &last_close;
de8c5301 4313
1de06328 4314 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4315 &data, -1, NULL, NULL,
4316 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4317
07be1b83 4318
786e8c11
YO
4319 CHECK_RESTUDY_GOTO;
4320
4321
830247a4 4322 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4323 && data.last_start_min == 0 && data.last_end > 0
830247a4 4324 && !RExC_seen_zerolen
bbe252da
YO
4325 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4326 r->extflags |= RXf_CHECK_ALL;
1de06328 4327 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4328 SvREFCNT_dec(data.last_found);
4329
1de06328
YO
4330 /* Note that code very similar to this but for anchored string
4331 follows immediately below, changes may need to be made to both.
4332 Be careful.
4333 */
a0ed51b3 4334 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4335 if (longest_float_length
c277df42
IZ
4336 || (data.flags & SF_FL_BEFORE_EOL
4337 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4338 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4339 {
1182767e 4340 I32 t,ml;
cf93c79d 4341
1de06328 4342 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4343 && data.offset_fixed == data.offset_float_min
4344 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4345 goto remove_float; /* As in (a)+. */
4346
1de06328
YO
4347 /* copy the information about the longest float from the reg_scan_data
4348 over to the program. */
33b8afdf
JH
4349 if (SvUTF8(data.longest_float)) {
4350 r->float_utf8 = data.longest_float;
c445ea15 4351 r->float_substr = NULL;
33b8afdf
JH
4352 } else {
4353 r->float_substr = data.longest_float;
c445ea15 4354 r->float_utf8 = NULL;
33b8afdf 4355 }
1de06328
YO
4356 /* float_end_shift is how many chars that must be matched that
4357 follow this item. We calculate it ahead of time as once the
4358 lookbehind offset is added in we lose the ability to correctly
4359 calculate it.*/
4360 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4361 : (I32)longest_float_length;
1de06328
YO
4362 r->float_end_shift = ml - data.offset_float_min
4363 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4364 + data.lookbehind_float;
4365 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4366 r->float_max_offset = data.offset_float_max;
1182767e 4367 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4368 r->float_max_offset -= data.lookbehind_float;
4369
cf93c79d
IZ
4370 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4371 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4372 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4373 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4374 }
4375 else {
aca2d497 4376 remove_float:
c445ea15 4377 r->float_substr = r->float_utf8 = NULL;
c277df42 4378 SvREFCNT_dec(data.longest_float);
c5254dd6 4379 longest_float_length = 0;
a0d0e21e 4380 }
c277df42 4381
1de06328
YO
4382 /* Note that code very similar to this but for floating string
4383 is immediately above, changes may need to be made to both.
4384 Be careful.
4385 */
a0ed51b3 4386 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4387 if (longest_fixed_length
c277df42
IZ
4388 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4389 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4390 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4391 {
1182767e 4392 I32 t,ml;
cf93c79d 4393
1de06328
YO
4394 /* copy the information about the longest fixed
4395 from the reg_scan_data over to the program. */
33b8afdf
JH
4396 if (SvUTF8(data.longest_fixed)) {
4397 r->anchored_utf8 = data.longest_fixed;
c445ea15 4398 r->anchored_substr = NULL;
33b8afdf
JH
4399 } else {
4400 r->anchored_substr = data.longest_fixed;
c445ea15 4401 r->anchored_utf8 = NULL;
33b8afdf 4402 }
1de06328
YO
4403 /* fixed_end_shift is how many chars that must be matched that
4404 follow this item. We calculate it ahead of time as once the
4405 lookbehind offset is added in we lose the ability to correctly
4406 calculate it.*/
4407 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4408 : (I32)longest_fixed_length;
1de06328
YO
4409 r->anchored_end_shift = ml - data.offset_fixed
4410 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4411 + data.lookbehind_fixed;
4412 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4413
cf93c79d
IZ
4414 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4415 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4416 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4417 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4418 }
4419 else {
c445ea15 4420 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4421 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4422 longest_fixed_length = 0;
a0d0e21e 4423 }
f8fc2ecf
YO
4424 if (ri->regstclass
4425 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4426 ri->regstclass = NULL;
33b8afdf
JH
4427 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4428 && stclass_flag
653099ff 4429 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4430 && !cl_is_anything(data.start_class))
4431 {
2eccd3b2 4432 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4433
f8fc2ecf 4434 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4435 struct regnode_charclass_class);
4436 StructCopy(data.start_class,
f8fc2ecf 4437 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4438 struct regnode_charclass_class);
f8fc2ecf 4439 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4440 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4441 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4442 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4443 PerlIO_printf(Perl_debug_log,
a0288114 4444 "synthetic stclass \"%s\".\n",
3f7c398e 4445 SvPVX_const(sv));});
653099ff 4446 }
c277df42
IZ
4447
4448 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4449 if (longest_fixed_length > longest_float_length) {
1de06328 4450 r->check_end_shift = r->anchored_end_shift;
c277df42 4451 r->check_substr = r->anchored_substr;
33b8afdf 4452 r->check_utf8 = r->anchored_utf8;
c277df42 4453 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4454 if (r->extflags & RXf_ANCH_SINGLE)
4455 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4456 }
4457 else {
1de06328 4458 r->check_end_shift = r->float_end_shift;
c277df42 4459 r->check_substr = r->float_substr;
33b8afdf 4460 r->check_utf8 = r->float_utf8;
1de06328
YO
4461 r->check_offset_min = r->float_min_offset;
4462 r->check_offset_max = r->float_max_offset;
a0d0e21e 4463 }
30382c73
IZ
4464 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4465 This should be changed ASAP! */
bbe252da
YO
4466 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4467 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4468 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4469 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4470 }
1de06328
YO
4471 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4472 if ( (STRLEN)minlen < longest_float_length )
4473 minlen= longest_float_length;
4474 if ( (STRLEN)minlen < longest_fixed_length )
4475 minlen= longest_fixed_length;
4476 */
a0ed51b3
LW
4477 }
4478 else {
c277df42
IZ
4479 /* Several toplevels. Best we can is to set minlen. */
4480 I32 fake;
653099ff 4481 struct regnode_charclass_class ch_class;
cb434fcc 4482 I32 last_close = 0;
c277df42 4483
5d458dd8 4484 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4485
f8fc2ecf 4486 scan = ri->program + 1;
830247a4 4487 cl_init(pRExC_state, &ch_class);
653099ff 4488 data.start_class = &ch_class;
cb434fcc 4489 data.last_closep = &last_close;
07be1b83 4490
de8c5301 4491
1de06328 4492 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4493 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4494
786e8c11 4495 CHECK_RESTUDY_GOTO;
07be1b83 4496
33b8afdf 4497 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4498 = r->float_substr = r->float_utf8 = NULL;
653099ff 4499 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4500 && !cl_is_anything(data.start_class))
4501 {
2eccd3b2 4502 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4503
f8fc2ecf 4504 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4505 struct regnode_charclass_class);
4506 StructCopy(data.start_class,
f8fc2ecf 4507 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4508 struct regnode_charclass_class);
f8fc2ecf 4509 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4510 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4511 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4512 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4513 PerlIO_printf(Perl_debug_log,
a0288114 4514 "synthetic stclass \"%s\".\n",
3f7c398e 4515 SvPVX_const(sv));});
653099ff 4516 }
a0d0e21e
LW
4517 }
4518
1de06328
YO
4519 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4520 the "real" pattern. */
cf9788e3
RGS
4521 DEBUG_OPTIMISE_r({
4522 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4523 (IV)minlen, (IV)r->minlen);
cf9788e3 4524 });
de8c5301 4525 r->minlenret = minlen;
1de06328
YO
4526 if (r->minlen < minlen)
4527 r->minlen = minlen;
4528
b81d288d 4529 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4530 r->extflags |= RXf_GPOS_SEEN;
830247a4 4531 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4532 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4533 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4534 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4535 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4536 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4537 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4538 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4539 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4540 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9
YO
4541 if (RExC_paren_names)
4542 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4543 else
4544 r->paren_names = NULL;
4545
40d049e4
YO
4546 if (RExC_recurse_count) {
4547 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4548 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4549 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4550 }
4551 }
a02a5408
JC
4552 Newxz(r->startp, RExC_npar, I32);
4553 Newxz(r->endp, RExC_npar, I32);
c74340f9
YO
4554 /* assume we don't need to swap parens around before we match */
4555
be8e71aa
YO
4556 DEBUG_DUMP_r({
4557 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4558 regdump(r);
4559 });
f8fc2ecf
YO
4560 DEBUG_OFFSETS_r(if (ri->offsets) {
4561 const U32 len = ri->offsets[0];
8e9a8a48
YO
4562 U32 i;
4563 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 4564 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->offsets[0]);
8e9a8a48 4565 for (i = 1; i <= len; i++) {
f8fc2ecf 4566 if (ri->offsets[i*2-1] || ri->offsets[i*2])
8e9a8a48 4567 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
f8fc2ecf 4568 (UV)i, (UV)ri->offsets[i*2-1], (UV)ri->offsets[i*2]);
8e9a8a48
YO
4569 }
4570 PerlIO_printf(Perl_debug_log, "\n");
4571 });
a0d0e21e 4572 return(r);
f9f4320a 4573 END_BLOCK
a687059c
LW
4574}
4575
f9f4320a
YO
4576#undef CORE_ONLY_BLOCK
4577#undef END_BLOCK
4578#undef RE_ENGINE_PTR
3dab1dad 4579
9af228c6 4580#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4581SV*
4582Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4583{
4584 I32 parno = 0; /* no match */
4585 if (PL_curpm) {
4586 const REGEXP * const rx = PM_GETRE(PL_curpm);
4587 if (rx && rx->paren_names) {
4588 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4589 if (he_str) {
4590 IV i;
4591 SV* sv_dat=HeVAL(he_str);
4592 I32 *nums=(I32*)SvPVX(sv_dat);
4593 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4594 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4595 rx->endp[nums[i]] != -1)
4596 {
4597 parno = nums[i];
4598 break;
4599 }
4600 }
4601 }
4602 }
4603 }
4604 if ( !parno ) {
4605 return 0;
4606 } else {
4607 GV *gv_paren;
4608 SV *sv= sv_newmortal();
4609 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4610 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4611 return GvSVn(gv_paren);
4612 }
4613}
9af228c6 4614#endif
0a4db386 4615
894be9b7 4616/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4617 * If flags is REG_RSN_RETURN_NULL returns null.
4618 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4619 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4620 * to the parsed name as looked up in the RExC_paren_names hash.
4621 * If there is an error throws a vFAIL().. type exception.
894be9b7 4622 */
0a4db386
YO
4623
4624#define REG_RSN_RETURN_NULL 0
4625#define REG_RSN_RETURN_NAME 1
4626#define REG_RSN_RETURN_DATA 2
4627
894be9b7
YO
4628STATIC SV*
4629S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4630 char *name_start = RExC_parse;
0a4db386 4631 if ( UTF ) {
894be9b7 4632 STRLEN numlen;
0a4db386
YO
4633 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4634 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4635 {
4636 RExC_parse += numlen;
4637 }
4638 } else {
4639 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4640 RExC_parse++;
4641 }
0a4db386
YO
4642 if ( flags ) {
4643 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4644 (int)(RExC_parse - name_start)));
894be9b7 4645 if (UTF)
0a4db386
YO
4646 SvUTF8_on(sv_name);
4647 if ( flags == REG_RSN_RETURN_NAME)
4648 return sv_name;
4649 else if (flags==REG_RSN_RETURN_DATA) {
4650 HE *he_str = NULL;
4651 SV *sv_dat = NULL;
4652 if ( ! sv_name ) /* should not happen*/
4653 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4654 if (RExC_paren_names)
4655 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4656 if ( he_str )
4657 sv_dat = HeVAL(he_str);
4658 if ( ! sv_dat )
4659 vFAIL("Reference to nonexistent named group");
4660 return sv_dat;
4661 }
4662 else {
4663 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4664 }
4665 /* NOT REACHED */
894be9b7 4666 }
0a4db386 4667 return NULL;
894be9b7
YO
4668}
4669
3dab1dad
YO
4670#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4671 int rem=(int)(RExC_end - RExC_parse); \
4672 int cut; \
4673 int num; \
4674 int iscut=0; \
4675 if (rem>10) { \
4676 rem=10; \
4677 iscut=1; \
4678 } \
4679 cut=10-rem; \
4680 if (RExC_lastparse!=RExC_parse) \
4681 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4682 rem, RExC_parse, \
4683 cut + 4, \
4684 iscut ? "..." : "<" \
4685 ); \
4686 else \
4687 PerlIO_printf(Perl_debug_log,"%16s",""); \
4688 \
4689 if (SIZE_ONLY) \
4690 num=RExC_size; \
4691 else \
4692 num=REG_NODE_NUM(RExC_emit); \
4693 if (RExC_lastnum!=num) \
0a4db386 4694 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4695 else \
0a4db386 4696 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4697 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4698 (int)((depth*2)), "", \
3dab1dad
YO
4699 (funcname) \
4700 ); \
4701 RExC_lastnum=num; \
4702 RExC_lastparse=RExC_parse; \
4703})
4704
07be1b83
YO
4705
4706
3dab1dad
YO
4707#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4708 DEBUG_PARSE_MSG((funcname)); \
4709 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4710})
6bda09f9
YO
4711#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4712 DEBUG_PARSE_MSG((funcname)); \
4713 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4714})
a687059c
LW
4715/*
4716 - reg - regular expression, i.e. main body or parenthesized thing
4717 *
4718 * Caller must absorb opening parenthesis.
4719 *
4720 * Combining parenthesis handling with the base level of regular expression
4721 * is a trifle forced, but the need to tie the tails of the branches to what
4722 * follows makes it hard to avoid.
4723 */
07be1b83
YO
4724#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4725#ifdef DEBUGGING
4726#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4727#else
4728#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4729#endif
3dab1dad 4730
e2e6a0f1
YO
4731/* this idea is borrowed from STR_WITH_LEN in handy.h */
4732#define CHECK_WORD(s,v,l) \
4733 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4734
76e3520e 4735STATIC regnode *
3dab1dad 4736S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4737 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4738{
27da23d5 4739 dVAR;
c277df42
IZ
4740 register regnode *ret; /* Will be the head of the group. */
4741 register regnode *br;
4742 register regnode *lastbr;
cbbf8932 4743 register regnode *ender = NULL;
a0d0e21e 4744 register I32 parno = 0;
cbbf8932
AL
4745 I32 flags;
4746 const I32 oregflags = RExC_flags;
6136c704
AL
4747 bool have_branch = 0;
4748 bool is_open = 0;
9d1d55b5
JP
4749
4750 /* for (?g), (?gc), and (?o) warnings; warning
4751 about (?c) will warn about (?g) -- japhy */
4752
6136c704
AL
4753#define WASTED_O 0x01
4754#define WASTED_G 0x02
4755#define WASTED_C 0x04
4756#define WASTED_GC (0x02|0x04)
cbbf8932 4757 I32 wastedflags = 0x00;
9d1d55b5 4758
fac92740 4759 char * parse_start = RExC_parse; /* MJD */
a28509cc 4760 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4761
3dab1dad
YO
4762 GET_RE_DEBUG_FLAGS_DECL;
4763 DEBUG_PARSE("reg ");
4764
4765
821b33a5 4766 *flagp = 0; /* Tentatively. */
a0d0e21e 4767
9d1d55b5 4768
a0d0e21e
LW
4769 /* Make an OPEN node, if parenthesized. */
4770 if (paren) {
e2e6a0f1
YO
4771 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4772 char *start_verb = RExC_parse;
4773 STRLEN verb_len = 0;
4774 char *start_arg = NULL;
4775 unsigned char op = 0;
4776 int argok = 1;
4777 int internal_argval = 0; /* internal_argval is only useful if !argok */
4778 while ( *RExC_parse && *RExC_parse != ')' ) {
4779 if ( *RExC_parse == ':' ) {
4780 start_arg = RExC_parse + 1;
4781 break;
4782 }
4783 RExC_parse++;
4784 }
4785 ++start_verb;
4786 verb_len = RExC_parse - start_verb;
4787 if ( start_arg ) {
4788 RExC_parse++;
4789 while ( *RExC_parse && *RExC_parse != ')' )
4790 RExC_parse++;
4791 if ( *RExC_parse != ')' )
4792 vFAIL("Unterminated verb pattern argument");
4793 if ( RExC_parse == start_arg )
4794 start_arg = NULL;
4795 } else {
4796 if ( *RExC_parse != ')' )
4797 vFAIL("Unterminated verb pattern");
4798 }
5d458dd8 4799
e2e6a0f1
YO
4800 switch ( *start_verb ) {
4801 case 'A': /* (*ACCEPT) */
4802 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4803 op = ACCEPT;
4804 internal_argval = RExC_nestroot;
4805 }
4806 break;
4807 case 'C': /* (*COMMIT) */
4808 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4809 op = COMMIT;
e2e6a0f1
YO
4810 break;
4811 case 'F': /* (*FAIL) */
4812 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4813 op = OPFAIL;
4814 argok = 0;
4815 }
4816 break;
5d458dd8
YO
4817 case ':': /* (*:NAME) */
4818 case 'M': /* (*MARK:NAME) */
4819 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 4820 op = MARKPOINT;
5d458dd8
YO
4821 argok = -1;
4822 }
4823 break;
4824 case 'P': /* (*PRUNE) */
4825 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4826 op = PRUNE;
e2e6a0f1 4827 break;
5d458dd8
YO
4828 case 'S': /* (*SKIP) */
4829 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4830 op = SKIP;
4831 break;
4832 case 'T': /* (*THEN) */
4833 /* [19:06] <TimToady> :: is then */
4834 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4835 op = CUTGROUP;
4836 RExC_seen |= REG_SEEN_CUTGROUP;
4837 }
e2e6a0f1
YO
4838 break;
4839 }
4840 if ( ! op ) {
4841 RExC_parse++;
4842 vFAIL3("Unknown verb pattern '%.*s'",
4843 verb_len, start_verb);
4844 }
4845 if ( argok ) {
4846 if ( start_arg && internal_argval ) {
4847 vFAIL3("Verb pattern '%.*s' may not have an argument",
4848 verb_len, start_verb);
4849 } else if ( argok < 0 && !start_arg ) {
4850 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4851 verb_len, start_verb);
4852 } else {
4853 ret = reganode(pRExC_state, op, internal_argval);
4854 if ( ! internal_argval && ! SIZE_ONLY ) {
4855 if (start_arg) {
4856 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4857 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 4858 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
4859 ret->flags = 0;
4860 } else {
4861 ret->flags = 1;
4862 }
4863 }
4864 }
4865 if (!internal_argval)
4866 RExC_seen |= REG_SEEN_VERBARG;
4867 } else if ( start_arg ) {
4868 vFAIL3("Verb pattern '%.*s' may not have an argument",
4869 verb_len, start_verb);
4870 } else {
4871 ret = reg_node(pRExC_state, op);
4872 }
4873 nextchar(pRExC_state);
4874 return ret;
4875 } else
fac92740 4876 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4877 U32 posflags = 0, negflags = 0;
4878 U32 *flagsp = &posflags;
6136c704 4879 bool is_logical = 0;
a28509cc 4880 const char * const seqstart = RExC_parse;
ca9dfc88 4881
830247a4
IZ
4882 RExC_parse++;
4883 paren = *RExC_parse++;
c277df42 4884 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4885 switch (paren) {
894be9b7 4886
fac92740 4887 case '<': /* (?<...) */
b81d288d 4888 if (*RExC_parse == '!')
c277df42 4889 paren = ',';
0a4db386
YO
4890 else if (*RExC_parse != '=')
4891 { /* (?<...>) */
81714fb9 4892 char *name_start;
894be9b7 4893 SV *svname;
81714fb9
YO
4894 paren= '>';
4895 case '\'': /* (?'...') */
4896 name_start= RExC_parse;
0a4db386
YO
4897 svname = reg_scan_name(pRExC_state,
4898 SIZE_ONLY ? /* reverse test from the others */
4899 REG_RSN_RETURN_NAME :
4900 REG_RSN_RETURN_NULL);
81714fb9
YO
4901 if (RExC_parse == name_start)
4902 goto unknown;
4903 if (*RExC_parse != paren)
4904 vFAIL2("Sequence (?%c... not terminated",
4905 paren=='>' ? '<' : paren);
4906 if (SIZE_ONLY) {
e62cc96a
YO
4907 HE *he_str;
4908 SV *sv_dat = NULL;
894be9b7
YO
4909 if (!svname) /* shouldnt happen */
4910 Perl_croak(aTHX_
4911 "panic: reg_scan_name returned NULL");
81714fb9
YO
4912 if (!RExC_paren_names) {
4913 RExC_paren_names= newHV();
4914 sv_2mortal((SV*)RExC_paren_names);
4915 }
4916 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4917 if ( he_str )
81714fb9 4918 sv_dat = HeVAL(he_str);
e62cc96a 4919 if ( ! sv_dat ) {
81714fb9 4920 /* croak baby croak */
e62cc96a
YO
4921 Perl_croak(aTHX_
4922 "panic: paren_name hash element allocation failed");
4923 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4924 IV count=SvIV(sv_dat);
4925 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4926 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4927 pv[count]=RExC_npar;
4928 SvIVX(sv_dat)++;
4929 } else {
4930 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4931 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4932 SvIOK_on(sv_dat);
4933 SvIVX(sv_dat)= 1;
e62cc96a
YO
4934 }
4935
81714fb9
YO
4936 /*sv_dump(sv_dat);*/
4937 }
4938 nextchar(pRExC_state);
4939 paren = 1;
4940 goto capturing_parens;
4941 }
4942 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4943 RExC_parse++;
fac92740
MJD
4944 case '=': /* (?=...) */
4945 case '!': /* (?!...) */
830247a4 4946 RExC_seen_zerolen++;
e2e6a0f1
YO
4947 if (*RExC_parse == ')') {
4948 ret=reg_node(pRExC_state, OPFAIL);
4949 nextchar(pRExC_state);
4950 return ret;
4951 }
fac92740
MJD
4952 case ':': /* (?:...) */
4953 case '>': /* (?>...) */
a0d0e21e 4954 break;
fac92740
MJD
4955 case '$': /* (?$...) */
4956 case '@': /* (?@...) */
8615cb43 4957 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4958 break;
fac92740 4959 case '#': /* (?#...) */
830247a4
IZ
4960 while (*RExC_parse && *RExC_parse != ')')
4961 RExC_parse++;
4962 if (*RExC_parse != ')')
c277df42 4963 FAIL("Sequence (?#... not terminated");
830247a4 4964 nextchar(pRExC_state);
a0d0e21e
LW
4965 *flagp = TRYAGAIN;
4966 return NULL;
894be9b7
YO
4967 case '0' : /* (?0) */
4968 case 'R' : /* (?R) */
4969 if (*RExC_parse != ')')
6bda09f9 4970 FAIL("Sequence (?R) not terminated");
1a147d38 4971 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
4972 nextchar(pRExC_state);
4973 return ret;
4974 /*notreached*/
894be9b7
YO
4975 { /* named and numeric backreferences */
4976 I32 num;
4977 char * parse_start;
4978 case '&': /* (?&NAME) */
4979 parse_start = RExC_parse - 1;
4980 {
0a4db386
YO
4981 SV *sv_dat = reg_scan_name(pRExC_state,
4982 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4983 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
4984 }
4985 goto gen_recurse_regop;
4986 /* NOT REACHED */
542fa716
YO
4987 case '+':
4988 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4989 RExC_parse++;
4990 vFAIL("Illegal pattern");
4991 }
4992 goto parse_recursion;
4993 /* NOT REACHED*/
4994 case '-': /* (?-1) */
4995 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4996 RExC_parse--; /* rewind to let it be handled later */
4997 goto parse_flags;
4998 }
4999 /*FALLTHROUGH */
6bda09f9
YO
5000 case '1': case '2': case '3': case '4': /* (?1) */
5001 case '5': case '6': case '7': case '8': case '9':
5002 RExC_parse--;
542fa716 5003 parse_recursion:
894be9b7
YO
5004 num = atoi(RExC_parse);
5005 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5006 if (*RExC_parse == '-')
5007 RExC_parse++;
6bda09f9
YO
5008 while (isDIGIT(*RExC_parse))
5009 RExC_parse++;
5010 if (*RExC_parse!=')')
5011 vFAIL("Expecting close bracket");
894be9b7
YO
5012
5013 gen_recurse_regop:
542fa716
YO
5014 if ( paren == '-' ) {
5015 /*
5016 Diagram of capture buffer numbering.
5017 Top line is the normal capture buffer numbers
5018 Botton line is the negative indexing as from
5019 the X (the (?-2))
5020
5021 + 1 2 3 4 5 X 6 7
5022 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5023 - 5 4 3 2 1 X x x
5024
5025 */
5026 num = RExC_npar + num;
5027 if (num < 1) {
5028 RExC_parse++;
5029 vFAIL("Reference to nonexistent group");
5030 }
5031 } else if ( paren == '+' ) {
5032 num = RExC_npar + num - 1;
5033 }
5034
1a147d38 5035 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5036 if (!SIZE_ONLY) {
5037 if (num > (I32)RExC_rx->nparens) {
5038 RExC_parse++;
5039 vFAIL("Reference to nonexistent group");
5040 }
40d049e4 5041 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5042 RExC_emit++;
226de585 5043 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5044 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5045 } else {
6bda09f9 5046 RExC_size++;
6bda09f9 5047 }
0a4db386 5048 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5049 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5050 Set_Node_Offset(ret, parse_start); /* MJD */
5051
6bda09f9
YO
5052 nextchar(pRExC_state);
5053 return ret;
894be9b7
YO
5054 } /* named and numeric backreferences */
5055 /* NOT REACHED */
5056
fac92740 5057 case 'p': /* (?p...) */
9014280d 5058 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 5059 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 5060 /* FALL THROUGH*/
fac92740 5061 case '?': /* (??...) */
6136c704 5062 is_logical = 1;
438a3801
YST
5063 if (*RExC_parse != '{')
5064 goto unknown;
830247a4 5065 paren = *RExC_parse++;
0f5d15d6 5066 /* FALL THROUGH */
fac92740 5067 case '{': /* (?{...}) */
c277df42 5068 {
2eccd3b2
NC
5069 I32 count = 1;
5070 U32 n = 0;
c277df42 5071 char c;
830247a4 5072 char *s = RExC_parse;
c277df42 5073
830247a4
IZ
5074 RExC_seen_zerolen++;
5075 RExC_seen |= REG_SEEN_EVAL;
5076 while (count && (c = *RExC_parse)) {
6136c704
AL
5077 if (c == '\\') {
5078 if (RExC_parse[1])
5079 RExC_parse++;
5080 }
b81d288d 5081 else if (c == '{')
c277df42 5082 count++;
b81d288d 5083 else if (c == '}')
c277df42 5084 count--;
830247a4 5085 RExC_parse++;
c277df42 5086 }
6136c704 5087 if (*RExC_parse != ')') {
b81d288d 5088 RExC_parse = s;
b45f050a
JF
5089 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5090 }
c277df42 5091 if (!SIZE_ONLY) {
f3548bdc 5092 PAD *pad;
6136c704
AL
5093 OP_4tree *sop, *rop;
5094 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5095
569233ed
SB
5096 ENTER;
5097 Perl_save_re_context(aTHX);
f3548bdc 5098 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5099 sop->op_private |= OPpREFCOUNTED;
5100 /* re_dup will OpREFCNT_inc */
5101 OpREFCNT_set(sop, 1);
569233ed 5102 LEAVE;
c277df42 5103
830247a4 5104 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5105 RExC_rxi->data->data[n] = (void*)rop;
5106 RExC_rxi->data->data[n+1] = (void*)sop;
5107 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 5108 SvREFCNT_dec(sv);
a0ed51b3 5109 }
e24b16f9 5110 else { /* First pass */
830247a4 5111 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5112 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5113 /* No compiled RE interpolated, has runtime
5114 components ===> unsafe. */
5115 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5116 if (PL_tainting && PL_tainted)
cc6b7395 5117 FAIL("Eval-group in insecure regular expression");
54df2634 5118#if PERL_VERSION > 8
923e4eb5 5119 if (IN_PERL_COMPILETIME)
b5c19bd7 5120 PL_cv_has_eval = 1;
54df2634 5121#endif
c277df42 5122 }
b5c19bd7 5123
830247a4 5124 nextchar(pRExC_state);
6136c704 5125 if (is_logical) {
830247a4 5126 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5127 if (!SIZE_ONLY)
5128 ret->flags = 2;
3dab1dad 5129 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5130 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5131 return ret;
5132 }
ccb2c380
MP
5133 ret = reganode(pRExC_state, EVAL, n);
5134 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5135 Set_Node_Offset(ret, parse_start);
5136 return ret;
c277df42 5137 }
fac92740 5138 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5139 {
0a4db386 5140 int is_define= 0;
fac92740 5141 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5142 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5143 || RExC_parse[1] == '<'
830247a4 5144 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5145 I32 flag;
5146
830247a4 5147 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5148 if (!SIZE_ONLY)
5149 ret->flags = 1;
3dab1dad 5150 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5151 goto insert_if;
b81d288d 5152 }
a0ed51b3 5153 }
0a4db386
YO
5154 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5155 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5156 {
5157 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5158 char *name_start= RExC_parse++;
2eccd3b2 5159 U32 num = 0;
0a4db386
YO
5160 SV *sv_dat=reg_scan_name(pRExC_state,
5161 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5162 if (RExC_parse == name_start || *RExC_parse != ch)
5163 vFAIL2("Sequence (?(%c... not terminated",
5164 (ch == '>' ? '<' : ch));
5165 RExC_parse++;
5166 if (!SIZE_ONLY) {
5167 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5168 RExC_rxi->data->data[num]=(void*)sv_dat;
0a4db386
YO
5169 SvREFCNT_inc(sv_dat);
5170 }
5171 ret = reganode(pRExC_state,NGROUPP,num);
5172 goto insert_if_check_paren;
5173 }
5174 else if (RExC_parse[0] == 'D' &&
5175 RExC_parse[1] == 'E' &&
5176 RExC_parse[2] == 'F' &&
5177 RExC_parse[3] == 'I' &&
5178 RExC_parse[4] == 'N' &&
5179 RExC_parse[5] == 'E')
5180 {
5181 ret = reganode(pRExC_state,DEFINEP,0);
5182 RExC_parse +=6 ;
5183 is_define = 1;
5184 goto insert_if_check_paren;
5185 }
5186 else if (RExC_parse[0] == 'R') {
5187 RExC_parse++;
5188 parno = 0;
5189 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5190 parno = atoi(RExC_parse++);
5191 while (isDIGIT(*RExC_parse))
5192 RExC_parse++;
5193 } else if (RExC_parse[0] == '&') {
5194 SV *sv_dat;
5195 RExC_parse++;
5196 sv_dat = reg_scan_name(pRExC_state,
5197 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5198 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5199 }
1a147d38 5200 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5201 goto insert_if_check_paren;
5202 }
830247a4 5203 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5204 /* (?(1)...) */
6136c704 5205 char c;
830247a4 5206 parno = atoi(RExC_parse++);
c277df42 5207
830247a4
IZ
5208 while (isDIGIT(*RExC_parse))
5209 RExC_parse++;
fac92740 5210 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5211
0a4db386 5212 insert_if_check_paren:
830247a4 5213 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5214 vFAIL("Switch condition not recognized");
c277df42 5215 insert_if:
3dab1dad
YO
5216 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5217 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5218 if (br == NULL)
830247a4 5219 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5220 else
3dab1dad 5221 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5222 c = *nextchar(pRExC_state);
d1b80229
IZ
5223 if (flags&HASWIDTH)
5224 *flagp |= HASWIDTH;
c277df42 5225 if (c == '|') {
0a4db386
YO
5226 if (is_define)
5227 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5228 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5229 regbranch(pRExC_state, &flags, 1,depth+1);
5230 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5231 if (flags&HASWIDTH)
5232 *flagp |= HASWIDTH;
830247a4 5233 c = *nextchar(pRExC_state);
a0ed51b3
LW
5234 }
5235 else
c277df42
IZ
5236 lastbr = NULL;
5237 if (c != ')')
8615cb43 5238 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5239 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5240 REGTAIL(pRExC_state, br, ender);
c277df42 5241 if (lastbr) {
3dab1dad
YO
5242 REGTAIL(pRExC_state, lastbr, ender);
5243 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5244 }
5245 else
3dab1dad 5246 REGTAIL(pRExC_state, ret, ender);
c277df42 5247 return ret;
a0ed51b3
LW
5248 }
5249 else {
830247a4 5250 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5251 }
5252 }
1b1626e4 5253 case 0:
830247a4 5254 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5255 vFAIL("Sequence (? incomplete");
1b1626e4 5256 break;
a0d0e21e 5257 default:
830247a4 5258 --RExC_parse;
fac92740 5259 parse_flags: /* (?i) */
830247a4 5260 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5261 /* (?g), (?gc) and (?o) are useless here
5262 and must be globally applied -- japhy */
5263
5264 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5265 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5266 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5267 if (! (wastedflags & wflagbit) ) {
5268 wastedflags |= wflagbit;
5269 vWARN5(
5270 RExC_parse + 1,
5271 "Useless (%s%c) - %suse /%c modifier",
5272 flagsp == &negflags ? "?-" : "?",
5273 *RExC_parse,
5274 flagsp == &negflags ? "don't " : "",
5275 *RExC_parse
5276 );
5277 }
5278 }
5279 }
5280 else if (*RExC_parse == 'c') {
5281 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5282 if (! (wastedflags & WASTED_C) ) {
5283 wastedflags |= WASTED_GC;
9d1d55b5
JP
5284 vWARN3(
5285 RExC_parse + 1,
5286 "Useless (%sc) - %suse /gc modifier",
5287 flagsp == &negflags ? "?-" : "?",
5288 flagsp == &negflags ? "don't " : ""
5289 );
5290 }
5291 }
5292 }
5293 else { pmflag(flagsp, *RExC_parse); }
5294
830247a4 5295 ++RExC_parse;
ca9dfc88 5296 }
830247a4 5297 if (*RExC_parse == '-') {
ca9dfc88 5298 flagsp = &negflags;
9d1d55b5 5299 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5300 ++RExC_parse;
ca9dfc88 5301 goto parse_flags;
48c036b1 5302 }
e2509266
JH
5303 RExC_flags |= posflags;
5304 RExC_flags &= ~negflags;
830247a4
IZ
5305 if (*RExC_parse == ':') {
5306 RExC_parse++;
ca9dfc88
IZ
5307 paren = ':';
5308 break;
5309 }
c277df42 5310 unknown:
830247a4
IZ
5311 if (*RExC_parse != ')') {
5312 RExC_parse++;
5313 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5314 }
830247a4 5315 nextchar(pRExC_state);
a0d0e21e
LW
5316 *flagp = TRYAGAIN;
5317 return NULL;
5318 }
5319 }
fac92740 5320 else { /* (...) */
81714fb9 5321 capturing_parens:
830247a4
IZ
5322 parno = RExC_npar;
5323 RExC_npar++;
e2e6a0f1 5324
830247a4 5325 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5326 if (!SIZE_ONLY ){
5327 if (!RExC_nestroot)
5328 RExC_nestroot = parno;
5329 if (RExC_seen & REG_SEEN_RECURSE) {
5330 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5331 "Setting open paren #%"IVdf" to %d\n",
5332 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5333 RExC_open_parens[parno-1]= ret;
5334 }
6bda09f9 5335 }
fac92740
MJD
5336 Set_Node_Length(ret, 1); /* MJD */
5337 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5338 is_open = 1;
a0d0e21e 5339 }
a0ed51b3 5340 }
fac92740 5341 else /* ! paren */
a0d0e21e
LW
5342 ret = NULL;
5343
5344 /* Pick up the branches, linking them together. */
fac92740 5345 parse_start = RExC_parse; /* MJD */
3dab1dad 5346 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5347 /* branch_len = (paren != 0); */
2af232bd 5348
a0d0e21e
LW
5349 if (br == NULL)
5350 return(NULL);
830247a4
IZ
5351 if (*RExC_parse == '|') {
5352 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5353 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5354 }
fac92740 5355 else { /* MJD */
6bda09f9 5356 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5357 Set_Node_Length(br, paren != 0);
5358 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5359 }
c277df42
IZ
5360 have_branch = 1;
5361 if (SIZE_ONLY)
830247a4 5362 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5363 }
5364 else if (paren == ':') {
c277df42
IZ
5365 *flagp |= flags&SIMPLE;
5366 }
6136c704 5367 if (is_open) { /* Starts with OPEN. */
3dab1dad 5368 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5369 }
5370 else if (paren != '?') /* Not Conditional */
a0d0e21e 5371 ret = br;
32a0ca98 5372 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5373 lastbr = br;
830247a4
IZ
5374 while (*RExC_parse == '|') {
5375 if (!SIZE_ONLY && RExC_extralen) {
5376 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5377 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5378 }
5379 if (SIZE_ONLY)
830247a4
IZ
5380 RExC_extralen += 2; /* Account for LONGJMP. */
5381 nextchar(pRExC_state);
3dab1dad 5382 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5383
a687059c 5384 if (br == NULL)
a0d0e21e 5385 return(NULL);
3dab1dad 5386 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5387 lastbr = br;
821b33a5
IZ
5388 if (flags&HASWIDTH)
5389 *flagp |= HASWIDTH;
a687059c 5390 *flagp |= flags&SPSTART;
a0d0e21e
LW
5391 }
5392
c277df42
IZ
5393 if (have_branch || paren != ':') {
5394 /* Make a closing node, and hook it on the end. */
5395 switch (paren) {
5396 case ':':
830247a4 5397 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5398 break;
5399 case 1:
c74340f9 5400 RExC_cpar++;
830247a4 5401 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5402 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5403 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5404 "Setting close paren #%"IVdf" to %d\n",
5405 (IV)parno, REG_NODE_NUM(ender)));
5406 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5407 if (RExC_nestroot == parno)
5408 RExC_nestroot = 0;
40d049e4 5409 }
fac92740
MJD
5410 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5411 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5412 break;
5413 case '<':
c277df42
IZ
5414 case ',':
5415 case '=':
5416 case '!':
c277df42 5417 *flagp &= ~HASWIDTH;
821b33a5
IZ
5418 /* FALL THROUGH */
5419 case '>':
830247a4 5420 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5421 break;
5422 case 0:
830247a4 5423 ender = reg_node(pRExC_state, END);
40d049e4
YO
5424 if (!SIZE_ONLY) {
5425 assert(!RExC_opend); /* there can only be one! */
5426 RExC_opend = ender;
5427 }
c277df42
IZ
5428 break;
5429 }
eaf3ca90 5430 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5431
9674d46a 5432 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5433 if (depth==1)
5434 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5435
c277df42 5436 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5437 for (br = ret; br; br = regnext(br)) {
5438 const U8 op = PL_regkind[OP(br)];
5439 if (op == BRANCH) {
07be1b83 5440 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5441 }
5442 else if (op == BRANCHJ) {
07be1b83 5443 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5444 }
c277df42
IZ
5445 }
5446 }
a0d0e21e 5447 }
c277df42
IZ
5448
5449 {
e1ec3a88
AL
5450 const char *p;
5451 static const char parens[] = "=!<,>";
c277df42
IZ
5452
5453 if (paren && (p = strchr(parens, paren))) {
eb160463 5454 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5455 int flag = (p - parens) > 1;
5456
5457 if (paren == '>')
5458 node = SUSPEND, flag = 0;
6bda09f9 5459 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5460 Set_Node_Cur_Length(ret);
5461 Set_Node_Offset(ret, parse_start + 1);
c277df42 5462 ret->flags = flag;
07be1b83 5463 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5464 }
a0d0e21e
LW
5465 }
5466
5467 /* Check for proper termination. */
ce3e6498 5468 if (paren) {
e2509266 5469 RExC_flags = oregflags;
830247a4
IZ
5470 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5471 RExC_parse = oregcomp_parse;
380a0633 5472 vFAIL("Unmatched (");
ce3e6498 5473 }
a0ed51b3 5474 }
830247a4
IZ
5475 else if (!paren && RExC_parse < RExC_end) {
5476 if (*RExC_parse == ')') {
5477 RExC_parse++;
380a0633 5478 vFAIL("Unmatched )");
a0ed51b3
LW
5479 }
5480 else
b45f050a 5481 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5482 /* NOTREACHED */
5483 }
a687059c 5484
a0d0e21e 5485 return(ret);
a687059c
LW
5486}
5487
5488/*
5489 - regbranch - one alternative of an | operator
5490 *
5491 * Implements the concatenation operator.
5492 */
76e3520e 5493STATIC regnode *
3dab1dad 5494S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5495{
97aff369 5496 dVAR;
c277df42
IZ
5497 register regnode *ret;
5498 register regnode *chain = NULL;
5499 register regnode *latest;
5500 I32 flags = 0, c = 0;
3dab1dad
YO
5501 GET_RE_DEBUG_FLAGS_DECL;
5502 DEBUG_PARSE("brnc");
b81d288d 5503 if (first)
c277df42
IZ
5504 ret = NULL;
5505 else {
b81d288d 5506 if (!SIZE_ONLY && RExC_extralen)
830247a4 5507 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5508 else {
830247a4 5509 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5510 Set_Node_Length(ret, 1);
5511 }
c277df42
IZ
5512 }
5513
b81d288d 5514 if (!first && SIZE_ONLY)
830247a4 5515 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5516
c277df42 5517 *flagp = WORST; /* Tentatively. */
a0d0e21e 5518
830247a4
IZ
5519 RExC_parse--;
5520 nextchar(pRExC_state);
5521 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5522 flags &= ~TRYAGAIN;
3dab1dad 5523 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5524 if (latest == NULL) {
5525 if (flags & TRYAGAIN)
5526 continue;
5527 return(NULL);
a0ed51b3
LW
5528 }
5529 else if (ret == NULL)
c277df42 5530 ret = latest;
a0d0e21e 5531 *flagp |= flags&HASWIDTH;
c277df42 5532 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5533 *flagp |= flags&SPSTART;
5534 else {
830247a4 5535 RExC_naughty++;
3dab1dad 5536 REGTAIL(pRExC_state, chain, latest);
a687059c 5537 }
a0d0e21e 5538 chain = latest;
c277df42
IZ
5539 c++;
5540 }
5541 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5542 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5543 if (ret == NULL)
5544 ret = chain;
5545 }
5546 if (c == 1) {
5547 *flagp |= flags&SIMPLE;
a0d0e21e 5548 }
a687059c 5549
d4c19fe8 5550 return ret;
a687059c
LW
5551}
5552
5553/*
5554 - regpiece - something followed by possible [*+?]
5555 *
5556 * Note that the branching code sequences used for ? and the general cases
5557 * of * and + are somewhat optimized: they use the same NOTHING node as
5558 * both the endmarker for their branch list and the body of the last branch.
5559 * It might seem that this node could be dispensed with entirely, but the
5560 * endmarker role is not redundant.
5561 */
76e3520e 5562STATIC regnode *
3dab1dad 5563S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5564{
97aff369 5565 dVAR;
c277df42 5566 register regnode *ret;
a0d0e21e
LW
5567 register char op;
5568 register char *next;
5569 I32 flags;
1df70142 5570 const char * const origparse = RExC_parse;
a0d0e21e 5571 I32 min;
c277df42 5572 I32 max = REG_INFTY;
fac92740 5573 char *parse_start;
10edeb5d 5574 const char *maxpos = NULL;
3dab1dad
YO
5575 GET_RE_DEBUG_FLAGS_DECL;
5576 DEBUG_PARSE("piec");
a0d0e21e 5577
3dab1dad 5578 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5579 if (ret == NULL) {
5580 if (flags & TRYAGAIN)
5581 *flagp |= TRYAGAIN;
5582 return(NULL);
5583 }
5584
830247a4 5585 op = *RExC_parse;
a0d0e21e 5586
830247a4 5587 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5588 maxpos = NULL;
fac92740 5589 parse_start = RExC_parse; /* MJD */
830247a4 5590 next = RExC_parse + 1;
a0d0e21e
LW
5591 while (isDIGIT(*next) || *next == ',') {
5592 if (*next == ',') {
5593 if (maxpos)
5594 break;
5595 else
5596 maxpos = next;
a687059c 5597 }
a0d0e21e
LW
5598 next++;
5599 }
5600 if (*next == '}') { /* got one */
5601 if (!maxpos)
5602 maxpos = next;
830247a4
IZ
5603 RExC_parse++;
5604 min = atoi(RExC_parse);
a0d0e21e
LW
5605 if (*maxpos == ',')
5606 maxpos++;
5607 else
830247a4 5608 maxpos = RExC_parse;
a0d0e21e
LW
5609 max = atoi(maxpos);
5610 if (!max && *maxpos != '0')
c277df42
IZ
5611 max = REG_INFTY; /* meaning "infinity" */
5612 else if (max >= REG_INFTY)
8615cb43 5613 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5614 RExC_parse = next;
5615 nextchar(pRExC_state);
a0d0e21e
LW
5616
5617 do_curly:
5618 if ((flags&SIMPLE)) {
830247a4 5619 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5620 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5621 Set_Node_Offset(ret, parse_start+1); /* MJD */
5622 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5623 }
5624 else {
3dab1dad 5625 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5626
5627 w->flags = 0;
3dab1dad 5628 REGTAIL(pRExC_state, ret, w);
830247a4 5629 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5630 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5631 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5632 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5633 }
6bda09f9 5634 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5635 /* MJD hk */
5636 Set_Node_Offset(ret, parse_start+1);
2af232bd 5637 Set_Node_Length(ret,
fac92740 5638 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5639
830247a4 5640 if (!SIZE_ONLY && RExC_extralen)
c277df42 5641 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5642 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5643 if (SIZE_ONLY)
830247a4
IZ
5644 RExC_whilem_seen++, RExC_extralen += 3;
5645 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5646 }
c277df42 5647 ret->flags = 0;
a0d0e21e
LW
5648
5649 if (min > 0)
821b33a5
IZ
5650 *flagp = WORST;
5651 if (max > 0)
5652 *flagp |= HASWIDTH;
a0d0e21e 5653 if (max && max < min)
8615cb43 5654 vFAIL("Can't do {n,m} with n > m");
c277df42 5655 if (!SIZE_ONLY) {
eb160463
GS
5656 ARG1_SET(ret, (U16)min);
5657 ARG2_SET(ret, (U16)max);
a687059c 5658 }
a687059c 5659
a0d0e21e 5660 goto nest_check;
a687059c 5661 }
a0d0e21e 5662 }
a687059c 5663
a0d0e21e
LW
5664 if (!ISMULT1(op)) {
5665 *flagp = flags;
a687059c 5666 return(ret);
a0d0e21e 5667 }
bb20fd44 5668
c277df42 5669#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5670
5671 /* if this is reinstated, don't forget to put this back into perldiag:
5672
5673 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5674
5675 (F) The part of the regexp subject to either the * or + quantifier
5676 could match an empty string. The {#} shows in the regular
5677 expression about where the problem was discovered.
5678
5679 */
5680
bb20fd44 5681 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5682 vFAIL("Regexp *+ operand could be empty");
b81d288d 5683#endif
bb20fd44 5684
fac92740 5685 parse_start = RExC_parse;
830247a4 5686 nextchar(pRExC_state);
a0d0e21e 5687
821b33a5 5688 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5689
5690 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5691 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5692 ret->flags = 0;
830247a4 5693 RExC_naughty += 4;
a0d0e21e
LW
5694 }
5695 else if (op == '*') {
5696 min = 0;
5697 goto do_curly;
a0ed51b3
LW
5698 }
5699 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5700 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5701 ret->flags = 0;
830247a4 5702 RExC_naughty += 3;
a0d0e21e
LW
5703 }
5704 else if (op == '+') {
5705 min = 1;
5706 goto do_curly;
a0ed51b3
LW
5707 }
5708 else if (op == '?') {
a0d0e21e
LW
5709 min = 0; max = 1;
5710 goto do_curly;
5711 }
5712 nest_check:
041457d9 5713 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5714 vWARN3(RExC_parse,
b45f050a 5715 "%.*s matches null string many times",
afd78fd5 5716 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5717 origparse);
a0d0e21e
LW
5718 }
5719
b9b4dddf 5720 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5721 nextchar(pRExC_state);
6bda09f9 5722 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5723 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5724 }
b9b4dddf
YO
5725#ifndef REG_ALLOW_MINMOD_SUSPEND
5726 else
5727#endif
5728 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5729 regnode *ender;
5730 nextchar(pRExC_state);
5731 ender = reg_node(pRExC_state, SUCCEED);
5732 REGTAIL(pRExC_state, ret, ender);
5733 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5734 ret->flags = 0;
5735 ender = reg_node(pRExC_state, TAIL);
5736 REGTAIL(pRExC_state, ret, ender);
5737 /*ret= ender;*/
5738 }
5739
5740 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5741 RExC_parse++;
b45f050a
JF
5742 vFAIL("Nested quantifiers");
5743 }
a0d0e21e
LW
5744
5745 return(ret);
a687059c
LW
5746}
5747
fc8cd66c
YO
5748
5749/* reg_namedseq(pRExC_state,UVp)
5750
5751 This is expected to be called by a parser routine that has
5752 recognized'\N' and needs to handle the rest. RExC_parse is
5753 expected to point at the first char following the N at the time
5754 of the call.
5755
5756 If valuep is non-null then it is assumed that we are parsing inside
5757 of a charclass definition and the first codepoint in the resolved
5758 string is returned via *valuep and the routine will return NULL.
5759 In this mode if a multichar string is returned from the charnames
5760 handler a warning will be issued, and only the first char in the
5761 sequence will be examined. If the string returned is zero length
5762 then the value of *valuep is undefined and NON-NULL will
5763 be returned to indicate failure. (This will NOT be a valid pointer
5764 to a regnode.)
5765
5766 If value is null then it is assumed that we are parsing normal text
5767 and inserts a new EXACT node into the program containing the resolved
5768 string and returns a pointer to the new node. If the string is
5769 zerolength a NOTHING node is emitted.
5770
5771 On success RExC_parse is set to the char following the endbrace.
5772 Parsing failures will generate a fatal errorvia vFAIL(...)
5773
5774 NOTE: We cache all results from the charnames handler locally in
5775 the RExC_charnames hash (created on first use) to prevent a charnames
5776 handler from playing silly-buggers and returning a short string and
5777 then a long string for a given pattern. Since the regexp program
5778 size is calculated during an initial parse this would result
5779 in a buffer overrun so we cache to prevent the charname result from
5780 changing during the course of the parse.
5781
5782 */
5783STATIC regnode *
5784S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5785{
5786 char * name; /* start of the content of the name */
5787 char * endbrace; /* endbrace following the name */
5788 SV *sv_str = NULL;
5789 SV *sv_name = NULL;
5790 STRLEN len; /* this has various purposes throughout the code */
5791 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5792 regnode *ret = NULL;
5793
5794 if (*RExC_parse != '{') {
5795 vFAIL("Missing braces on \\N{}");
5796 }
5797 name = RExC_parse+1;
5798 endbrace = strchr(RExC_parse, '}');
5799 if ( ! endbrace ) {
5800 RExC_parse++;
5801 vFAIL("Missing right brace on \\N{}");
5802 }
5803 RExC_parse = endbrace + 1;
5804
5805
5806 /* RExC_parse points at the beginning brace,
5807 endbrace points at the last */
5808 if ( name[0]=='U' && name[1]=='+' ) {
5809 /* its a "unicode hex" notation {U+89AB} */
5810 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5811 | PERL_SCAN_DISALLOW_PREFIX
5812 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5813 UV cp;
196f1508 5814 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5815 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5816 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5817 cp = 0xFFFD;
5818 }
5819 if (cp > 0xff)
5820 RExC_utf8 = 1;
5821 if ( valuep ) {
5822 *valuep = cp;
5823 return NULL;
5824 }
5825 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5826 } else {
5827 /* fetch the charnames handler for this scope */
5828 HV * const table = GvHV(PL_hintgv);
5829 SV **cvp= table ?
5830 hv_fetchs(table, "charnames", FALSE) :
5831 NULL;
5832 SV *cv= cvp ? *cvp : NULL;
5833 HE *he_str;
5834 int count;
5835 /* create an SV with the name as argument */
5836 sv_name = newSVpvn(name, endbrace - name);
5837
5838 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5839 vFAIL2("Constant(\\N{%s}) unknown: "
5840 "(possibly a missing \"use charnames ...\")",
5841 SvPVX(sv_name));
5842 }
5843 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5844 vFAIL2("Constant(\\N{%s}): "
5845 "$^H{charnames} is not defined",SvPVX(sv_name));
5846 }
5847
5848
5849
5850 if (!RExC_charnames) {
5851 /* make sure our cache is allocated */
5852 RExC_charnames = newHV();
6bda09f9 5853 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5854 }
5855 /* see if we have looked this one up before */
5856 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5857 if ( he_str ) {
5858 sv_str = HeVAL(he_str);
5859 cached = 1;
5860 } else {
5861 dSP ;
5862
5863 ENTER ;
5864 SAVETMPS ;
5865 PUSHMARK(SP) ;
5866
5867 XPUSHs(sv_name);
5868
5869 PUTBACK ;
5870
5871 count= call_sv(cv, G_SCALAR);
5872
5873 if (count == 1) { /* XXXX is this right? dmq */
5874 sv_str = POPs;
5875 SvREFCNT_inc_simple_void(sv_str);
5876 }
5877
5878 SPAGAIN ;
5879 PUTBACK ;
5880 FREETMPS ;
5881 LEAVE ;
5882
5883 if ( !sv_str || !SvOK(sv_str) ) {
5884 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5885 "did not return a defined value",SvPVX(sv_name));
5886 }
5887 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5888 cached = 1;
5889 }
5890 }
5891 if (valuep) {
5892 char *p = SvPV(sv_str, len);
5893 if (len) {
5894 STRLEN numlen = 1;
5895 if ( SvUTF8(sv_str) ) {
196f1508 5896 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5897 if (*valuep > 0x7F)
5898 RExC_utf8 = 1;
5899 /* XXXX
5900 We have to turn on utf8 for high bit chars otherwise
5901 we get failures with
5902
5903 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5904 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5905
5906 This is different from what \x{} would do with the same
5907 codepoint, where the condition is > 0xFF.
5908 - dmq
5909 */
5910
5911
5912 } else {
5913 *valuep = (UV)*p;
5914 /* warn if we havent used the whole string? */
5915 }
5916 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5917 vWARN2(RExC_parse,
5918 "Ignoring excess chars from \\N{%s} in character class",
5919 SvPVX(sv_name)
5920 );
5921 }
5922 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5923 vWARN2(RExC_parse,
5924 "Ignoring zero length \\N{%s} in character class",
5925 SvPVX(sv_name)
5926 );
5927 }
5928 if (sv_name)
5929 SvREFCNT_dec(sv_name);
5930 if (!cached)
5931 SvREFCNT_dec(sv_str);
5932 return len ? NULL : (regnode *)&len;
5933 } else if(SvCUR(sv_str)) {
5934
5935 char *s;
5936 char *p, *pend;
5937 STRLEN charlen = 1;
5938 char * parse_start = name-3; /* needed for the offsets */
5939 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5940
5941 ret = reg_node(pRExC_state,
5942 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5943 s= STRING(ret);
5944
5945 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5946 sv_utf8_upgrade(sv_str);
5947 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5948 RExC_utf8= 1;
5949 }
5950
5951 p = SvPV(sv_str, len);
5952 pend = p + len;
5953 /* len is the length written, charlen is the size the char read */
5954 for ( len = 0; p < pend; p += charlen ) {
5955 if (UTF) {
196f1508 5956 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5957 if (FOLD) {
5958 STRLEN foldlen,numlen;
5959 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5960 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5961 /* Emit all the Unicode characters. */
5962
5963 for (foldbuf = tmpbuf;
5964 foldlen;
5965 foldlen -= numlen)
5966 {
5967 uvc = utf8_to_uvchr(foldbuf, &numlen);
5968 if (numlen > 0) {
5969 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5970 s += unilen;
5971 len += unilen;
5972 /* In EBCDIC the numlen
5973 * and unilen can differ. */
5974 foldbuf += numlen;
5975 if (numlen >= foldlen)
5976 break;
5977 }
5978 else
5979 break; /* "Can't happen." */
5980 }
5981 } else {
5982 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5983 if (unilen > 0) {
5984 s += unilen;
5985 len += unilen;
5986 }
5987 }
5988 } else {
5989 len++;
5990 REGC(*p, s++);
5991 }
5992 }
5993 if (SIZE_ONLY) {
5994 RExC_size += STR_SZ(len);
5995 } else {
5996 STR_LEN(ret) = len;
5997 RExC_emit += STR_SZ(len);
5998 }
5999 Set_Node_Cur_Length(ret); /* MJD */
6000 RExC_parse--;
6001 nextchar(pRExC_state);
6002 } else {
6003 ret = reg_node(pRExC_state,NOTHING);
6004 }
6005 if (!cached) {
6006 SvREFCNT_dec(sv_str);
6007 }
6008 if (sv_name) {
6009 SvREFCNT_dec(sv_name);
6010 }
6011 return ret;
6012
6013}
6014
6015
9e08bc66
TS
6016/*
6017 * reg_recode
6018 *
6019 * It returns the code point in utf8 for the value in *encp.
6020 * value: a code value in the source encoding
6021 * encp: a pointer to an Encode object
6022 *
6023 * If the result from Encode is not a single character,
6024 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6025 */
6026STATIC UV
6027S_reg_recode(pTHX_ const char value, SV **encp)
6028{
6029 STRLEN numlen = 1;
6030 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6031 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6032 : SvPVX(sv);
6033 const STRLEN newlen = SvCUR(sv);
6034 UV uv = UNICODE_REPLACEMENT;
6035
6036 if (newlen)
6037 uv = SvUTF8(sv)
6038 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6039 : *(U8*)s;
6040
6041 if (!newlen || numlen != newlen) {
6042 uv = UNICODE_REPLACEMENT;
6043 if (encp)
6044 *encp = NULL;
6045 }
6046 return uv;
6047}
6048
fc8cd66c 6049
a687059c
LW
6050/*
6051 - regatom - the lowest level
6052 *
6053 * Optimization: gobbles an entire sequence of ordinary characters so that
6054 * it can turn them into a single node, which is smaller to store and
6055 * faster to run. Backslashed characters are exceptions, each becoming a
6056 * separate node; the code is simpler that way and it's not worth fixing.
6057 *
7f6f358c
YO
6058 * [Yes, it is worth fixing, some scripts can run twice the speed.]
6059 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6060 */
76e3520e 6061STATIC regnode *
3dab1dad 6062S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6063{
97aff369 6064 dVAR;
cbbf8932 6065 register regnode *ret = NULL;
a0d0e21e 6066 I32 flags;
45948336 6067 char *parse_start = RExC_parse;
3dab1dad
YO
6068 GET_RE_DEBUG_FLAGS_DECL;
6069 DEBUG_PARSE("atom");
a0d0e21e
LW
6070 *flagp = WORST; /* Tentatively. */
6071
6072tryagain:
830247a4 6073 switch (*RExC_parse) {
a0d0e21e 6074 case '^':
830247a4
IZ
6075 RExC_seen_zerolen++;
6076 nextchar(pRExC_state);
bbe252da 6077 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6078 ret = reg_node(pRExC_state, MBOL);
bbe252da 6079 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6080 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6081 else
830247a4 6082 ret = reg_node(pRExC_state, BOL);
fac92740 6083 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6084 break;
6085 case '$':
830247a4 6086 nextchar(pRExC_state);
b81d288d 6087 if (*RExC_parse)
830247a4 6088 RExC_seen_zerolen++;
bbe252da 6089 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6090 ret = reg_node(pRExC_state, MEOL);
bbe252da 6091 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6092 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6093 else
830247a4 6094 ret = reg_node(pRExC_state, EOL);
fac92740 6095 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6096 break;
6097 case '.':
830247a4 6098 nextchar(pRExC_state);
bbe252da 6099 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
6100 ret = reg_node(pRExC_state, SANY);
6101 else
6102 ret = reg_node(pRExC_state, REG_ANY);
6103 *flagp |= HASWIDTH|SIMPLE;
830247a4 6104 RExC_naughty++;
fac92740 6105 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6106 break;
6107 case '[':
b45f050a 6108 {
3dab1dad
YO
6109 char * const oregcomp_parse = ++RExC_parse;
6110 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6111 if (*RExC_parse != ']') {
6112 RExC_parse = oregcomp_parse;
b45f050a
JF
6113 vFAIL("Unmatched [");
6114 }
830247a4 6115 nextchar(pRExC_state);
a0d0e21e 6116 *flagp |= HASWIDTH|SIMPLE;
fac92740 6117 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6118 break;
b45f050a 6119 }
a0d0e21e 6120 case '(':
830247a4 6121 nextchar(pRExC_state);
3dab1dad 6122 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6123 if (ret == NULL) {
bf93d4cc 6124 if (flags & TRYAGAIN) {
830247a4 6125 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6126 /* Make parent create an empty node if needed. */
6127 *flagp |= TRYAGAIN;
6128 return(NULL);
6129 }
a0d0e21e 6130 goto tryagain;
bf93d4cc 6131 }
a0d0e21e
LW
6132 return(NULL);
6133 }
c277df42 6134 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
6135 break;
6136 case '|':
6137 case ')':
6138 if (flags & TRYAGAIN) {
6139 *flagp |= TRYAGAIN;
6140 return NULL;
6141 }
b45f050a 6142 vFAIL("Internal urp");
a0d0e21e
LW
6143 /* Supposed to be caught earlier. */
6144 break;
85afd4ae 6145 case '{':
830247a4
IZ
6146 if (!regcurly(RExC_parse)) {
6147 RExC_parse++;
85afd4ae
CS
6148 goto defchar;
6149 }
6150 /* FALL THROUGH */
a0d0e21e
LW
6151 case '?':
6152 case '+':
6153 case '*':
830247a4 6154 RExC_parse++;
b45f050a 6155 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6156 break;
6157 case '\\':
830247a4 6158 switch (*++RExC_parse) {
a0d0e21e 6159 case 'A':
830247a4
IZ
6160 RExC_seen_zerolen++;
6161 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6162 *flagp |= SIMPLE;
830247a4 6163 nextchar(pRExC_state);
fac92740 6164 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6165 break;
6166 case 'G':
830247a4
IZ
6167 ret = reg_node(pRExC_state, GPOS);
6168 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6169 *flagp |= SIMPLE;
830247a4 6170 nextchar(pRExC_state);
fac92740 6171 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6172 break;
6173 case 'Z':
830247a4 6174 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6175 *flagp |= SIMPLE;
a1917ab9 6176 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 6177 nextchar(pRExC_state);
a0d0e21e 6178 break;
b85d18e9 6179 case 'z':
830247a4 6180 ret = reg_node(pRExC_state, EOS);
b85d18e9 6181 *flagp |= SIMPLE;
830247a4
IZ
6182 RExC_seen_zerolen++; /* Do not optimize RE away */
6183 nextchar(pRExC_state);
fac92740 6184 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 6185 break;
4a2d328f 6186 case 'C':
f33976b4
DB
6187 ret = reg_node(pRExC_state, CANY);
6188 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6189 *flagp |= HASWIDTH|SIMPLE;
830247a4 6190 nextchar(pRExC_state);
fac92740 6191 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
6192 break;
6193 case 'X':
830247a4 6194 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6195 *flagp |= HASWIDTH;
830247a4 6196 nextchar(pRExC_state);
fac92740 6197 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 6198 break;
a0d0e21e 6199 case 'w':
eb160463 6200 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6201 *flagp |= HASWIDTH|SIMPLE;
830247a4 6202 nextchar(pRExC_state);
fac92740 6203 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6204 break;
6205 case 'W':
eb160463 6206 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6207 *flagp |= HASWIDTH|SIMPLE;
830247a4 6208 nextchar(pRExC_state);
fac92740 6209 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6210 break;
6211 case 'b':
830247a4
IZ
6212 RExC_seen_zerolen++;
6213 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6214 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6215 *flagp |= SIMPLE;
830247a4 6216 nextchar(pRExC_state);
fac92740 6217 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6218 break;
6219 case 'B':
830247a4
IZ
6220 RExC_seen_zerolen++;
6221 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6222 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6223 *flagp |= SIMPLE;
830247a4 6224 nextchar(pRExC_state);
fac92740 6225 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6226 break;
6227 case 's':
eb160463 6228 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6229 *flagp |= HASWIDTH|SIMPLE;
830247a4 6230 nextchar(pRExC_state);
fac92740 6231 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6232 break;
6233 case 'S':
eb160463 6234 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6235 *flagp |= HASWIDTH|SIMPLE;
830247a4 6236 nextchar(pRExC_state);
fac92740 6237 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6238 break;
6239 case 'd':
ffc61ed2 6240 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6241 *flagp |= HASWIDTH|SIMPLE;
830247a4 6242 nextchar(pRExC_state);
fac92740 6243 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6244 break;
6245 case 'D':
ffc61ed2 6246 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6247 *flagp |= HASWIDTH|SIMPLE;
830247a4 6248 nextchar(pRExC_state);
fac92740 6249 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 6250 break;
a14b48bc
LW
6251 case 'p':
6252 case 'P':
3568d838 6253 {
3dab1dad 6254 char* const oldregxend = RExC_end;
ccb2c380 6255 char* parse_start = RExC_parse - 2;
a14b48bc 6256
830247a4 6257 if (RExC_parse[1] == '{') {
3568d838 6258 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6259 RExC_end = strchr(RExC_parse, '}');
6260 if (!RExC_end) {
3dab1dad 6261 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6262 RExC_parse += 2;
6263 RExC_end = oldregxend;
0da60cf5 6264 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6265 }
830247a4 6266 RExC_end++;
a14b48bc 6267 }
af6f566e 6268 else {
830247a4 6269 RExC_end = RExC_parse + 2;
af6f566e
HS
6270 if (RExC_end > oldregxend)
6271 RExC_end = oldregxend;
6272 }
830247a4 6273 RExC_parse--;
a14b48bc 6274
3dab1dad 6275 ret = regclass(pRExC_state,depth+1);
a14b48bc 6276
830247a4
IZ
6277 RExC_end = oldregxend;
6278 RExC_parse--;
ccb2c380
MP
6279
6280 Set_Node_Offset(ret, parse_start + 2);
6281 Set_Node_Cur_Length(ret);
830247a4 6282 nextchar(pRExC_state);
a14b48bc
LW
6283 *flagp |= HASWIDTH|SIMPLE;
6284 }
6285 break;
fc8cd66c
YO
6286 case 'N':
6287 /* Handle \N{NAME} here and not below because it can be
6288 multicharacter. join_exact() will join them up later on.
6289 Also this makes sure that things like /\N{BLAH}+/ and
6290 \N{BLAH} being multi char Just Happen. dmq*/
6291 ++RExC_parse;
6292 ret= reg_namedseq(pRExC_state, NULL);
6293 break;
0a4db386 6294 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
6295 {
6296 char ch= RExC_parse[1];
6297 if (ch != '<' && ch != '\'') {
6298 if (SIZE_ONLY)
6299 vWARN( RExC_parse + 1,
6300 "Possible broken named back reference treated as literal k");
6301 parse_start--;
6302 goto defchar;
6303 } else {
6304 char* name_start = (RExC_parse += 2);
2eccd3b2 6305 U32 num = 0;
0a4db386
YO
6306 SV *sv_dat = reg_scan_name(pRExC_state,
6307 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
6308 ch= (ch == '<') ? '>' : '\'';
6309
81714fb9
YO
6310 if (RExC_parse == name_start || *RExC_parse != ch)
6311 vFAIL2("Sequence \\k%c... not terminated",
6312 (ch == '>' ? '<' : ch));
6313
6314 RExC_sawback = 1;
6315 ret = reganode(pRExC_state,
6316 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6317 num);
6318 *flagp |= HASWIDTH;
6319
6320
6321 if (!SIZE_ONLY) {
81714fb9
YO
6322 num = add_data( pRExC_state, 1, "S" );
6323 ARG_SET(ret,num);
f8fc2ecf 6324 RExC_rxi->data->data[num]=(void*)sv_dat;
81714fb9
YO
6325 SvREFCNT_inc(sv_dat);
6326 }
6327 /* override incorrect value set in reganode MJD */
6328 Set_Node_Offset(ret, parse_start+1);
6329 Set_Node_Cur_Length(ret); /* MJD */
6330 nextchar(pRExC_state);
6331
6332 }
6333 break;
6334 }
a0d0e21e
LW
6335 case 'n':
6336 case 'r':
6337 case 't':
6338 case 'f':
6339 case 'e':
6340 case 'a':
6341 case 'x':
6342 case 'c':
6343 case '0':
6344 goto defchar;
c74340f9 6345 case 'R':
a0d0e21e
LW
6346 case '1': case '2': case '3': case '4':
6347 case '5': case '6': case '7': case '8': case '9':
6348 {
c74340f9
YO
6349 I32 num;
6350 bool isrel=(*RExC_parse=='R');
6351 if (isrel)
6352 RExC_parse++;
6353 num = atoi(RExC_parse);
6354 if (isrel) {
6355 num = RExC_cpar - num;
6356 if (num < 1)
6357 vFAIL("Reference to nonexistent or unclosed group");
6358 }
830247a4 6359 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
6360 goto defchar;
6361 else {
3dab1dad 6362 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6363 while (isDIGIT(*RExC_parse))
6364 RExC_parse++;
b45f050a 6365
c74340f9
YO
6366 if (!SIZE_ONLY) {
6367 if (num > (I32)RExC_rx->nparens)
6368 vFAIL("Reference to nonexistent group");
6369 /* People make this error all the time apparently.
6370 So we cant fail on it, even though we should
6371
6372 else if (num >= RExC_cpar)
6373 vFAIL("Reference to unclosed group will always match");
6374 */
6375 }
830247a4 6376 RExC_sawback = 1;
eb160463
GS
6377 ret = reganode(pRExC_state,
6378 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6379 num);
a0d0e21e 6380 *flagp |= HASWIDTH;
2af232bd 6381
fac92740 6382 /* override incorrect value set in reganode MJD */
2af232bd 6383 Set_Node_Offset(ret, parse_start+1);
fac92740 6384 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6385 RExC_parse--;
6386 nextchar(pRExC_state);
a0d0e21e
LW
6387 }
6388 }
6389 break;
6390 case '\0':
830247a4 6391 if (RExC_parse >= RExC_end)
b45f050a 6392 FAIL("Trailing \\");
a0d0e21e
LW
6393 /* FALL THROUGH */
6394 default:
a0288114 6395 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6396 back into the quick-grab loop below */
45948336 6397 parse_start--;
a0d0e21e
LW
6398 goto defchar;
6399 }
6400 break;
4633a7c4
LW
6401
6402 case '#':
bbe252da 6403 if (RExC_flags & RXf_PMf_EXTENDED) {
3dab1dad
YO
6404 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6405 RExC_parse++;
830247a4 6406 if (RExC_parse < RExC_end)
4633a7c4
LW
6407 goto tryagain;
6408 }
6409 /* FALL THROUGH */
6410
a0d0e21e 6411 default: {
ba210ebe 6412 register STRLEN len;
58ae7d3f 6413 register UV ender;
a0d0e21e 6414 register char *p;
3dab1dad 6415 char *s;
80aecb99 6416 STRLEN foldlen;
89ebb4a3 6417 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6418
6419 parse_start = RExC_parse - 1;
a0d0e21e 6420
830247a4 6421 RExC_parse++;
a0d0e21e
LW
6422
6423 defchar:
58ae7d3f 6424 ender = 0;
eb160463
GS
6425 ret = reg_node(pRExC_state,
6426 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6427 s = STRING(ret);
830247a4
IZ
6428 for (len = 0, p = RExC_parse - 1;
6429 len < 127 && p < RExC_end;
a0d0e21e
LW
6430 len++)
6431 {
3dab1dad 6432 char * const oldp = p;
5b5a24f7 6433
bbe252da 6434 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6435 p = regwhite(p, RExC_end);
a0d0e21e
LW
6436 switch (*p) {
6437 case '^':
6438 case '$':
6439 case '.':
6440 case '[':
6441 case '(':
6442 case ')':
6443 case '|':
6444 goto loopdone;
6445 case '\\':
6446 switch (*++p) {
6447 case 'A':
1ed8eac0
JF
6448 case 'C':
6449 case 'X':
a0d0e21e
LW
6450 case 'G':
6451 case 'Z':
b85d18e9 6452 case 'z':
a0d0e21e
LW
6453 case 'w':
6454 case 'W':
6455 case 'b':
6456 case 'B':
6457 case 's':
6458 case 'S':
6459 case 'd':
6460 case 'D':
a14b48bc
LW
6461 case 'p':
6462 case 'P':
fc8cd66c 6463 case 'N':
c74340f9 6464 case 'R':
a0d0e21e
LW
6465 --p;
6466 goto loopdone;
6467 case 'n':
6468 ender = '\n';
6469 p++;
a687059c 6470 break;
a0d0e21e
LW
6471 case 'r':
6472 ender = '\r';
6473 p++;
a687059c 6474 break;
a0d0e21e
LW
6475 case 't':
6476 ender = '\t';
6477 p++;
a687059c 6478 break;
a0d0e21e
LW
6479 case 'f':
6480 ender = '\f';
6481 p++;
a687059c 6482 break;
a0d0e21e 6483 case 'e':
c7f1f016 6484 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6485 p++;
a687059c 6486 break;
a0d0e21e 6487 case 'a':
c7f1f016 6488 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6489 p++;
a687059c 6490 break;
a0d0e21e 6491 case 'x':
a0ed51b3 6492 if (*++p == '{') {
1df70142 6493 char* const e = strchr(p, '}');
b81d288d 6494
b45f050a 6495 if (!e) {
830247a4 6496 RExC_parse = p + 1;
b45f050a
JF
6497 vFAIL("Missing right brace on \\x{}");
6498 }
de5f0749 6499 else {
a4c04bdc
NC
6500 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6501 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6502 STRLEN numlen = e - p - 1;
53305cf1 6503 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6504 if (ender > 0xff)
6505 RExC_utf8 = 1;
a0ed51b3
LW
6506 p = e + 1;
6507 }
a0ed51b3
LW
6508 }
6509 else {
a4c04bdc 6510 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6511 STRLEN numlen = 2;
53305cf1 6512 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6513 p += numlen;
6514 }
9e08bc66
TS
6515 if (PL_encoding && ender < 0x100)
6516 goto recode_encoding;
a687059c 6517 break;
a0d0e21e
LW
6518 case 'c':
6519 p++;
bbce6d69 6520 ender = UCHARAT(p++);
6521 ender = toCTRL(ender);
a687059c 6522 break;
a0d0e21e
LW
6523 case '0': case '1': case '2': case '3':case '4':
6524 case '5': case '6': case '7': case '8':case '9':
6525 if (*p == '0' ||
830247a4 6526 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6527 I32 flags = 0;
1df70142 6528 STRLEN numlen = 3;
53305cf1 6529 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6530 p += numlen;
6531 }
6532 else {
6533 --p;
6534 goto loopdone;
a687059c 6535 }
9e08bc66
TS
6536 if (PL_encoding && ender < 0x100)
6537 goto recode_encoding;
6538 break;
6539 recode_encoding:
6540 {
6541 SV* enc = PL_encoding;
6542 ender = reg_recode((const char)(U8)ender, &enc);
6543 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6544 vWARN(p, "Invalid escape in the specified encoding");
6545 RExC_utf8 = 1;
6546 }
a687059c 6547 break;
a0d0e21e 6548 case '\0':
830247a4 6549 if (p >= RExC_end)
b45f050a 6550 FAIL("Trailing \\");
a687059c 6551 /* FALL THROUGH */
a0d0e21e 6552 default:
041457d9 6553 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6554 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6555 goto normal_default;
a0d0e21e
LW
6556 }
6557 break;
a687059c 6558 default:
a0ed51b3 6559 normal_default:
fd400ab9 6560 if (UTF8_IS_START(*p) && UTF) {
1df70142 6561 STRLEN numlen;
5e12f4fb 6562 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6563 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6564 p += numlen;
6565 }
6566 else
6567 ender = *p++;
a0d0e21e 6568 break;
a687059c 6569 }
bbe252da 6570 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6571 p = regwhite(p, RExC_end);
60a8b682
JH
6572 if (UTF && FOLD) {
6573 /* Prime the casefolded buffer. */
ac7e0132 6574 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6575 }
a0d0e21e
LW
6576 if (ISMULT2(p)) { /* Back off on ?+*. */
6577 if (len)
6578 p = oldp;
16ea2a2e 6579 else if (UTF) {
80aecb99 6580 if (FOLD) {
60a8b682 6581 /* Emit all the Unicode characters. */
1df70142 6582 STRLEN numlen;
80aecb99
JH
6583 for (foldbuf = tmpbuf;
6584 foldlen;
6585 foldlen -= numlen) {
6586 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6587 if (numlen > 0) {
71207a34 6588 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6589 s += unilen;
6590 len += unilen;
6591 /* In EBCDIC the numlen
6592 * and unilen can differ. */
9dc45d57 6593 foldbuf += numlen;
47654450
JH
6594 if (numlen >= foldlen)
6595 break;
9dc45d57
JH
6596 }
6597 else
6598 break; /* "Can't happen." */
80aecb99
JH
6599 }
6600 }
6601 else {
71207a34 6602 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6603 if (unilen > 0) {
0ebc6274
JH
6604 s += unilen;
6605 len += unilen;
9dc45d57 6606 }
80aecb99 6607 }
a0ed51b3 6608 }
a0d0e21e
LW
6609 else {
6610 len++;
eb160463 6611 REGC((char)ender, s++);
a0d0e21e
LW
6612 }
6613 break;
a687059c 6614 }
16ea2a2e 6615 if (UTF) {
80aecb99 6616 if (FOLD) {
60a8b682 6617 /* Emit all the Unicode characters. */
1df70142 6618 STRLEN numlen;
80aecb99
JH
6619 for (foldbuf = tmpbuf;
6620 foldlen;
6621 foldlen -= numlen) {
6622 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6623 if (numlen > 0) {
71207a34 6624 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6625 len += unilen;
6626 s += unilen;
6627 /* In EBCDIC the numlen
6628 * and unilen can differ. */
9dc45d57 6629 foldbuf += numlen;
47654450
JH
6630 if (numlen >= foldlen)
6631 break;
9dc45d57
JH
6632 }
6633 else
6634 break;
80aecb99
JH
6635 }
6636 }
6637 else {
71207a34 6638 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6639 if (unilen > 0) {
0ebc6274
JH
6640 s += unilen;
6641 len += unilen;
9dc45d57 6642 }
80aecb99
JH
6643 }
6644 len--;
a0ed51b3
LW
6645 }
6646 else
eb160463 6647 REGC((char)ender, s++);
a0d0e21e
LW
6648 }
6649 loopdone:
830247a4 6650 RExC_parse = p - 1;
fac92740 6651 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6652 nextchar(pRExC_state);
793db0cb
JH
6653 {
6654 /* len is STRLEN which is unsigned, need to copy to signed */
6655 IV iv = len;
6656 if (iv < 0)
6657 vFAIL("Internal disaster");
6658 }
a0d0e21e
LW
6659 if (len > 0)
6660 *flagp |= HASWIDTH;
090f7165 6661 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6662 *flagp |= SIMPLE;
3dab1dad 6663
cd439c50 6664 if (SIZE_ONLY)
830247a4 6665 RExC_size += STR_SZ(len);
3dab1dad
YO
6666 else {
6667 STR_LEN(ret) = len;
830247a4 6668 RExC_emit += STR_SZ(len);
07be1b83 6669 }
3dab1dad 6670 }
a0d0e21e
LW
6671 break;
6672 }
a687059c 6673
a0d0e21e 6674 return(ret);
a687059c
LW
6675}
6676
873ef191 6677STATIC char *
5f66b61c 6678S_regwhite(char *p, const char *e)
5b5a24f7
CS
6679{
6680 while (p < e) {
6681 if (isSPACE(*p))
6682 ++p;
6683 else if (*p == '#') {
6684 do {
6685 p++;
6686 } while (p < e && *p != '\n');
6687 }
6688 else
6689 break;
6690 }
6691 return p;
6692}
6693
b8c5462f
JH
6694/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6695 Character classes ([:foo:]) can also be negated ([:^foo:]).
6696 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6697 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6698 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6699
6700#define POSIXCC_DONE(c) ((c) == ':')
6701#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6702#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6703
b8c5462f 6704STATIC I32
830247a4 6705S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6706{
97aff369 6707 dVAR;
936ed897 6708 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6709
830247a4 6710 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6711 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6712 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6713 const char c = UCHARAT(RExC_parse);
097eb12c 6714 char* const s = RExC_parse++;
b81d288d 6715
9a86a77b 6716 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6717 RExC_parse++;
6718 if (RExC_parse == RExC_end)
620e46c5 6719 /* Grandfather lone [:, [=, [. */
830247a4 6720 RExC_parse = s;
620e46c5 6721 else {
3dab1dad 6722 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6723 assert(*t == c);
6724
9a86a77b 6725 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6726 const char *posixcc = s + 1;
830247a4 6727 RExC_parse++; /* skip over the ending ] */
3dab1dad 6728
b8c5462f 6729 if (*s == ':') {
1df70142
AL
6730 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6731 const I32 skip = t - posixcc;
80916619
NC
6732
6733 /* Initially switch on the length of the name. */
6734 switch (skip) {
6735 case 4:
3dab1dad
YO
6736 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6737 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6738 break;
80916619
NC
6739 case 5:
6740 /* Names all of length 5. */
6741 /* alnum alpha ascii blank cntrl digit graph lower
6742 print punct space upper */
6743 /* Offset 4 gives the best switch position. */
6744 switch (posixcc[4]) {
6745 case 'a':
3dab1dad
YO
6746 if (memEQ(posixcc, "alph", 4)) /* alpha */
6747 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6748 break;
6749 case 'e':
3dab1dad
YO
6750 if (memEQ(posixcc, "spac", 4)) /* space */
6751 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6752 break;
6753 case 'h':
3dab1dad
YO
6754 if (memEQ(posixcc, "grap", 4)) /* graph */
6755 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6756 break;
6757 case 'i':
3dab1dad
YO
6758 if (memEQ(posixcc, "asci", 4)) /* ascii */
6759 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6760 break;
6761 case 'k':
3dab1dad
YO
6762 if (memEQ(posixcc, "blan", 4)) /* blank */
6763 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6764 break;
6765 case 'l':
3dab1dad
YO
6766 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6767 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6768 break;
6769 case 'm':
3dab1dad
YO
6770 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6771 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6772 break;
6773 case 'r':
3dab1dad
YO
6774 if (memEQ(posixcc, "lowe", 4)) /* lower */
6775 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6776 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6777 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6778 break;
6779 case 't':
3dab1dad
YO
6780 if (memEQ(posixcc, "digi", 4)) /* digit */
6781 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6782 else if (memEQ(posixcc, "prin", 4)) /* print */
6783 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6784 else if (memEQ(posixcc, "punc", 4)) /* punct */
6785 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6786 break;
b8c5462f
JH
6787 }
6788 break;
80916619 6789 case 6:
3dab1dad
YO
6790 if (memEQ(posixcc, "xdigit", 6))
6791 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6792 break;
6793 }
80916619
NC
6794
6795 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6796 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6797 t - s - 1, s + 1);
80916619
NC
6798 assert (posixcc[skip] == ':');
6799 assert (posixcc[skip+1] == ']');
b45f050a 6800 } else if (!SIZE_ONLY) {
b8c5462f 6801 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6802
830247a4 6803 /* adjust RExC_parse so the warning shows after
b45f050a 6804 the class closes */
9a86a77b 6805 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6806 RExC_parse++;
b45f050a
JF
6807 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6808 }
b8c5462f
JH
6809 } else {
6810 /* Maternal grandfather:
6811 * "[:" ending in ":" but not in ":]" */
830247a4 6812 RExC_parse = s;
767d463e 6813 }
620e46c5
JH
6814 }
6815 }
6816
b8c5462f
JH
6817 return namedclass;
6818}
6819
6820STATIC void
830247a4 6821S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6822{
97aff369 6823 dVAR;
3dab1dad 6824 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6825 const char *s = RExC_parse;
6826 const char c = *s++;
b8c5462f 6827
3dab1dad 6828 while (isALNUM(*s))
b8c5462f
JH
6829 s++;
6830 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6831 if (ckWARN(WARN_REGEXP))
6832 vWARN3(s+2,
6833 "POSIX syntax [%c %c] belongs inside character classes",
6834 c, c);
b45f050a
JF
6835
6836 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6837 if (POSIXCC_NOTYET(c)) {
830247a4 6838 /* adjust RExC_parse so the error shows after
b45f050a 6839 the class closes */
9a86a77b 6840 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6841 NOOP;
b45f050a
JF
6842 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6843 }
b8c5462f
JH
6844 }
6845 }
620e46c5
JH
6846}
6847
7f6f358c
YO
6848
6849/*
6850 parse a class specification and produce either an ANYOF node that
6851 matches the pattern. If the pattern matches a single char only and
6852 that char is < 256 then we produce an EXACT node instead.
6853*/
76e3520e 6854STATIC regnode *
3dab1dad 6855S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6856{
97aff369 6857 dVAR;
9ef43ace 6858 register UV value = 0;
9a86a77b 6859 register UV nextvalue;
3568d838 6860 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6861 register IV range = 0;
c277df42 6862 register regnode *ret;
ba210ebe 6863 STRLEN numlen;
ffc61ed2 6864 IV namedclass;
cbbf8932 6865 char *rangebegin = NULL;
936ed897 6866 bool need_class = 0;
c445ea15 6867 SV *listsv = NULL;
ffc61ed2 6868 UV n;
9e55ce06 6869 bool optimize_invert = TRUE;
cbbf8932 6870 AV* unicode_alternate = NULL;
1b2d223b
JH
6871#ifdef EBCDIC
6872 UV literal_endpoint = 0;
6873#endif
7f6f358c 6874 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6875
3dab1dad 6876 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6877 case we need to change the emitted regop to an EXACT. */
07be1b83 6878 const char * orig_parse = RExC_parse;
72f13be8 6879 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6880#ifndef DEBUGGING
6881 PERL_UNUSED_ARG(depth);
6882#endif
72f13be8 6883
3dab1dad 6884 DEBUG_PARSE("clas");
7f6f358c
YO
6885
6886 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6887 ret = reganode(pRExC_state, ANYOF, 0);
6888
6889 if (!SIZE_ONLY)
6890 ANYOF_FLAGS(ret) = 0;
6891
9a86a77b 6892 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6893 RExC_naughty++;
6894 RExC_parse++;
6895 if (!SIZE_ONLY)
6896 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6897 }
a0d0e21e 6898
73060fc4 6899 if (SIZE_ONLY) {
830247a4 6900 RExC_size += ANYOF_SKIP;
73060fc4
JH
6901 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6902 }
936ed897 6903 else {
830247a4 6904 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6905 if (FOLD)
6906 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6907 if (LOC)
6908 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6909 ANYOF_BITMAP_ZERO(ret);
396482e1 6910 listsv = newSVpvs("# comment\n");
a0d0e21e 6911 }
b8c5462f 6912
9a86a77b
JH
6913 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6914
b938889d 6915 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6916 checkposixcc(pRExC_state);
b8c5462f 6917
f064b6ad
HS
6918 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6919 if (UCHARAT(RExC_parse) == ']')
6920 goto charclassloop;
ffc61ed2 6921
fc8cd66c 6922parseit:
9a86a77b 6923 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6924
6925 charclassloop:
6926
6927 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6928
73b437c8 6929 if (!range)
830247a4 6930 rangebegin = RExC_parse;
ffc61ed2 6931 if (UTF) {
5e12f4fb 6932 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6933 RExC_end - RExC_parse,
9f7f3913 6934 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6935 RExC_parse += numlen;
6936 }
6937 else
6938 value = UCHARAT(RExC_parse++);
7f6f358c 6939
9a86a77b
JH
6940 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6941 if (value == '[' && POSIXCC(nextvalue))
830247a4 6942 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6943 else if (value == '\\') {
ffc61ed2 6944 if (UTF) {
5e12f4fb 6945 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6946 RExC_end - RExC_parse,
9f7f3913 6947 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6948 RExC_parse += numlen;
6949 }
6950 else
6951 value = UCHARAT(RExC_parse++);
470c3474 6952 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6953 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6954 * be a problem later if we want switch on Unicode.
6955 * A similar issue a little bit later when switching on
6956 * namedclass. --jhi */
ffc61ed2 6957 switch ((I32)value) {
b8c5462f
JH
6958 case 'w': namedclass = ANYOF_ALNUM; break;
6959 case 'W': namedclass = ANYOF_NALNUM; break;
6960 case 's': namedclass = ANYOF_SPACE; break;
6961 case 'S': namedclass = ANYOF_NSPACE; break;
6962 case 'd': namedclass = ANYOF_DIGIT; break;
6963 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6964 case 'N': /* Handle \N{NAME} in class */
6965 {
6966 /* We only pay attention to the first char of
6967 multichar strings being returned. I kinda wonder
6968 if this makes sense as it does change the behaviour
6969 from earlier versions, OTOH that behaviour was broken
6970 as well. */
6971 UV v; /* value is register so we cant & it /grrr */
6972 if (reg_namedseq(pRExC_state, &v)) {
6973 goto parseit;
6974 }
6975 value= v;
6976 }
6977 break;
ffc61ed2
JH
6978 case 'p':
6979 case 'P':
3dab1dad
YO
6980 {
6981 char *e;
af6f566e 6982 if (RExC_parse >= RExC_end)
2a4859cd 6983 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6984 if (*RExC_parse == '{') {
1df70142 6985 const U8 c = (U8)value;
ffc61ed2
JH
6986 e = strchr(RExC_parse++, '}');
6987 if (!e)
0da60cf5 6988 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
6989 while (isSPACE(UCHARAT(RExC_parse)))
6990 RExC_parse++;
6991 if (e == RExC_parse)
0da60cf5 6992 vFAIL2("Empty \\%c{}", c);
ffc61ed2 6993 n = e - RExC_parse;
ab13f0c7
JH
6994 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6995 n--;
ffc61ed2
JH
6996 }
6997 else {
6998 e = RExC_parse;
6999 n = 1;
7000 }
7001 if (!SIZE_ONLY) {
ab13f0c7
JH
7002 if (UCHARAT(RExC_parse) == '^') {
7003 RExC_parse++;
7004 n--;
7005 value = value == 'p' ? 'P' : 'p'; /* toggle */
7006 while (isSPACE(UCHARAT(RExC_parse))) {
7007 RExC_parse++;
7008 n--;
7009 }
7010 }
097eb12c
AL
7011 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7012 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
7013 }
7014 RExC_parse = e + 1;
7015 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 7016 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 7017 }
f81125e2 7018 break;
b8c5462f
JH
7019 case 'n': value = '\n'; break;
7020 case 'r': value = '\r'; break;
7021 case 't': value = '\t'; break;
7022 case 'f': value = '\f'; break;
7023 case 'b': value = '\b'; break;
c7f1f016
NIS
7024 case 'e': value = ASCII_TO_NATIVE('\033');break;
7025 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 7026 case 'x':
ffc61ed2 7027 if (*RExC_parse == '{') {
a4c04bdc
NC
7028 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7029 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 7030 char * const e = strchr(RExC_parse++, '}');
b81d288d 7031 if (!e)
ffc61ed2 7032 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
7033
7034 numlen = e - RExC_parse;
7035 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7036 RExC_parse = e + 1;
7037 }
7038 else {
a4c04bdc 7039 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
7040 numlen = 2;
7041 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7042 RExC_parse += numlen;
7043 }
9e08bc66
TS
7044 if (PL_encoding && value < 0x100)
7045 goto recode_encoding;
b8c5462f
JH
7046 break;
7047 case 'c':
830247a4 7048 value = UCHARAT(RExC_parse++);
b8c5462f
JH
7049 value = toCTRL(value);
7050 break;
7051 case '0': case '1': case '2': case '3': case '4':
7052 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
7053 {
7054 I32 flags = 0;
7055 numlen = 3;
7056 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7057 RExC_parse += numlen;
7058 if (PL_encoding && value < 0x100)
7059 goto recode_encoding;
7060 break;
7061 }
7062 recode_encoding:
7063 {
7064 SV* enc = PL_encoding;
7065 value = reg_recode((const char)(U8)value, &enc);
7066 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7067 vWARN(RExC_parse,
7068 "Invalid escape in the specified encoding");
7069 break;
7070 }
1028017a 7071 default:
041457d9 7072 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7073 vWARN2(RExC_parse,
7074 "Unrecognized escape \\%c in character class passed through",
7075 (int)value);
1028017a 7076 break;
b8c5462f 7077 }
ffc61ed2 7078 } /* end of \blah */
1b2d223b
JH
7079#ifdef EBCDIC
7080 else
7081 literal_endpoint++;
7082#endif
ffc61ed2
JH
7083
7084 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7085
7086 if (!SIZE_ONLY && !need_class)
936ed897 7087 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7088
936ed897 7089 need_class = 1;
ffc61ed2
JH
7090
7091 /* a bad range like a-\d, a-[:digit:] ? */
7092 if (range) {
73b437c8 7093 if (!SIZE_ONLY) {
afd78fd5 7094 if (ckWARN(WARN_REGEXP)) {
097eb12c 7095 const int w =
afd78fd5
JH
7096 RExC_parse >= rangebegin ?
7097 RExC_parse - rangebegin : 0;
830247a4 7098 vWARN4(RExC_parse,
b45f050a 7099 "False [] range \"%*.*s\"",
097eb12c 7100 w, w, rangebegin);
afd78fd5 7101 }
3568d838
JH
7102 if (prevvalue < 256) {
7103 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7104 ANYOF_BITMAP_SET(ret, '-');
7105 }
7106 else {
7107 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7108 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7109 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7110 }
b8c5462f 7111 }
ffc61ed2
JH
7112
7113 range = 0; /* this was not a true range */
73b437c8 7114 }
ffc61ed2 7115
73b437c8 7116 if (!SIZE_ONLY) {
c49a72a9
NC
7117 const char *what = NULL;
7118 char yesno = 0;
7119
3568d838
JH
7120 if (namedclass > OOB_NAMEDCLASS)
7121 optimize_invert = FALSE;
e2962f66
JH
7122 /* Possible truncation here but in some 64-bit environments
7123 * the compiler gets heartburn about switch on 64-bit values.
7124 * A similar issue a little earlier when switching on value.
98f323fa 7125 * --jhi */
e2962f66 7126 switch ((I32)namedclass) {
73b437c8
JH
7127 case ANYOF_ALNUM:
7128 if (LOC)
936ed897 7129 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
7130 else {
7131 for (value = 0; value < 256; value++)
7132 if (isALNUM(value))
936ed897 7133 ANYOF_BITMAP_SET(ret, value);
73b437c8 7134 }
c49a72a9
NC
7135 yesno = '+';
7136 what = "Word";
73b437c8
JH
7137 break;
7138 case ANYOF_NALNUM:
7139 if (LOC)
936ed897 7140 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
7141 else {
7142 for (value = 0; value < 256; value++)
7143 if (!isALNUM(value))
936ed897 7144 ANYOF_BITMAP_SET(ret, value);
73b437c8 7145 }
c49a72a9
NC
7146 yesno = '!';
7147 what = "Word";
73b437c8 7148 break;
ffc61ed2 7149 case ANYOF_ALNUMC:
73b437c8 7150 if (LOC)
ffc61ed2 7151 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
7152 else {
7153 for (value = 0; value < 256; value++)
ffc61ed2 7154 if (isALNUMC(value))
936ed897 7155 ANYOF_BITMAP_SET(ret, value);
73b437c8 7156 }
c49a72a9
NC
7157 yesno = '+';
7158 what = "Alnum";
73b437c8
JH
7159 break;
7160 case ANYOF_NALNUMC:
7161 if (LOC)
936ed897 7162 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
7163 else {
7164 for (value = 0; value < 256; value++)
7165 if (!isALNUMC(value))
936ed897 7166 ANYOF_BITMAP_SET(ret, value);
73b437c8 7167 }
c49a72a9
NC
7168 yesno = '!';
7169 what = "Alnum";
73b437c8
JH
7170 break;
7171 case ANYOF_ALPHA:
7172 if (LOC)
936ed897 7173 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
7174 else {
7175 for (value = 0; value < 256; value++)
7176 if (isALPHA(value))
936ed897 7177 ANYOF_BITMAP_SET(ret, value);
73b437c8 7178 }
c49a72a9
NC
7179 yesno = '+';
7180 what = "Alpha";
73b437c8
JH
7181 break;
7182 case ANYOF_NALPHA:
7183 if (LOC)
936ed897 7184 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
7185 else {
7186 for (value = 0; value < 256; value++)
7187 if (!isALPHA(value))
936ed897 7188 ANYOF_BITMAP_SET(ret, value);
73b437c8 7189 }
c49a72a9
NC
7190 yesno = '!';
7191 what = "Alpha";
73b437c8
JH
7192 break;
7193 case ANYOF_ASCII:
7194 if (LOC)
936ed897 7195 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7196 else {
c7f1f016 7197#ifndef EBCDIC
1ba5c669
JH
7198 for (value = 0; value < 128; value++)
7199 ANYOF_BITMAP_SET(ret, value);
7200#else /* EBCDIC */
ffbc6a93 7201 for (value = 0; value < 256; value++) {
3a3c4447
JH
7202 if (isASCII(value))
7203 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7204 }
1ba5c669 7205#endif /* EBCDIC */
73b437c8 7206 }
c49a72a9
NC
7207 yesno = '+';
7208 what = "ASCII";
73b437c8
JH
7209 break;
7210 case ANYOF_NASCII:
7211 if (LOC)
936ed897 7212 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7213 else {
c7f1f016 7214#ifndef EBCDIC
1ba5c669
JH
7215 for (value = 128; value < 256; value++)
7216 ANYOF_BITMAP_SET(ret, value);
7217#else /* EBCDIC */
ffbc6a93 7218 for (value = 0; value < 256; value++) {
3a3c4447
JH
7219 if (!isASCII(value))
7220 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7221 }
1ba5c669 7222#endif /* EBCDIC */
73b437c8 7223 }
c49a72a9
NC
7224 yesno = '!';
7225 what = "ASCII";
73b437c8 7226 break;
aaa51d5e
JF
7227 case ANYOF_BLANK:
7228 if (LOC)
7229 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7230 else {
7231 for (value = 0; value < 256; value++)
7232 if (isBLANK(value))
7233 ANYOF_BITMAP_SET(ret, value);
7234 }
c49a72a9
NC
7235 yesno = '+';
7236 what = "Blank";
aaa51d5e
JF
7237 break;
7238 case ANYOF_NBLANK:
7239 if (LOC)
7240 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7241 else {
7242 for (value = 0; value < 256; value++)
7243 if (!isBLANK(value))
7244 ANYOF_BITMAP_SET(ret, value);
7245 }
c49a72a9
NC
7246 yesno = '!';
7247 what = "Blank";
aaa51d5e 7248 break;
73b437c8
JH
7249 case ANYOF_CNTRL:
7250 if (LOC)
936ed897 7251 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
7252 else {
7253 for (value = 0; value < 256; value++)
7254 if (isCNTRL(value))
936ed897 7255 ANYOF_BITMAP_SET(ret, value);
73b437c8 7256 }
c49a72a9
NC
7257 yesno = '+';
7258 what = "Cntrl";
73b437c8
JH
7259 break;
7260 case ANYOF_NCNTRL:
7261 if (LOC)
936ed897 7262 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
7263 else {
7264 for (value = 0; value < 256; value++)
7265 if (!isCNTRL(value))
936ed897 7266 ANYOF_BITMAP_SET(ret, value);
73b437c8 7267 }
c49a72a9
NC
7268 yesno = '!';
7269 what = "Cntrl";
ffc61ed2
JH
7270 break;
7271 case ANYOF_DIGIT:
7272 if (LOC)
7273 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7274 else {
7275 /* consecutive digits assumed */
7276 for (value = '0'; value <= '9'; value++)
7277 ANYOF_BITMAP_SET(ret, value);
7278 }
c49a72a9
NC
7279 yesno = '+';
7280 what = "Digit";
ffc61ed2
JH
7281 break;
7282 case ANYOF_NDIGIT:
7283 if (LOC)
7284 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7285 else {
7286 /* consecutive digits assumed */
7287 for (value = 0; value < '0'; value++)
7288 ANYOF_BITMAP_SET(ret, value);
7289 for (value = '9' + 1; value < 256; value++)
7290 ANYOF_BITMAP_SET(ret, value);
7291 }
c49a72a9
NC
7292 yesno = '!';
7293 what = "Digit";
73b437c8
JH
7294 break;
7295 case ANYOF_GRAPH:
7296 if (LOC)
936ed897 7297 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
7298 else {
7299 for (value = 0; value < 256; value++)
7300 if (isGRAPH(value))
936ed897 7301 ANYOF_BITMAP_SET(ret, value);
73b437c8 7302 }
c49a72a9
NC
7303 yesno = '+';
7304 what = "Graph";
73b437c8
JH
7305 break;
7306 case ANYOF_NGRAPH:
7307 if (LOC)
936ed897 7308 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
7309 else {
7310 for (value = 0; value < 256; value++)
7311 if (!isGRAPH(value))
936ed897 7312 ANYOF_BITMAP_SET(ret, value);
73b437c8 7313 }
c49a72a9
NC
7314 yesno = '!';
7315 what = "Graph";
73b437c8
JH
7316 break;
7317 case ANYOF_LOWER:
7318 if (LOC)
936ed897 7319 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
7320 else {
7321 for (value = 0; value < 256; value++)
7322 if (isLOWER(value))
936ed897 7323 ANYOF_BITMAP_SET(ret, value);
73b437c8 7324 }
c49a72a9
NC
7325 yesno = '+';
7326 what = "Lower";
73b437c8
JH
7327 break;
7328 case ANYOF_NLOWER:
7329 if (LOC)
936ed897 7330 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
7331 else {
7332 for (value = 0; value < 256; value++)
7333 if (!isLOWER(value))
936ed897 7334 ANYOF_BITMAP_SET(ret, value);
73b437c8 7335 }
c49a72a9
NC
7336 yesno = '!';
7337 what = "Lower";
73b437c8
JH
7338 break;
7339 case ANYOF_PRINT:
7340 if (LOC)
936ed897 7341 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
7342 else {
7343 for (value = 0; value < 256; value++)
7344 if (isPRINT(value))
936ed897 7345 ANYOF_BITMAP_SET(ret, value);
73b437c8 7346 }
c49a72a9
NC
7347 yesno = '+';
7348 what = "Print";
73b437c8
JH
7349 break;
7350 case ANYOF_NPRINT:
7351 if (LOC)
936ed897 7352 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
7353 else {
7354 for (value = 0; value < 256; value++)
7355 if (!isPRINT(value))
936ed897 7356 ANYOF_BITMAP_SET(ret, value);
73b437c8 7357 }
c49a72a9
NC
7358 yesno = '!';
7359 what = "Print";
73b437c8 7360 break;
aaa51d5e
JF
7361 case ANYOF_PSXSPC:
7362 if (LOC)
7363 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7364 else {
7365 for (value = 0; value < 256; value++)
7366 if (isPSXSPC(value))
7367 ANYOF_BITMAP_SET(ret, value);
7368 }
c49a72a9
NC
7369 yesno = '+';
7370 what = "Space";
aaa51d5e
JF
7371 break;
7372 case ANYOF_NPSXSPC:
7373 if (LOC)
7374 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7375 else {
7376 for (value = 0; value < 256; value++)
7377 if (!isPSXSPC(value))
7378 ANYOF_BITMAP_SET(ret, value);
7379 }
c49a72a9
NC
7380 yesno = '!';
7381 what = "Space";
aaa51d5e 7382 break;
73b437c8
JH
7383 case ANYOF_PUNCT:
7384 if (LOC)
936ed897 7385 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
7386 else {
7387 for (value = 0; value < 256; value++)
7388 if (isPUNCT(value))
936ed897 7389 ANYOF_BITMAP_SET(ret, value);
73b437c8 7390 }
c49a72a9
NC
7391 yesno = '+';
7392 what = "Punct";
73b437c8
JH
7393 break;
7394 case ANYOF_NPUNCT:
7395 if (LOC)
936ed897 7396 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
7397 else {
7398 for (value = 0; value < 256; value++)
7399 if (!isPUNCT(value))
936ed897 7400 ANYOF_BITMAP_SET(ret, value);
73b437c8 7401 }
c49a72a9
NC
7402 yesno = '!';
7403 what = "Punct";
ffc61ed2
JH
7404 break;
7405 case ANYOF_SPACE:
7406 if (LOC)
7407 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7408 else {
7409 for (value = 0; value < 256; value++)
7410 if (isSPACE(value))
7411 ANYOF_BITMAP_SET(ret, value);
7412 }
c49a72a9
NC
7413 yesno = '+';
7414 what = "SpacePerl";
ffc61ed2
JH
7415 break;
7416 case ANYOF_NSPACE:
7417 if (LOC)
7418 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7419 else {
7420 for (value = 0; value < 256; value++)
7421 if (!isSPACE(value))
7422 ANYOF_BITMAP_SET(ret, value);
7423 }
c49a72a9
NC
7424 yesno = '!';
7425 what = "SpacePerl";
73b437c8
JH
7426 break;
7427 case ANYOF_UPPER:
7428 if (LOC)
936ed897 7429 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
7430 else {
7431 for (value = 0; value < 256; value++)
7432 if (isUPPER(value))
936ed897 7433 ANYOF_BITMAP_SET(ret, value);
73b437c8 7434 }
c49a72a9
NC
7435 yesno = '+';
7436 what = "Upper";
73b437c8
JH
7437 break;
7438 case ANYOF_NUPPER:
7439 if (LOC)
936ed897 7440 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
7441 else {
7442 for (value = 0; value < 256; value++)
7443 if (!isUPPER(value))
936ed897 7444 ANYOF_BITMAP_SET(ret, value);
73b437c8 7445 }
c49a72a9
NC
7446 yesno = '!';
7447 what = "Upper";
73b437c8
JH
7448 break;
7449 case ANYOF_XDIGIT:
7450 if (LOC)
936ed897 7451 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
7452 else {
7453 for (value = 0; value < 256; value++)
7454 if (isXDIGIT(value))
936ed897 7455 ANYOF_BITMAP_SET(ret, value);
73b437c8 7456 }
c49a72a9
NC
7457 yesno = '+';
7458 what = "XDigit";
73b437c8
JH
7459 break;
7460 case ANYOF_NXDIGIT:
7461 if (LOC)
936ed897 7462 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
7463 else {
7464 for (value = 0; value < 256; value++)
7465 if (!isXDIGIT(value))
936ed897 7466 ANYOF_BITMAP_SET(ret, value);
73b437c8 7467 }
c49a72a9
NC
7468 yesno = '!';
7469 what = "XDigit";
73b437c8 7470 break;
f81125e2
JP
7471 case ANYOF_MAX:
7472 /* this is to handle \p and \P */
7473 break;
73b437c8 7474 default:
b45f050a 7475 vFAIL("Invalid [::] class");
73b437c8 7476 break;
b8c5462f 7477 }
c49a72a9
NC
7478 if (what) {
7479 /* Strings such as "+utf8::isWord\n" */
7480 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7481 }
b8c5462f 7482 if (LOC)
936ed897 7483 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7484 continue;
a0d0e21e 7485 }
ffc61ed2
JH
7486 } /* end of namedclass \blah */
7487
a0d0e21e 7488 if (range) {
eb160463 7489 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7490 const int w = RExC_parse - rangebegin;
7491 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7492 range = 0; /* not a valid range */
73b437c8 7493 }
a0d0e21e
LW
7494 }
7495 else {
3568d838 7496 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7497 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7498 RExC_parse[1] != ']') {
7499 RExC_parse++;
ffc61ed2
JH
7500
7501 /* a bad range like \w-, [:word:]- ? */
7502 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7503 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7504 const int w =
afd78fd5
JH
7505 RExC_parse >= rangebegin ?
7506 RExC_parse - rangebegin : 0;
830247a4 7507 vWARN4(RExC_parse,
b45f050a 7508 "False [] range \"%*.*s\"",
097eb12c 7509 w, w, rangebegin);
afd78fd5 7510 }
73b437c8 7511 if (!SIZE_ONLY)
936ed897 7512 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7513 } else
ffc61ed2
JH
7514 range = 1; /* yeah, it's a range! */
7515 continue; /* but do it the next time */
a0d0e21e 7516 }
a687059c 7517 }
ffc61ed2 7518
93733859 7519 /* now is the next time */
07be1b83 7520 /*stored += (value - prevvalue + 1);*/
ae5c130c 7521 if (!SIZE_ONLY) {
3568d838 7522 if (prevvalue < 256) {
1df70142 7523 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7524 IV i;
3568d838 7525#ifdef EBCDIC
1b2d223b
JH
7526 /* In EBCDIC [\x89-\x91] should include
7527 * the \x8e but [i-j] should not. */
7528 if (literal_endpoint == 2 &&
7529 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7530 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7531 {
3568d838
JH
7532 if (isLOWER(prevvalue)) {
7533 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7534 if (isLOWER(i))
7535 ANYOF_BITMAP_SET(ret, i);
7536 } else {
3568d838 7537 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7538 if (isUPPER(i))
7539 ANYOF_BITMAP_SET(ret, i);
7540 }
8ada0baa 7541 }
ffc61ed2 7542 else
8ada0baa 7543#endif
07be1b83
YO
7544 for (i = prevvalue; i <= ceilvalue; i++) {
7545 if (!ANYOF_BITMAP_TEST(ret,i)) {
7546 stored++;
7547 ANYOF_BITMAP_SET(ret, i);
7548 }
7549 }
3568d838 7550 }
a5961de5 7551 if (value > 255 || UTF) {
1df70142
AL
7552 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7553 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7554 stored+=2; /* can't optimize this class */
ffc61ed2 7555 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7556 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7557 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7558 prevnatvalue, natvalue);
7559 }
7560 else if (prevnatvalue == natvalue) {
7561 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7562 if (FOLD) {
89ebb4a3 7563 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7564 STRLEN foldlen;
1df70142 7565 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7566
e294cc5d
JH
7567#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7568 if (RExC_precomp[0] == ':' &&
7569 RExC_precomp[1] == '[' &&
7570 (f == 0xDF || f == 0x92)) {
7571 f = NATIVE_TO_UNI(f);
7572 }
7573#endif
c840d2a2
JH
7574 /* If folding and foldable and a single
7575 * character, insert also the folded version
7576 * to the charclass. */
9e55ce06 7577 if (f != value) {
e294cc5d
JH
7578#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7579 if ((RExC_precomp[0] == ':' &&
7580 RExC_precomp[1] == '[' &&
7581 (f == 0xA2 &&
7582 (value == 0xFB05 || value == 0xFB06))) ?
7583 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7584 foldlen == (STRLEN)UNISKIP(f) )
7585#else
eb160463 7586 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7587#endif
9e55ce06
JH
7588 Perl_sv_catpvf(aTHX_ listsv,
7589 "%04"UVxf"\n", f);
7590 else {
7591 /* Any multicharacter foldings
7592 * require the following transform:
7593 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7594 * where E folds into "pq" and F folds
7595 * into "rst", all other characters
7596 * fold to single characters. We save
7597 * away these multicharacter foldings,
7598 * to be later saved as part of the
7599 * additional "s" data. */
7600 SV *sv;
7601
7602 if (!unicode_alternate)
7603 unicode_alternate = newAV();
7604 sv = newSVpvn((char*)foldbuf, foldlen);
7605 SvUTF8_on(sv);
7606 av_push(unicode_alternate, sv);
7607 }
7608 }
254ba52a 7609
60a8b682
JH
7610 /* If folding and the value is one of the Greek
7611 * sigmas insert a few more sigmas to make the
7612 * folding rules of the sigmas to work right.
7613 * Note that not all the possible combinations
7614 * are handled here: some of them are handled
9e55ce06
JH
7615 * by the standard folding rules, and some of
7616 * them (literal or EXACTF cases) are handled
7617 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7618 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7619 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7620 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7621 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7622 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7623 }
7624 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7625 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7626 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7627 }
7628 }
ffc61ed2 7629 }
1b2d223b
JH
7630#ifdef EBCDIC
7631 literal_endpoint = 0;
7632#endif
8ada0baa 7633 }
ffc61ed2
JH
7634
7635 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7636 }
ffc61ed2 7637
936ed897 7638 if (need_class) {
4f66b38d 7639 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7640 if (SIZE_ONLY)
830247a4 7641 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7642 else
830247a4 7643 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7644 }
ffc61ed2 7645
7f6f358c
YO
7646
7647 if (SIZE_ONLY)
7648 return ret;
7649 /****** !SIZE_ONLY AFTER HERE *********/
7650
7651 if( stored == 1 && value < 256
7652 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7653 ) {
7654 /* optimize single char class to an EXACT node
7655 but *only* when its not a UTF/high char */
07be1b83
YO
7656 const char * cur_parse= RExC_parse;
7657 RExC_emit = (regnode *)orig_emit;
7658 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7659 ret = reg_node(pRExC_state,
7660 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7661 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7662 *STRING(ret)= (char)value;
7663 STR_LEN(ret)= 1;
7664 RExC_emit += STR_SZ(1);
7665 return ret;
7666 }
ae5c130c 7667 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7668 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7669 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7670 ) {
a0ed51b3 7671 for (value = 0; value < 256; ++value) {
936ed897 7672 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7673 UV fold = PL_fold[value];
ffc61ed2
JH
7674
7675 if (fold != value)
7676 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7677 }
7678 }
936ed897 7679 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7680 }
ffc61ed2 7681
ae5c130c 7682 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7683 if (optimize_invert &&
ffc61ed2
JH
7684 /* If the only flag is inversion. */
7685 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7686 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7687 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7688 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7689 }
7f6f358c 7690 {
097eb12c 7691 AV * const av = newAV();
ffc61ed2 7692 SV *rv;
9e55ce06 7693 /* The 0th element stores the character class description
6a0407ee 7694 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7695 * to initialize the appropriate swash (which gets stored in
7696 * the 1st element), and also useful for dumping the regnode.
7697 * The 2nd element stores the multicharacter foldings,
6a0407ee 7698 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7699 av_store(av, 0, listsv);
7700 av_store(av, 1, NULL);
9e55ce06 7701 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7702 rv = newRV_noinc((SV*)av);
19860706 7703 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 7704 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 7705 ARG_SET(ret, n);
a0ed51b3 7706 }
a0ed51b3
LW
7707 return ret;
7708}
7709
76e3520e 7710STATIC char*
830247a4 7711S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7712{
097eb12c 7713 char* const retval = RExC_parse++;
a0d0e21e 7714
4633a7c4 7715 for (;;) {
830247a4
IZ
7716 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7717 RExC_parse[2] == '#') {
e994fd66
AE
7718 while (*RExC_parse != ')') {
7719 if (RExC_parse == RExC_end)
7720 FAIL("Sequence (?#... not terminated");
830247a4 7721 RExC_parse++;
e994fd66 7722 }
830247a4 7723 RExC_parse++;
4633a7c4
LW
7724 continue;
7725 }
bbe252da 7726 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
7727 if (isSPACE(*RExC_parse)) {
7728 RExC_parse++;
748a9306
LW
7729 continue;
7730 }
830247a4 7731 else if (*RExC_parse == '#') {
e994fd66
AE
7732 while (RExC_parse < RExC_end)
7733 if (*RExC_parse++ == '\n') break;
748a9306
LW
7734 continue;
7735 }
748a9306 7736 }
4633a7c4 7737 return retval;
a0d0e21e 7738 }
a687059c
LW
7739}
7740
7741/*
c277df42 7742- reg_node - emit a node
a0d0e21e 7743*/
76e3520e 7744STATIC regnode * /* Location. */
830247a4 7745S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7746{
97aff369 7747 dVAR;
c277df42 7748 register regnode *ptr;
504618e9 7749 regnode * const ret = RExC_emit;
07be1b83 7750 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7751
c277df42 7752 if (SIZE_ONLY) {
830247a4
IZ
7753 SIZE_ALIGN(RExC_size);
7754 RExC_size += 1;
a0d0e21e
LW
7755 return(ret);
7756 }
e2e6a0f1
YO
7757#ifdef DEBUGGING
7758 if (OP(RExC_emit) == 255)
7759 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7760 reg_name[op], OP(RExC_emit));
7761#endif
c277df42 7762 NODE_ALIGN_FILL(ret);
a0d0e21e 7763 ptr = ret;
c277df42 7764 FILL_ADVANCE_NODE(ptr, op);
fac92740 7765 if (RExC_offsets) { /* MJD */
07be1b83 7766 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7767 "reg_node", __LINE__,
7768 reg_name[op],
07be1b83
YO
7769 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7770 ? "Overwriting end of array!\n" : "OK",
7771 (UV)(RExC_emit - RExC_emit_start),
7772 (UV)(RExC_parse - RExC_start),
7773 (UV)RExC_offsets[0]));
ccb2c380 7774 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7775 }
07be1b83 7776
830247a4 7777 RExC_emit = ptr;
a0d0e21e 7778 return(ret);
a687059c
LW
7779}
7780
7781/*
a0d0e21e
LW
7782- reganode - emit a node with an argument
7783*/
76e3520e 7784STATIC regnode * /* Location. */
830247a4 7785S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7786{
97aff369 7787 dVAR;
c277df42 7788 register regnode *ptr;
504618e9 7789 regnode * const ret = RExC_emit;
07be1b83 7790 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7791
c277df42 7792 if (SIZE_ONLY) {
830247a4
IZ
7793 SIZE_ALIGN(RExC_size);
7794 RExC_size += 2;
6bda09f9
YO
7795 /*
7796 We can't do this:
7797
7798 assert(2==regarglen[op]+1);
7799
7800 Anything larger than this has to allocate the extra amount.
7801 If we changed this to be:
7802
7803 RExC_size += (1 + regarglen[op]);
7804
7805 then it wouldn't matter. Its not clear what side effect
7806 might come from that so its not done so far.
7807 -- dmq
7808 */
a0d0e21e
LW
7809 return(ret);
7810 }
e2e6a0f1
YO
7811#ifdef DEBUGGING
7812 if (OP(RExC_emit) == 255)
7813 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7814#endif
c277df42 7815 NODE_ALIGN_FILL(ret);
a0d0e21e 7816 ptr = ret;
c277df42 7817 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7818 if (RExC_offsets) { /* MJD */
07be1b83 7819 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7820 "reganode",
ccb2c380
MP
7821 __LINE__,
7822 reg_name[op],
07be1b83 7823 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7824 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7825 (UV)(RExC_emit - RExC_emit_start),
7826 (UV)(RExC_parse - RExC_start),
7827 (UV)RExC_offsets[0]));
ccb2c380 7828 Set_Cur_Node_Offset;
fac92740
MJD
7829 }
7830
830247a4 7831 RExC_emit = ptr;
a0d0e21e 7832 return(ret);
fe14fcc3
LW
7833}
7834
7835/*
cd439c50 7836- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7837*/
71207a34
AL
7838STATIC STRLEN
7839S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7840{
97aff369 7841 dVAR;
71207a34 7842 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7843}
7844
7845/*
a0d0e21e
LW
7846- reginsert - insert an operator in front of already-emitted operand
7847*
7848* Means relocating the operand.
7849*/
76e3520e 7850STATIC void
6bda09f9 7851S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7852{
97aff369 7853 dVAR;
c277df42
IZ
7854 register regnode *src;
7855 register regnode *dst;
7856 register regnode *place;
504618e9 7857 const int offset = regarglen[(U8)op];
6bda09f9 7858 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7859 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7860/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7861 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7862 if (SIZE_ONLY) {
6bda09f9 7863 RExC_size += size;
a0d0e21e
LW
7864 return;
7865 }
a687059c 7866
830247a4 7867 src = RExC_emit;
6bda09f9 7868 RExC_emit += size;
830247a4 7869 dst = RExC_emit;
40d049e4 7870 if (RExC_open_parens) {
6bda09f9 7871 int paren;
6d99fb9b 7872 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 7873 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7874 if ( RExC_open_parens[paren] >= opnd ) {
7875 DEBUG_PARSE_FMT("open"," - %d",size);
7876 RExC_open_parens[paren] += size;
7877 } else {
7878 DEBUG_PARSE_FMT("open"," - %s","ok");
7879 }
7880 if ( RExC_close_parens[paren] >= opnd ) {
7881 DEBUG_PARSE_FMT("close"," - %d",size);
7882 RExC_close_parens[paren] += size;
7883 } else {
7884 DEBUG_PARSE_FMT("close"," - %s","ok");
7885 }
7886 }
6bda09f9 7887 }
40d049e4 7888
fac92740 7889 while (src > opnd) {
c277df42 7890 StructCopy(--src, --dst, regnode);
fac92740 7891 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7892 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7893 "reg_insert",
ccb2c380
MP
7894 __LINE__,
7895 reg_name[op],
07be1b83
YO
7896 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7897 ? "Overwriting end of array!\n" : "OK",
7898 (UV)(src - RExC_emit_start),
7899 (UV)(dst - RExC_emit_start),
7900 (UV)RExC_offsets[0]));
ccb2c380
MP
7901 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7902 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7903 }
7904 }
7905
a0d0e21e
LW
7906
7907 place = opnd; /* Op node, where operand used to be. */
fac92740 7908 if (RExC_offsets) { /* MJD */
07be1b83 7909 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7910 "reginsert",
ccb2c380
MP
7911 __LINE__,
7912 reg_name[op],
07be1b83 7913 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7914 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7915 (UV)(place - RExC_emit_start),
7916 (UV)(RExC_parse - RExC_start),
786e8c11 7917 (UV)RExC_offsets[0]));
ccb2c380 7918 Set_Node_Offset(place, RExC_parse);
45948336 7919 Set_Node_Length(place, 1);
fac92740 7920 }
c277df42
IZ
7921 src = NEXTOPER(place);
7922 FILL_ADVANCE_NODE(place, op);
7923 Zero(src, offset, regnode);
a687059c
LW
7924}
7925
7926/*
c277df42 7927- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7928- SEE ALSO: regtail_study
a0d0e21e 7929*/
097eb12c 7930/* TODO: All three parms should be const */
76e3520e 7931STATIC void
3dab1dad 7932S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7933{
97aff369 7934 dVAR;
c277df42 7935 register regnode *scan;
72f13be8 7936 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7937#ifndef DEBUGGING
7938 PERL_UNUSED_ARG(depth);
7939#endif
a0d0e21e 7940
c277df42 7941 if (SIZE_ONLY)
a0d0e21e
LW
7942 return;
7943
7944 /* Find last node. */
7945 scan = p;
7946 for (;;) {
504618e9 7947 regnode * const temp = regnext(scan);
3dab1dad
YO
7948 DEBUG_PARSE_r({
7949 SV * const mysv=sv_newmortal();
7950 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7951 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7952 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7953 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7954 (temp == NULL ? "->" : ""),
7955 (temp == NULL ? reg_name[OP(val)] : "")
7956 );
3dab1dad
YO
7957 });
7958 if (temp == NULL)
7959 break;
7960 scan = temp;
7961 }
7962
7963 if (reg_off_by_arg[OP(scan)]) {
7964 ARG_SET(scan, val - scan);
7965 }
7966 else {
7967 NEXT_OFF(scan) = val - scan;
7968 }
7969}
7970
07be1b83 7971#ifdef DEBUGGING
3dab1dad
YO
7972/*
7973- regtail_study - set the next-pointer at the end of a node chain of p to val.
7974- Look for optimizable sequences at the same time.
7975- currently only looks for EXACT chains.
07be1b83
YO
7976
7977This is expermental code. The idea is to use this routine to perform
7978in place optimizations on branches and groups as they are constructed,
7979with the long term intention of removing optimization from study_chunk so
7980that it is purely analytical.
7981
7982Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7983to control which is which.
7984
3dab1dad
YO
7985*/
7986/* TODO: All four parms should be const */
07be1b83 7987
3dab1dad
YO
7988STATIC U8
7989S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7990{
7991 dVAR;
7992 register regnode *scan;
07be1b83
YO
7993 U8 exact = PSEUDO;
7994#ifdef EXPERIMENTAL_INPLACESCAN
7995 I32 min = 0;
7996#endif
7997
3dab1dad
YO
7998 GET_RE_DEBUG_FLAGS_DECL;
7999
07be1b83 8000
3dab1dad
YO
8001 if (SIZE_ONLY)
8002 return exact;
8003
8004 /* Find last node. */
8005
8006 scan = p;
8007 for (;;) {
8008 regnode * const temp = regnext(scan);
07be1b83
YO
8009#ifdef EXPERIMENTAL_INPLACESCAN
8010 if (PL_regkind[OP(scan)] == EXACT)
8011 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8012 return EXACT;
8013#endif
3dab1dad
YO
8014 if ( exact ) {
8015 switch (OP(scan)) {
8016 case EXACT:
8017 case EXACTF:
8018 case EXACTFL:
8019 if( exact == PSEUDO )
8020 exact= OP(scan);
07be1b83
YO
8021 else if ( exact != OP(scan) )
8022 exact= 0;
3dab1dad
YO
8023 case NOTHING:
8024 break;
8025 default:
8026 exact= 0;
8027 }
8028 }
8029 DEBUG_PARSE_r({
8030 SV * const mysv=sv_newmortal();
8031 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8032 regprop(RExC_rx, mysv, scan);
eaf3ca90 8033 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8034 SvPV_nolen_const(mysv),
eaf3ca90
YO
8035 REG_NODE_NUM(scan),
8036 reg_name[exact]);
3dab1dad 8037 });
a0d0e21e
LW
8038 if (temp == NULL)
8039 break;
8040 scan = temp;
8041 }
07be1b83
YO
8042 DEBUG_PARSE_r({
8043 SV * const mysv_val=sv_newmortal();
8044 DEBUG_PARSE_MSG("");
8045 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
8046 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8047 SvPV_nolen_const(mysv_val),
8048 (IV)REG_NODE_NUM(val),
8049 (IV)(val - scan)
07be1b83
YO
8050 );
8051 });
c277df42
IZ
8052 if (reg_off_by_arg[OP(scan)]) {
8053 ARG_SET(scan, val - scan);
a0ed51b3
LW
8054 }
8055 else {
c277df42
IZ
8056 NEXT_OFF(scan) = val - scan;
8057 }
3dab1dad
YO
8058
8059 return exact;
a687059c 8060}
07be1b83 8061#endif
a687059c
LW
8062
8063/*
a687059c
LW
8064 - regcurly - a little FSA that accepts {\d+,?\d*}
8065 */
79072805 8066STATIC I32
5f66b61c 8067S_regcurly(register const char *s)
a687059c
LW
8068{
8069 if (*s++ != '{')
8070 return FALSE;
f0fcb552 8071 if (!isDIGIT(*s))
a687059c 8072 return FALSE;
f0fcb552 8073 while (isDIGIT(*s))
a687059c
LW
8074 s++;
8075 if (*s == ',')
8076 s++;
f0fcb552 8077 while (isDIGIT(*s))
a687059c
LW
8078 s++;
8079 if (*s != '}')
8080 return FALSE;
8081 return TRUE;
8082}
8083
a687059c
LW
8084
8085/*
fd181c75 8086 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
8087 */
8088void
097eb12c 8089Perl_regdump(pTHX_ const regexp *r)
a687059c 8090{
35ff7856 8091#ifdef DEBUGGING
97aff369 8092 dVAR;
c445ea15 8093 SV * const sv = sv_newmortal();
ab3bbdeb 8094 SV *dsv= sv_newmortal();
f8fc2ecf 8095 RXi_GET_DECL(r,ri);
a687059c 8096
f8fc2ecf 8097 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8098
8099 /* Header fields of interest. */
ab3bbdeb
YO
8100 if (r->anchored_substr) {
8101 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8102 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8103 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8104 "anchored %s%s at %"IVdf" ",
8105 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8106 (IV)r->anchored_offset);
ab3bbdeb
YO
8107 } else if (r->anchored_utf8) {
8108 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8109 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8110 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8111 "anchored utf8 %s%s at %"IVdf" ",
8112 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8113 (IV)r->anchored_offset);
ab3bbdeb
YO
8114 }
8115 if (r->float_substr) {
8116 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8117 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8118 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8119 "floating %s%s at %"IVdf"..%"UVuf" ",
8120 s, RE_SV_TAIL(r->float_substr),
7b0972df 8121 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8122 } else if (r->float_utf8) {
8123 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8124 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8125 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8126 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8127 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8128 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8129 }
33b8afdf 8130 if (r->check_substr || r->check_utf8)
b81d288d 8131 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8132 (const char *)
8133 (r->check_substr == r->float_substr
8134 && r->check_utf8 == r->float_utf8
8135 ? "(checking floating" : "(checking anchored"));
bbe252da 8136 if (r->extflags & RXf_NOSCAN)
c277df42 8137 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 8138 if (r->extflags & RXf_CHECK_ALL)
c277df42 8139 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8140 if (r->check_substr || r->check_utf8)
c277df42
IZ
8141 PerlIO_printf(Perl_debug_log, ") ");
8142
f8fc2ecf
YO
8143 if (ri->regstclass) {
8144 regprop(r, sv, ri->regstclass);
1de06328 8145 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8146 }
bbe252da 8147 if (r->extflags & RXf_ANCH) {
774d564b 8148 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 8149 if (r->extflags & RXf_ANCH_BOL)
774d564b 8150 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 8151 if (r->extflags & RXf_ANCH_MBOL)
c277df42 8152 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 8153 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 8154 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 8155 if (r->extflags & RXf_ANCH_GPOS)
774d564b 8156 PerlIO_printf(Perl_debug_log, "(GPOS)");
8157 PerlIO_putc(Perl_debug_log, ' ');
8158 }
bbe252da 8159 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 8160 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 8161 if (r->intflags & PREGf_SKIP)
760ac839 8162 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 8163 if (r->intflags & PREGf_IMPLICIT)
760ac839 8164 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 8165 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 8166 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 8167 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8168 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8169#else
96a5add6 8170 PERL_UNUSED_CONTEXT;
65e66c80 8171 PERL_UNUSED_ARG(r);
17c3b450 8172#endif /* DEBUGGING */
a687059c
LW
8173}
8174
8175/*
a0d0e21e
LW
8176- regprop - printable representation of opcode
8177*/
46fc3d4c 8178void
32fc9b6a 8179Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8180{
35ff7856 8181#ifdef DEBUGGING
97aff369 8182 dVAR;
9b155405 8183 register int k;
f8fc2ecf 8184 RXi_GET_DECL(prog,progi);
1de06328 8185 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 8186
a0d0e21e 8187
54dc92de 8188 sv_setpvn(sv, "", 0);
8aa23a47 8189
03363afd 8190 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8191 /* It would be nice to FAIL() here, but this may be called from
8192 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8193 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8194 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8195
3dab1dad 8196 k = PL_regkind[OP(o)];
9b155405 8197
2a782b5b 8198 if (k == EXACT) {
396482e1 8199 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8200 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8201 * is a crude hack but it may be the best for now since
8202 * we have no flag "this EXACTish node was UTF-8"
8203 * --jhi */
8204 const char * const s =
ddc5bc0f 8205 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8206 PL_colors[0], PL_colors[1],
8207 PERL_PV_ESCAPE_UNI_DETECT |
8208 PERL_PV_PRETTY_ELIPSES |
8209 PERL_PV_PRETTY_LTGT
8210 );
8211 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8212 } else if (k == TRIE) {
3dab1dad 8213 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 8214 * progi->data isn't available here */
1de06328
YO
8215 const char op = OP(o);
8216 const I32 n = ARG(o);
8217 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 8218 (reg_ac_data *)progi->data->data[n] :
1de06328
YO
8219 NULL;
8220 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
f8fc2ecf 8221 (reg_trie_data*)progi->data->data[n] :
1de06328
YO
8222 ac->trie;
8223
8224 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8225 DEBUG_TRIE_COMPILE_r(
8226 Perl_sv_catpvf(aTHX_ sv,
8227 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8228 (UV)trie->startstate,
1e2e3d02 8229 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8230 (UV)trie->wordcount,
8231 (UV)trie->minlen,
8232 (UV)trie->maxlen,
8233 (UV)TRIE_CHARCOUNT(trie),
8234 (UV)trie->uniquecharcount
8235 )
8236 );
8237 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8238 int i;
8239 int rangestart = -1;
f46cb337 8240 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8241 Perl_sv_catpvf(aTHX_ sv, "[");
8242 for (i = 0; i <= 256; i++) {
8243 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8244 if (rangestart == -1)
8245 rangestart = i;
8246 } else if (rangestart != -1) {
8247 if (i <= rangestart + 3)
8248 for (; rangestart < i; rangestart++)
8249 put_byte(sv, rangestart);
8250 else {
8251 put_byte(sv, rangestart);
8252 sv_catpvs(sv, "-");
8253 put_byte(sv, i - 1);
8254 }
8255 rangestart = -1;
8256 }
8257 }
8258 Perl_sv_catpvf(aTHX_ sv, "]");
8259 }
8260
a3621e74 8261 } else if (k == CURLY) {
cb434fcc 8262 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8263 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8264 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8265 }
2c2d71f5
JH
8266 else if (k == WHILEM && o->flags) /* Ordinal/of */
8267 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
e2e6a0f1 8268 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
894356b3 8269 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1a147d38 8270 else if (k == GOSUB)
6bda09f9 8271 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8272 else if (k == VERB) {
8273 if (!o->flags)
8274 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
f8fc2ecf 8275 (SV*)progi->data->data[ ARG( o ) ]);
e2e6a0f1 8276 } else if (k == LOGICAL)
04ebc1ab 8277 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8278 else if (k == ANYOF) {
8279 int i, rangestart = -1;
2d03de9c 8280 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8281
8282 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8283 static const char * const anyofs[] = {
653099ff
GS
8284 "\\w",
8285 "\\W",
8286 "\\s",
8287 "\\S",
8288 "\\d",
8289 "\\D",
8290 "[:alnum:]",
8291 "[:^alnum:]",
8292 "[:alpha:]",
8293 "[:^alpha:]",
8294 "[:ascii:]",
8295 "[:^ascii:]",
8296 "[:ctrl:]",
8297 "[:^ctrl:]",
8298 "[:graph:]",
8299 "[:^graph:]",
8300 "[:lower:]",
8301 "[:^lower:]",
8302 "[:print:]",
8303 "[:^print:]",
8304 "[:punct:]",
8305 "[:^punct:]",
8306 "[:upper:]",
aaa51d5e 8307 "[:^upper:]",
653099ff 8308 "[:xdigit:]",
aaa51d5e
JF
8309 "[:^xdigit:]",
8310 "[:space:]",
8311 "[:^space:]",
8312 "[:blank:]",
8313 "[:^blank:]"
653099ff
GS
8314 };
8315
19860706 8316 if (flags & ANYOF_LOCALE)
396482e1 8317 sv_catpvs(sv, "{loc}");
19860706 8318 if (flags & ANYOF_FOLD)
396482e1 8319 sv_catpvs(sv, "{i}");
653099ff 8320 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8321 if (flags & ANYOF_INVERT)
396482e1 8322 sv_catpvs(sv, "^");
ffc61ed2
JH
8323 for (i = 0; i <= 256; i++) {
8324 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8325 if (rangestart == -1)
8326 rangestart = i;
8327 } else if (rangestart != -1) {
8328 if (i <= rangestart + 3)
8329 for (; rangestart < i; rangestart++)
653099ff 8330 put_byte(sv, rangestart);
ffc61ed2
JH
8331 else {
8332 put_byte(sv, rangestart);
396482e1 8333 sv_catpvs(sv, "-");
ffc61ed2 8334 put_byte(sv, i - 1);
653099ff 8335 }
ffc61ed2 8336 rangestart = -1;
653099ff 8337 }
847a199f 8338 }
ffc61ed2
JH
8339
8340 if (o->flags & ANYOF_CLASS)
bb7a0f54 8341 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8342 if (ANYOF_CLASS_TEST(o,i))
8343 sv_catpv(sv, anyofs[i]);
8344
8345 if (flags & ANYOF_UNICODE)
396482e1 8346 sv_catpvs(sv, "{unicode}");
1aa99e6b 8347 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8348 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8349
8350 {
8351 SV *lv;
32fc9b6a 8352 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8353
ffc61ed2
JH
8354 if (lv) {
8355 if (sw) {
89ebb4a3 8356 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8357
ffc61ed2 8358 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8359 uvchr_to_utf8(s, i);
ffc61ed2 8360
3568d838 8361 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8362 if (rangestart == -1)
8363 rangestart = i;
8364 } else if (rangestart != -1) {
ffc61ed2
JH
8365 if (i <= rangestart + 3)
8366 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8367 const U8 * const e = uvchr_to_utf8(s,rangestart);
8368 U8 *p;
8369 for(p = s; p < e; p++)
ffc61ed2
JH
8370 put_byte(sv, *p);
8371 }
8372 else {
2d03de9c
AL
8373 const U8 *e = uvchr_to_utf8(s,rangestart);
8374 U8 *p;
8375 for (p = s; p < e; p++)
ffc61ed2 8376 put_byte(sv, *p);
396482e1 8377 sv_catpvs(sv, "-");
2d03de9c
AL
8378 e = uvchr_to_utf8(s, i-1);
8379 for (p = s; p < e; p++)
1df70142 8380 put_byte(sv, *p);
ffc61ed2
JH
8381 }
8382 rangestart = -1;
8383 }
19860706 8384 }
ffc61ed2 8385
396482e1 8386 sv_catpvs(sv, "..."); /* et cetera */
19860706 8387 }
fde631ed 8388
ffc61ed2 8389 {
2e0de35c 8390 char *s = savesvpv(lv);
c445ea15 8391 char * const origs = s;
b81d288d 8392
3dab1dad
YO
8393 while (*s && *s != '\n')
8394 s++;
b81d288d 8395
ffc61ed2 8396 if (*s == '\n') {
2d03de9c 8397 const char * const t = ++s;
ffc61ed2
JH
8398
8399 while (*s) {
8400 if (*s == '\n')
8401 *s = ' ';
8402 s++;
8403 }
8404 if (s[-1] == ' ')
8405 s[-1] = 0;
8406
8407 sv_catpv(sv, t);
fde631ed 8408 }
b81d288d 8409
ffc61ed2 8410 Safefree(origs);
fde631ed
JH
8411 }
8412 }
653099ff 8413 }
ffc61ed2 8414
653099ff
GS
8415 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8416 }
9b155405 8417 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8418 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8419#else
96a5add6 8420 PERL_UNUSED_CONTEXT;
65e66c80
SP
8421 PERL_UNUSED_ARG(sv);
8422 PERL_UNUSED_ARG(o);
f9049ba1 8423 PERL_UNUSED_ARG(prog);
17c3b450 8424#endif /* DEBUGGING */
35ff7856 8425}
a687059c 8426
cad2e5aa
JH
8427SV *
8428Perl_re_intuit_string(pTHX_ regexp *prog)
8429{ /* Assume that RE_INTUIT is set */
97aff369 8430 dVAR;
a3621e74 8431 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8432 PERL_UNUSED_CONTEXT;
8433
a3621e74 8434 DEBUG_COMPILE_r(
cfd0369c 8435 {
2d03de9c 8436 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8437 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8438
8439 if (!PL_colorset) reginitcolors();
8440 PerlIO_printf(Perl_debug_log,
a0288114 8441 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8442 PL_colors[4],
8443 prog->check_substr ? "" : "utf8 ",
8444 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8445 s,
8446 PL_colors[1],
8447 (strlen(s) > 60 ? "..." : ""));
8448 } );
8449
33b8afdf 8450 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8451}
8452
84da74a7
YO
8453/*
8454 pregfree - free a regexp
8455
8456 See regdupe below if you change anything here.
8457*/
8458
2b69d0c2 8459void
864dbfa3 8460Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8461{
27da23d5 8462 dVAR;
f8fc2ecf 8463 RXi_GET_DECL(r,ri);
fc32ee4a 8464 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8465
7821416a
IZ
8466 if (!r || (--r->refcnt > 0))
8467 return;
ab3bbdeb 8468 DEBUG_COMPILE_r({
0df25f3d
YO
8469 if (!PL_colorset)
8470 reginitcolors();
5c9f2f80
YO
8471 {
8472 SV *dsv= sv_newmortal();
bbe252da 8473 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
ab3bbdeb
YO
8474 dsv, r->precomp, r->prelen, 60);
8475 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8476 PL_colors[4],PL_colors[5],s);
8477 }
9e55ce06 8478 });
cad2e5aa 8479
43c5f42d
NC
8480 /* gcov results gave these as non-null 100% of the time, so there's no
8481 optimisation in checking them before calling Safefree */
8482 Safefree(r->precomp);
f8fc2ecf 8483 Safefree(ri->offsets); /* 20010421 MJD */
ed252734 8484 RX_MATCH_COPY_FREE(r);
f8c7b90f 8485#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8486 if (r->saved_copy)
8487 SvREFCNT_dec(r->saved_copy);
8488#endif
a193d654
GS
8489 if (r->substrs) {
8490 if (r->anchored_substr)
8491 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8492 if (r->anchored_utf8)
8493 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8494 if (r->float_substr)
8495 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8496 if (r->float_utf8)
8497 SvREFCNT_dec(r->float_utf8);
2779dcf1 8498 Safefree(r->substrs);
a193d654 8499 }
81714fb9
YO
8500 if (r->paren_names)
8501 SvREFCNT_dec(r->paren_names);
f8fc2ecf
YO
8502 if (ri->data) {
8503 int n = ri->data->count;
f3548bdc
DM
8504 PAD* new_comppad = NULL;
8505 PAD* old_comppad;
4026c95a 8506 PADOFFSET refcnt;
dfad63ad 8507
c277df42 8508 while (--n >= 0) {
261faec3 8509 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 8510 switch (ri->data->what[n]) {
c277df42 8511 case 's':
81714fb9 8512 case 'S':
f8fc2ecf 8513 SvREFCNT_dec((SV*)ri->data->data[n]);
c277df42 8514 break;
653099ff 8515 case 'f':
f8fc2ecf 8516 Safefree(ri->data->data[n]);
653099ff 8517 break;
dfad63ad 8518 case 'p':
f8fc2ecf 8519 new_comppad = (AV*)ri->data->data[n];
dfad63ad 8520 break;
c277df42 8521 case 'o':
dfad63ad 8522 if (new_comppad == NULL)
cea2e8a9 8523 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8524 PAD_SAVE_LOCAL(old_comppad,
8525 /* Watch out for global destruction's random ordering. */
c445ea15 8526 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8527 );
b34c0dd4 8528 OP_REFCNT_LOCK;
f8fc2ecf 8529 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
8530 OP_REFCNT_UNLOCK;
8531 if (!refcnt)
f8fc2ecf 8532 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 8533
f3548bdc 8534 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8535 SvREFCNT_dec((SV*)new_comppad);
8536 new_comppad = NULL;
c277df42
IZ
8537 break;
8538 case 'n':
9e55ce06 8539 break;
07be1b83 8540 case 'T':
be8e71aa
YO
8541 { /* Aho Corasick add-on structure for a trie node.
8542 Used in stclass optimization only */
07be1b83 8543 U32 refcount;
f8fc2ecf 8544 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
8545 OP_REFCNT_LOCK;
8546 refcount = --aho->refcount;
8547 OP_REFCNT_UNLOCK;
8548 if ( !refcount ) {
8549 Safefree(aho->states);
8550 Safefree(aho->fail);
8551 aho->trie=NULL; /* not necessary to free this as it is
8552 handled by the 't' case */
f8fc2ecf
YO
8553 Safefree(ri->data->data[n]); /* do this last!!!! */
8554 Safefree(ri->regstclass);
07be1b83
YO
8555 }
8556 }
8557 break;
a3621e74 8558 case 't':
07be1b83 8559 {
be8e71aa 8560 /* trie structure. */
07be1b83 8561 U32 refcount;
f8fc2ecf 8562 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
8563 OP_REFCNT_LOCK;
8564 refcount = --trie->refcount;
8565 OP_REFCNT_UNLOCK;
8566 if ( !refcount ) {
8567 Safefree(trie->charmap);
8568 if (trie->widecharmap)
8569 SvREFCNT_dec((SV*)trie->widecharmap);
8570 Safefree(trie->states);
8571 Safefree(trie->trans);
8572 if (trie->bitmap)
8573 Safefree(trie->bitmap);
8574 if (trie->wordlen)
8575 Safefree(trie->wordlen);
786e8c11
YO
8576 if (trie->jump)
8577 Safefree(trie->jump);
8578 if (trie->nextword)
8579 Safefree(trie->nextword);
a3621e74 8580#ifdef DEBUGGING
5c9f2f80
YO
8581 if (trie->words)
8582 SvREFCNT_dec((SV*)trie->words);
8583 if (trie->revcharmap)
8584 SvREFCNT_dec((SV*)trie->revcharmap);
a3621e74 8585#endif
f8fc2ecf 8586 Safefree(ri->data->data[n]); /* do this last!!!! */
a3621e74 8587 }
07be1b83
YO
8588 }
8589 break;
c277df42 8590 default:
f8fc2ecf 8591 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
8592 }
8593 }
f8fc2ecf
YO
8594 Safefree(ri->data->what);
8595 Safefree(ri->data);
a0d0e21e
LW
8596 }
8597 Safefree(r->startp);
8598 Safefree(r->endp);
f8fc2ecf
YO
8599 if (ri->swap) {
8600 Safefree(ri->swap->startp);
8601 Safefree(ri->swap->endp);
8602 Safefree(ri->swap);
c74340f9 8603 }
f8fc2ecf 8604 Safefree(ri);
a0d0e21e 8605 Safefree(r);
a687059c 8606}
c277df42 8607
84da74a7
YO
8608#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8609#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8610#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8611#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8612
8613/*
8614 regdupe - duplicate a regexp.
8615
8616 This routine is called by sv.c's re_dup and is expected to clone a
8617 given regexp structure. It is a no-op when not under USE_ITHREADS.
8618 (Originally this *was* re_dup() for change history see sv.c)
8619
8620 See pregfree() above if you change anything here.
8621*/
a3c0e9ca 8622#if defined(USE_ITHREADS)
84da74a7
YO
8623regexp *
8624Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8625{
84da74a7 8626 dVAR;
f8fc2ecf
YO
8627 regexp *ret;
8628 regexp_internal *reti;
84da74a7
YO
8629 int i, len, npar;
8630 struct reg_substr_datum *s;
f8fc2ecf
YO
8631 RXi_GET_DECL(r,ri);
8632
84da74a7
YO
8633 if (!r)
8634 return (REGEXP *)NULL;
8635
8636 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8637 return ret;
8638
f8fc2ecf 8639 len = ri->offsets[0];
84da74a7
YO
8640 npar = r->nparens+1;
8641
f8fc2ecf
YO
8642 Newxz(ret, 1, regexp);
8643 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
8644 RXi_SET(ret,reti);
8645 Copy(ri->program, reti->program, len+1, regnode);
84da74a7
YO
8646
8647 Newx(ret->startp, npar, I32);
8648 Copy(r->startp, ret->startp, npar, I32);
8649 Newx(ret->endp, npar, I32);
8650 Copy(r->startp, ret->startp, npar, I32);
f8fc2ecf
YO
8651 if(ri->swap) {
8652 Newx(reti->swap, 1, regexp_paren_ofs);
c74340f9 8653 /* no need to copy these */
f8fc2ecf
YO
8654 Newx(reti->swap->startp, npar, I32);
8655 Newx(reti->swap->endp, npar, I32);
c74340f9 8656 } else {
f8fc2ecf 8657 reti->swap = NULL;
c74340f9 8658 }
84da74a7
YO
8659
8660 Newx(ret->substrs, 1, struct reg_substr_data);
8661 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8662 s->min_offset = r->substrs->data[i].min_offset;
8663 s->max_offset = r->substrs->data[i].max_offset;
8664 s->end_shift = r->substrs->data[i].end_shift;
8665 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8666 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8667 }
8668
f8fc2ecf
YO
8669 reti->regstclass = NULL;
8670 if (ri->data) {
84da74a7 8671 struct reg_data *d;
f8fc2ecf 8672 const int count = ri->data->count;
84da74a7
YO
8673 int i;
8674
8675 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8676 char, struct reg_data);
8677 Newx(d->what, count, U8);
8678
8679 d->count = count;
8680 for (i = 0; i < count; i++) {
f8fc2ecf 8681 d->what[i] = ri->data->what[i];
84da74a7 8682 switch (d->what[i]) {
bbe252da 8683 /* legal options are one of: sSfpontT
84da74a7
YO
8684 see also regcomp.h and pregfree() */
8685 case 's':
81714fb9 8686 case 'S':
f8fc2ecf 8687 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
84da74a7
YO
8688 break;
8689 case 'p':
f8fc2ecf 8690 d->data[i] = av_dup_inc((AV *)ri->data->data[i], param);
84da74a7
YO
8691 break;
8692 case 'f':
8693 /* This is cheating. */
8694 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 8695 StructCopy(ri->data->data[i], d->data[i],
84da74a7 8696 struct regnode_charclass_class);
f8fc2ecf 8697 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
8698 break;
8699 case 'o':
bbe252da
YO
8700 /* Compiled op trees are readonly and in shared memory,
8701 and can thus be shared without duplication. */
84da74a7 8702 OP_REFCNT_LOCK;
f8fc2ecf 8703 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
8704 OP_REFCNT_UNLOCK;
8705 break;
8706 case 'n':
f8fc2ecf 8707 d->data[i] = ri->data->data[i];
84da74a7
YO
8708 break;
8709 case 't':
f8fc2ecf 8710 d->data[i] = ri->data->data[i];
84da74a7
YO
8711 OP_REFCNT_LOCK;
8712 ((reg_trie_data*)d->data[i])->refcount++;
8713 OP_REFCNT_UNLOCK;
8714 break;
8715 case 'T':
f8fc2ecf 8716 d->data[i] = ri->data->data[i];
84da74a7
YO
8717 OP_REFCNT_LOCK;
8718 ((reg_ac_data*)d->data[i])->refcount++;
8719 OP_REFCNT_UNLOCK;
8720 /* Trie stclasses are readonly and can thus be shared
8721 * without duplication. We free the stclass in pregfree
8722 * when the corresponding reg_ac_data struct is freed.
8723 */
f8fc2ecf 8724 reti->regstclass= ri->regstclass;
84da74a7
YO
8725 break;
8726 default:
f8fc2ecf 8727 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
8728 }
8729 }
8730
f8fc2ecf 8731 reti->data = d;
84da74a7
YO
8732 }
8733 else
f8fc2ecf 8734 reti->data = NULL;
84da74a7 8735
f8fc2ecf
YO
8736 Newx(reti->offsets, 2*len+1, U32);
8737 Copy(ri->offsets, reti->offsets, 2*len+1, U32);
84da74a7
YO
8738
8739 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8740 ret->refcnt = r->refcnt;
8741 ret->minlen = r->minlen;
de8c5301 8742 ret->minlenret = r->minlenret;
84da74a7
YO
8743 ret->prelen = r->prelen;
8744 ret->nparens = r->nparens;
8745 ret->lastparen = r->lastparen;
8746 ret->lastcloseparen = r->lastcloseparen;
bbe252da
YO
8747 ret->intflags = r->intflags;
8748 ret->extflags = r->extflags;
84da74a7
YO
8749
8750 ret->sublen = r->sublen;
8751
f9f4320a 8752 ret->engine = r->engine;
81714fb9
YO
8753
8754 ret->paren_names = hv_dup_inc(r->paren_names, param);
f9f4320a 8755
84da74a7
YO
8756 if (RX_MATCH_COPIED(ret))
8757 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8758 else
8759 ret->subbeg = NULL;
8760#ifdef PERL_OLD_COPY_ON_WRITE
8761 ret->saved_copy = NULL;
8762#endif
8763
8764 ptr_table_store(PL_ptr_table, r, ret);
8765 return ret;
84da74a7 8766}
a3c0e9ca 8767#endif
84da74a7 8768
de8c5301
YO
8769/*
8770 reg_stringify()
8771
8772 converts a regexp embedded in a MAGIC struct to its stringified form,
8773 caching the converted form in the struct and returns the cached
8774 string.
8775
8776 If lp is nonnull then it is used to return the length of the
8777 resulting string
8778
8779 If flags is nonnull and the returned string contains UTF8 then
8780 (flags & 1) will be true.
8781
8782 If haseval is nonnull then it is used to return whether the pattern
8783 contains evals.
8784
8785 Normally called via macro:
8786
8787 CALLREG_STRINGIFY(mg,0,0);
8788
8789 And internally with
8790
8791 CALLREG_AS_STR(mg,lp,flags,haseval)
8792
8793 See sv_2pv_flags() in sv.c for an example of internal usage.
8794
8795 */
8796
8797char *
8798Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8799 dVAR;
8800 const regexp * const re = (regexp *)mg->mg_obj;
f8fc2ecf
YO
8801 RXi_GET_DECL(re,ri);
8802
de8c5301
YO
8803 if (!mg->mg_ptr) {
8804 const char *fptr = "msix";
8805 char reflags[6];
8806 char ch;
8807 int left = 0;
8808 int right = 4;
8809 bool need_newline = 0;
bbe252da 8810 U16 reganch = (U16)((re->extflags & RXf_PMf_COMPILETIME) >> 12);
de8c5301
YO
8811
8812 while((ch = *fptr++)) {
8813 if(reganch & 1) {
8814 reflags[left++] = ch;
8815 }
8816 else {
8817 reflags[right--] = ch;
8818 }
8819 reganch >>= 1;
8820 }
8821 if(left != 4) {
8822 reflags[left] = '-';
8823 left = 5;
8824 }
8825
8826 mg->mg_len = re->prelen + 4 + left;
8827 /*
8828 * If /x was used, we have to worry about a regex ending with a
8829 * comment later being embedded within another regex. If so, we don't
8830 * want this regex's "commentization" to leak out to the right part of
8831 * the enclosing regex, we must cap it with a newline.
8832 *
8833 * So, if /x was used, we scan backwards from the end of the regex. If
8834 * we find a '#' before we find a newline, we need to add a newline
8835 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8836 * we don't need to add anything. -jfriedl
8837 */
bbe252da 8838 if (PMf_EXTENDED & re->extflags) {
de8c5301
YO
8839 const char *endptr = re->precomp + re->prelen;
8840 while (endptr >= re->precomp) {
8841 const char c = *(endptr--);
8842 if (c == '\n')
8843 break; /* don't need another */
8844 if (c == '#') {
8845 /* we end while in a comment, so we need a newline */
8846 mg->mg_len++; /* save space for it */
8847 need_newline = 1; /* note to add it */
8848 break;
8849 }
8850 }
8851 }
8852
8853 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8854 mg->mg_ptr[0] = '(';
8855 mg->mg_ptr[1] = '?';
8856 Copy(reflags, mg->mg_ptr+2, left, char);
8857 *(mg->mg_ptr+left+2) = ':';
8858 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8859 if (need_newline)
8860 mg->mg_ptr[mg->mg_len - 2] = '\n';
8861 mg->mg_ptr[mg->mg_len - 1] = ')';
8862 mg->mg_ptr[mg->mg_len] = 0;
8863 }
8864 if (haseval)
f8fc2ecf 8865 *haseval = ri->program[0].next_off;
de8c5301 8866 if (flags)
bbe252da 8867 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
de8c5301
YO
8868
8869 if (lp)
8870 *lp = mg->mg_len;
8871 return mg->mg_ptr;
8872}
8873
8874
76234dfb 8875#ifndef PERL_IN_XSUB_RE
c277df42
IZ
8876/*
8877 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8878 */
8879regnode *
864dbfa3 8880Perl_regnext(pTHX_ register regnode *p)
c277df42 8881{
97aff369 8882 dVAR;
c277df42
IZ
8883 register I32 offset;
8884
f8fc2ecf 8885 if (!p)
c277df42
IZ
8886 return(NULL);
8887
8888 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8889 if (offset == 0)
8890 return(NULL);
8891
c277df42 8892 return(p+offset);
c277df42 8893}
76234dfb 8894#endif
c277df42 8895
01f988be 8896STATIC void
cea2e8a9 8897S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8898{
8899 va_list args;
8900 STRLEN l1 = strlen(pat1);
8901 STRLEN l2 = strlen(pat2);
8902 char buf[512];
06bf62c7 8903 SV *msv;
73d840c0 8904 const char *message;
c277df42
IZ
8905
8906 if (l1 > 510)
8907 l1 = 510;
8908 if (l1 + l2 > 510)
8909 l2 = 510 - l1;
8910 Copy(pat1, buf, l1 , char);
8911 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8912 buf[l1 + l2] = '\n';
8913 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8914#ifdef I_STDARG
8915 /* ANSI variant takes additional second argument */
c277df42 8916 va_start(args, pat2);
8736538c
AS
8917#else
8918 va_start(args);
8919#endif
5a844595 8920 msv = vmess(buf, &args);
c277df42 8921 va_end(args);
cfd0369c 8922 message = SvPV_const(msv,l1);
c277df42
IZ
8923 if (l1 > 512)
8924 l1 = 512;
8925 Copy(message, buf, l1 , char);
197cf9b9 8926 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8927 Perl_croak(aTHX_ "%s", buf);
c277df42 8928}
a0ed51b3
LW
8929
8930/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8931
76234dfb 8932#ifndef PERL_IN_XSUB_RE
a0ed51b3 8933void
864dbfa3 8934Perl_save_re_context(pTHX)
b81d288d 8935{
97aff369 8936 dVAR;
1ade1aa1
NC
8937
8938 struct re_save_state *state;
8939
8940 SAVEVPTR(PL_curcop);
8941 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8942
8943 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8944 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8945 SSPUSHINT(SAVEt_RE_STATE);
8946
46ab3289 8947 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8948
a0ed51b3 8949 PL_reg_start_tmp = 0;
a0ed51b3 8950 PL_reg_start_tmpl = 0;
c445ea15 8951 PL_reg_oldsaved = NULL;
a5db57d6 8952 PL_reg_oldsavedlen = 0;
a5db57d6 8953 PL_reg_maxiter = 0;
a5db57d6 8954 PL_reg_leftiter = 0;
c445ea15 8955 PL_reg_poscache = NULL;
a5db57d6 8956 PL_reg_poscache_size = 0;
1ade1aa1
NC
8957#ifdef PERL_OLD_COPY_ON_WRITE
8958 PL_nrs = NULL;
8959#endif
ada6e8a9 8960
c445ea15
AL
8961 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8962 if (PL_curpm) {
8963 const REGEXP * const rx = PM_GETRE(PL_curpm);
8964 if (rx) {
1df70142 8965 U32 i;
ada6e8a9 8966 for (i = 1; i <= rx->nparens; i++) {
1df70142 8967 char digits[TYPE_CHARS(long)];
d9fad198 8968 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8969 GV *const *const gvp
8970 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8971
b37c2d43
AL
8972 if (gvp) {
8973 GV * const gv = *gvp;
8974 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8975 save_scalar(gv);
49f27e4b 8976 }
ada6e8a9
AMS
8977 }
8978 }
8979 }
a0ed51b3 8980}
76234dfb 8981#endif
51371543 8982
51371543 8983static void
acfe0abc 8984clear_re(pTHX_ void *r)
51371543 8985{
97aff369 8986 dVAR;
51371543
GS
8987 ReREFCNT_dec((regexp *)r);
8988}
ffbc6a93 8989
a28509cc
AL
8990#ifdef DEBUGGING
8991
8992STATIC void
8993S_put_byte(pTHX_ SV *sv, int c)
8994{
8995 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8996 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8997 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8998 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8999 else
9000 Perl_sv_catpvf(aTHX_ sv, "%c", c);
9001}
9002
786e8c11 9003
3dab1dad
YO
9004#define CLEAR_OPTSTART \
9005 if (optstart) STMT_START { \
70685ca0 9006 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
9007 optstart=NULL; \
9008 } STMT_END
9009
786e8c11 9010#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 9011
b5a2f8d8
NC
9012STATIC const regnode *
9013S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9014 const regnode *last, const regnode *plast,
9015 SV* sv, I32 indent, U32 depth)
a28509cc 9016{
97aff369 9017 dVAR;
786e8c11 9018 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9019 register const regnode *next;
3dab1dad 9020 const regnode *optstart= NULL;
f8fc2ecf 9021 RXi_GET_DECL(r,ri);
3dab1dad 9022 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 9023
786e8c11
YO
9024#ifdef DEBUG_DUMPUNTIL
9025 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9026 last ? last-start : 0,plast ? plast-start : 0);
9027#endif
9028
9029 if (plast && plast < last)
9030 last= plast;
9031
9032 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
9033 /* While that wasn't END last time... */
9034
9035 NODE_ALIGN(node);
9036 op = OP(node);
de734bd5 9037 if (op == CLOSE || op == WHILEM)
786e8c11 9038 indent--;
b5a2f8d8 9039 next = regnext((regnode *)node);
07be1b83 9040
a28509cc 9041 /* Where, what. */
8e11feef 9042 if (OP(node) == OPTIMIZED) {
e68ec53f 9043 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 9044 optstart = node;
3dab1dad 9045 else
8e11feef 9046 goto after_print;
3dab1dad
YO
9047 } else
9048 CLEAR_OPTSTART;
07be1b83 9049
32fc9b6a 9050 regprop(r, sv, node);
a28509cc 9051 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 9052 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
9053
9054 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
9055 if (next == NULL) /* Next ptr. */
9056 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
9057 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9058 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
9059 else
9060 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 9061
1de06328 9062 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 9063 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
9064 }
9065
a28509cc
AL
9066 after_print:
9067 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
9068 assert(next);
9069 {
9070 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
9071 ? regnext((regnode *)next)
9072 : next);
be8e71aa
YO
9073 if (last && nnode > last)
9074 nnode = last;
786e8c11 9075 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 9076 }
a28509cc
AL
9077 }
9078 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9079 assert(next);
786e8c11 9080 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9081 }
9082 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9083 const regnode *this_trie = node;
1de06328 9084 const char op = OP(node);
a28509cc 9085 const I32 n = ARG(node);
1de06328 9086 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 9087 (reg_ac_data *)ri->data->data[n] :
1de06328
YO
9088 NULL;
9089 const reg_trie_data * const trie = op<AHOCORASICK ?
f8fc2ecf 9090 (reg_trie_data*)ri->data->data[n] :
1de06328 9091 ac->trie;
786e8c11 9092 const regnode *nextbranch= NULL;
a28509cc 9093 I32 word_idx;
1de06328 9094 sv_setpvn(sv, "", 0);
786e8c11 9095 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 9096 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
9097
9098 PerlIO_printf(Perl_debug_log, "%*s%s ",
9099 (int)(2*(indent+3)), "",
9100 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9101 PL_colors[0], PL_colors[1],
9102 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9103 PERL_PV_PRETTY_ELIPSES |
7f69552c 9104 PERL_PV_PRETTY_LTGT
786e8c11
YO
9105 )
9106 : "???"
9107 );
9108 if (trie->jump) {
40d049e4 9109 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
9110 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9111 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
9112 if (dist) {
9113 if (!nextbranch)
24b23f37 9114 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9115 DUMPUNTIL(this_trie + dist, nextbranch);
9116 }
786e8c11
YO
9117 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9118 nextbranch= regnext((regnode *)nextbranch);
9119 } else {
9120 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9121 }
786e8c11
YO
9122 }
9123 if (last && next > last)
9124 node= last;
9125 else
9126 node= next;
a28509cc 9127 }
786e8c11
YO
9128 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9129 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9130 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9131 }
9132 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9133 assert(next);
786e8c11 9134 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9135 }
9136 else if ( op == PLUS || op == STAR) {
786e8c11 9137 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9138 }
9139 else if (op == ANYOF) {
9140 /* arglen 1 + class block */
9141 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9142 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9143 node = NEXTOPER(node);
9144 }
9145 else if (PL_regkind[(U8)op] == EXACT) {
9146 /* Literal string, where present. */
9147 node += NODE_SZ_STR(node) - 1;
9148 node = NEXTOPER(node);
9149 }
9150 else {
9151 node = NEXTOPER(node);
9152 node += regarglen[(U8)op];
9153 }
9154 if (op == CURLYX || op == OPEN)
786e8c11 9155 indent++;
a28509cc 9156 }
3dab1dad 9157 CLEAR_OPTSTART;
786e8c11 9158#ifdef DEBUG_DUMPUNTIL
70685ca0 9159 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 9160#endif
1de06328 9161 return node;
a28509cc
AL
9162}
9163
9164#endif /* DEBUGGING */
9165
241d1a3b
NC
9166/*
9167 * Local variables:
9168 * c-indentation-style: bsd
9169 * c-basic-offset: 4
9170 * indent-tabs-mode: t
9171 * End:
9172 *
37442d52
RGS
9173 * ex: set ts=8 sts=4 sw=4 noet:
9174 */