This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add missing file to MANIFEST
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
40d049e4
YO
129 regnode **recurse; /* Recurse regops */
130 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
131#if ADD_TO_REGEXEC
132 char *starttry; /* -Dr: where regtry was called. */
133#define RExC_starttry (pRExC_state->starttry)
134#endif
3dab1dad 135#ifdef DEBUGGING
be8e71aa 136 const char *lastparse;
3dab1dad
YO
137 I32 lastnum;
138#define RExC_lastparse (pRExC_state->lastparse)
139#define RExC_lastnum (pRExC_state->lastnum)
140#endif
830247a4
IZ
141} RExC_state_t;
142
e2509266 143#define RExC_flags (pRExC_state->flags)
830247a4
IZ
144#define RExC_precomp (pRExC_state->precomp)
145#define RExC_rx (pRExC_state->rx)
f8fc2ecf 146#define RExC_rxi (pRExC_state->rxi)
fac92740 147#define RExC_start (pRExC_state->start)
830247a4
IZ
148#define RExC_end (pRExC_state->end)
149#define RExC_parse (pRExC_state->parse)
150#define RExC_whilem_seen (pRExC_state->whilem_seen)
f8fc2ecf 151#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
830247a4 152#define RExC_emit (pRExC_state->emit)
fac92740 153#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
154#define RExC_naughty (pRExC_state->naughty)
155#define RExC_sawback (pRExC_state->sawback)
156#define RExC_seen (pRExC_state->seen)
157#define RExC_size (pRExC_state->size)
158#define RExC_npar (pRExC_state->npar)
c74340f9 159#define RExC_cpar (pRExC_state->cpar)
e2e6a0f1 160#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
161#define RExC_extralen (pRExC_state->extralen)
162#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 164#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 165#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
166#define RExC_open_parens (pRExC_state->open_parens)
167#define RExC_close_parens (pRExC_state->close_parens)
168#define RExC_opend (pRExC_state->opend)
81714fb9 169#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
170#define RExC_recurse (pRExC_state->recurse)
171#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 172
a687059c
LW
173#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
174#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
175 ((*s) == '{' && regcurly(s)))
a687059c 176
35c8bce7
LW
177#ifdef SPSTART
178#undef SPSTART /* dratted cpp namespace... */
179#endif
a687059c
LW
180/*
181 * Flags to be passed up and down.
182 */
a687059c 183#define WORST 0 /* Worst case. */
821b33a5 184#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
185#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
186#define SPSTART 0x4 /* Starts with * or +. */
187#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 188
3dab1dad
YO
189#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
190
07be1b83
YO
191/* whether trie related optimizations are enabled */
192#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
193#define TRIE_STUDY_OPT
786e8c11 194#define FULL_TRIE_STUDY
07be1b83
YO
195#define TRIE_STCLASS
196#endif
1de06328
YO
197
198
40d049e4
YO
199
200#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
201#define PBITVAL(paren) (1 << ((paren) & 7))
202#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
203#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
204#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
205
206
1de06328
YO
207/* About scan_data_t.
208
209 During optimisation we recurse through the regexp program performing
210 various inplace (keyhole style) optimisations. In addition study_chunk
211 and scan_commit populate this data structure with information about
212 what strings MUST appear in the pattern. We look for the longest
213 string that must appear for at a fixed location, and we look for the
214 longest string that may appear at a floating location. So for instance
215 in the pattern:
216
217 /FOO[xX]A.*B[xX]BAR/
218
219 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
220 strings (because they follow a .* construct). study_chunk will identify
221 both FOO and BAR as being the longest fixed and floating strings respectively.
222
223 The strings can be composites, for instance
224
225 /(f)(o)(o)/
226
227 will result in a composite fixed substring 'foo'.
228
229 For each string some basic information is maintained:
230
231 - offset or min_offset
232 This is the position the string must appear at, or not before.
233 It also implicitly (when combined with minlenp) tells us how many
234 character must match before the string we are searching.
235 Likewise when combined with minlenp and the length of the string
236 tells us how many characters must appear after the string we have
237 found.
238
239 - max_offset
240 Only used for floating strings. This is the rightmost point that
241 the string can appear at. Ifset to I32 max it indicates that the
242 string can occur infinitely far to the right.
243
244 - minlenp
245 A pointer to the minimum length of the pattern that the string
246 was found inside. This is important as in the case of positive
247 lookahead or positive lookbehind we can have multiple patterns
248 involved. Consider
249
250 /(?=FOO).*F/
251
252 The minimum length of the pattern overall is 3, the minimum length
253 of the lookahead part is 3, but the minimum length of the part that
254 will actually match is 1. So 'FOO's minimum length is 3, but the
255 minimum length for the F is 1. This is important as the minimum length
256 is used to determine offsets in front of and behind the string being
257 looked for. Since strings can be composites this is the length of the
258 pattern at the time it was commited with a scan_commit. Note that
259 the length is calculated by study_chunk, so that the minimum lengths
260 are not known until the full pattern has been compiled, thus the
261 pointer to the value.
262
263 - lookbehind
264
265 In the case of lookbehind the string being searched for can be
266 offset past the start point of the final matching string.
267 If this value was just blithely removed from the min_offset it would
268 invalidate some of the calculations for how many chars must match
269 before or after (as they are derived from min_offset and minlen and
270 the length of the string being searched for).
271 When the final pattern is compiled and the data is moved from the
272 scan_data_t structure into the regexp structure the information
273 about lookbehind is factored in, with the information that would
274 have been lost precalculated in the end_shift field for the
275 associated string.
276
277 The fields pos_min and pos_delta are used to store the minimum offset
278 and the delta to the maximum offset at the current point in the pattern.
279
280*/
2c2d71f5
JH
281
282typedef struct scan_data_t {
1de06328
YO
283 /*I32 len_min; unused */
284 /*I32 len_delta; unused */
2c2d71f5
JH
285 I32 pos_min;
286 I32 pos_delta;
287 SV *last_found;
1de06328 288 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
289 I32 last_start_min;
290 I32 last_start_max;
1de06328
YO
291 SV **longest; /* Either &l_fixed, or &l_float. */
292 SV *longest_fixed; /* longest fixed string found in pattern */
293 I32 offset_fixed; /* offset where it starts */
294 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
295 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
296 SV *longest_float; /* longest floating string found in pattern */
297 I32 offset_float_min; /* earliest point in string it can appear */
298 I32 offset_float_max; /* latest point in string it can appear */
299 I32 *minlen_float; /* pointer to the minlen relevent to the string */
300 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
301 I32 flags;
302 I32 whilem_c;
cb434fcc 303 I32 *last_closep;
653099ff 304 struct regnode_charclass_class *start_class;
2c2d71f5
JH
305} scan_data_t;
306
a687059c 307/*
e50aee73 308 * Forward declarations for pregcomp()'s friends.
a687059c 309 */
a0d0e21e 310
27da23d5 311static const scan_data_t zero_scan_data =
1de06328 312 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
313
314#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
315#define SF_BEFORE_SEOL 0x0001
316#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
317#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
318#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
319
09b7f37c
CB
320#ifdef NO_UNARY_PLUS
321# define SF_FIX_SHIFT_EOL (0+2)
322# define SF_FL_SHIFT_EOL (0+4)
323#else
324# define SF_FIX_SHIFT_EOL (+2)
325# define SF_FL_SHIFT_EOL (+4)
326#endif
c277df42
IZ
327
328#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
329#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
330
331#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
332#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
333#define SF_IS_INF 0x0040
334#define SF_HAS_PAR 0x0080
335#define SF_IN_PAR 0x0100
336#define SF_HAS_EVAL 0x0200
337#define SCF_DO_SUBSTR 0x0400
653099ff
GS
338#define SCF_DO_STCLASS_AND 0x0800
339#define SCF_DO_STCLASS_OR 0x1000
340#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 341#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 342
786e8c11 343#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 344#define SCF_SEEN_ACCEPT 0x8000
07be1b83 345
eb160463 346#define UTF (RExC_utf8 != 0)
bbe252da
YO
347#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
348#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 349
ffc61ed2 350#define OOB_UNICODE 12345678
93733859 351#define OOB_NAMEDCLASS -1
b8c5462f 352
a0ed51b3
LW
353#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
354#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
355
8615cb43 356
b45f050a
JF
357/* length of regex to show in messages that don't mark a position within */
358#define RegexLengthToShowInErrorMessages 127
359
360/*
361 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
362 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
363 * op/pragma/warn/regcomp.
364 */
7253e4e3
RK
365#define MARKER1 "<-- HERE" /* marker as it appears in the description */
366#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 367
7253e4e3 368#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
369
370/*
371 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
372 * arg. Show regex, up to a maximum length. If it's too long, chop and add
373 * "...".
374 */
58e23c8d 375#define _FAIL(code) STMT_START { \
bfed75c6 376 const char *ellipses = ""; \
ccb2c380
MP
377 IV len = RExC_end - RExC_precomp; \
378 \
379 if (!SIZE_ONLY) \
380 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
381 if (len > RegexLengthToShowInErrorMessages) { \
382 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
383 len = RegexLengthToShowInErrorMessages - 10; \
384 ellipses = "..."; \
385 } \
58e23c8d 386 code; \
ccb2c380 387} STMT_END
8615cb43 388
58e23c8d
YO
389#define FAIL(msg) _FAIL( \
390 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
391 msg, (int)len, RExC_precomp, ellipses))
392
393#define FAIL2(msg,arg) _FAIL( \
394 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
395 arg, (int)len, RExC_precomp, ellipses))
396
b45f050a 397/*
b45f050a
JF
398 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
399 */
ccb2c380 400#define Simple_vFAIL(m) STMT_START { \
a28509cc 401 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
402 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
403 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
404} STMT_END
b45f050a
JF
405
406/*
407 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
408 */
ccb2c380
MP
409#define vFAIL(m) STMT_START { \
410 if (!SIZE_ONLY) \
411 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
412 Simple_vFAIL(m); \
413} STMT_END
b45f050a
JF
414
415/*
416 * Like Simple_vFAIL(), but accepts two arguments.
417 */
ccb2c380 418#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 419 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
420 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
421 (int)offset, RExC_precomp, RExC_precomp + offset); \
422} STMT_END
b45f050a
JF
423
424/*
425 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
426 */
ccb2c380
MP
427#define vFAIL2(m,a1) STMT_START { \
428 if (!SIZE_ONLY) \
429 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
430 Simple_vFAIL2(m, a1); \
431} STMT_END
b45f050a
JF
432
433
434/*
435 * Like Simple_vFAIL(), but accepts three arguments.
436 */
ccb2c380 437#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 438 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
439 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
440 (int)offset, RExC_precomp, RExC_precomp + offset); \
441} STMT_END
b45f050a
JF
442
443/*
444 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
445 */
ccb2c380
MP
446#define vFAIL3(m,a1,a2) STMT_START { \
447 if (!SIZE_ONLY) \
448 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
449 Simple_vFAIL3(m, a1, a2); \
450} STMT_END
b45f050a
JF
451
452/*
453 * Like Simple_vFAIL(), but accepts four arguments.
454 */
ccb2c380 455#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 456 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
457 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
458 (int)offset, RExC_precomp, RExC_precomp + offset); \
459} STMT_END
b45f050a 460
ccb2c380 461#define vWARN(loc,m) STMT_START { \
a28509cc 462 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
463 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
464 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
465} STMT_END
466
467#define vWARNdep(loc,m) STMT_START { \
a28509cc 468 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
469 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
470 "%s" REPORT_LOCATION, \
471 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
472} STMT_END
473
474
475#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 476 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
477 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
478 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
479} STMT_END
480
481#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 482 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
483 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
484 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
485} STMT_END
486
487#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 488 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
489 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
491} STMT_END
492
493#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 494 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
495 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
496 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
497} STMT_END
9d1d55b5 498
8615cb43 499
cd439c50 500/* Allow for side effects in s */
ccb2c380
MP
501#define REGC(c,s) STMT_START { \
502 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
503} STMT_END
cd439c50 504
fac92740
MJD
505/* Macros for recording node offsets. 20001227 mjd@plover.com
506 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
507 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
508 * Element 0 holds the number n.
07be1b83 509 * Position is 1 indexed.
fac92740
MJD
510 */
511
ccb2c380
MP
512#define Set_Node_Offset_To_R(node,byte) STMT_START { \
513 if (! SIZE_ONLY) { \
514 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 515 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 516 if((node) < 0) { \
551405c4 517 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
518 } else { \
519 RExC_offsets[2*(node)-1] = (byte); \
520 } \
521 } \
522} STMT_END
523
524#define Set_Node_Offset(node,byte) \
525 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
526#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
527
528#define Set_Node_Length_To_R(node,len) STMT_START { \
529 if (! SIZE_ONLY) { \
530 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 531 __LINE__, (int)(node), (int)(len))); \
ccb2c380 532 if((node) < 0) { \
551405c4 533 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
534 } else { \
535 RExC_offsets[2*(node)] = (len); \
536 } \
537 } \
538} STMT_END
539
540#define Set_Node_Length(node,len) \
541 Set_Node_Length_To_R((node)-RExC_emit_start, len)
542#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
543#define Set_Node_Cur_Length(node) \
544 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
545
546/* Get offsets and lengths */
547#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
548#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
549
07be1b83
YO
550#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
551 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
552 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
553} STMT_END
554
555
556#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
557#define EXPERIMENTAL_INPLACESCAN
558#endif
559
1de06328 560#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 561DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 562 PerlIO_printf(Perl_debug_log, \
b515a41d 563 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
1de06328
YO
564 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
565 (int)(depth)*2, "", \
566 (IV)((data)->pos_min), \
567 (IV)((data)->pos_delta), \
568 (IV)((data)->flags), \
569 (IV)((data)->whilem_c), \
570 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
571 ); \
572 if ((data)->last_found) \
573 PerlIO_printf(Perl_debug_log, \
574 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
575 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
576 SvPVX_const((data)->last_found), \
577 (IV)((data)->last_end), \
578 (IV)((data)->last_start_min), \
579 (IV)((data)->last_start_max), \
580 ((data)->longest && \
581 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
582 SvPVX_const((data)->longest_fixed), \
583 (IV)((data)->offset_fixed), \
584 ((data)->longest && \
585 (data)->longest==&((data)->longest_float)) ? "*" : "", \
586 SvPVX_const((data)->longest_float), \
587 (IV)((data)->offset_float_min), \
588 (IV)((data)->offset_float_max) \
589 ); \
590 PerlIO_printf(Perl_debug_log,"\n"); \
591});
592
acfe0abc 593static void clear_re(pTHX_ void *r);
4327152a 594
653099ff 595/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 596 Update the longest found anchored substring and the longest found
653099ff
GS
597 floating substrings if needed. */
598
4327152a 599STATIC void
1de06328 600S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 601{
e1ec3a88
AL
602 const STRLEN l = CHR_SVLEN(data->last_found);
603 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 604 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 605
c277df42 606 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 607 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
608 if (*data->longest == data->longest_fixed) {
609 data->offset_fixed = l ? data->last_start_min : data->pos_min;
610 if (data->flags & SF_BEFORE_EOL)
b81d288d 611 data->flags
c277df42
IZ
612 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
613 else
614 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
615 data->minlen_fixed=minlenp;
616 data->lookbehind_fixed=0;
a0ed51b3
LW
617 }
618 else {
c277df42 619 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
620 data->offset_float_max = (l
621 ? data->last_start_max
c277df42 622 : data->pos_min + data->pos_delta);
9051bda5
HS
623 if ((U32)data->offset_float_max > (U32)I32_MAX)
624 data->offset_float_max = I32_MAX;
c277df42 625 if (data->flags & SF_BEFORE_EOL)
b81d288d 626 data->flags
c277df42
IZ
627 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
628 else
629 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
630 data->minlen_float=minlenp;
631 data->lookbehind_float=0;
c277df42
IZ
632 }
633 }
634 SvCUR_set(data->last_found, 0);
0eda9292 635 {
a28509cc 636 SV * const sv = data->last_found;
097eb12c
AL
637 if (SvUTF8(sv) && SvMAGICAL(sv)) {
638 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
639 if (mg)
640 mg->mg_len = 0;
641 }
0eda9292 642 }
c277df42
IZ
643 data->last_end = -1;
644 data->flags &= ~SF_BEFORE_EOL;
1de06328 645 DEBUG_STUDYDATA(data,0);
c277df42
IZ
646}
647
653099ff
GS
648/* Can match anything (initialization) */
649STATIC void
097eb12c 650S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 651{
653099ff 652 ANYOF_CLASS_ZERO(cl);
f8bef550 653 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 654 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
655 if (LOC)
656 cl->flags |= ANYOF_LOCALE;
657}
658
659/* Can match anything (initialization) */
660STATIC int
5f66b61c 661S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
662{
663 int value;
664
aaa51d5e 665 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
666 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
667 return 1;
1aa99e6b
IH
668 if (!(cl->flags & ANYOF_UNICODE_ALL))
669 return 0;
10edeb5d 670 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 671 return 0;
653099ff
GS
672 return 1;
673}
674
675/* Can match anything (initialization) */
676STATIC void
097eb12c 677S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 678{
8ecf7187 679 Zero(cl, 1, struct regnode_charclass_class);
653099ff 680 cl->type = ANYOF;
830247a4 681 cl_anything(pRExC_state, cl);
653099ff
GS
682}
683
684STATIC void
097eb12c 685S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 686{
8ecf7187 687 Zero(cl, 1, struct regnode_charclass_class);
653099ff 688 cl->type = ANYOF;
830247a4 689 cl_anything(pRExC_state, cl);
653099ff
GS
690 if (LOC)
691 cl->flags |= ANYOF_LOCALE;
692}
693
694/* 'And' a given class with another one. Can create false positives */
695/* We assume that cl is not inverted */
696STATIC void
5f66b61c 697S_cl_and(struct regnode_charclass_class *cl,
a28509cc 698 const struct regnode_charclass_class *and_with)
653099ff 699{
40d049e4
YO
700
701 assert(and_with->type == ANYOF);
653099ff
GS
702 if (!(and_with->flags & ANYOF_CLASS)
703 && !(cl->flags & ANYOF_CLASS)
704 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
705 && !(and_with->flags & ANYOF_FOLD)
706 && !(cl->flags & ANYOF_FOLD)) {
707 int i;
708
709 if (and_with->flags & ANYOF_INVERT)
710 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
711 cl->bitmap[i] &= ~and_with->bitmap[i];
712 else
713 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
714 cl->bitmap[i] &= and_with->bitmap[i];
715 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
716 if (!(and_with->flags & ANYOF_EOS))
717 cl->flags &= ~ANYOF_EOS;
1aa99e6b 718
14ebb1a2
JH
719 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
720 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
721 cl->flags &= ~ANYOF_UNICODE_ALL;
722 cl->flags |= ANYOF_UNICODE;
723 ARG_SET(cl, ARG(and_with));
724 }
14ebb1a2
JH
725 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
726 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 727 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
728 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
729 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 730 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
731}
732
733/* 'OR' a given class with another one. Can create false positives */
734/* We assume that cl is not inverted */
735STATIC void
097eb12c 736S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 737{
653099ff
GS
738 if (or_with->flags & ANYOF_INVERT) {
739 /* We do not use
740 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
741 * <= (B1 | !B2) | (CL1 | !CL2)
742 * which is wasteful if CL2 is small, but we ignore CL2:
743 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
744 * XXXX Can we handle case-fold? Unclear:
745 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
746 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
747 */
748 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
749 && !(or_with->flags & ANYOF_FOLD)
750 && !(cl->flags & ANYOF_FOLD) ) {
751 int i;
752
753 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
754 cl->bitmap[i] |= ~or_with->bitmap[i];
755 } /* XXXX: logic is complicated otherwise */
756 else {
830247a4 757 cl_anything(pRExC_state, cl);
653099ff
GS
758 }
759 } else {
760 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
761 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 762 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
763 || (cl->flags & ANYOF_FOLD)) ) {
764 int i;
765
766 /* OR char bitmap and class bitmap separately */
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] |= or_with->bitmap[i];
769 if (or_with->flags & ANYOF_CLASS) {
770 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
771 cl->classflags[i] |= or_with->classflags[i];
772 cl->flags |= ANYOF_CLASS;
773 }
774 }
775 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 776 cl_anything(pRExC_state, cl);
653099ff
GS
777 }
778 }
779 if (or_with->flags & ANYOF_EOS)
780 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
781
782 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
783 ARG(cl) != ARG(or_with)) {
784 cl->flags |= ANYOF_UNICODE_ALL;
785 cl->flags &= ~ANYOF_UNICODE;
786 }
787 if (or_with->flags & ANYOF_UNICODE_ALL) {
788 cl->flags |= ANYOF_UNICODE_ALL;
789 cl->flags &= ~ANYOF_UNICODE;
790 }
653099ff
GS
791}
792
a3621e74
YO
793#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
794#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
795#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
796#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
797
3dab1dad
YO
798
799#ifdef DEBUGGING
07be1b83 800/*
2b8b4781
NC
801 dump_trie(trie,widecharmap,revcharmap)
802 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
803 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
804
805 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
806 The _interim_ variants are used for debugging the interim
807 tables that are used to generate the final compressed
808 representation which is what dump_trie expects.
809
3dab1dad
YO
810 Part of the reason for their existance is to provide a form
811 of documentation as to how the different representations function.
07be1b83
YO
812
813*/
3dab1dad
YO
814
815/*
3dab1dad
YO
816 Dumps the final compressed table form of the trie to Perl_debug_log.
817 Used for debugging make_trie().
818*/
819
820STATIC void
2b8b4781
NC
821S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
822 AV *revcharmap, U32 depth)
3dab1dad
YO
823{
824 U32 state;
ab3bbdeb 825 SV *sv=sv_newmortal();
55eed653 826 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
827 GET_RE_DEBUG_FLAGS_DECL;
828
ab3bbdeb 829
3dab1dad
YO
830 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
831 (int)depth * 2 + 2,"",
832 "Match","Base","Ofs" );
833
834 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 835 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 836 if ( tmp ) {
ab3bbdeb
YO
837 PerlIO_printf( Perl_debug_log, "%*s",
838 colwidth,
ddc5bc0f 839 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
840 PL_colors[0], PL_colors[1],
841 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
842 PERL_PV_ESCAPE_FIRSTCHAR
843 )
844 );
3dab1dad
YO
845 }
846 }
847 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
848 (int)depth * 2 + 2,"");
849
850 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 851 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
852 PerlIO_printf( Perl_debug_log, "\n");
853
1e2e3d02 854 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 855 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
856
857 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
858
859 if ( trie->states[ state ].wordnum ) {
860 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
861 } else {
862 PerlIO_printf( Perl_debug_log, "%6s", "" );
863 }
864
865 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
866
867 if ( base ) {
868 U32 ofs = 0;
869
870 while( ( base + ofs < trie->uniquecharcount ) ||
871 ( base + ofs - trie->uniquecharcount < trie->lasttrans
872 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
873 ofs++;
874
875 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
876
877 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
878 if ( ( base + ofs >= trie->uniquecharcount ) &&
879 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
880 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
881 {
ab3bbdeb
YO
882 PerlIO_printf( Perl_debug_log, "%*"UVXf,
883 colwidth,
3dab1dad
YO
884 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
885 } else {
ab3bbdeb 886 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
887 }
888 }
889
890 PerlIO_printf( Perl_debug_log, "]");
891
892 }
893 PerlIO_printf( Perl_debug_log, "\n" );
894 }
895}
896/*
3dab1dad
YO
897 Dumps a fully constructed but uncompressed trie in list form.
898 List tries normally only are used for construction when the number of
899 possible chars (trie->uniquecharcount) is very high.
900 Used for debugging make_trie().
901*/
902STATIC void
55eed653 903S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
904 HV *widecharmap, AV *revcharmap, U32 next_alloc,
905 U32 depth)
3dab1dad
YO
906{
907 U32 state;
ab3bbdeb 908 SV *sv=sv_newmortal();
55eed653 909 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
910 GET_RE_DEBUG_FLAGS_DECL;
911 /* print out the table precompression. */
ab3bbdeb
YO
912 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
913 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
914 "------:-----+-----------------\n" );
3dab1dad
YO
915
916 for( state=1 ; state < next_alloc ; state ++ ) {
917 U16 charid;
918
ab3bbdeb 919 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
920 (int)depth * 2 + 2,"", (UV)state );
921 if ( ! trie->states[ state ].wordnum ) {
922 PerlIO_printf( Perl_debug_log, "%5s| ","");
923 } else {
924 PerlIO_printf( Perl_debug_log, "W%4x| ",
925 trie->states[ state ].wordnum
926 );
927 }
928 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 929 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
930 if ( tmp ) {
931 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
932 colwidth,
ddc5bc0f 933 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
934 PL_colors[0], PL_colors[1],
935 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
936 PERL_PV_ESCAPE_FIRSTCHAR
937 ) ,
1e2e3d02
YO
938 TRIE_LIST_ITEM(state,charid).forid,
939 (UV)TRIE_LIST_ITEM(state,charid).newstate
940 );
941 if (!(charid % 10))
664e119d
RGS
942 PerlIO_printf(Perl_debug_log, "\n%*s| ",
943 (int)((depth * 2) + 14), "");
1e2e3d02 944 }
ab3bbdeb
YO
945 }
946 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
947 }
948}
949
950/*
3dab1dad
YO
951 Dumps a fully constructed but uncompressed trie in table form.
952 This is the normal DFA style state transition table, with a few
953 twists to facilitate compression later.
954 Used for debugging make_trie().
955*/
956STATIC void
55eed653 957S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
958 HV *widecharmap, AV *revcharmap, U32 next_alloc,
959 U32 depth)
3dab1dad
YO
960{
961 U32 state;
962 U16 charid;
ab3bbdeb 963 SV *sv=sv_newmortal();
55eed653 964 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
965 GET_RE_DEBUG_FLAGS_DECL;
966
967 /*
968 print out the table precompression so that we can do a visual check
969 that they are identical.
970 */
971
972 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
973
974 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 975 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 976 if ( tmp ) {
ab3bbdeb
YO
977 PerlIO_printf( Perl_debug_log, "%*s",
978 colwidth,
ddc5bc0f 979 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
980 PL_colors[0], PL_colors[1],
981 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
982 PERL_PV_ESCAPE_FIRSTCHAR
983 )
984 );
3dab1dad
YO
985 }
986 }
987
988 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
989
990 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 991 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
992 }
993
994 PerlIO_printf( Perl_debug_log, "\n" );
995
996 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
997
998 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
999 (int)depth * 2 + 2,"",
1000 (UV)TRIE_NODENUM( state ) );
1001
1002 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1003 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1004 if (v)
1005 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1006 else
1007 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1008 }
1009 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1010 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1011 } else {
1012 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1013 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1014 }
1015 }
07be1b83 1016}
3dab1dad
YO
1017
1018#endif
1019
786e8c11
YO
1020/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1021 startbranch: the first branch in the whole branch sequence
1022 first : start branch of sequence of branch-exact nodes.
1023 May be the same as startbranch
1024 last : Thing following the last branch.
1025 May be the same as tail.
1026 tail : item following the branch sequence
1027 count : words in the sequence
1028 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1029 depth : indent depth
3dab1dad 1030
786e8c11 1031Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1032
786e8c11
YO
1033A trie is an N'ary tree where the branches are determined by digital
1034decomposition of the key. IE, at the root node you look up the 1st character and
1035follow that branch repeat until you find the end of the branches. Nodes can be
1036marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1037
786e8c11 1038 /he|she|his|hers/
72f13be8 1039
786e8c11
YO
1040would convert into the following structure. Numbers represent states, letters
1041following numbers represent valid transitions on the letter from that state, if
1042the number is in square brackets it represents an accepting state, otherwise it
1043will be in parenthesis.
07be1b83 1044
786e8c11
YO
1045 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1046 | |
1047 | (2)
1048 | |
1049 (1) +-i->(6)-+-s->[7]
1050 |
1051 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1052
786e8c11
YO
1053 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1054
1055This shows that when matching against the string 'hers' we will begin at state 1
1056read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1057then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1058is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1059single traverse. We store a mapping from accepting to state to which word was
1060matched, and then when we have multiple possibilities we try to complete the
1061rest of the regex in the order in which they occured in the alternation.
1062
1063The only prior NFA like behaviour that would be changed by the TRIE support is
1064the silent ignoring of duplicate alternations which are of the form:
1065
1066 / (DUPE|DUPE) X? (?{ ... }) Y /x
1067
1068Thus EVAL blocks follwing a trie may be called a different number of times with
1069and without the optimisation. With the optimisations dupes will be silently
1070ignored. This inconsistant behaviour of EVAL type nodes is well established as
1071the following demonstrates:
1072
1073 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1074
1075which prints out 'word' three times, but
1076
1077 'words'=~/(word|word|word)(?{ print $1 })S/
1078
1079which doesnt print it out at all. This is due to other optimisations kicking in.
1080
1081Example of what happens on a structural level:
1082
1083The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1084
1085 1: CURLYM[1] {1,32767}(18)
1086 5: BRANCH(8)
1087 6: EXACT <ac>(16)
1088 8: BRANCH(11)
1089 9: EXACT <ad>(16)
1090 11: BRANCH(14)
1091 12: EXACT <ab>(16)
1092 16: SUCCEED(0)
1093 17: NOTHING(18)
1094 18: END(0)
1095
1096This would be optimizable with startbranch=5, first=5, last=16, tail=16
1097and should turn into:
1098
1099 1: CURLYM[1] {1,32767}(18)
1100 5: TRIE(16)
1101 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1102 <ac>
1103 <ad>
1104 <ab>
1105 16: SUCCEED(0)
1106 17: NOTHING(18)
1107 18: END(0)
1108
1109Cases where tail != last would be like /(?foo|bar)baz/:
1110
1111 1: BRANCH(4)
1112 2: EXACT <foo>(8)
1113 4: BRANCH(7)
1114 5: EXACT <bar>(8)
1115 7: TAIL(8)
1116 8: EXACT <baz>(10)
1117 10: END(0)
1118
1119which would be optimizable with startbranch=1, first=1, last=7, tail=8
1120and would end up looking like:
1121
1122 1: TRIE(8)
1123 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1124 <foo>
1125 <bar>
1126 7: TAIL(8)
1127 8: EXACT <baz>(10)
1128 10: END(0)
1129
1130 d = uvuni_to_utf8_flags(d, uv, 0);
1131
1132is the recommended Unicode-aware way of saying
1133
1134 *(d++) = uv;
1135*/
1136
1e2e3d02 1137#define TRIE_STORE_REVCHAR \
786e8c11 1138 STMT_START { \
1e2e3d02 1139 SV *tmp = newSVpvs(""); \
786e8c11 1140 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1141 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
2b8b4781 1142 av_push( revcharmap, tmp ); \
786e8c11
YO
1143 } STMT_END
1144
1145#define TRIE_READ_CHAR STMT_START { \
1146 wordlen++; \
1147 if ( UTF ) { \
1148 if ( folder ) { \
1149 if ( foldlen > 0 ) { \
1150 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1151 foldlen -= len; \
1152 scan += len; \
1153 len = 0; \
1154 } else { \
1155 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1156 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1157 foldlen -= UNISKIP( uvc ); \
1158 scan = foldbuf + UNISKIP( uvc ); \
1159 } \
1160 } else { \
1161 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1162 } \
1163 } else { \
1164 uvc = (U32)*uc; \
1165 len = 1; \
1166 } \
1167} STMT_END
1168
1169
1170
1171#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1172 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1173 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1174 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1175 } \
1176 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1177 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1178 TRIE_LIST_CUR( state )++; \
1179} STMT_END
07be1b83 1180
786e8c11
YO
1181#define TRIE_LIST_NEW(state) STMT_START { \
1182 Newxz( trie->states[ state ].trans.list, \
1183 4, reg_trie_trans_le ); \
1184 TRIE_LIST_CUR( state ) = 1; \
1185 TRIE_LIST_LEN( state ) = 4; \
1186} STMT_END
07be1b83 1187
786e8c11
YO
1188#define TRIE_HANDLE_WORD(state) STMT_START { \
1189 U16 dupe= trie->states[ state ].wordnum; \
1190 regnode * const noper_next = regnext( noper ); \
1191 \
1192 if (trie->wordlen) \
1193 trie->wordlen[ curword ] = wordlen; \
1194 DEBUG_r({ \
1195 /* store the word for dumping */ \
1196 SV* tmp; \
1197 if (OP(noper) != NOTHING) \
1198 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1199 else \
1200 tmp = newSVpvn( "", 0 ); \
1201 if ( UTF ) SvUTF8_on( tmp ); \
2b8b4781 1202 av_push( trie_words, tmp ); \
786e8c11
YO
1203 }); \
1204 \
1205 curword++; \
1206 \
1207 if ( noper_next < tail ) { \
1208 if (!trie->jump) \
446bd890 1209 trie->jump = PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1210 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1211 if (!jumper) \
1212 jumper = noper_next; \
1213 if (!nextbranch) \
1214 nextbranch= regnext(cur); \
1215 } \
1216 \
1217 if ( dupe ) { \
1218 /* So it's a dupe. This means we need to maintain a */\
1219 /* linked-list from the first to the next. */\
1220 /* we only allocate the nextword buffer when there */\
1221 /* a dupe, so first time we have to do the allocation */\
1222 if (!trie->nextword) \
446bd890
NC
1223 trie->nextword = \
1224 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1225 while ( trie->nextword[dupe] ) \
1226 dupe= trie->nextword[dupe]; \
1227 trie->nextword[dupe]= curword; \
1228 } else { \
1229 /* we haven't inserted this word yet. */ \
1230 trie->states[ state ].wordnum = curword; \
1231 } \
1232} STMT_END
07be1b83 1233
3dab1dad 1234
786e8c11
YO
1235#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1236 ( ( base + charid >= ucharcount \
1237 && base + charid < ubound \
1238 && state == trie->trans[ base - ucharcount + charid ].check \
1239 && trie->trans[ base - ucharcount + charid ].next ) \
1240 ? trie->trans[ base - ucharcount + charid ].next \
1241 : ( state==1 ? special : 0 ) \
1242 )
3dab1dad 1243
786e8c11
YO
1244#define MADE_TRIE 1
1245#define MADE_JUMP_TRIE 2
1246#define MADE_EXACT_TRIE 4
3dab1dad 1247
a3621e74 1248STATIC I32
786e8c11 1249S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1250{
27da23d5 1251 dVAR;
a3621e74
YO
1252 /* first pass, loop through and scan words */
1253 reg_trie_data *trie;
55eed653 1254 HV *widecharmap = NULL;
2b8b4781 1255 AV *revcharmap = newAV();
a3621e74 1256 regnode *cur;
9f7f3913 1257 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1258 STRLEN len = 0;
1259 UV uvc = 0;
1260 U16 curword = 0;
1261 U32 next_alloc = 0;
786e8c11
YO
1262 regnode *jumper = NULL;
1263 regnode *nextbranch = NULL;
7f69552c 1264 regnode *convert = NULL;
a3621e74 1265 /* we just use folder as a flag in utf8 */
e1ec3a88 1266 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1267 ? PL_fold
1268 : ( flags == EXACTFL
1269 ? PL_fold_locale
1270 : NULL
1271 )
1272 );
1273
2b8b4781
NC
1274#ifdef DEBUGGING
1275 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1276 AV *trie_words = NULL;
1277 /* along with revcharmap, this only used during construction but both are
1278 * useful during debugging so we store them in the struct when debugging.
8e11feef 1279 */
2b8b4781
NC
1280#else
1281 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1282 STRLEN trie_charcount=0;
3dab1dad 1283#endif
2b8b4781 1284 SV *re_trie_maxbuff;
a3621e74 1285 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1286#ifndef DEBUGGING
1287 PERL_UNUSED_ARG(depth);
1288#endif
a3621e74 1289
446bd890 1290 trie = PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1291 trie->refcount = 1;
3dab1dad 1292 trie->startstate = 1;
786e8c11 1293 trie->wordcount = word_count;
f8fc2ecf 1294 RExC_rxi->data->data[ data_slot ] = (void*)trie;
446bd890 1295 trie->charmap = PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1296 if (!(UTF && folder))
446bd890 1297 trie->bitmap = PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1298 DEBUG_r({
2b8b4781 1299 trie_words = newAV();
a3621e74 1300 });
a3621e74 1301
0111c4fd 1302 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1303 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1304 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1305 }
3dab1dad
YO
1306 DEBUG_OPTIMISE_r({
1307 PerlIO_printf( Perl_debug_log,
786e8c11 1308 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1309 (int)depth * 2 + 2, "",
1310 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1311 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1312 (int)depth);
3dab1dad 1313 });
7f69552c
YO
1314
1315 /* Find the node we are going to overwrite */
1316 if ( first == startbranch && OP( last ) != BRANCH ) {
1317 /* whole branch chain */
1318 convert = first;
1319 } else {
1320 /* branch sub-chain */
1321 convert = NEXTOPER( first );
1322 }
1323
a3621e74
YO
1324 /* -- First loop and Setup --
1325
1326 We first traverse the branches and scan each word to determine if it
1327 contains widechars, and how many unique chars there are, this is
1328 important as we have to build a table with at least as many columns as we
1329 have unique chars.
1330
1331 We use an array of integers to represent the character codes 0..255
1332 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1333 native representation of the character value as the key and IV's for the
1334 coded index.
1335
1336 *TODO* If we keep track of how many times each character is used we can
1337 remap the columns so that the table compression later on is more
1338 efficient in terms of memory by ensuring most common value is in the
1339 middle and the least common are on the outside. IMO this would be better
1340 than a most to least common mapping as theres a decent chance the most
1341 common letter will share a node with the least common, meaning the node
1342 will not be compressable. With a middle is most common approach the worst
1343 case is when we have the least common nodes twice.
1344
1345 */
1346
a3621e74 1347 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1348 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1349 const U8 *uc = (U8*)STRING( noper );
a28509cc 1350 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1351 STRLEN foldlen = 0;
1352 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1353 const U8 *scan = (U8*)NULL;
07be1b83 1354 U32 wordlen = 0; /* required init */
3dab1dad 1355 STRLEN chars=0;
a3621e74 1356
3dab1dad
YO
1357 if (OP(noper) == NOTHING) {
1358 trie->minlen= 0;
1359 continue;
1360 }
1361 if (trie->bitmap) {
1362 TRIE_BITMAP_SET(trie,*uc);
1363 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1364 }
a3621e74 1365 for ( ; uc < e ; uc += len ) {
3dab1dad 1366 TRIE_CHARCOUNT(trie)++;
a3621e74 1367 TRIE_READ_CHAR;
3dab1dad 1368 chars++;
a3621e74
YO
1369 if ( uvc < 256 ) {
1370 if ( !trie->charmap[ uvc ] ) {
1371 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1372 if ( folder )
1373 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1374 TRIE_STORE_REVCHAR;
a3621e74
YO
1375 }
1376 } else {
1377 SV** svpp;
55eed653
NC
1378 if ( !widecharmap )
1379 widecharmap = newHV();
a3621e74 1380
55eed653 1381 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1382
1383 if ( !svpp )
e4584336 1384 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1385
1386 if ( !SvTRUE( *svpp ) ) {
1387 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1388 TRIE_STORE_REVCHAR;
a3621e74
YO
1389 }
1390 }
1391 }
3dab1dad
YO
1392 if( cur == first ) {
1393 trie->minlen=chars;
1394 trie->maxlen=chars;
1395 } else if (chars < trie->minlen) {
1396 trie->minlen=chars;
1397 } else if (chars > trie->maxlen) {
1398 trie->maxlen=chars;
1399 }
1400
a3621e74
YO
1401 } /* end first pass */
1402 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1403 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1404 (int)depth * 2 + 2,"",
55eed653 1405 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1406 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1407 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1408 );
446bd890 1409 trie->wordlen = PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1410
1411 /*
1412 We now know what we are dealing with in terms of unique chars and
1413 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1414 representation using a flat table will take. If it's over a reasonable
1415 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1416 conservative but potentially much slower representation using an array
1417 of lists.
1418
1419 At the end we convert both representations into the same compressed
1420 form that will be used in regexec.c for matching with. The latter
1421 is a form that cannot be used to construct with but has memory
1422 properties similar to the list form and access properties similar
1423 to the table form making it both suitable for fast searches and
1424 small enough that its feasable to store for the duration of a program.
1425
1426 See the comment in the code where the compressed table is produced
1427 inplace from the flat tabe representation for an explanation of how
1428 the compression works.
1429
1430 */
1431
1432
3dab1dad 1433 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1434 /*
1435 Second Pass -- Array Of Lists Representation
1436
1437 Each state will be represented by a list of charid:state records
1438 (reg_trie_trans_le) the first such element holds the CUR and LEN
1439 points of the allocated array. (See defines above).
1440
1441 We build the initial structure using the lists, and then convert
1442 it into the compressed table form which allows faster lookups
1443 (but cant be modified once converted).
a3621e74
YO
1444 */
1445
a3621e74
YO
1446 STRLEN transcount = 1;
1447
1e2e3d02
YO
1448 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1449 "%*sCompiling trie using list compiler\n",
1450 (int)depth * 2 + 2, ""));
446bd890
NC
1451
1452 trie->states = PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1453 sizeof(reg_trie_state) );
a3621e74
YO
1454 TRIE_LIST_NEW(1);
1455 next_alloc = 2;
1456
1457 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1458
c445ea15
AL
1459 regnode * const noper = NEXTOPER( cur );
1460 U8 *uc = (U8*)STRING( noper );
1461 const U8 * const e = uc + STR_LEN( noper );
1462 U32 state = 1; /* required init */
1463 U16 charid = 0; /* sanity init */
1464 U8 *scan = (U8*)NULL; /* sanity init */
1465 STRLEN foldlen = 0; /* required init */
07be1b83 1466 U32 wordlen = 0; /* required init */
c445ea15
AL
1467 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1468
3dab1dad 1469 if (OP(noper) != NOTHING) {
786e8c11 1470 for ( ; uc < e ; uc += len ) {
c445ea15 1471
786e8c11 1472 TRIE_READ_CHAR;
c445ea15 1473
786e8c11
YO
1474 if ( uvc < 256 ) {
1475 charid = trie->charmap[ uvc ];
c445ea15 1476 } else {
55eed653 1477 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1478 if ( !svpp ) {
1479 charid = 0;
1480 } else {
1481 charid=(U16)SvIV( *svpp );
1482 }
c445ea15 1483 }
786e8c11
YO
1484 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1485 if ( charid ) {
a3621e74 1486
786e8c11
YO
1487 U16 check;
1488 U32 newstate = 0;
a3621e74 1489
786e8c11
YO
1490 charid--;
1491 if ( !trie->states[ state ].trans.list ) {
1492 TRIE_LIST_NEW( state );
c445ea15 1493 }
786e8c11
YO
1494 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1495 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1496 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1497 break;
1498 }
1499 }
1500 if ( ! newstate ) {
1501 newstate = next_alloc++;
1502 TRIE_LIST_PUSH( state, charid, newstate );
1503 transcount++;
1504 }
1505 state = newstate;
1506 } else {
1507 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1508 }
a28509cc 1509 }
c445ea15 1510 }
3dab1dad 1511 TRIE_HANDLE_WORD(state);
a3621e74
YO
1512
1513 } /* end second pass */
1514
1e2e3d02
YO
1515 /* next alloc is the NEXT state to be allocated */
1516 trie->statecount = next_alloc;
446bd890
NC
1517 trie->states = PerlMemShared_realloc( trie->states, next_alloc
1518 * sizeof(reg_trie_state) );
a3621e74 1519
3dab1dad 1520 /* and now dump it out before we compress it */
2b8b4781
NC
1521 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1522 revcharmap, next_alloc,
1523 depth+1)
1e2e3d02 1524 );
a3621e74 1525
446bd890
NC
1526 trie->trans
1527 = PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1528 {
1529 U32 state;
a3621e74
YO
1530 U32 tp = 0;
1531 U32 zp = 0;
1532
1533
1534 for( state=1 ; state < next_alloc ; state ++ ) {
1535 U32 base=0;
1536
1537 /*
1538 DEBUG_TRIE_COMPILE_MORE_r(
1539 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1540 );
1541 */
1542
1543 if (trie->states[state].trans.list) {
1544 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1545 U16 maxid=minid;
a28509cc 1546 U16 idx;
a3621e74
YO
1547
1548 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1549 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1550 if ( forid < minid ) {
1551 minid=forid;
1552 } else if ( forid > maxid ) {
1553 maxid=forid;
1554 }
a3621e74
YO
1555 }
1556 if ( transcount < tp + maxid - minid + 1) {
1557 transcount *= 2;
446bd890
NC
1558 trie->trans
1559 = PerlMemShared_realloc( trie->trans,
1560 transcount
1561 * sizeof(reg_trie_trans) );
a3621e74
YO
1562 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1563 }
1564 base = trie->uniquecharcount + tp - minid;
1565 if ( maxid == minid ) {
1566 U32 set = 0;
1567 for ( ; zp < tp ; zp++ ) {
1568 if ( ! trie->trans[ zp ].next ) {
1569 base = trie->uniquecharcount + zp - minid;
1570 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1571 trie->trans[ zp ].check = state;
1572 set = 1;
1573 break;
1574 }
1575 }
1576 if ( !set ) {
1577 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1578 trie->trans[ tp ].check = state;
1579 tp++;
1580 zp = tp;
1581 }
1582 } else {
1583 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1584 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1585 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1586 trie->trans[ tid ].check = state;
1587 }
1588 tp += ( maxid - minid + 1 );
1589 }
1590 Safefree(trie->states[ state ].trans.list);
1591 }
1592 /*
1593 DEBUG_TRIE_COMPILE_MORE_r(
1594 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1595 );
1596 */
1597 trie->states[ state ].trans.base=base;
1598 }
cc601c31 1599 trie->lasttrans = tp + 1;
a3621e74
YO
1600 }
1601 } else {
1602 /*
1603 Second Pass -- Flat Table Representation.
1604
1605 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1606 We know that we will need Charcount+1 trans at most to store the data
1607 (one row per char at worst case) So we preallocate both structures
1608 assuming worst case.
1609
1610 We then construct the trie using only the .next slots of the entry
1611 structs.
1612
1613 We use the .check field of the first entry of the node temporarily to
1614 make compression both faster and easier by keeping track of how many non
1615 zero fields are in the node.
1616
1617 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1618 transition.
1619
1620 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1621 number representing the first entry of the node, and state as a
1622 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1623 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1624 are 2 entrys per node. eg:
1625
1626 A B A B
1627 1. 2 4 1. 3 7
1628 2. 0 3 3. 0 5
1629 3. 0 0 5. 0 0
1630 4. 0 0 7. 0 0
1631
1632 The table is internally in the right hand, idx form. However as we also
1633 have to deal with the states array which is indexed by nodenum we have to
1634 use TRIE_NODENUM() to convert.
1635
1636 */
1e2e3d02
YO
1637 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1638 "%*sCompiling trie using table compiler\n",
1639 (int)depth * 2 + 2, ""));
3dab1dad 1640
446bd890
NC
1641 trie->trans = PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1642 * trie->uniquecharcount + 1,
1643 sizeof(reg_trie_trans) );
1644 trie->states = PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1645 sizeof(reg_trie_state) );
a3621e74
YO
1646 next_alloc = trie->uniquecharcount + 1;
1647
3dab1dad 1648
a3621e74
YO
1649 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1650
c445ea15 1651 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1652 const U8 *uc = (U8*)STRING( noper );
1653 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1654
1655 U32 state = 1; /* required init */
1656
1657 U16 charid = 0; /* sanity init */
1658 U32 accept_state = 0; /* sanity init */
1659 U8 *scan = (U8*)NULL; /* sanity init */
1660
1661 STRLEN foldlen = 0; /* required init */
07be1b83 1662 U32 wordlen = 0; /* required init */
a3621e74
YO
1663 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1664
3dab1dad 1665 if ( OP(noper) != NOTHING ) {
786e8c11 1666 for ( ; uc < e ; uc += len ) {
a3621e74 1667
786e8c11 1668 TRIE_READ_CHAR;
a3621e74 1669
786e8c11
YO
1670 if ( uvc < 256 ) {
1671 charid = trie->charmap[ uvc ];
1672 } else {
55eed653 1673 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1674 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1675 }
786e8c11
YO
1676 if ( charid ) {
1677 charid--;
1678 if ( !trie->trans[ state + charid ].next ) {
1679 trie->trans[ state + charid ].next = next_alloc;
1680 trie->trans[ state ].check++;
1681 next_alloc += trie->uniquecharcount;
1682 }
1683 state = trie->trans[ state + charid ].next;
1684 } else {
1685 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1686 }
1687 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1688 }
a3621e74 1689 }
3dab1dad
YO
1690 accept_state = TRIE_NODENUM( state );
1691 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1692
1693 } /* end second pass */
1694
3dab1dad 1695 /* and now dump it out before we compress it */
2b8b4781
NC
1696 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1697 revcharmap,
1698 next_alloc, depth+1));
a3621e74 1699
a3621e74
YO
1700 {
1701 /*
1702 * Inplace compress the table.*
1703
1704 For sparse data sets the table constructed by the trie algorithm will
1705 be mostly 0/FAIL transitions or to put it another way mostly empty.
1706 (Note that leaf nodes will not contain any transitions.)
1707
1708 This algorithm compresses the tables by eliminating most such
1709 transitions, at the cost of a modest bit of extra work during lookup:
1710
1711 - Each states[] entry contains a .base field which indicates the
1712 index in the state[] array wheres its transition data is stored.
1713
1714 - If .base is 0 there are no valid transitions from that node.
1715
1716 - If .base is nonzero then charid is added to it to find an entry in
1717 the trans array.
1718
1719 -If trans[states[state].base+charid].check!=state then the
1720 transition is taken to be a 0/Fail transition. Thus if there are fail
1721 transitions at the front of the node then the .base offset will point
1722 somewhere inside the previous nodes data (or maybe even into a node
1723 even earlier), but the .check field determines if the transition is
1724 valid.
1725
786e8c11 1726 XXX - wrong maybe?
a3621e74
YO
1727 The following process inplace converts the table to the compressed
1728 table: We first do not compress the root node 1,and mark its all its
1729 .check pointers as 1 and set its .base pointer as 1 as well. This
1730 allows to do a DFA construction from the compressed table later, and
1731 ensures that any .base pointers we calculate later are greater than
1732 0.
1733
1734 - We set 'pos' to indicate the first entry of the second node.
1735
1736 - We then iterate over the columns of the node, finding the first and
1737 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1738 and set the .check pointers accordingly, and advance pos
1739 appropriately and repreat for the next node. Note that when we copy
1740 the next pointers we have to convert them from the original
1741 NODEIDX form to NODENUM form as the former is not valid post
1742 compression.
1743
1744 - If a node has no transitions used we mark its base as 0 and do not
1745 advance the pos pointer.
1746
1747 - If a node only has one transition we use a second pointer into the
1748 structure to fill in allocated fail transitions from other states.
1749 This pointer is independent of the main pointer and scans forward
1750 looking for null transitions that are allocated to a state. When it
1751 finds one it writes the single transition into the "hole". If the
786e8c11 1752 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1753
1754 - Once compressed we can Renew/realloc the structures to release the
1755 excess space.
1756
1757 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1758 specifically Fig 3.47 and the associated pseudocode.
1759
1760 demq
1761 */
a3b680e6 1762 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1763 U32 state, charid;
a3621e74 1764 U32 pos = 0, zp=0;
1e2e3d02 1765 trie->statecount = laststate;
a3621e74
YO
1766
1767 for ( state = 1 ; state < laststate ; state++ ) {
1768 U8 flag = 0;
a28509cc
AL
1769 const U32 stateidx = TRIE_NODEIDX( state );
1770 const U32 o_used = trie->trans[ stateidx ].check;
1771 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1772 trie->trans[ stateidx ].check = 0;
1773
1774 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1775 if ( flag || trie->trans[ stateidx + charid ].next ) {
1776 if ( trie->trans[ stateidx + charid ].next ) {
1777 if (o_used == 1) {
1778 for ( ; zp < pos ; zp++ ) {
1779 if ( ! trie->trans[ zp ].next ) {
1780 break;
1781 }
1782 }
1783 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1784 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1785 trie->trans[ zp ].check = state;
1786 if ( ++zp > pos ) pos = zp;
1787 break;
1788 }
1789 used--;
1790 }
1791 if ( !flag ) {
1792 flag = 1;
1793 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1794 }
1795 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1796 trie->trans[ pos ].check = state;
1797 pos++;
1798 }
1799 }
1800 }
cc601c31 1801 trie->lasttrans = pos + 1;
446bd890
NC
1802 trie->states = PerlMemShared_realloc( trie->states, laststate
1803 * sizeof(reg_trie_state) );
a3621e74 1804 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1805 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1806 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1807 (int)depth * 2 + 2,"",
1808 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1809 (IV)next_alloc,
1810 (IV)pos,
a3621e74
YO
1811 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1812 );
1813
1814 } /* end table compress */
1815 }
1e2e3d02
YO
1816 DEBUG_TRIE_COMPILE_MORE_r(
1817 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1818 (int)depth * 2 + 2, "",
1819 (UV)trie->statecount,
1820 (UV)trie->lasttrans)
1821 );
cc601c31 1822 /* resize the trans array to remove unused space */
446bd890
NC
1823 trie->trans = PerlMemShared_realloc( trie->trans, trie->lasttrans
1824 * sizeof(reg_trie_trans) );
a3621e74 1825
3dab1dad 1826 /* and now dump out the compressed format */
2b8b4781 1827 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1828
3dab1dad 1829 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1830 U8 nodetype =(U8)(flags & 0xFF);
1831 char *str=NULL;
786e8c11 1832
07be1b83 1833#ifdef DEBUGGING
e62cc96a 1834 regnode *optimize = NULL;
b57a0404
JH
1835 U32 mjd_offset = 0;
1836 U32 mjd_nodelen = 0;
07be1b83 1837#endif
a3621e74 1838 /*
3dab1dad
YO
1839 This means we convert either the first branch or the first Exact,
1840 depending on whether the thing following (in 'last') is a branch
1841 or not and whther first is the startbranch (ie is it a sub part of
1842 the alternation or is it the whole thing.)
1843 Assuming its a sub part we conver the EXACT otherwise we convert
1844 the whole branch sequence, including the first.
a3621e74 1845 */
3dab1dad 1846 /* Find the node we are going to overwrite */
7f69552c 1847 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1848 /* branch sub-chain */
3dab1dad 1849 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1850 DEBUG_r({
1851 mjd_offset= Node_Offset((convert));
1852 mjd_nodelen= Node_Length((convert));
1853 });
7f69552c
YO
1854 /* whole branch chain */
1855 } else {
1856 DEBUG_r({
1857 const regnode *nop = NEXTOPER( convert );
1858 mjd_offset= Node_Offset((nop));
1859 mjd_nodelen= Node_Length((nop));
1860 });
07be1b83 1861 }
7f69552c 1862
07be1b83
YO
1863 DEBUG_OPTIMISE_r(
1864 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1865 (int)depth * 2 + 2, "",
786e8c11 1866 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1867 );
a3621e74 1868
3dab1dad
YO
1869 /* But first we check to see if there is a common prefix we can
1870 split out as an EXACT and put in front of the TRIE node. */
1871 trie->startstate= 1;
55eed653 1872 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1873 U32 state;
1e2e3d02 1874 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1875 U32 ofs = 0;
8e11feef
RGS
1876 I32 idx = -1;
1877 U32 count = 0;
1878 const U32 base = trie->states[ state ].trans.base;
a3621e74 1879
3dab1dad 1880 if ( trie->states[state].wordnum )
8e11feef 1881 count = 1;
a3621e74 1882
8e11feef 1883 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1884 if ( ( base + ofs >= trie->uniquecharcount ) &&
1885 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1886 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1887 {
3dab1dad 1888 if ( ++count > 1 ) {
2b8b4781 1889 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1890 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1891 if ( state == 1 ) break;
3dab1dad
YO
1892 if ( count == 2 ) {
1893 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1894 DEBUG_OPTIMISE_r(
8e11feef
RGS
1895 PerlIO_printf(Perl_debug_log,
1896 "%*sNew Start State=%"UVuf" Class: [",
1897 (int)depth * 2 + 2, "",
786e8c11 1898 (UV)state));
be8e71aa 1899 if (idx >= 0) {
2b8b4781 1900 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 1901 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1902
3dab1dad 1903 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1904 if ( folder )
1905 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1906 DEBUG_OPTIMISE_r(
07be1b83 1907 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1908 );
8e11feef
RGS
1909 }
1910 }
1911 TRIE_BITMAP_SET(trie,*ch);
1912 if ( folder )
1913 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1914 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1915 }
1916 idx = ofs;
1917 }
3dab1dad
YO
1918 }
1919 if ( count == 1 ) {
2b8b4781 1920 SV **tmp = av_fetch( revcharmap, idx, 0);
de734bd5
A
1921 char *ch = SvPV_nolen( *tmp );
1922 DEBUG_OPTIMISE_r({
1923 SV *sv=sv_newmortal();
8e11feef
RGS
1924 PerlIO_printf( Perl_debug_log,
1925 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1926 (int)depth * 2 + 2, "",
de734bd5
A
1927 (UV)state, (UV)idx,
1928 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1929 PL_colors[0], PL_colors[1],
1930 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1931 PERL_PV_ESCAPE_FIRSTCHAR
1932 )
1933 );
1934 });
3dab1dad
YO
1935 if ( state==1 ) {
1936 OP( convert ) = nodetype;
1937 str=STRING(convert);
1938 STR_LEN(convert)=0;
1939 }
de734bd5
A
1940 while (*ch) {
1941 *str++ = *ch++;
1942 STR_LEN(convert)++;
1943 }
1944
8e11feef 1945 } else {
f9049ba1 1946#ifdef DEBUGGING
8e11feef
RGS
1947 if (state>1)
1948 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1949#endif
8e11feef
RGS
1950 break;
1951 }
1952 }
3dab1dad 1953 if (str) {
8e11feef 1954 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1955 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1956 trie->startstate = state;
07be1b83
YO
1957 trie->minlen -= (state - 1);
1958 trie->maxlen -= (state - 1);
1959 DEBUG_r({
1960 regnode *fix = convert;
de734bd5 1961 U32 word = trie->wordcount;
07be1b83
YO
1962 mjd_nodelen++;
1963 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1964 while( ++fix < n ) {
1965 Set_Node_Offset_Length(fix, 0, 0);
1966 }
de734bd5 1967 while (word--) {
2b8b4781 1968 SV ** const tmp = av_fetch( trie_words, word, 0 );
de734bd5
A
1969 if (tmp) {
1970 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1971 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1972 else
1973 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1974 }
1975 }
07be1b83 1976 });
8e11feef
RGS
1977 if (trie->maxlen) {
1978 convert = n;
1979 } else {
3dab1dad 1980 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1981 DEBUG_r(optimize= n);
3dab1dad
YO
1982 }
1983 }
1984 }
a5ca303d
YO
1985 if (!jumper)
1986 jumper = last;
3dab1dad 1987 if ( trie->maxlen ) {
8e11feef
RGS
1988 NEXT_OFF( convert ) = (U16)(tail - convert);
1989 ARG_SET( convert, data_slot );
786e8c11
YO
1990 /* Store the offset to the first unabsorbed branch in
1991 jump[0], which is otherwise unused by the jump logic.
1992 We use this when dumping a trie and during optimisation. */
1993 if (trie->jump)
7f69552c 1994 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1995
786e8c11
YO
1996 /* XXXX */
1997 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1998 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1999 {
2000 OP( convert ) = TRIEC;
2001 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2002 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2003 trie->bitmap= NULL;
2004 } else
2005 OP( convert ) = TRIE;
a3621e74 2006
3dab1dad
YO
2007 /* store the type in the flags */
2008 convert->flags = nodetype;
a5ca303d
YO
2009 DEBUG_r({
2010 optimize = convert
2011 + NODE_STEP_REGNODE
2012 + regarglen[ OP( convert ) ];
2013 });
2014 /* XXX We really should free up the resource in trie now,
2015 as we won't use them - (which resources?) dmq */
3dab1dad 2016 }
a3621e74 2017 /* needed for dumping*/
e62cc96a 2018 DEBUG_r(if (optimize) {
07be1b83 2019 regnode *opt = convert;
e62cc96a 2020 while ( ++opt < optimize) {
07be1b83
YO
2021 Set_Node_Offset_Length(opt,0,0);
2022 }
786e8c11
YO
2023 /*
2024 Try to clean up some of the debris left after the
2025 optimisation.
a3621e74 2026 */
786e8c11 2027 while( optimize < jumper ) {
07be1b83 2028 mjd_nodelen += Node_Length((optimize));
a3621e74 2029 OP( optimize ) = OPTIMIZED;
07be1b83 2030 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2031 optimize++;
2032 }
07be1b83 2033 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2034 });
2035 } /* end node insert */
55eed653 2036 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2037#ifdef DEBUGGING
2038 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2039 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2040#else
2041 SvREFCNT_dec(revcharmap);
07be1b83 2042#endif
786e8c11
YO
2043 return trie->jump
2044 ? MADE_JUMP_TRIE
2045 : trie->startstate>1
2046 ? MADE_EXACT_TRIE
2047 : MADE_TRIE;
2048}
2049
2050STATIC void
2051S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2052{
2053/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2054
2055 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2056 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2057 ISBN 0-201-10088-6
2058
2059 We find the fail state for each state in the trie, this state is the longest proper
2060 suffix of the current states 'word' that is also a proper prefix of another word in our
2061 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2062 the DFA not to have to restart after its tried and failed a word at a given point, it
2063 simply continues as though it had been matching the other word in the first place.
2064 Consider
2065 'abcdgu'=~/abcdefg|cdgu/
2066 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2067 fail, which would bring use to the state representing 'd' in the second word where we would
2068 try 'g' and succeed, prodceding to match 'cdgu'.
2069 */
2070 /* add a fail transition */
3251b653
NC
2071 const U32 trie_offset = ARG(source);
2072 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2073 U32 *q;
2074 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2075 const U32 numstates = trie->statecount;
786e8c11
YO
2076 const U32 ubound = trie->lasttrans + ucharcount;
2077 U32 q_read = 0;
2078 U32 q_write = 0;
2079 U32 charid;
2080 U32 base = trie->states[ 1 ].trans.base;
2081 U32 *fail;
2082 reg_ac_data *aho;
2083 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2084 GET_RE_DEBUG_FLAGS_DECL;
2085#ifndef DEBUGGING
2086 PERL_UNUSED_ARG(depth);
2087#endif
2088
2089
2090 ARG_SET( stclass, data_slot );
446bd890 2091 aho = PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2092 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2093 aho->trie=trie_offset;
446bd890
NC
2094 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2095 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2096 Newxz( q, numstates, U32);
446bd890 2097 aho->fail = PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2098 aho->refcount = 1;
2099 fail = aho->fail;
2100 /* initialize fail[0..1] to be 1 so that we always have
2101 a valid final fail state */
2102 fail[ 0 ] = fail[ 1 ] = 1;
2103
2104 for ( charid = 0; charid < ucharcount ; charid++ ) {
2105 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2106 if ( newstate ) {
2107 q[ q_write ] = newstate;
2108 /* set to point at the root */
2109 fail[ q[ q_write++ ] ]=1;
2110 }
2111 }
2112 while ( q_read < q_write) {
2113 const U32 cur = q[ q_read++ % numstates ];
2114 base = trie->states[ cur ].trans.base;
2115
2116 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2117 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2118 if (ch_state) {
2119 U32 fail_state = cur;
2120 U32 fail_base;
2121 do {
2122 fail_state = fail[ fail_state ];
2123 fail_base = aho->states[ fail_state ].trans.base;
2124 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2125
2126 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2127 fail[ ch_state ] = fail_state;
2128 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2129 {
2130 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2131 }
2132 q[ q_write++ % numstates] = ch_state;
2133 }
2134 }
2135 }
2136 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2137 when we fail in state 1, this allows us to use the
2138 charclass scan to find a valid start char. This is based on the principle
2139 that theres a good chance the string being searched contains lots of stuff
2140 that cant be a start char.
2141 */
2142 fail[ 0 ] = fail[ 1 ] = 0;
2143 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2144 PerlIO_printf(Perl_debug_log,
2145 "%*sStclass Failtable (%"UVuf" states): 0",
2146 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2147 );
786e8c11
YO
2148 for( q_read=1; q_read<numstates; q_read++ ) {
2149 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2150 }
2151 PerlIO_printf(Perl_debug_log, "\n");
2152 });
2153 Safefree(q);
2154 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2155}
2156
786e8c11 2157
a3621e74 2158/*
5d1c421c
JH
2159 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2160 * These need to be revisited when a newer toolchain becomes available.
2161 */
2162#if defined(__sparc64__) && defined(__GNUC__)
2163# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2164# undef SPARC64_GCC_WORKAROUND
2165# define SPARC64_GCC_WORKAROUND 1
2166# endif
2167#endif
2168
07be1b83 2169#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2170 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2171 SV * const mysv=sv_newmortal(); \
2172 regnode *Next = regnext(scan); \
2173 regprop(RExC_rx, mysv, scan); \
7f69552c 2174 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2175 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2176 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2177 }});
07be1b83 2178
1de06328
YO
2179
2180
2181
2182
07be1b83
YO
2183#define JOIN_EXACT(scan,min,flags) \
2184 if (PL_regkind[OP(scan)] == EXACT) \
2185 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2186
be8e71aa 2187STATIC U32
07be1b83
YO
2188S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2189 /* Merge several consecutive EXACTish nodes into one. */
2190 regnode *n = regnext(scan);
2191 U32 stringok = 1;
2192 regnode *next = scan + NODE_SZ_STR(scan);
2193 U32 merged = 0;
2194 U32 stopnow = 0;
2195#ifdef DEBUGGING
2196 regnode *stop = scan;
72f13be8 2197 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2198#else
d47053eb
RGS
2199 PERL_UNUSED_ARG(depth);
2200#endif
2201#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2202 PERL_UNUSED_ARG(flags);
2203 PERL_UNUSED_ARG(val);
07be1b83 2204#endif
07be1b83
YO
2205 DEBUG_PEEP("join",scan,depth);
2206
2207 /* Skip NOTHING, merge EXACT*. */
2208 while (n &&
2209 ( PL_regkind[OP(n)] == NOTHING ||
2210 (stringok && (OP(n) == OP(scan))))
2211 && NEXT_OFF(n)
2212 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2213
2214 if (OP(n) == TAIL || n > next)
2215 stringok = 0;
2216 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2217 DEBUG_PEEP("skip:",n,depth);
2218 NEXT_OFF(scan) += NEXT_OFF(n);
2219 next = n + NODE_STEP_REGNODE;
2220#ifdef DEBUGGING
2221 if (stringok)
2222 stop = n;
2223#endif
2224 n = regnext(n);
2225 }
2226 else if (stringok) {
786e8c11 2227 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2228 regnode * const nnext = regnext(n);
2229
2230 DEBUG_PEEP("merg",n,depth);
2231
2232 merged++;
2233 if (oldl + STR_LEN(n) > U8_MAX)
2234 break;
2235 NEXT_OFF(scan) += NEXT_OFF(n);
2236 STR_LEN(scan) += STR_LEN(n);
2237 next = n + NODE_SZ_STR(n);
2238 /* Now we can overwrite *n : */
2239 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2240#ifdef DEBUGGING
2241 stop = next - 1;
2242#endif
2243 n = nnext;
2244 if (stopnow) break;
2245 }
2246
d47053eb
RGS
2247#ifdef EXPERIMENTAL_INPLACESCAN
2248 if (flags && !NEXT_OFF(n)) {
2249 DEBUG_PEEP("atch", val, depth);
2250 if (reg_off_by_arg[OP(n)]) {
2251 ARG_SET(n, val - n);
2252 }
2253 else {
2254 NEXT_OFF(n) = val - n;
2255 }
2256 stopnow = 1;
2257 }
07be1b83
YO
2258#endif
2259 }
2260
2261 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2262 /*
2263 Two problematic code points in Unicode casefolding of EXACT nodes:
2264
2265 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2266 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2267
2268 which casefold to
2269
2270 Unicode UTF-8
2271
2272 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2273 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2274
2275 This means that in case-insensitive matching (or "loose matching",
2276 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2277 length of the above casefolded versions) can match a target string
2278 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2279 This would rather mess up the minimum length computation.
2280
2281 What we'll do is to look for the tail four bytes, and then peek
2282 at the preceding two bytes to see whether we need to decrease
2283 the minimum length by four (six minus two).
2284
2285 Thanks to the design of UTF-8, there cannot be false matches:
2286 A sequence of valid UTF-8 bytes cannot be a subsequence of
2287 another valid sequence of UTF-8 bytes.
2288
2289 */
2290 char * const s0 = STRING(scan), *s, *t;
2291 char * const s1 = s0 + STR_LEN(scan) - 1;
2292 char * const s2 = s1 - 4;
e294cc5d
JH
2293#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2294 const char t0[] = "\xaf\x49\xaf\x42";
2295#else
07be1b83 2296 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2297#endif
07be1b83
YO
2298 const char * const t1 = t0 + 3;
2299
2300 for (s = s0 + 2;
2301 s < s2 && (t = ninstr(s, s1, t0, t1));
2302 s = t + 4) {
e294cc5d
JH
2303#ifdef EBCDIC
2304 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2305 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2306#else
07be1b83
YO
2307 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2308 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2309#endif
07be1b83
YO
2310 *min -= 4;
2311 }
2312 }
2313
2314#ifdef DEBUGGING
2315 /* Allow dumping */
2316 n = scan + NODE_SZ_STR(scan);
2317 while (n <= stop) {
2318 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2319 OP(n) = OPTIMIZED;
2320 NEXT_OFF(n) = 0;
2321 }
2322 n++;
2323 }
2324#endif
2325 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2326 return stopnow;
2327}
2328
653099ff
GS
2329/* REx optimizer. Converts nodes into quickier variants "in place".
2330 Finds fixed substrings. */
2331
a0288114 2332/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2333 to the position after last scanned or to NULL. */
2334
40d049e4
YO
2335#define INIT_AND_WITHP \
2336 assert(!and_withp); \
2337 Newx(and_withp,1,struct regnode_charclass_class); \
2338 SAVEFREEPV(and_withp)
07be1b83 2339
b515a41d
YO
2340/* this is a chain of data about sub patterns we are processing that
2341 need to be handled seperately/specially in study_chunk. Its so
2342 we can simulate recursion without losing state. */
2343struct scan_frame;
2344typedef struct scan_frame {
2345 regnode *last; /* last node to process in this frame */
2346 regnode *next; /* next node to process when last is reached */
2347 struct scan_frame *prev; /*previous frame*/
2348 I32 stop; /* what stopparen do we use */
2349} scan_frame;
2350
76e3520e 2351STATIC I32
40d049e4 2352S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2353 I32 *minlenp, I32 *deltap,
40d049e4
YO
2354 regnode *last,
2355 scan_data_t *data,
2356 I32 stopparen,
2357 U8* recursed,
2358 struct regnode_charclass_class *and_withp,
2359 U32 flags, U32 depth)
c277df42
IZ
2360 /* scanp: Start here (read-write). */
2361 /* deltap: Write maxlen-minlen here. */
2362 /* last: Stop before this one. */
40d049e4
YO
2363 /* data: string data about the pattern */
2364 /* stopparen: treat close N as END */
2365 /* recursed: which subroutines have we recursed into */
2366 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2367{
97aff369 2368 dVAR;
c277df42
IZ
2369 I32 min = 0, pars = 0, code;
2370 regnode *scan = *scanp, *next;
2371 I32 delta = 0;
2372 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2373 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2374 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2375 scan_data_t data_fake;
a3621e74 2376 SV *re_trie_maxbuff = NULL;
786e8c11 2377 regnode *first_non_open = scan;
e2e6a0f1 2378 I32 stopmin = I32_MAX;
8aa23a47
YO
2379 scan_frame *frame = NULL;
2380
a3621e74 2381 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2382
13a24bad 2383#ifdef DEBUGGING
40d049e4 2384 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2385#endif
40d049e4 2386
786e8c11 2387 if ( depth == 0 ) {
40d049e4 2388 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2389 first_non_open=regnext(first_non_open);
2390 }
2391
b81d288d 2392
8aa23a47
YO
2393 fake_study_recurse:
2394 while ( scan && OP(scan) != END && scan < last ){
2395 /* Peephole optimizer: */
2396 DEBUG_STUDYDATA(data,depth);
2397 DEBUG_PEEP("Peep",scan,depth);
2398 JOIN_EXACT(scan,&min,0);
2399
2400 /* Follow the next-chain of the current node and optimize
2401 away all the NOTHINGs from it. */
2402 if (OP(scan) != CURLYX) {
2403 const int max = (reg_off_by_arg[OP(scan)]
2404 ? I32_MAX
2405 /* I32 may be smaller than U16 on CRAYs! */
2406 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2407 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2408 int noff;
2409 regnode *n = scan;
2410
2411 /* Skip NOTHING and LONGJMP. */
2412 while ((n = regnext(n))
2413 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2414 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2415 && off + noff < max)
2416 off += noff;
2417 if (reg_off_by_arg[OP(scan)])
2418 ARG(scan) = off;
2419 else
2420 NEXT_OFF(scan) = off;
2421 }
a3621e74 2422
c277df42 2423
8aa23a47
YO
2424
2425 /* The principal pseudo-switch. Cannot be a switch, since we
2426 look into several different things. */
2427 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2428 || OP(scan) == IFTHEN) {
2429 next = regnext(scan);
2430 code = OP(scan);
2431 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2432
2433 if (OP(next) == code || code == IFTHEN) {
2434 /* NOTE - There is similar code to this block below for handling
2435 TRIE nodes on a re-study. If you change stuff here check there
2436 too. */
2437 I32 max1 = 0, min1 = I32_MAX, num = 0;
2438 struct regnode_charclass_class accum;
2439 regnode * const startbranch=scan;
2440
2441 if (flags & SCF_DO_SUBSTR)
2442 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2443 if (flags & SCF_DO_STCLASS)
2444 cl_init_zero(pRExC_state, &accum);
2445
2446 while (OP(scan) == code) {
2447 I32 deltanext, minnext, f = 0, fake;
2448 struct regnode_charclass_class this_class;
2449
2450 num++;
2451 data_fake.flags = 0;
2452 if (data) {
2453 data_fake.whilem_c = data->whilem_c;
2454 data_fake.last_closep = data->last_closep;
2455 }
2456 else
2457 data_fake.last_closep = &fake;
58e23c8d
YO
2458
2459 data_fake.pos_delta = delta;
8aa23a47
YO
2460 next = regnext(scan);
2461 scan = NEXTOPER(scan);
2462 if (code != BRANCH)
c277df42 2463 scan = NEXTOPER(scan);
8aa23a47
YO
2464 if (flags & SCF_DO_STCLASS) {
2465 cl_init(pRExC_state, &this_class);
2466 data_fake.start_class = &this_class;
2467 f = SCF_DO_STCLASS_AND;
58e23c8d 2468 }
8aa23a47
YO
2469 if (flags & SCF_WHILEM_VISITED_POS)
2470 f |= SCF_WHILEM_VISITED_POS;
2471
2472 /* we suppose the run is continuous, last=next...*/
2473 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2474 next, &data_fake,
2475 stopparen, recursed, NULL, f,depth+1);
2476 if (min1 > minnext)
2477 min1 = minnext;
2478 if (max1 < minnext + deltanext)
2479 max1 = minnext + deltanext;
2480 if (deltanext == I32_MAX)
2481 is_inf = is_inf_internal = 1;
2482 scan = next;
2483 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2484 pars++;
2485 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2486 if ( stopmin > minnext)
2487 stopmin = min + min1;
2488 flags &= ~SCF_DO_SUBSTR;
2489 if (data)
2490 data->flags |= SCF_SEEN_ACCEPT;
2491 }
2492 if (data) {
2493 if (data_fake.flags & SF_HAS_EVAL)
2494 data->flags |= SF_HAS_EVAL;
2495 data->whilem_c = data_fake.whilem_c;
3dab1dad 2496 }
8aa23a47
YO
2497 if (flags & SCF_DO_STCLASS)
2498 cl_or(pRExC_state, &accum, &this_class);
2499 }
2500 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2501 min1 = 0;
2502 if (flags & SCF_DO_SUBSTR) {
2503 data->pos_min += min1;
2504 data->pos_delta += max1 - min1;
2505 if (max1 != min1 || is_inf)
2506 data->longest = &(data->longest_float);
2507 }
2508 min += min1;
2509 delta += max1 - min1;
2510 if (flags & SCF_DO_STCLASS_OR) {
2511 cl_or(pRExC_state, data->start_class, &accum);
2512 if (min1) {
2513 cl_and(data->start_class, and_withp);
2514 flags &= ~SCF_DO_STCLASS;
653099ff 2515 }
8aa23a47
YO
2516 }
2517 else if (flags & SCF_DO_STCLASS_AND) {
2518 if (min1) {
2519 cl_and(data->start_class, &accum);
2520 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2521 }
8aa23a47
YO
2522 else {
2523 /* Switch to OR mode: cache the old value of
2524 * data->start_class */
2525 INIT_AND_WITHP;
2526 StructCopy(data->start_class, and_withp,
2527 struct regnode_charclass_class);
2528 flags &= ~SCF_DO_STCLASS_AND;
2529 StructCopy(&accum, data->start_class,
2530 struct regnode_charclass_class);
2531 flags |= SCF_DO_STCLASS_OR;
2532 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2533 }
8aa23a47 2534 }
a3621e74 2535
8aa23a47
YO
2536 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2537 /* demq.
a3621e74 2538
8aa23a47
YO
2539 Assuming this was/is a branch we are dealing with: 'scan' now
2540 points at the item that follows the branch sequence, whatever
2541 it is. We now start at the beginning of the sequence and look
2542 for subsequences of
a3621e74 2543
8aa23a47
YO
2544 BRANCH->EXACT=>x1
2545 BRANCH->EXACT=>x2
2546 tail
a3621e74 2547
8aa23a47 2548 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2549
8aa23a47
YO
2550 If we can find such a subseqence we need to turn the first
2551 element into a trie and then add the subsequent branch exact
2552 strings to the trie.
a3621e74 2553
8aa23a47 2554 We have two cases
a3621e74 2555
8aa23a47 2556 1. patterns where the whole set of branch can be converted.
a3621e74 2557
8aa23a47 2558 2. patterns where only a subset can be converted.
a3621e74 2559
8aa23a47
YO
2560 In case 1 we can replace the whole set with a single regop
2561 for the trie. In case 2 we need to keep the start and end
2562 branchs so
a3621e74 2563
8aa23a47
YO
2564 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2565 becomes BRANCH TRIE; BRANCH X;
786e8c11 2566
8aa23a47
YO
2567 There is an additional case, that being where there is a
2568 common prefix, which gets split out into an EXACT like node
2569 preceding the TRIE node.
a3621e74 2570
8aa23a47
YO
2571 If x(1..n)==tail then we can do a simple trie, if not we make
2572 a "jump" trie, such that when we match the appropriate word
2573 we "jump" to the appopriate tail node. Essentailly we turn
2574 a nested if into a case structure of sorts.
b515a41d 2575
8aa23a47
YO
2576 */
2577
2578 int made=0;
2579 if (!re_trie_maxbuff) {
2580 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2581 if (!SvIOK(re_trie_maxbuff))
2582 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2583 }
2584 if ( SvIV(re_trie_maxbuff)>=0 ) {
2585 regnode *cur;
2586 regnode *first = (regnode *)NULL;
2587 regnode *last = (regnode *)NULL;
2588 regnode *tail = scan;
2589 U8 optype = 0;
2590 U32 count=0;
a3621e74
YO
2591
2592#ifdef DEBUGGING
8aa23a47 2593 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2594#endif
8aa23a47
YO
2595 /* var tail is used because there may be a TAIL
2596 regop in the way. Ie, the exacts will point to the
2597 thing following the TAIL, but the last branch will
2598 point at the TAIL. So we advance tail. If we
2599 have nested (?:) we may have to move through several
2600 tails.
2601 */
2602
2603 while ( OP( tail ) == TAIL ) {
2604 /* this is the TAIL generated by (?:) */
2605 tail = regnext( tail );
2606 }
a3621e74 2607
8aa23a47
YO
2608
2609 DEBUG_OPTIMISE_r({
2610 regprop(RExC_rx, mysv, tail );
2611 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2612 (int)depth * 2 + 2, "",
2613 "Looking for TRIE'able sequences. Tail node is: ",
2614 SvPV_nolen_const( mysv )
2615 );
2616 });
2617
2618 /*
2619
2620 step through the branches, cur represents each
2621 branch, noper is the first thing to be matched
2622 as part of that branch and noper_next is the
2623 regnext() of that node. if noper is an EXACT
2624 and noper_next is the same as scan (our current
2625 position in the regex) then the EXACT branch is
2626 a possible optimization target. Once we have
2627 two or more consequetive such branches we can
2628 create a trie of the EXACT's contents and stich
2629 it in place. If the sequence represents all of
2630 the branches we eliminate the whole thing and
2631 replace it with a single TRIE. If it is a
2632 subsequence then we need to stitch it in. This
2633 means the first branch has to remain, and needs
2634 to be repointed at the item on the branch chain
2635 following the last branch optimized. This could
2636 be either a BRANCH, in which case the
2637 subsequence is internal, or it could be the
2638 item following the branch sequence in which
2639 case the subsequence is at the end.
2640
2641 */
2642
2643 /* dont use tail as the end marker for this traverse */
2644 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2645 regnode * const noper = NEXTOPER( cur );
b515a41d 2646#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2647 regnode * const noper_next = regnext( noper );
b515a41d
YO
2648#endif
2649
8aa23a47
YO
2650 DEBUG_OPTIMISE_r({
2651 regprop(RExC_rx, mysv, cur);
2652 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2653 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2654
2655 regprop(RExC_rx, mysv, noper);
2656 PerlIO_printf( Perl_debug_log, " -> %s",
2657 SvPV_nolen_const(mysv));
2658
2659 if ( noper_next ) {
2660 regprop(RExC_rx, mysv, noper_next );
2661 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2662 SvPV_nolen_const(mysv));
2663 }
2664 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2665 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2666 });
2667 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2668 : PL_regkind[ OP( noper ) ] == EXACT )
2669 || OP(noper) == NOTHING )
786e8c11 2670#ifdef NOJUMPTRIE
8aa23a47 2671 && noper_next == tail
786e8c11 2672#endif
8aa23a47
YO
2673 && count < U16_MAX)
2674 {
2675 count++;
2676 if ( !first || optype == NOTHING ) {
2677 if (!first) first = cur;
2678 optype = OP( noper );
2679 } else {
2680 last = cur;
2681 }
2682 } else {
2683 if ( last ) {
2684 make_trie( pRExC_state,
2685 startbranch, first, cur, tail, count,
2686 optype, depth+1 );
2687 }
2688 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2689#ifdef NOJUMPTRIE
8aa23a47 2690 && noper_next == tail
786e8c11 2691#endif
8aa23a47
YO
2692 ){
2693 count = 1;
2694 first = cur;
2695 optype = OP( noper );
2696 } else {
2697 count = 0;
2698 first = NULL;
2699 optype = 0;
2700 }
2701 last = NULL;
2702 }
2703 }
2704 DEBUG_OPTIMISE_r({
2705 regprop(RExC_rx, mysv, cur);
2706 PerlIO_printf( Perl_debug_log,
2707 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2708 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2709
2710 });
2711 if ( last ) {
2712 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2713#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2714 if ( ((made == MADE_EXACT_TRIE &&
2715 startbranch == first)
2716 || ( first_non_open == first )) &&
2717 depth==0 ) {
2718 flags |= SCF_TRIE_RESTUDY;
2719 if ( startbranch == first
2720 && scan == tail )
2721 {
2722 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2723 }
2724 }
3dab1dad 2725#endif
8aa23a47
YO
2726 }
2727 }
2728
2729 } /* do trie */
2730
653099ff 2731 }
8aa23a47
YO
2732 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2733 scan = NEXTOPER(NEXTOPER(scan));
2734 } else /* single branch is optimized. */
2735 scan = NEXTOPER(scan);
2736 continue;
2737 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2738 scan_frame *newframe = NULL;
2739 I32 paren;
2740 regnode *start;
2741 regnode *end;
2742
2743 if (OP(scan) != SUSPEND) {
2744 /* set the pointer */
2745 if (OP(scan) == GOSUB) {
2746 paren = ARG(scan);
2747 RExC_recurse[ARG2L(scan)] = scan;
2748 start = RExC_open_parens[paren-1];
2749 end = RExC_close_parens[paren-1];
2750 } else {
2751 paren = 0;
f8fc2ecf 2752 start = RExC_rxi->program + 1;
8aa23a47
YO
2753 end = RExC_opend;
2754 }
2755 if (!recursed) {
2756 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2757 SAVEFREEPV(recursed);
2758 }
2759 if (!PAREN_TEST(recursed,paren+1)) {
2760 PAREN_SET(recursed,paren+1);
2761 Newx(newframe,1,scan_frame);
2762 } else {
2763 if (flags & SCF_DO_SUBSTR) {
2764 scan_commit(pRExC_state,data,minlenp);
2765 data->longest = &(data->longest_float);
2766 }
2767 is_inf = is_inf_internal = 1;
2768 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2769 cl_anything(pRExC_state, data->start_class);
2770 flags &= ~SCF_DO_STCLASS;
2771 }
2772 } else {
2773 Newx(newframe,1,scan_frame);
2774 paren = stopparen;
2775 start = scan+2;
2776 end = regnext(scan);
2777 }
2778 if (newframe) {
2779 assert(start);
2780 assert(end);
2781 SAVEFREEPV(newframe);
2782 newframe->next = regnext(scan);
2783 newframe->last = last;
2784 newframe->stop = stopparen;
2785 newframe->prev = frame;
2786
2787 frame = newframe;
2788 scan = start;
2789 stopparen = paren;
2790 last = end;
2791
2792 continue;
2793 }
2794 }
2795 else if (OP(scan) == EXACT) {
2796 I32 l = STR_LEN(scan);
2797 UV uc;
2798 if (UTF) {
2799 const U8 * const s = (U8*)STRING(scan);
2800 l = utf8_length(s, s + l);
2801 uc = utf8_to_uvchr(s, NULL);
2802 } else {
2803 uc = *((U8*)STRING(scan));
2804 }
2805 min += l;
2806 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2807 /* The code below prefers earlier match for fixed
2808 offset, later match for variable offset. */
2809 if (data->last_end == -1) { /* Update the start info. */
2810 data->last_start_min = data->pos_min;
2811 data->last_start_max = is_inf
2812 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2813 }
8aa23a47
YO
2814 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2815 if (UTF)
2816 SvUTF8_on(data->last_found);
2817 {
2818 SV * const sv = data->last_found;
2819 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2820 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2821 if (mg && mg->mg_len >= 0)
2822 mg->mg_len += utf8_length((U8*)STRING(scan),
2823 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2824 }
8aa23a47
YO
2825 data->last_end = data->pos_min + l;
2826 data->pos_min += l; /* As in the first entry. */
2827 data->flags &= ~SF_BEFORE_EOL;
2828 }
2829 if (flags & SCF_DO_STCLASS_AND) {
2830 /* Check whether it is compatible with what we know already! */
2831 int compat = 1;
2832
2833 if (uc >= 0x100 ||
2834 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2835 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2836 && (!(data->start_class->flags & ANYOF_FOLD)
2837 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2838 )
2839 compat = 0;
2840 ANYOF_CLASS_ZERO(data->start_class);
2841 ANYOF_BITMAP_ZERO(data->start_class);
2842 if (compat)
2843 ANYOF_BITMAP_SET(data->start_class, uc);
2844 data->start_class->flags &= ~ANYOF_EOS;
2845 if (uc < 0x100)
2846 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2847 }
2848 else if (flags & SCF_DO_STCLASS_OR) {
2849 /* false positive possible if the class is case-folded */
2850 if (uc < 0x100)
2851 ANYOF_BITMAP_SET(data->start_class, uc);
2852 else
2853 data->start_class->flags |= ANYOF_UNICODE_ALL;
2854 data->start_class->flags &= ~ANYOF_EOS;
2855 cl_and(data->start_class, and_withp);
2856 }
2857 flags &= ~SCF_DO_STCLASS;
2858 }
2859 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2860 I32 l = STR_LEN(scan);
2861 UV uc = *((U8*)STRING(scan));
2862
2863 /* Search for fixed substrings supports EXACT only. */
2864 if (flags & SCF_DO_SUBSTR) {
2865 assert(data);
2866 scan_commit(pRExC_state, data, minlenp);
2867 }
2868 if (UTF) {
2869 const U8 * const s = (U8 *)STRING(scan);
2870 l = utf8_length(s, s + l);
2871 uc = utf8_to_uvchr(s, NULL);
2872 }
2873 min += l;
2874 if (flags & SCF_DO_SUBSTR)
2875 data->pos_min += l;
2876 if (flags & SCF_DO_STCLASS_AND) {
2877 /* Check whether it is compatible with what we know already! */
2878 int compat = 1;
2879
2880 if (uc >= 0x100 ||
2881 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2882 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2883 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2884 compat = 0;
2885 ANYOF_CLASS_ZERO(data->start_class);
2886 ANYOF_BITMAP_ZERO(data->start_class);
2887 if (compat) {
2888 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2889 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2890 data->start_class->flags |= ANYOF_FOLD;
2891 if (OP(scan) == EXACTFL)
2892 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2893 }
8aa23a47
YO
2894 }
2895 else if (flags & SCF_DO_STCLASS_OR) {
2896 if (data->start_class->flags & ANYOF_FOLD) {
2897 /* false positive possible if the class is case-folded.
2898 Assume that the locale settings are the same... */
1aa99e6b
IH
2899 if (uc < 0x100)
2900 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2901 data->start_class->flags &= ~ANYOF_EOS;
2902 }
8aa23a47 2903 cl_and(data->start_class, and_withp);
653099ff 2904 }
8aa23a47
YO
2905 flags &= ~SCF_DO_STCLASS;
2906 }
2907 else if (strchr((const char*)PL_varies,OP(scan))) {
2908 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2909 I32 f = flags, pos_before = 0;
2910 regnode * const oscan = scan;
2911 struct regnode_charclass_class this_class;
2912 struct regnode_charclass_class *oclass = NULL;
2913 I32 next_is_eval = 0;
2914
2915 switch (PL_regkind[OP(scan)]) {
2916 case WHILEM: /* End of (?:...)* . */
2917 scan = NEXTOPER(scan);
2918 goto finish;
2919 case PLUS:
2920 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2921 next = NEXTOPER(scan);
2922 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2923 mincount = 1;
2924 maxcount = REG_INFTY;
2925 next = regnext(scan);
2926 scan = NEXTOPER(scan);
2927 goto do_curly;
2928 }
2929 }
2930 if (flags & SCF_DO_SUBSTR)
2931 data->pos_min++;
2932 min++;
2933 /* Fall through. */
2934 case STAR:
2935 if (flags & SCF_DO_STCLASS) {
2936 mincount = 0;
2937 maxcount = REG_INFTY;
2938 next = regnext(scan);
2939 scan = NEXTOPER(scan);
2940 goto do_curly;
2941 }
2942 is_inf = is_inf_internal = 1;
2943 scan = regnext(scan);
c277df42 2944 if (flags & SCF_DO_SUBSTR) {
8aa23a47
YO
2945 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2946 data->longest = &(data->longest_float);
c277df42 2947 }
8aa23a47
YO
2948 goto optimize_curly_tail;
2949 case CURLY:
2950 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2951 && (scan->flags == stopparen))
2952 {
2953 mincount = 1;
2954 maxcount = 1;
2955 } else {
2956 mincount = ARG1(scan);
2957 maxcount = ARG2(scan);
653099ff 2958 }
8aa23a47
YO
2959 next = regnext(scan);
2960 if (OP(scan) == CURLYX) {
2961 I32 lp = (data ? *(data->last_closep) : 0);
2962 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2963 }
8aa23a47
YO
2964 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2965 next_is_eval = (OP(scan) == EVAL);
2966 do_curly:
2967 if (flags & SCF_DO_SUBSTR) {
2968 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2969 pos_before = data->pos_min;
b45f050a 2970 }
8aa23a47
YO
2971 if (data) {
2972 fl = data->flags;
2973 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2974 if (is_inf)
2975 data->flags |= SF_IS_INF;
2976 }
2977 if (flags & SCF_DO_STCLASS) {
2978 cl_init(pRExC_state, &this_class);
2979 oclass = data->start_class;
2980 data->start_class = &this_class;
2981 f |= SCF_DO_STCLASS_AND;
2982 f &= ~SCF_DO_STCLASS_OR;
2983 }
2984 /* These are the cases when once a subexpression
2985 fails at a particular position, it cannot succeed
2986 even after backtracking at the enclosing scope.
2987
2988 XXXX what if minimal match and we are at the
2989 initial run of {n,m}? */
2990 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2991 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 2992
8aa23a47
YO
2993 /* This will finish on WHILEM, setting scan, or on NULL: */
2994 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2995 last, data, stopparen, recursed, NULL,
2996 (mincount == 0
2997 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 2998
8aa23a47
YO
2999 if (flags & SCF_DO_STCLASS)
3000 data->start_class = oclass;
3001 if (mincount == 0 || minnext == 0) {
3002 if (flags & SCF_DO_STCLASS_OR) {
3003 cl_or(pRExC_state, data->start_class, &this_class);
3004 }
3005 else if (flags & SCF_DO_STCLASS_AND) {
3006 /* Switch to OR mode: cache the old value of
3007 * data->start_class */
3008 INIT_AND_WITHP;
3009 StructCopy(data->start_class, and_withp,
3010 struct regnode_charclass_class);
3011 flags &= ~SCF_DO_STCLASS_AND;
3012 StructCopy(&this_class, data->start_class,
3013 struct regnode_charclass_class);
3014 flags |= SCF_DO_STCLASS_OR;
3015 data->start_class->flags |= ANYOF_EOS;
3016 }
3017 } else { /* Non-zero len */
3018 if (flags & SCF_DO_STCLASS_OR) {
3019 cl_or(pRExC_state, data->start_class, &this_class);
3020 cl_and(data->start_class, and_withp);
3021 }
3022 else if (flags & SCF_DO_STCLASS_AND)
3023 cl_and(data->start_class, &this_class);
3024 flags &= ~SCF_DO_STCLASS;
3025 }
3026 if (!scan) /* It was not CURLYX, but CURLY. */
3027 scan = next;
3028 if ( /* ? quantifier ok, except for (?{ ... }) */
3029 (next_is_eval || !(mincount == 0 && maxcount == 1))
3030 && (minnext == 0) && (deltanext == 0)
3031 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3032 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3033 && ckWARN(WARN_REGEXP))
3034 {
3035 vWARN(RExC_parse,
3036 "Quantifier unexpected on zero-length expression");
3037 }
3038
3039 min += minnext * mincount;
3040 is_inf_internal |= ((maxcount == REG_INFTY
3041 && (minnext + deltanext) > 0)
3042 || deltanext == I32_MAX);
3043 is_inf |= is_inf_internal;
3044 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3045
3046 /* Try powerful optimization CURLYX => CURLYN. */
3047 if ( OP(oscan) == CURLYX && data
3048 && data->flags & SF_IN_PAR
3049 && !(data->flags & SF_HAS_EVAL)
3050 && !deltanext && minnext == 1 ) {
3051 /* Try to optimize to CURLYN. */
3052 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3053 regnode * const nxt1 = nxt;
497b47a8 3054#ifdef DEBUGGING
8aa23a47 3055 regnode *nxt2;
497b47a8 3056#endif
c277df42 3057
8aa23a47
YO
3058 /* Skip open. */
3059 nxt = regnext(nxt);
3060 if (!strchr((const char*)PL_simple,OP(nxt))
3061 && !(PL_regkind[OP(nxt)] == EXACT
3062 && STR_LEN(nxt) == 1))
3063 goto nogo;
497b47a8 3064#ifdef DEBUGGING
8aa23a47 3065 nxt2 = nxt;
497b47a8 3066#endif
8aa23a47
YO
3067 nxt = regnext(nxt);
3068 if (OP(nxt) != CLOSE)
3069 goto nogo;
3070 if (RExC_open_parens) {
3071 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3072 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3073 }
3074 /* Now we know that nxt2 is the only contents: */
3075 oscan->flags = (U8)ARG(nxt);
3076 OP(oscan) = CURLYN;
3077 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3078
c277df42 3079#ifdef DEBUGGING
8aa23a47
YO
3080 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3081 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3082 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3083 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3084 OP(nxt + 1) = OPTIMIZED; /* was count. */
3085 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3086#endif
8aa23a47
YO
3087 }
3088 nogo:
3089
3090 /* Try optimization CURLYX => CURLYM. */
3091 if ( OP(oscan) == CURLYX && data
3092 && !(data->flags & SF_HAS_PAR)
3093 && !(data->flags & SF_HAS_EVAL)
3094 && !deltanext /* atom is fixed width */
3095 && minnext != 0 /* CURLYM can't handle zero width */
3096 ) {
3097 /* XXXX How to optimize if data == 0? */
3098 /* Optimize to a simpler form. */
3099 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3100 regnode *nxt2;
3101
3102 OP(oscan) = CURLYM;
3103 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3104 && (OP(nxt2) != WHILEM))
3105 nxt = nxt2;
3106 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3107 /* Need to optimize away parenths. */
3108 if (data->flags & SF_IN_PAR) {
3109 /* Set the parenth number. */
3110 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3111
3112 if (OP(nxt) != CLOSE)
3113 FAIL("Panic opt close");
3114 oscan->flags = (U8)ARG(nxt);
3115 if (RExC_open_parens) {
3116 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3117 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3118 }
8aa23a47
YO
3119 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3120 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3121
c277df42 3122#ifdef DEBUGGING
8aa23a47
YO
3123 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3124 OP(nxt + 1) = OPTIMIZED; /* was count. */
3125 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3126 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3127#endif
c277df42 3128#if 0
8aa23a47
YO
3129 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3130 regnode *nnxt = regnext(nxt1);
3131
3132 if (nnxt == nxt) {
3133 if (reg_off_by_arg[OP(nxt1)])
3134 ARG_SET(nxt1, nxt2 - nxt1);
3135 else if (nxt2 - nxt1 < U16_MAX)
3136 NEXT_OFF(nxt1) = nxt2 - nxt1;
3137 else
3138 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3139 }
8aa23a47 3140 nxt1 = nnxt;
c277df42 3141 }
5d1c421c 3142#endif
8aa23a47
YO
3143 /* Optimize again: */
3144 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3145 NULL, stopparen, recursed, NULL, 0,depth+1);
3146 }
3147 else
3148 oscan->flags = 0;
3149 }
3150 else if ((OP(oscan) == CURLYX)
3151 && (flags & SCF_WHILEM_VISITED_POS)
3152 /* See the comment on a similar expression above.
3153 However, this time it not a subexpression
3154 we care about, but the expression itself. */
3155 && (maxcount == REG_INFTY)
3156 && data && ++data->whilem_c < 16) {
3157 /* This stays as CURLYX, we can put the count/of pair. */
3158 /* Find WHILEM (as in regexec.c) */
3159 regnode *nxt = oscan + NEXT_OFF(oscan);
3160
3161 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3162 nxt += ARG(nxt);
3163 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3164 | (RExC_whilem_seen << 4)); /* On WHILEM */
3165 }
3166 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3167 pars++;
3168 if (flags & SCF_DO_SUBSTR) {
3169 SV *last_str = NULL;
3170 int counted = mincount != 0;
a0ed51b3 3171
8aa23a47
YO
3172 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3173#if defined(SPARC64_GCC_WORKAROUND)
3174 I32 b = 0;
3175 STRLEN l = 0;
3176 const char *s = NULL;
3177 I32 old = 0;
b515a41d 3178
8aa23a47
YO
3179 if (pos_before >= data->last_start_min)
3180 b = pos_before;
3181 else
3182 b = data->last_start_min;
b515a41d 3183
8aa23a47
YO
3184 l = 0;
3185 s = SvPV_const(data->last_found, l);
3186 old = b - data->last_start_min;
3187
3188#else
3189 I32 b = pos_before >= data->last_start_min
3190 ? pos_before : data->last_start_min;
3191 STRLEN l;
3192 const char * const s = SvPV_const(data->last_found, l);
3193 I32 old = b - data->last_start_min;
3194#endif
3195
3196 if (UTF)
3197 old = utf8_hop((U8*)s, old) - (U8*)s;
3198
3199 l -= old;
3200 /* Get the added string: */
3201 last_str = newSVpvn(s + old, l);
3202 if (UTF)
3203 SvUTF8_on(last_str);
3204 if (deltanext == 0 && pos_before == b) {
3205 /* What was added is a constant string */
3206 if (mincount > 1) {
3207 SvGROW(last_str, (mincount * l) + 1);
3208 repeatcpy(SvPVX(last_str) + l,
3209 SvPVX_const(last_str), l, mincount - 1);
3210 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3211 /* Add additional parts. */
3212 SvCUR_set(data->last_found,
3213 SvCUR(data->last_found) - l);
3214 sv_catsv(data->last_found, last_str);
3215 {
3216 SV * sv = data->last_found;
3217 MAGIC *mg =
3218 SvUTF8(sv) && SvMAGICAL(sv) ?
3219 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3220 if (mg && mg->mg_len >= 0)
3221 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3222 }
8aa23a47 3223 data->last_end += l * (mincount - 1);
b515a41d 3224 }
8aa23a47
YO
3225 } else {
3226 /* start offset must point into the last copy */
3227 data->last_start_min += minnext * (mincount - 1);
3228 data->last_start_max += is_inf ? I32_MAX
3229 : (maxcount - 1) * (minnext + data->pos_delta);
3230 }
c277df42 3231 }
8aa23a47
YO
3232 /* It is counted once already... */
3233 data->pos_min += minnext * (mincount - counted);
3234 data->pos_delta += - counted * deltanext +
3235 (minnext + deltanext) * maxcount - minnext * mincount;
3236 if (mincount != maxcount) {
3237 /* Cannot extend fixed substrings found inside
3238 the group. */
3239 scan_commit(pRExC_state,data,minlenp);
3240 if (mincount && last_str) {
3241 SV * const sv = data->last_found;
3242 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3243 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3244
3245 if (mg)
3246 mg->mg_len = -1;
3247 sv_setsv(sv, last_str);
3248 data->last_end = data->pos_min;
3249 data->last_start_min =
3250 data->pos_min - CHR_SVLEN(last_str);
3251 data->last_start_max = is_inf
3252 ? I32_MAX
3253 : data->pos_min + data->pos_delta
3254 - CHR_SVLEN(last_str);
3255 }
3256 data->longest = &(data->longest_float);
3257 }
3258 SvREFCNT_dec(last_str);
c277df42 3259 }
8aa23a47
YO
3260 if (data && (fl & SF_HAS_EVAL))
3261 data->flags |= SF_HAS_EVAL;
3262 optimize_curly_tail:
3263 if (OP(oscan) != CURLYX) {
3264 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3265 && NEXT_OFF(next))
3266 NEXT_OFF(oscan) += NEXT_OFF(next);
3267 }
3268 continue;
3269 default: /* REF and CLUMP only? */
3270 if (flags & SCF_DO_SUBSTR) {
3271 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3272 data->longest = &(data->longest_float);
3273 }
3274 is_inf = is_inf_internal = 1;
3275 if (flags & SCF_DO_STCLASS_OR)
3276 cl_anything(pRExC_state, data->start_class);
3277 flags &= ~SCF_DO_STCLASS;
3278 break;
c277df42 3279 }
8aa23a47
YO
3280 }
3281 else if (strchr((const char*)PL_simple,OP(scan))) {
3282 int value = 0;
653099ff 3283
8aa23a47
YO
3284 if (flags & SCF_DO_SUBSTR) {
3285 scan_commit(pRExC_state,data,minlenp);
3286 data->pos_min++;
3287 }
3288 min++;
3289 if (flags & SCF_DO_STCLASS) {
3290 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3291
8aa23a47
YO
3292 /* Some of the logic below assumes that switching
3293 locale on will only add false positives. */
3294 switch (PL_regkind[OP(scan)]) {
3295 case SANY:
3296 default:
3297 do_default:
3298 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3299 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3300 cl_anything(pRExC_state, data->start_class);
3301 break;
3302 case REG_ANY:
3303 if (OP(scan) == SANY)
3304 goto do_default;
3305 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3306 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3307 || (data->start_class->flags & ANYOF_CLASS));
3308 cl_anything(pRExC_state, data->start_class);
653099ff 3309 }
8aa23a47
YO
3310 if (flags & SCF_DO_STCLASS_AND || !value)
3311 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3312 break;
3313 case ANYOF:
3314 if (flags & SCF_DO_STCLASS_AND)
3315 cl_and(data->start_class,
3316 (struct regnode_charclass_class*)scan);
653099ff 3317 else
8aa23a47
YO
3318 cl_or(pRExC_state, data->start_class,
3319 (struct regnode_charclass_class*)scan);
3320 break;
3321 case ALNUM:
3322 if (flags & SCF_DO_STCLASS_AND) {
3323 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3324 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3325 for (value = 0; value < 256; value++)
3326 if (!isALNUM(value))
3327 ANYOF_BITMAP_CLEAR(data->start_class, value);
3328 }
653099ff 3329 }
8aa23a47
YO
3330 else {
3331 if (data->start_class->flags & ANYOF_LOCALE)
3332 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3333 else {
3334 for (value = 0; value < 256; value++)
3335 if (isALNUM(value))
3336 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3337 }
8aa23a47
YO
3338 }
3339 break;
3340 case ALNUML:
3341 if (flags & SCF_DO_STCLASS_AND) {
3342 if (data->start_class->flags & ANYOF_LOCALE)
3343 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3344 }
3345 else {
3346 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3347 data->start_class->flags |= ANYOF_LOCALE;
3348 }
3349 break;
3350 case NALNUM:
3351 if (flags & SCF_DO_STCLASS_AND) {
3352 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3353 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3354 for (value = 0; value < 256; value++)
3355 if (isALNUM(value))
3356 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3357 }
3358 }
8aa23a47
YO
3359 else {
3360 if (data->start_class->flags & ANYOF_LOCALE)
3361 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3362 else {
3363 for (value = 0; value < 256; value++)
3364 if (!isALNUM(value))
3365 ANYOF_BITMAP_SET(data->start_class, value);
3366 }
653099ff 3367 }
8aa23a47
YO
3368 break;
3369 case NALNUML:
3370 if (flags & SCF_DO_STCLASS_AND) {
3371 if (data->start_class->flags & ANYOF_LOCALE)
3372 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3373 }
8aa23a47
YO
3374 else {
3375 data->start_class->flags |= ANYOF_LOCALE;
3376 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3377 }
3378 break;
3379 case SPACE:
3380 if (flags & SCF_DO_STCLASS_AND) {
3381 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3382 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3383 for (value = 0; value < 256; value++)
3384 if (!isSPACE(value))
3385 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3386 }
3387 }
8aa23a47
YO
3388 else {
3389 if (data->start_class->flags & ANYOF_LOCALE)
3390 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3391 else {
3392 for (value = 0; value < 256; value++)
3393 if (isSPACE(value))
3394 ANYOF_BITMAP_SET(data->start_class, value);
3395 }
653099ff 3396 }
8aa23a47
YO
3397 break;
3398 case SPACEL:
3399 if (flags & SCF_DO_STCLASS_AND) {
3400 if (data->start_class->flags & ANYOF_LOCALE)
3401 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3402 }
3403 else {
3404 data->start_class->flags |= ANYOF_LOCALE;
3405 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3406 }
3407 break;
3408 case NSPACE:
3409 if (flags & SCF_DO_STCLASS_AND) {
3410 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3411 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3412 for (value = 0; value < 256; value++)
3413 if (isSPACE(value))
3414 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3415 }
8aa23a47
YO
3416 }
3417 else {
3418 if (data->start_class->flags & ANYOF_LOCALE)
3419 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3420 else {
3421 for (value = 0; value < 256; value++)
3422 if (!isSPACE(value))
3423 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3424 }
3425 }
8aa23a47
YO
3426 break;
3427 case NSPACEL:
3428 if (flags & SCF_DO_STCLASS_AND) {
3429 if (data->start_class->flags & ANYOF_LOCALE) {
3430 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3431 for (value = 0; value < 256; value++)
3432 if (!isSPACE(value))
3433 ANYOF_BITMAP_CLEAR(data->start_class, value);
3434 }
653099ff 3435 }
8aa23a47
YO
3436 else {
3437 data->start_class->flags |= ANYOF_LOCALE;
3438 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3439 }
3440 break;
3441 case DIGIT:
3442 if (flags & SCF_DO_STCLASS_AND) {
3443 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3444 for (value = 0; value < 256; value++)
3445 if (!isDIGIT(value))
3446 ANYOF_BITMAP_CLEAR(data->start_class, value);
3447 }
3448 else {
3449 if (data->start_class->flags & ANYOF_LOCALE)
3450 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3451 else {
3452 for (value = 0; value < 256; value++)
3453 if (isDIGIT(value))
3454 ANYOF_BITMAP_SET(data->start_class, value);
3455 }
3456 }
3457 break;
3458 case NDIGIT:
3459 if (flags & SCF_DO_STCLASS_AND) {
3460 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3461 for (value = 0; value < 256; value++)
3462 if (isDIGIT(value))
3463 ANYOF_BITMAP_CLEAR(data->start_class, value);
3464 }
3465 else {
3466 if (data->start_class->flags & ANYOF_LOCALE)
3467 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3468 else {
3469 for (value = 0; value < 256; value++)
3470 if (!isDIGIT(value))
3471 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3472 }
3473 }
8aa23a47
YO
3474 break;
3475 }
3476 if (flags & SCF_DO_STCLASS_OR)
3477 cl_and(data->start_class, and_withp);
3478 flags &= ~SCF_DO_STCLASS;
3479 }
3480 }
3481 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3482 data->flags |= (OP(scan) == MEOL
3483 ? SF_BEFORE_MEOL
3484 : SF_BEFORE_SEOL);
3485 }
3486 else if ( PL_regkind[OP(scan)] == BRANCHJ
3487 /* Lookbehind, or need to calculate parens/evals/stclass: */
3488 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3489 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3490 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3491 || OP(scan) == UNLESSM )
3492 {
3493 /* Negative Lookahead/lookbehind
3494 In this case we can't do fixed string optimisation.
3495 */
1de06328 3496
8aa23a47
YO
3497 I32 deltanext, minnext, fake = 0;
3498 regnode *nscan;
3499 struct regnode_charclass_class intrnl;
3500 int f = 0;
1de06328 3501
8aa23a47
YO
3502 data_fake.flags = 0;
3503 if (data) {
3504 data_fake.whilem_c = data->whilem_c;
3505 data_fake.last_closep = data->last_closep;
c277df42 3506 }
8aa23a47
YO
3507 else
3508 data_fake.last_closep = &fake;
58e23c8d 3509 data_fake.pos_delta = delta;
8aa23a47
YO
3510 if ( flags & SCF_DO_STCLASS && !scan->flags
3511 && OP(scan) == IFMATCH ) { /* Lookahead */
3512 cl_init(pRExC_state, &intrnl);
3513 data_fake.start_class = &intrnl;
3514 f |= SCF_DO_STCLASS_AND;
3515 }
3516 if (flags & SCF_WHILEM_VISITED_POS)
3517 f |= SCF_WHILEM_VISITED_POS;
3518 next = regnext(scan);
3519 nscan = NEXTOPER(NEXTOPER(scan));
3520 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3521 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3522 if (scan->flags) {
3523 if (deltanext) {
58e23c8d 3524 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3525 }
3526 else if (minnext > (I32)U8_MAX) {
58e23c8d 3527 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3528 }
3529 scan->flags = (U8)minnext;
3530 }
3531 if (data) {
3532 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3533 pars++;
3534 if (data_fake.flags & SF_HAS_EVAL)
3535 data->flags |= SF_HAS_EVAL;
3536 data->whilem_c = data_fake.whilem_c;
3537 }
3538 if (f & SCF_DO_STCLASS_AND) {
3539 const int was = (data->start_class->flags & ANYOF_EOS);
3540
3541 cl_and(data->start_class, &intrnl);
3542 if (was)
3543 data->start_class->flags |= ANYOF_EOS;
3544 }
cb434fcc 3545 }
8aa23a47
YO
3546#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3547 else {
3548 /* Positive Lookahead/lookbehind
3549 In this case we can do fixed string optimisation,
3550 but we must be careful about it. Note in the case of
3551 lookbehind the positions will be offset by the minimum
3552 length of the pattern, something we won't know about
3553 until after the recurse.
3554 */
3555 I32 deltanext, fake = 0;
3556 regnode *nscan;
3557 struct regnode_charclass_class intrnl;
3558 int f = 0;
3559 /* We use SAVEFREEPV so that when the full compile
3560 is finished perl will clean up the allocated
3561 minlens when its all done. This was we don't
3562 have to worry about freeing them when we know
3563 they wont be used, which would be a pain.
3564 */
3565 I32 *minnextp;
3566 Newx( minnextp, 1, I32 );
3567 SAVEFREEPV(minnextp);
3568
3569 if (data) {
3570 StructCopy(data, &data_fake, scan_data_t);
3571 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3572 f |= SCF_DO_SUBSTR;
3573 if (scan->flags)
3574 scan_commit(pRExC_state, &data_fake,minlenp);
3575 data_fake.last_found=newSVsv(data->last_found);
3576 }
3577 }
3578 else
3579 data_fake.last_closep = &fake;
3580 data_fake.flags = 0;
58e23c8d 3581 data_fake.pos_delta = delta;
8aa23a47
YO
3582 if (is_inf)
3583 data_fake.flags |= SF_IS_INF;
3584 if ( flags & SCF_DO_STCLASS && !scan->flags
3585 && OP(scan) == IFMATCH ) { /* Lookahead */
3586 cl_init(pRExC_state, &intrnl);
3587 data_fake.start_class = &intrnl;
3588 f |= SCF_DO_STCLASS_AND;
3589 }
3590 if (flags & SCF_WHILEM_VISITED_POS)
3591 f |= SCF_WHILEM_VISITED_POS;
3592 next = regnext(scan);
3593 nscan = NEXTOPER(NEXTOPER(scan));
3594
3595 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3596 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3597 if (scan->flags) {
3598 if (deltanext) {
58e23c8d 3599 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3600 }
3601 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3602 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3603 }
3604 scan->flags = (U8)*minnextp;
3605 }
3606
3607 *minnextp += min;
3608
3609 if (f & SCF_DO_STCLASS_AND) {
3610 const int was = (data->start_class->flags & ANYOF_EOS);
3611
3612 cl_and(data->start_class, &intrnl);
3613 if (was)
3614 data->start_class->flags |= ANYOF_EOS;
3615 }
3616 if (data) {
3617 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3618 pars++;
3619 if (data_fake.flags & SF_HAS_EVAL)
3620 data->flags |= SF_HAS_EVAL;
3621 data->whilem_c = data_fake.whilem_c;
3622 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3623 if (RExC_rx->minlen<*minnextp)
3624 RExC_rx->minlen=*minnextp;
3625 scan_commit(pRExC_state, &data_fake, minnextp);
3626 SvREFCNT_dec(data_fake.last_found);
3627
3628 if ( data_fake.minlen_fixed != minlenp )
3629 {
3630 data->offset_fixed= data_fake.offset_fixed;
3631 data->minlen_fixed= data_fake.minlen_fixed;
3632 data->lookbehind_fixed+= scan->flags;
3633 }
3634 if ( data_fake.minlen_float != minlenp )
3635 {
3636 data->minlen_float= data_fake.minlen_float;
3637 data->offset_float_min=data_fake.offset_float_min;
3638 data->offset_float_max=data_fake.offset_float_max;
3639 data->lookbehind_float+= scan->flags;
3640 }
3641 }
3642 }
3643
3644
40d049e4 3645 }
8aa23a47
YO
3646#endif
3647 }
3648 else if (OP(scan) == OPEN) {
3649 if (stopparen != (I32)ARG(scan))
3650 pars++;
3651 }
3652 else if (OP(scan) == CLOSE) {
3653 if (stopparen == (I32)ARG(scan)) {
3654 break;
3655 }
3656 if ((I32)ARG(scan) == is_par) {
3657 next = regnext(scan);
b515a41d 3658
8aa23a47
YO
3659 if ( next && (OP(next) != WHILEM) && next < last)
3660 is_par = 0; /* Disable optimization */
40d049e4 3661 }
8aa23a47
YO
3662 if (data)
3663 *(data->last_closep) = ARG(scan);
3664 }
3665 else if (OP(scan) == EVAL) {
c277df42
IZ
3666 if (data)
3667 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3668 }
3669 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3670 if (flags & SCF_DO_SUBSTR) {
3671 scan_commit(pRExC_state,data,minlenp);
3672 flags &= ~SCF_DO_SUBSTR;
40d049e4 3673 }
8aa23a47
YO
3674 if (data && OP(scan)==ACCEPT) {
3675 data->flags |= SCF_SEEN_ACCEPT;
3676 if (stopmin > min)
3677 stopmin = min;
e2e6a0f1 3678 }
8aa23a47
YO
3679 }
3680 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3681 {
0f5d15d6 3682 if (flags & SCF_DO_SUBSTR) {
1de06328 3683 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3684 data->longest = &(data->longest_float);
3685 }
3686 is_inf = is_inf_internal = 1;
653099ff 3687 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3688 cl_anything(pRExC_state, data->start_class);
96776eda 3689 flags &= ~SCF_DO_STCLASS;
8aa23a47 3690 }
58e23c8d 3691 else if (OP(scan) == GPOS) {
bbe252da 3692 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3693 !(delta || is_inf || (data && data->pos_delta)))
3694 {
bbe252da
YO
3695 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3696 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3697 if (RExC_rx->gofs < (U32)min)
3698 RExC_rx->gofs = min;
3699 } else {
bbe252da 3700 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3701 RExC_rx->gofs = 0;
3702 }
3703 }
786e8c11 3704#ifdef TRIE_STUDY_OPT
40d049e4 3705#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3706 else if (PL_regkind[OP(scan)] == TRIE) {
3707 /* NOTE - There is similar code to this block above for handling
3708 BRANCH nodes on the initial study. If you change stuff here
3709 check there too. */
3710 regnode *trie_node= scan;
3711 regnode *tail= regnext(scan);
f8fc2ecf 3712 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3713 I32 max1 = 0, min1 = I32_MAX;
3714 struct regnode_charclass_class accum;
3715
3716 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3717 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3718 if (flags & SCF_DO_STCLASS)
3719 cl_init_zero(pRExC_state, &accum);
3720
3721 if (!trie->jump) {
3722 min1= trie->minlen;
3723 max1= trie->maxlen;
3724 } else {
3725 const regnode *nextbranch= NULL;
3726 U32 word;
3727
3728 for ( word=1 ; word <= trie->wordcount ; word++)
3729 {
3730 I32 deltanext=0, minnext=0, f = 0, fake;
3731 struct regnode_charclass_class this_class;
3732
3733 data_fake.flags = 0;
3734 if (data) {
3735 data_fake.whilem_c = data->whilem_c;
3736 data_fake.last_closep = data->last_closep;
3737 }
3738 else
3739 data_fake.last_closep = &fake;
58e23c8d 3740 data_fake.pos_delta = delta;
8aa23a47
YO
3741 if (flags & SCF_DO_STCLASS) {
3742 cl_init(pRExC_state, &this_class);
3743 data_fake.start_class = &this_class;
3744 f = SCF_DO_STCLASS_AND;
3745 }
3746 if (flags & SCF_WHILEM_VISITED_POS)
3747 f |= SCF_WHILEM_VISITED_POS;
3748
3749 if (trie->jump[word]) {
3750 if (!nextbranch)
3751 nextbranch = trie_node + trie->jump[0];
3752 scan= trie_node + trie->jump[word];
3753 /* We go from the jump point to the branch that follows
3754 it. Note this means we need the vestigal unused branches
3755 even though they arent otherwise used.
3756 */
3757 minnext = study_chunk(pRExC_state, &scan, minlenp,
3758 &deltanext, (regnode *)nextbranch, &data_fake,
3759 stopparen, recursed, NULL, f,depth+1);
3760 }
3761 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3762 nextbranch= regnext((regnode*)nextbranch);
3763
3764 if (min1 > (I32)(minnext + trie->minlen))
3765 min1 = minnext + trie->minlen;
3766 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3767 max1 = minnext + deltanext + trie->maxlen;
3768 if (deltanext == I32_MAX)
3769 is_inf = is_inf_internal = 1;
3770
3771 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3772 pars++;
3773 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3774 if ( stopmin > min + min1)
3775 stopmin = min + min1;
3776 flags &= ~SCF_DO_SUBSTR;
3777 if (data)
3778 data->flags |= SCF_SEEN_ACCEPT;
3779 }
3780 if (data) {
3781 if (data_fake.flags & SF_HAS_EVAL)
3782 data->flags |= SF_HAS_EVAL;
3783 data->whilem_c = data_fake.whilem_c;
3784 }
3785 if (flags & SCF_DO_STCLASS)
3786 cl_or(pRExC_state, &accum, &this_class);
3787 }
3788 }
3789 if (flags & SCF_DO_SUBSTR) {
3790 data->pos_min += min1;
3791 data->pos_delta += max1 - min1;
3792 if (max1 != min1 || is_inf)
3793 data->longest = &(data->longest_float);
3794 }
3795 min += min1;
3796 delta += max1 - min1;
3797 if (flags & SCF_DO_STCLASS_OR) {
3798 cl_or(pRExC_state, data->start_class, &accum);
3799 if (min1) {
3800 cl_and(data->start_class, and_withp);
3801 flags &= ~SCF_DO_STCLASS;
3802 }
3803 }
3804 else if (flags & SCF_DO_STCLASS_AND) {
3805 if (min1) {
3806 cl_and(data->start_class, &accum);
3807 flags &= ~SCF_DO_STCLASS;
3808 }
3809 else {
3810 /* Switch to OR mode: cache the old value of
3811 * data->start_class */
3812 INIT_AND_WITHP;
3813 StructCopy(data->start_class, and_withp,
3814 struct regnode_charclass_class);
3815 flags &= ~SCF_DO_STCLASS_AND;
3816 StructCopy(&accum, data->start_class,
3817 struct regnode_charclass_class);
3818 flags |= SCF_DO_STCLASS_OR;
3819 data->start_class->flags |= ANYOF_EOS;
3820 }
3821 }
3822 scan= tail;
3823 continue;
3824 }
786e8c11 3825#else
8aa23a47 3826 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 3827 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3828 U8*bang=NULL;
3829
3830 min += trie->minlen;
3831 delta += (trie->maxlen - trie->minlen);
3832 flags &= ~SCF_DO_STCLASS; /* xxx */
3833 if (flags & SCF_DO_SUBSTR) {
3834 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3835 data->pos_min += trie->minlen;
3836 data->pos_delta += (trie->maxlen - trie->minlen);
3837 if (trie->maxlen != trie->minlen)
3838 data->longest = &(data->longest_float);
3839 }
3840 if (trie->jump) /* no more substrings -- for now /grr*/
3841 flags &= ~SCF_DO_SUBSTR;
b515a41d 3842 }
8aa23a47
YO
3843#endif /* old or new */
3844#endif /* TRIE_STUDY_OPT */
3845 /* Else: zero-length, ignore. */
3846 scan = regnext(scan);
3847 }
3848 if (frame) {
3849 last = frame->last;
3850 scan = frame->next;
3851 stopparen = frame->stop;
3852 frame = frame->prev;
3853 goto fake_study_recurse;
c277df42
IZ
3854 }
3855
3856 finish:
8aa23a47
YO
3857 assert(!frame);
3858
c277df42 3859 *scanp = scan;
aca2d497 3860 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3861 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3862 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3863 if (is_par > (I32)U8_MAX)
c277df42
IZ
3864 is_par = 0;
3865 if (is_par && pars==1 && data) {
3866 data->flags |= SF_IN_PAR;
3867 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3868 }
3869 else if (pars && data) {
c277df42
IZ
3870 data->flags |= SF_HAS_PAR;
3871 data->flags &= ~SF_IN_PAR;
3872 }
653099ff 3873 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3874 cl_and(data->start_class, and_withp);
786e8c11
YO
3875 if (flags & SCF_TRIE_RESTUDY)
3876 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3877
3878 DEBUG_STUDYDATA(data,depth);
3879
e2e6a0f1 3880 return min < stopmin ? min : stopmin;
c277df42
IZ
3881}
3882
2eccd3b2
NC
3883STATIC U32
3884S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 3885{
4a4e7719
NC
3886 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
3887
3888 Renewc(RExC_rxi->data,
3889 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
3890 char, struct reg_data);
3891 if(count)
f8fc2ecf 3892 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 3893 else
f8fc2ecf 3894 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
3895 RExC_rxi->data->count = count + n;
3896 Copy(s, RExC_rxi->data->what + count, n, U8);
3897 return count;
c277df42
IZ
3898}
3899
76234dfb 3900#ifndef PERL_IN_XSUB_RE
d88dccdf 3901void
864dbfa3 3902Perl_reginitcolors(pTHX)
d88dccdf 3903{
97aff369 3904 dVAR;
1df70142 3905 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3906 if (s) {
1df70142
AL
3907 char *t = savepv(s);
3908 int i = 0;
3909 PL_colors[0] = t;
d88dccdf 3910 while (++i < 6) {
1df70142
AL
3911 t = strchr(t, '\t');
3912 if (t) {
3913 *t = '\0';
3914 PL_colors[i] = ++t;
d88dccdf
IZ
3915 }
3916 else
1df70142 3917 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3918 }
3919 } else {
1df70142 3920 int i = 0;
b81d288d 3921 while (i < 6)
06b5626a 3922 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3923 }
3924 PL_colorset = 1;
3925}
76234dfb 3926#endif
8615cb43 3927
07be1b83 3928
786e8c11
YO
3929#ifdef TRIE_STUDY_OPT
3930#define CHECK_RESTUDY_GOTO \
3931 if ( \
3932 (data.flags & SCF_TRIE_RESTUDY) \
3933 && ! restudied++ \
3934 ) goto reStudy
3935#else
3936#define CHECK_RESTUDY_GOTO
3937#endif
f9f4320a 3938
a687059c 3939/*
e50aee73 3940 - pregcomp - compile a regular expression into internal code
a687059c
LW
3941 *
3942 * We can't allocate space until we know how big the compiled form will be,
3943 * but we can't compile it (and thus know how big it is) until we've got a
3944 * place to put the code. So we cheat: we compile it twice, once with code
3945 * generation turned off and size counting turned on, and once "for real".
3946 * This also means that we don't allocate space until we are sure that the
3947 * thing really will compile successfully, and we never have to move the
3948 * code and thus invalidate pointers into it. (Note that it has to be in
3949 * one piece because free() must be able to free it all.) [NB: not true in perl]
3950 *
3951 * Beware that the optimization-preparation code in here knows about some
3952 * of the structure of the compiled regexp. [I'll say.]
3953 */
b9b4dddf
YO
3954
3955
3956
f9f4320a 3957#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3958#define RE_ENGINE_PTR &PL_core_reg_engine
3959#else
f9f4320a
YO
3960extern const struct regexp_engine my_reg_engine;
3961#define RE_ENGINE_PTR &my_reg_engine
3962#endif
b9b4dddf
YO
3963/* these make a few things look better, to avoid indentation */
3964#define BEGIN_BLOCK {
f9f4320a
YO
3965#define END_BLOCK }
3966
a687059c 3967regexp *
864dbfa3 3968Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3969{
97aff369 3970 dVAR;
f9f4320a
YO
3971 GET_RE_DEBUG_FLAGS_DECL;
3972 DEBUG_r(if (!PL_colorset) reginitcolors());
b9b4dddf
YO
3973#ifndef PERL_IN_XSUB_RE
3974 BEGIN_BLOCK
f9f4320a
YO
3975 /* Dispatch a request to compile a regexp to correct
3976 regexp engine. */
3977 HV * const table = GvHV(PL_hintgv);
3978 if (table) {
3979 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
1e2e3d02 3980 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3981 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3982 DEBUG_COMPILE_r({
8d8756e7 3983 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3984 SvIV(*ptr));
3985 });
f2f78491 3986 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3987 }
b9b4dddf
YO
3988 }
3989 END_BLOCK
3990#endif
3991 BEGIN_BLOCK
a0d0e21e 3992 register regexp *r;
f8fc2ecf 3993 register regexp_internal *ri;
c277df42 3994 regnode *scan;
c277df42 3995 regnode *first;
a0d0e21e 3996 I32 flags;
a0d0e21e
LW
3997 I32 minlen = 0;
3998 I32 sawplus = 0;
3999 I32 sawopen = 0;
2c2d71f5 4000 scan_data_t data;
830247a4 4001 RExC_state_t RExC_state;
be8e71aa 4002 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
4003#ifdef TRIE_STUDY_OPT
4004 int restudied= 0;
4005 RExC_state_t copyRExC_state;
4006#endif
a0d0e21e 4007 if (exp == NULL)
c277df42 4008 FAIL("NULL regexp argument");
a0d0e21e 4009
a5961de5 4010 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 4011
5cfc7842 4012 RExC_precomp = exp;
a3621e74 4013 DEBUG_COMPILE_r({
ab3bbdeb
YO
4014 SV *dsv= sv_newmortal();
4015 RE_PV_QUOTED_DECL(s, RExC_utf8,
4016 dsv, RExC_precomp, (xend - exp), 60);
4017 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4018 PL_colors[4],PL_colors[5],s);
a5961de5 4019 });
e2509266 4020 RExC_flags = pm->op_pmflags;
830247a4 4021 RExC_sawback = 0;
bbce6d69 4022
830247a4
IZ
4023 RExC_seen = 0;
4024 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4025 RExC_seen_evals = 0;
4026 RExC_extralen = 0;
c277df42 4027
bbce6d69 4028 /* First pass: determine size, legality. */
830247a4 4029 RExC_parse = exp;
fac92740 4030 RExC_start = exp;
830247a4
IZ
4031 RExC_end = xend;
4032 RExC_naughty = 0;
4033 RExC_npar = 1;
c74340f9 4034 RExC_cpar = 1;
e2e6a0f1 4035 RExC_nestroot = 0;
830247a4
IZ
4036 RExC_size = 0L;
4037 RExC_emit = &PL_regdummy;
4038 RExC_whilem_seen = 0;
fc8cd66c 4039 RExC_charnames = NULL;
40d049e4
YO
4040 RExC_open_parens = NULL;
4041 RExC_close_parens = NULL;
4042 RExC_opend = NULL;
81714fb9 4043 RExC_paren_names = NULL;
40d049e4
YO
4044 RExC_recurse = NULL;
4045 RExC_recurse_count = 0;
81714fb9 4046
85ddcde9
JH
4047#if 0 /* REGC() is (currently) a NOP at the first pass.
4048 * Clever compilers notice this and complain. --jhi */
830247a4 4049 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4050#endif
3dab1dad
YO
4051 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4052 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4053 RExC_precomp = NULL;
a0d0e21e
LW
4054 return(NULL);
4055 }
07be1b83 4056 DEBUG_PARSE_r({
81714fb9
YO
4057 PerlIO_printf(Perl_debug_log,
4058 "Required size %"IVdf" nodes\n"
4059 "Starting second pass (creation)\n",
4060 (IV)RExC_size);
07be1b83
YO
4061 RExC_lastnum=0;
4062 RExC_lastparse=NULL;
4063 });
c277df42
IZ
4064 /* Small enough for pointer-storage convention?
4065 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4066 if (RExC_size >= 0x10000L && RExC_extralen)
4067 RExC_size += RExC_extralen;
c277df42 4068 else
830247a4
IZ
4069 RExC_extralen = 0;
4070 if (RExC_whilem_seen > 15)
4071 RExC_whilem_seen = 15;
a0d0e21e 4072
e2e6a0f1
YO
4073#ifdef DEBUGGING
4074 /* Make room for a sentinel value at the end of the program */
4075 RExC_size++;
4076#endif
4077
f9f4320a
YO
4078 /* Allocate space and zero-initialize. Note, the two step process
4079 of zeroing when in debug mode, thus anything assigned has to
4080 happen after that */
f8fc2ecf
YO
4081 Newxz(r, 1, regexp);
4082 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4083 char, regexp_internal);
4084 if ( r == NULL || ri == NULL )
b45f050a 4085 FAIL("Regexp out of space");
0f79a09d
GS
4086#ifdef DEBUGGING
4087 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4088 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4089#else
f8fc2ecf
YO
4090 /* bulk initialize base fields with 0. */
4091 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4092#endif
58e23c8d
YO
4093
4094 /* non-zero initialization begins here */
f8fc2ecf 4095 RXi_SET( r, ri );
f9f4320a 4096 r->engine= RE_ENGINE_PTR;
c277df42 4097 r->refcnt = 1;
bbce6d69 4098 r->prelen = xend - exp;
5cfc7842 4099 r->precomp = savepvn(RExC_precomp, r->prelen);
bbe252da
YO
4100 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
4101 r->intflags = 0;
830247a4 4102 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4103
6bda09f9 4104 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4105 Newxz(RExC_open_parens, RExC_npar,regnode *);
4106 SAVEFREEPV(RExC_open_parens);
4107 Newxz(RExC_close_parens,RExC_npar,regnode *);
4108 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4109 }
4110
4111 /* Useful during FAIL. */
f8fc2ecf
YO
4112 Newxz(ri->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4113 if (ri->offsets) {
4114 ri->offsets[0] = RExC_size;
fac92740 4115 }
a3621e74 4116 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4117 "%s %"UVuf" bytes for offset annotations.\n",
f8fc2ecf 4118 ri->offsets ? "Got" : "Couldn't get",
392fbf5d 4119 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4120
830247a4 4121 RExC_rx = r;
f8fc2ecf 4122 RExC_rxi = ri;
bbce6d69 4123
4124 /* Second pass: emit code. */
e2509266 4125 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4126 RExC_parse = exp;
4127 RExC_end = xend;
4128 RExC_naughty = 0;
4129 RExC_npar = 1;
c74340f9 4130 RExC_cpar = 1;
f8fc2ecf
YO
4131 RExC_emit_start = ri->program;
4132 RExC_emit = ri->program;
e2e6a0f1
YO
4133#ifdef DEBUGGING
4134 /* put a sentinal on the end of the program so we can check for
4135 overwrites */
f8fc2ecf 4136 ri->program[RExC_size].type = 255;
e2e6a0f1 4137#endif
2cd61cdb 4138 /* Store the count of eval-groups for security checks: */
786e8c11 4139 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 4140 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3dab1dad 4141 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4142 return(NULL);
6bda09f9 4143
07be1b83
YO
4144 /* XXXX To minimize changes to RE engine we always allocate
4145 3-units-long substrs field. */
4146 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4147 if (RExC_recurse_count) {
4148 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4149 SAVEFREEPV(RExC_recurse);
4150 }
a0d0e21e 4151
07be1b83 4152reStudy:
1de06328 4153 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4154 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4155
07be1b83
YO
4156#ifdef TRIE_STUDY_OPT
4157 if ( restudied ) {
5d458dd8 4158 U32 seen=RExC_seen;
07be1b83 4159 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4160
4161 RExC_state = copyRExC_state;
4162 if (seen & REG_TOP_LEVEL_BRANCHES)
4163 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4164 else
4165 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4166 if (data.last_found) {
07be1b83 4167 SvREFCNT_dec(data.longest_fixed);
07be1b83 4168 SvREFCNT_dec(data.longest_float);
07be1b83 4169 SvREFCNT_dec(data.last_found);
1de06328 4170 }
40d049e4 4171 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4172 } else {
40d049e4 4173 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4174 copyRExC_state = RExC_state;
07be1b83 4175 }
40d049e4
YO
4176#else
4177 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4178#endif
fc8cd66c 4179
a0d0e21e 4180 /* Dig out information for optimizations. */
bbe252da 4181 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */
e2509266 4182 pm->op_pmflags = RExC_flags;
a0ed51b3 4183 if (UTF)
bbe252da 4184 r->extflags |= RXf_UTF8; /* Unicode in it? */
f8fc2ecf 4185 ri->regstclass = NULL;
830247a4 4186 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4187 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4188 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4189
1de06328
YO
4190 /* testing for BRANCH here tells us whether there is "must appear"
4191 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4192 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4193 I32 fake;
c5254dd6 4194 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4195 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4196 int stclass_flag;
07be1b83 4197 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4198
4199 first = scan;
c277df42 4200 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4201 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4202 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4203 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4204 /* for now we can't handle lookbehind IFMATCH*/
4205 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4206 (OP(first) == PLUS) ||
4207 (OP(first) == MINMOD) ||
653099ff 4208 /* An {n,m} with n>0 */
07be1b83
YO
4209 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4210 {
786e8c11 4211
a0d0e21e
LW
4212 if (OP(first) == PLUS)
4213 sawplus = 1;
4214 else
3dab1dad 4215 first += regarglen[OP(first)];
07be1b83
YO
4216 if (OP(first) == IFMATCH) {
4217 first = NEXTOPER(first);
4218 first += EXTRA_STEP_2ARGS;
7c167cea 4219 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4220 first = NEXTOPER(first);
a687059c
LW
4221 }
4222
a0d0e21e
LW
4223 /* Starting-point info. */
4224 again:
786e8c11 4225 DEBUG_PEEP("first:",first,0);
07be1b83 4226 /* Ignore EXACT as we deal with it later. */
3dab1dad 4227 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4228 if (OP(first) == EXACT)
6f207bd3 4229 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4230 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4231 ri->regstclass = first;
b3c9acc1 4232 }
07be1b83 4233#ifdef TRIE_STCLASS
786e8c11 4234 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4235 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4236 {
786e8c11 4237 regnode *trie_op;
07be1b83 4238 /* this can happen only on restudy */
786e8c11 4239 if ( OP(first) == TRIE ) {
446bd890
NC
4240 struct regnode_1 *trieop =
4241 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4242 StructCopy(first,trieop,struct regnode_1);
4243 trie_op=(regnode *)trieop;
4244 } else {
446bd890
NC
4245 struct regnode_charclass *trieop =
4246 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4247 StructCopy(first,trieop,struct regnode_charclass);
4248 trie_op=(regnode *)trieop;
4249 }
1de06328 4250 OP(trie_op)+=2;
786e8c11 4251 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4252 ri->regstclass = trie_op;
07be1b83
YO
4253 }
4254#endif
bfed75c6 4255 else if (strchr((const char*)PL_simple,OP(first)))
f8fc2ecf 4256 ri->regstclass = first;
3dab1dad
YO
4257 else if (PL_regkind[OP(first)] == BOUND ||
4258 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4259 ri->regstclass = first;
3dab1dad 4260 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4261 r->extflags |= (OP(first) == MBOL
4262 ? RXf_ANCH_MBOL
cad2e5aa 4263 : (OP(first) == SBOL
bbe252da
YO
4264 ? RXf_ANCH_SBOL
4265 : RXf_ANCH_BOL));
a0d0e21e 4266 first = NEXTOPER(first);
774d564b 4267 goto again;
4268 }
4269 else if (OP(first) == GPOS) {
bbe252da 4270 r->extflags |= RXf_ANCH_GPOS;
774d564b 4271 first = NEXTOPER(first);
4272 goto again;
a0d0e21e 4273 }
cf2a2b69
YO
4274 else if ((!sawopen || !RExC_sawback) &&
4275 (OP(first) == STAR &&
3dab1dad 4276 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4277 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4278 {
4279 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4280 const int type =
4281 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4282 ? RXf_ANCH_MBOL
4283 : RXf_ANCH_SBOL;
4284 r->extflags |= type;
4285 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4286 first = NEXTOPER(first);
774d564b 4287 goto again;
a0d0e21e 4288 }
b81d288d 4289 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4290 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4291 /* x+ must match at the 1st pos of run of x's */
bbe252da 4292 r->intflags |= PREGf_SKIP;
a0d0e21e 4293
c277df42 4294 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4295#ifdef TRIE_STUDY_OPT
81714fb9 4296 DEBUG_PARSE_r(
be8e71aa
YO
4297 if (!restudied)
4298 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4299 (IV)(first - scan + 1))
4300 );
4301#else
81714fb9 4302 DEBUG_PARSE_r(
be8e71aa
YO
4303 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4304 (IV)(first - scan + 1))
4305 );
4306#endif
4307
4308
a0d0e21e
LW
4309 /*
4310 * If there's something expensive in the r.e., find the
4311 * longest literal string that must appear and make it the
4312 * regmust. Resolve ties in favor of later strings, since
4313 * the regstart check works with the beginning of the r.e.
4314 * and avoiding duplication strengthens checking. Not a
4315 * strong reason, but sufficient in the absence of others.
4316 * [Now we resolve ties in favor of the earlier string if
c277df42 4317 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4318 * earlier string may buy us something the later one won't.]
4319 */
de8c5301 4320
396482e1
GA
4321 data.longest_fixed = newSVpvs("");
4322 data.longest_float = newSVpvs("");
4323 data.last_found = newSVpvs("");
c277df42
IZ
4324 data.longest = &(data.longest_fixed);
4325 first = scan;
f8fc2ecf 4326 if (!ri->regstclass) {
830247a4 4327 cl_init(pRExC_state, &ch_class);
653099ff
GS
4328 data.start_class = &ch_class;
4329 stclass_flag = SCF_DO_STCLASS_AND;
4330 } else /* XXXX Check for BOUND? */
4331 stclass_flag = 0;
cb434fcc 4332 data.last_closep = &last_close;
de8c5301 4333
1de06328 4334 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4335 &data, -1, NULL, NULL,
4336 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4337
07be1b83 4338
786e8c11
YO
4339 CHECK_RESTUDY_GOTO;
4340
4341
830247a4 4342 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4343 && data.last_start_min == 0 && data.last_end > 0
830247a4 4344 && !RExC_seen_zerolen
bbe252da
YO
4345 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4346 r->extflags |= RXf_CHECK_ALL;
1de06328 4347 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4348 SvREFCNT_dec(data.last_found);
4349
1de06328
YO
4350 /* Note that code very similar to this but for anchored string
4351 follows immediately below, changes may need to be made to both.
4352 Be careful.
4353 */
a0ed51b3 4354 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4355 if (longest_float_length
c277df42
IZ
4356 || (data.flags & SF_FL_BEFORE_EOL
4357 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4358 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4359 {
1182767e 4360 I32 t,ml;
cf93c79d 4361
1de06328 4362 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4363 && data.offset_fixed == data.offset_float_min
4364 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4365 goto remove_float; /* As in (a)+. */
4366
1de06328
YO
4367 /* copy the information about the longest float from the reg_scan_data
4368 over to the program. */
33b8afdf
JH
4369 if (SvUTF8(data.longest_float)) {
4370 r->float_utf8 = data.longest_float;
c445ea15 4371 r->float_substr = NULL;
33b8afdf
JH
4372 } else {
4373 r->float_substr = data.longest_float;
c445ea15 4374 r->float_utf8 = NULL;
33b8afdf 4375 }
1de06328
YO
4376 /* float_end_shift is how many chars that must be matched that
4377 follow this item. We calculate it ahead of time as once the
4378 lookbehind offset is added in we lose the ability to correctly
4379 calculate it.*/
4380 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4381 : (I32)longest_float_length;
1de06328
YO
4382 r->float_end_shift = ml - data.offset_float_min
4383 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4384 + data.lookbehind_float;
4385 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4386 r->float_max_offset = data.offset_float_max;
1182767e 4387 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4388 r->float_max_offset -= data.lookbehind_float;
4389
cf93c79d
IZ
4390 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4391 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4392 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4393 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4394 }
4395 else {
aca2d497 4396 remove_float:
c445ea15 4397 r->float_substr = r->float_utf8 = NULL;
c277df42 4398 SvREFCNT_dec(data.longest_float);
c5254dd6 4399 longest_float_length = 0;
a0d0e21e 4400 }
c277df42 4401
1de06328
YO
4402 /* Note that code very similar to this but for floating string
4403 is immediately above, changes may need to be made to both.
4404 Be careful.
4405 */
a0ed51b3 4406 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4407 if (longest_fixed_length
c277df42
IZ
4408 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4409 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4410 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4411 {
1182767e 4412 I32 t,ml;
cf93c79d 4413
1de06328
YO
4414 /* copy the information about the longest fixed
4415 from the reg_scan_data over to the program. */
33b8afdf
JH
4416 if (SvUTF8(data.longest_fixed)) {
4417 r->anchored_utf8 = data.longest_fixed;
c445ea15 4418 r->anchored_substr = NULL;
33b8afdf
JH
4419 } else {
4420 r->anchored_substr = data.longest_fixed;
c445ea15 4421 r->anchored_utf8 = NULL;
33b8afdf 4422 }
1de06328
YO
4423 /* fixed_end_shift is how many chars that must be matched that
4424 follow this item. We calculate it ahead of time as once the
4425 lookbehind offset is added in we lose the ability to correctly
4426 calculate it.*/
4427 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4428 : (I32)longest_fixed_length;
1de06328
YO
4429 r->anchored_end_shift = ml - data.offset_fixed
4430 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4431 + data.lookbehind_fixed;
4432 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4433
cf93c79d
IZ
4434 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4435 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4436 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4437 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4438 }
4439 else {
c445ea15 4440 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4441 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4442 longest_fixed_length = 0;
a0d0e21e 4443 }
f8fc2ecf
YO
4444 if (ri->regstclass
4445 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4446 ri->regstclass = NULL;
33b8afdf
JH
4447 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4448 && stclass_flag
653099ff 4449 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4450 && !cl_is_anything(data.start_class))
4451 {
2eccd3b2 4452 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4453
f8fc2ecf 4454 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4455 struct regnode_charclass_class);
4456 StructCopy(data.start_class,
f8fc2ecf 4457 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4458 struct regnode_charclass_class);
f8fc2ecf 4459 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4460 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4461 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4462 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4463 PerlIO_printf(Perl_debug_log,
a0288114 4464 "synthetic stclass \"%s\".\n",
3f7c398e 4465 SvPVX_const(sv));});
653099ff 4466 }
c277df42
IZ
4467
4468 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4469 if (longest_fixed_length > longest_float_length) {
1de06328 4470 r->check_end_shift = r->anchored_end_shift;
c277df42 4471 r->check_substr = r->anchored_substr;
33b8afdf 4472 r->check_utf8 = r->anchored_utf8;
c277df42 4473 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4474 if (r->extflags & RXf_ANCH_SINGLE)
4475 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4476 }
4477 else {
1de06328 4478 r->check_end_shift = r->float_end_shift;
c277df42 4479 r->check_substr = r->float_substr;
33b8afdf 4480 r->check_utf8 = r->float_utf8;
1de06328
YO
4481 r->check_offset_min = r->float_min_offset;
4482 r->check_offset_max = r->float_max_offset;
a0d0e21e 4483 }
30382c73
IZ
4484 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4485 This should be changed ASAP! */
bbe252da
YO
4486 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4487 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4488 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4489 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4490 }
1de06328
YO
4491 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4492 if ( (STRLEN)minlen < longest_float_length )
4493 minlen= longest_float_length;
4494 if ( (STRLEN)minlen < longest_fixed_length )
4495 minlen= longest_fixed_length;
4496 */
a0ed51b3
LW
4497 }
4498 else {
c277df42
IZ
4499 /* Several toplevels. Best we can is to set minlen. */
4500 I32 fake;
653099ff 4501 struct regnode_charclass_class ch_class;
cb434fcc 4502 I32 last_close = 0;
c277df42 4503
5d458dd8 4504 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4505
f8fc2ecf 4506 scan = ri->program + 1;
830247a4 4507 cl_init(pRExC_state, &ch_class);
653099ff 4508 data.start_class = &ch_class;
cb434fcc 4509 data.last_closep = &last_close;
07be1b83 4510
de8c5301 4511
1de06328 4512 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4513 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4514
786e8c11 4515 CHECK_RESTUDY_GOTO;
07be1b83 4516
33b8afdf 4517 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4518 = r->float_substr = r->float_utf8 = NULL;
653099ff 4519 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4520 && !cl_is_anything(data.start_class))
4521 {
2eccd3b2 4522 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4523
f8fc2ecf 4524 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4525 struct regnode_charclass_class);
4526 StructCopy(data.start_class,
f8fc2ecf 4527 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4528 struct regnode_charclass_class);
f8fc2ecf 4529 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4530 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4531 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4532 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4533 PerlIO_printf(Perl_debug_log,
a0288114 4534 "synthetic stclass \"%s\".\n",
3f7c398e 4535 SvPVX_const(sv));});
653099ff 4536 }
a0d0e21e
LW
4537 }
4538
1de06328
YO
4539 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4540 the "real" pattern. */
cf9788e3
RGS
4541 DEBUG_OPTIMISE_r({
4542 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4543 (IV)minlen, (IV)r->minlen);
cf9788e3 4544 });
de8c5301 4545 r->minlenret = minlen;
1de06328
YO
4546 if (r->minlen < minlen)
4547 r->minlen = minlen;
4548
b81d288d 4549 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4550 r->extflags |= RXf_GPOS_SEEN;
830247a4 4551 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4552 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4553 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4554 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4555 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4556 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4557 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4558 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4559 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4560 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9
YO
4561 if (RExC_paren_names)
4562 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4563 else
4564 r->paren_names = NULL;
4565
40d049e4
YO
4566 if (RExC_recurse_count) {
4567 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4568 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4569 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4570 }
4571 }
a02a5408
JC
4572 Newxz(r->startp, RExC_npar, I32);
4573 Newxz(r->endp, RExC_npar, I32);
c74340f9
YO
4574 /* assume we don't need to swap parens around before we match */
4575
be8e71aa
YO
4576 DEBUG_DUMP_r({
4577 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4578 regdump(r);
4579 });
f8fc2ecf
YO
4580 DEBUG_OFFSETS_r(if (ri->offsets) {
4581 const U32 len = ri->offsets[0];
8e9a8a48
YO
4582 U32 i;
4583 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 4584 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->offsets[0]);
8e9a8a48 4585 for (i = 1; i <= len; i++) {
f8fc2ecf 4586 if (ri->offsets[i*2-1] || ri->offsets[i*2])
8e9a8a48 4587 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
f8fc2ecf 4588 (UV)i, (UV)ri->offsets[i*2-1], (UV)ri->offsets[i*2]);
8e9a8a48
YO
4589 }
4590 PerlIO_printf(Perl_debug_log, "\n");
4591 });
a0d0e21e 4592 return(r);
f9f4320a 4593 END_BLOCK
a687059c
LW
4594}
4595
f9f4320a
YO
4596#undef CORE_ONLY_BLOCK
4597#undef END_BLOCK
4598#undef RE_ENGINE_PTR
3dab1dad 4599
9af228c6 4600#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4601SV*
4602Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4603{
4604 I32 parno = 0; /* no match */
4605 if (PL_curpm) {
4606 const REGEXP * const rx = PM_GETRE(PL_curpm);
4607 if (rx && rx->paren_names) {
4608 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4609 if (he_str) {
4610 IV i;
4611 SV* sv_dat=HeVAL(he_str);
4612 I32 *nums=(I32*)SvPVX(sv_dat);
4613 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4614 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4615 rx->endp[nums[i]] != -1)
4616 {
4617 parno = nums[i];
4618 break;
4619 }
4620 }
4621 }
4622 }
4623 }
4624 if ( !parno ) {
4625 return 0;
4626 } else {
4627 GV *gv_paren;
4628 SV *sv= sv_newmortal();
4629 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4630 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4631 return GvSVn(gv_paren);
4632 }
4633}
9af228c6 4634#endif
0a4db386 4635
894be9b7 4636/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4637 * If flags is REG_RSN_RETURN_NULL returns null.
4638 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4639 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4640 * to the parsed name as looked up in the RExC_paren_names hash.
4641 * If there is an error throws a vFAIL().. type exception.
894be9b7 4642 */
0a4db386
YO
4643
4644#define REG_RSN_RETURN_NULL 0
4645#define REG_RSN_RETURN_NAME 1
4646#define REG_RSN_RETURN_DATA 2
4647
894be9b7
YO
4648STATIC SV*
4649S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4650 char *name_start = RExC_parse;
0a4db386 4651 if ( UTF ) {
894be9b7 4652 STRLEN numlen;
0a4db386
YO
4653 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4654 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4655 {
4656 RExC_parse += numlen;
4657 }
4658 } else {
4659 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4660 RExC_parse++;
4661 }
0a4db386
YO
4662 if ( flags ) {
4663 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4664 (int)(RExC_parse - name_start)));
894be9b7 4665 if (UTF)
0a4db386
YO
4666 SvUTF8_on(sv_name);
4667 if ( flags == REG_RSN_RETURN_NAME)
4668 return sv_name;
4669 else if (flags==REG_RSN_RETURN_DATA) {
4670 HE *he_str = NULL;
4671 SV *sv_dat = NULL;
4672 if ( ! sv_name ) /* should not happen*/
4673 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4674 if (RExC_paren_names)
4675 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4676 if ( he_str )
4677 sv_dat = HeVAL(he_str);
4678 if ( ! sv_dat )
4679 vFAIL("Reference to nonexistent named group");
4680 return sv_dat;
4681 }
4682 else {
4683 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4684 }
4685 /* NOT REACHED */
894be9b7 4686 }
0a4db386 4687 return NULL;
894be9b7
YO
4688}
4689
3dab1dad
YO
4690#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4691 int rem=(int)(RExC_end - RExC_parse); \
4692 int cut; \
4693 int num; \
4694 int iscut=0; \
4695 if (rem>10) { \
4696 rem=10; \
4697 iscut=1; \
4698 } \
4699 cut=10-rem; \
4700 if (RExC_lastparse!=RExC_parse) \
4701 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4702 rem, RExC_parse, \
4703 cut + 4, \
4704 iscut ? "..." : "<" \
4705 ); \
4706 else \
4707 PerlIO_printf(Perl_debug_log,"%16s",""); \
4708 \
4709 if (SIZE_ONLY) \
4710 num=RExC_size; \
4711 else \
4712 num=REG_NODE_NUM(RExC_emit); \
4713 if (RExC_lastnum!=num) \
0a4db386 4714 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4715 else \
0a4db386 4716 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4717 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4718 (int)((depth*2)), "", \
3dab1dad
YO
4719 (funcname) \
4720 ); \
4721 RExC_lastnum=num; \
4722 RExC_lastparse=RExC_parse; \
4723})
4724
07be1b83
YO
4725
4726
3dab1dad
YO
4727#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4728 DEBUG_PARSE_MSG((funcname)); \
4729 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4730})
6bda09f9
YO
4731#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4732 DEBUG_PARSE_MSG((funcname)); \
4733 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4734})
a687059c
LW
4735/*
4736 - reg - regular expression, i.e. main body or parenthesized thing
4737 *
4738 * Caller must absorb opening parenthesis.
4739 *
4740 * Combining parenthesis handling with the base level of regular expression
4741 * is a trifle forced, but the need to tie the tails of the branches to what
4742 * follows makes it hard to avoid.
4743 */
07be1b83
YO
4744#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4745#ifdef DEBUGGING
4746#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4747#else
4748#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4749#endif
3dab1dad 4750
e2e6a0f1
YO
4751/* this idea is borrowed from STR_WITH_LEN in handy.h */
4752#define CHECK_WORD(s,v,l) \
4753 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4754
76e3520e 4755STATIC regnode *
3dab1dad 4756S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4757 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4758{
27da23d5 4759 dVAR;
c277df42
IZ
4760 register regnode *ret; /* Will be the head of the group. */
4761 register regnode *br;
4762 register regnode *lastbr;
cbbf8932 4763 register regnode *ender = NULL;
a0d0e21e 4764 register I32 parno = 0;
cbbf8932
AL
4765 I32 flags;
4766 const I32 oregflags = RExC_flags;
6136c704
AL
4767 bool have_branch = 0;
4768 bool is_open = 0;
9d1d55b5
JP
4769
4770 /* for (?g), (?gc), and (?o) warnings; warning
4771 about (?c) will warn about (?g) -- japhy */
4772
6136c704
AL
4773#define WASTED_O 0x01
4774#define WASTED_G 0x02
4775#define WASTED_C 0x04
4776#define WASTED_GC (0x02|0x04)
cbbf8932 4777 I32 wastedflags = 0x00;
9d1d55b5 4778
fac92740 4779 char * parse_start = RExC_parse; /* MJD */
a28509cc 4780 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4781
3dab1dad
YO
4782 GET_RE_DEBUG_FLAGS_DECL;
4783 DEBUG_PARSE("reg ");
4784
4785
821b33a5 4786 *flagp = 0; /* Tentatively. */
a0d0e21e 4787
9d1d55b5 4788
a0d0e21e
LW
4789 /* Make an OPEN node, if parenthesized. */
4790 if (paren) {
e2e6a0f1
YO
4791 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4792 char *start_verb = RExC_parse;
4793 STRLEN verb_len = 0;
4794 char *start_arg = NULL;
4795 unsigned char op = 0;
4796 int argok = 1;
4797 int internal_argval = 0; /* internal_argval is only useful if !argok */
4798 while ( *RExC_parse && *RExC_parse != ')' ) {
4799 if ( *RExC_parse == ':' ) {
4800 start_arg = RExC_parse + 1;
4801 break;
4802 }
4803 RExC_parse++;
4804 }
4805 ++start_verb;
4806 verb_len = RExC_parse - start_verb;
4807 if ( start_arg ) {
4808 RExC_parse++;
4809 while ( *RExC_parse && *RExC_parse != ')' )
4810 RExC_parse++;
4811 if ( *RExC_parse != ')' )
4812 vFAIL("Unterminated verb pattern argument");
4813 if ( RExC_parse == start_arg )
4814 start_arg = NULL;
4815 } else {
4816 if ( *RExC_parse != ')' )
4817 vFAIL("Unterminated verb pattern");
4818 }
5d458dd8 4819
e2e6a0f1
YO
4820 switch ( *start_verb ) {
4821 case 'A': /* (*ACCEPT) */
4822 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4823 op = ACCEPT;
4824 internal_argval = RExC_nestroot;
4825 }
4826 break;
4827 case 'C': /* (*COMMIT) */
4828 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4829 op = COMMIT;
e2e6a0f1
YO
4830 break;
4831 case 'F': /* (*FAIL) */
4832 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4833 op = OPFAIL;
4834 argok = 0;
4835 }
4836 break;
5d458dd8
YO
4837 case ':': /* (*:NAME) */
4838 case 'M': /* (*MARK:NAME) */
4839 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 4840 op = MARKPOINT;
5d458dd8
YO
4841 argok = -1;
4842 }
4843 break;
4844 case 'P': /* (*PRUNE) */
4845 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4846 op = PRUNE;
e2e6a0f1 4847 break;
5d458dd8
YO
4848 case 'S': /* (*SKIP) */
4849 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4850 op = SKIP;
4851 break;
4852 case 'T': /* (*THEN) */
4853 /* [19:06] <TimToady> :: is then */
4854 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4855 op = CUTGROUP;
4856 RExC_seen |= REG_SEEN_CUTGROUP;
4857 }
e2e6a0f1
YO
4858 break;
4859 }
4860 if ( ! op ) {
4861 RExC_parse++;
4862 vFAIL3("Unknown verb pattern '%.*s'",
4863 verb_len, start_verb);
4864 }
4865 if ( argok ) {
4866 if ( start_arg && internal_argval ) {
4867 vFAIL3("Verb pattern '%.*s' may not have an argument",
4868 verb_len, start_verb);
4869 } else if ( argok < 0 && !start_arg ) {
4870 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4871 verb_len, start_verb);
4872 } else {
4873 ret = reganode(pRExC_state, op, internal_argval);
4874 if ( ! internal_argval && ! SIZE_ONLY ) {
4875 if (start_arg) {
4876 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4877 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 4878 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
4879 ret->flags = 0;
4880 } else {
4881 ret->flags = 1;
4882 }
4883 }
4884 }
4885 if (!internal_argval)
4886 RExC_seen |= REG_SEEN_VERBARG;
4887 } else if ( start_arg ) {
4888 vFAIL3("Verb pattern '%.*s' may not have an argument",
4889 verb_len, start_verb);
4890 } else {
4891 ret = reg_node(pRExC_state, op);
4892 }
4893 nextchar(pRExC_state);
4894 return ret;
4895 } else
fac92740 4896 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4897 U32 posflags = 0, negflags = 0;
4898 U32 *flagsp = &posflags;
6136c704 4899 bool is_logical = 0;
a28509cc 4900 const char * const seqstart = RExC_parse;
ca9dfc88 4901
830247a4
IZ
4902 RExC_parse++;
4903 paren = *RExC_parse++;
c277df42 4904 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4905 switch (paren) {
894be9b7 4906
fac92740 4907 case '<': /* (?<...) */
b81d288d 4908 if (*RExC_parse == '!')
c277df42 4909 paren = ',';
0a4db386
YO
4910 else if (*RExC_parse != '=')
4911 { /* (?<...>) */
81714fb9 4912 char *name_start;
894be9b7 4913 SV *svname;
81714fb9
YO
4914 paren= '>';
4915 case '\'': /* (?'...') */
4916 name_start= RExC_parse;
0a4db386
YO
4917 svname = reg_scan_name(pRExC_state,
4918 SIZE_ONLY ? /* reverse test from the others */
4919 REG_RSN_RETURN_NAME :
4920 REG_RSN_RETURN_NULL);
81714fb9
YO
4921 if (RExC_parse == name_start)
4922 goto unknown;
4923 if (*RExC_parse != paren)
4924 vFAIL2("Sequence (?%c... not terminated",
4925 paren=='>' ? '<' : paren);
4926 if (SIZE_ONLY) {
e62cc96a
YO
4927 HE *he_str;
4928 SV *sv_dat = NULL;
894be9b7
YO
4929 if (!svname) /* shouldnt happen */
4930 Perl_croak(aTHX_
4931 "panic: reg_scan_name returned NULL");
81714fb9
YO
4932 if (!RExC_paren_names) {
4933 RExC_paren_names= newHV();
4934 sv_2mortal((SV*)RExC_paren_names);
4935 }
4936 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4937 if ( he_str )
81714fb9 4938 sv_dat = HeVAL(he_str);
e62cc96a 4939 if ( ! sv_dat ) {
81714fb9 4940 /* croak baby croak */
e62cc96a
YO
4941 Perl_croak(aTHX_
4942 "panic: paren_name hash element allocation failed");
4943 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4944 IV count=SvIV(sv_dat);
4945 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4946 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4947 pv[count]=RExC_npar;
4948 SvIVX(sv_dat)++;
4949 } else {
4950 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4951 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4952 SvIOK_on(sv_dat);
4953 SvIVX(sv_dat)= 1;
e62cc96a
YO
4954 }
4955
81714fb9
YO
4956 /*sv_dump(sv_dat);*/
4957 }
4958 nextchar(pRExC_state);
4959 paren = 1;
4960 goto capturing_parens;
4961 }
4962 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4963 RExC_parse++;
fac92740
MJD
4964 case '=': /* (?=...) */
4965 case '!': /* (?!...) */
830247a4 4966 RExC_seen_zerolen++;
e2e6a0f1
YO
4967 if (*RExC_parse == ')') {
4968 ret=reg_node(pRExC_state, OPFAIL);
4969 nextchar(pRExC_state);
4970 return ret;
4971 }
fac92740
MJD
4972 case ':': /* (?:...) */
4973 case '>': /* (?>...) */
a0d0e21e 4974 break;
fac92740
MJD
4975 case '$': /* (?$...) */
4976 case '@': /* (?@...) */
8615cb43 4977 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4978 break;
fac92740 4979 case '#': /* (?#...) */
830247a4
IZ
4980 while (*RExC_parse && *RExC_parse != ')')
4981 RExC_parse++;
4982 if (*RExC_parse != ')')
c277df42 4983 FAIL("Sequence (?#... not terminated");
830247a4 4984 nextchar(pRExC_state);
a0d0e21e
LW
4985 *flagp = TRYAGAIN;
4986 return NULL;
894be9b7
YO
4987 case '0' : /* (?0) */
4988 case 'R' : /* (?R) */
4989 if (*RExC_parse != ')')
6bda09f9 4990 FAIL("Sequence (?R) not terminated");
1a147d38 4991 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
4992 nextchar(pRExC_state);
4993 return ret;
4994 /*notreached*/
894be9b7
YO
4995 { /* named and numeric backreferences */
4996 I32 num;
4997 char * parse_start;
4998 case '&': /* (?&NAME) */
4999 parse_start = RExC_parse - 1;
5000 {
0a4db386
YO
5001 SV *sv_dat = reg_scan_name(pRExC_state,
5002 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5003 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5004 }
5005 goto gen_recurse_regop;
5006 /* NOT REACHED */
542fa716
YO
5007 case '+':
5008 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5009 RExC_parse++;
5010 vFAIL("Illegal pattern");
5011 }
5012 goto parse_recursion;
5013 /* NOT REACHED*/
5014 case '-': /* (?-1) */
5015 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5016 RExC_parse--; /* rewind to let it be handled later */
5017 goto parse_flags;
5018 }
5019 /*FALLTHROUGH */
6bda09f9
YO
5020 case '1': case '2': case '3': case '4': /* (?1) */
5021 case '5': case '6': case '7': case '8': case '9':
5022 RExC_parse--;
542fa716 5023 parse_recursion:
894be9b7
YO
5024 num = atoi(RExC_parse);
5025 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5026 if (*RExC_parse == '-')
5027 RExC_parse++;
6bda09f9
YO
5028 while (isDIGIT(*RExC_parse))
5029 RExC_parse++;
5030 if (*RExC_parse!=')')
5031 vFAIL("Expecting close bracket");
894be9b7
YO
5032
5033 gen_recurse_regop:
542fa716
YO
5034 if ( paren == '-' ) {
5035 /*
5036 Diagram of capture buffer numbering.
5037 Top line is the normal capture buffer numbers
5038 Botton line is the negative indexing as from
5039 the X (the (?-2))
5040
5041 + 1 2 3 4 5 X 6 7
5042 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5043 - 5 4 3 2 1 X x x
5044
5045 */
5046 num = RExC_npar + num;
5047 if (num < 1) {
5048 RExC_parse++;
5049 vFAIL("Reference to nonexistent group");
5050 }
5051 } else if ( paren == '+' ) {
5052 num = RExC_npar + num - 1;
5053 }
5054
1a147d38 5055 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5056 if (!SIZE_ONLY) {
5057 if (num > (I32)RExC_rx->nparens) {
5058 RExC_parse++;
5059 vFAIL("Reference to nonexistent group");
5060 }
40d049e4 5061 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5062 RExC_emit++;
226de585 5063 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5064 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5065 } else {
6bda09f9 5066 RExC_size++;
6bda09f9 5067 }
0a4db386 5068 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5069 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5070 Set_Node_Offset(ret, parse_start); /* MJD */
5071
6bda09f9
YO
5072 nextchar(pRExC_state);
5073 return ret;
894be9b7
YO
5074 } /* named and numeric backreferences */
5075 /* NOT REACHED */
5076
fac92740 5077 case 'p': /* (?p...) */
9014280d 5078 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 5079 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 5080 /* FALL THROUGH*/
fac92740 5081 case '?': /* (??...) */
6136c704 5082 is_logical = 1;
438a3801
YST
5083 if (*RExC_parse != '{')
5084 goto unknown;
830247a4 5085 paren = *RExC_parse++;
0f5d15d6 5086 /* FALL THROUGH */
fac92740 5087 case '{': /* (?{...}) */
c277df42 5088 {
2eccd3b2
NC
5089 I32 count = 1;
5090 U32 n = 0;
c277df42 5091 char c;
830247a4 5092 char *s = RExC_parse;
c277df42 5093
830247a4
IZ
5094 RExC_seen_zerolen++;
5095 RExC_seen |= REG_SEEN_EVAL;
5096 while (count && (c = *RExC_parse)) {
6136c704
AL
5097 if (c == '\\') {
5098 if (RExC_parse[1])
5099 RExC_parse++;
5100 }
b81d288d 5101 else if (c == '{')
c277df42 5102 count++;
b81d288d 5103 else if (c == '}')
c277df42 5104 count--;
830247a4 5105 RExC_parse++;
c277df42 5106 }
6136c704 5107 if (*RExC_parse != ')') {
b81d288d 5108 RExC_parse = s;
b45f050a
JF
5109 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5110 }
c277df42 5111 if (!SIZE_ONLY) {
f3548bdc 5112 PAD *pad;
6136c704
AL
5113 OP_4tree *sop, *rop;
5114 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5115
569233ed
SB
5116 ENTER;
5117 Perl_save_re_context(aTHX);
f3548bdc 5118 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5119 sop->op_private |= OPpREFCOUNTED;
5120 /* re_dup will OpREFCNT_inc */
5121 OpREFCNT_set(sop, 1);
569233ed 5122 LEAVE;
c277df42 5123
830247a4 5124 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5125 RExC_rxi->data->data[n] = (void*)rop;
5126 RExC_rxi->data->data[n+1] = (void*)sop;
5127 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 5128 SvREFCNT_dec(sv);
a0ed51b3 5129 }
e24b16f9 5130 else { /* First pass */
830247a4 5131 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5132 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5133 /* No compiled RE interpolated, has runtime
5134 components ===> unsafe. */
5135 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5136 if (PL_tainting && PL_tainted)
cc6b7395 5137 FAIL("Eval-group in insecure regular expression");
54df2634 5138#if PERL_VERSION > 8
923e4eb5 5139 if (IN_PERL_COMPILETIME)
b5c19bd7 5140 PL_cv_has_eval = 1;
54df2634 5141#endif
c277df42 5142 }
b5c19bd7 5143
830247a4 5144 nextchar(pRExC_state);
6136c704 5145 if (is_logical) {
830247a4 5146 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5147 if (!SIZE_ONLY)
5148 ret->flags = 2;
3dab1dad 5149 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5150 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5151 return ret;
5152 }
ccb2c380
MP
5153 ret = reganode(pRExC_state, EVAL, n);
5154 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5155 Set_Node_Offset(ret, parse_start);
5156 return ret;
c277df42 5157 }
fac92740 5158 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5159 {
0a4db386 5160 int is_define= 0;
fac92740 5161 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5162 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5163 || RExC_parse[1] == '<'
830247a4 5164 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5165 I32 flag;
5166
830247a4 5167 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5168 if (!SIZE_ONLY)
5169 ret->flags = 1;
3dab1dad 5170 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5171 goto insert_if;
b81d288d 5172 }
a0ed51b3 5173 }
0a4db386
YO
5174 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5175 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5176 {
5177 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5178 char *name_start= RExC_parse++;
2eccd3b2 5179 U32 num = 0;
0a4db386
YO
5180 SV *sv_dat=reg_scan_name(pRExC_state,
5181 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5182 if (RExC_parse == name_start || *RExC_parse != ch)
5183 vFAIL2("Sequence (?(%c... not terminated",
5184 (ch == '>' ? '<' : ch));
5185 RExC_parse++;
5186 if (!SIZE_ONLY) {
5187 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5188 RExC_rxi->data->data[num]=(void*)sv_dat;
0a4db386
YO
5189 SvREFCNT_inc(sv_dat);
5190 }
5191 ret = reganode(pRExC_state,NGROUPP,num);
5192 goto insert_if_check_paren;
5193 }
5194 else if (RExC_parse[0] == 'D' &&
5195 RExC_parse[1] == 'E' &&
5196 RExC_parse[2] == 'F' &&
5197 RExC_parse[3] == 'I' &&
5198 RExC_parse[4] == 'N' &&
5199 RExC_parse[5] == 'E')
5200 {
5201 ret = reganode(pRExC_state,DEFINEP,0);
5202 RExC_parse +=6 ;
5203 is_define = 1;
5204 goto insert_if_check_paren;
5205 }
5206 else if (RExC_parse[0] == 'R') {
5207 RExC_parse++;
5208 parno = 0;
5209 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5210 parno = atoi(RExC_parse++);
5211 while (isDIGIT(*RExC_parse))
5212 RExC_parse++;
5213 } else if (RExC_parse[0] == '&') {
5214 SV *sv_dat;
5215 RExC_parse++;
5216 sv_dat = reg_scan_name(pRExC_state,
5217 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5218 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5219 }
1a147d38 5220 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5221 goto insert_if_check_paren;
5222 }
830247a4 5223 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5224 /* (?(1)...) */
6136c704 5225 char c;
830247a4 5226 parno = atoi(RExC_parse++);
c277df42 5227
830247a4
IZ
5228 while (isDIGIT(*RExC_parse))
5229 RExC_parse++;
fac92740 5230 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5231
0a4db386 5232 insert_if_check_paren:
830247a4 5233 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5234 vFAIL("Switch condition not recognized");
c277df42 5235 insert_if:
3dab1dad
YO
5236 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5237 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5238 if (br == NULL)
830247a4 5239 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5240 else
3dab1dad 5241 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5242 c = *nextchar(pRExC_state);
d1b80229
IZ
5243 if (flags&HASWIDTH)
5244 *flagp |= HASWIDTH;
c277df42 5245 if (c == '|') {
0a4db386
YO
5246 if (is_define)
5247 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5248 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5249 regbranch(pRExC_state, &flags, 1,depth+1);
5250 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5251 if (flags&HASWIDTH)
5252 *flagp |= HASWIDTH;
830247a4 5253 c = *nextchar(pRExC_state);
a0ed51b3
LW
5254 }
5255 else
c277df42
IZ
5256 lastbr = NULL;
5257 if (c != ')')
8615cb43 5258 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5259 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5260 REGTAIL(pRExC_state, br, ender);
c277df42 5261 if (lastbr) {
3dab1dad
YO
5262 REGTAIL(pRExC_state, lastbr, ender);
5263 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5264 }
5265 else
3dab1dad 5266 REGTAIL(pRExC_state, ret, ender);
c277df42 5267 return ret;
a0ed51b3
LW
5268 }
5269 else {
830247a4 5270 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5271 }
5272 }
1b1626e4 5273 case 0:
830247a4 5274 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5275 vFAIL("Sequence (? incomplete");
1b1626e4 5276 break;
a0d0e21e 5277 default:
830247a4 5278 --RExC_parse;
fac92740 5279 parse_flags: /* (?i) */
830247a4 5280 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5281 /* (?g), (?gc) and (?o) are useless here
5282 and must be globally applied -- japhy */
5283
5284 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5285 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5286 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5287 if (! (wastedflags & wflagbit) ) {
5288 wastedflags |= wflagbit;
5289 vWARN5(
5290 RExC_parse + 1,
5291 "Useless (%s%c) - %suse /%c modifier",
5292 flagsp == &negflags ? "?-" : "?",
5293 *RExC_parse,
5294 flagsp == &negflags ? "don't " : "",
5295 *RExC_parse
5296 );
5297 }
5298 }
5299 }
5300 else if (*RExC_parse == 'c') {
5301 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5302 if (! (wastedflags & WASTED_C) ) {
5303 wastedflags |= WASTED_GC;
9d1d55b5
JP
5304 vWARN3(
5305 RExC_parse + 1,
5306 "Useless (%sc) - %suse /gc modifier",
5307 flagsp == &negflags ? "?-" : "?",
5308 flagsp == &negflags ? "don't " : ""
5309 );
5310 }
5311 }
5312 }
5313 else { pmflag(flagsp, *RExC_parse); }
5314
830247a4 5315 ++RExC_parse;
ca9dfc88 5316 }
830247a4 5317 if (*RExC_parse == '-') {
ca9dfc88 5318 flagsp = &negflags;
9d1d55b5 5319 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5320 ++RExC_parse;
ca9dfc88 5321 goto parse_flags;
48c036b1 5322 }
e2509266
JH
5323 RExC_flags |= posflags;
5324 RExC_flags &= ~negflags;
830247a4
IZ
5325 if (*RExC_parse == ':') {
5326 RExC_parse++;
ca9dfc88
IZ
5327 paren = ':';
5328 break;
5329 }
c277df42 5330 unknown:
830247a4
IZ
5331 if (*RExC_parse != ')') {
5332 RExC_parse++;
5333 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5334 }
830247a4 5335 nextchar(pRExC_state);
a0d0e21e
LW
5336 *flagp = TRYAGAIN;
5337 return NULL;
5338 }
5339 }
fac92740 5340 else { /* (...) */
81714fb9 5341 capturing_parens:
830247a4
IZ
5342 parno = RExC_npar;
5343 RExC_npar++;
e2e6a0f1 5344
830247a4 5345 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5346 if (!SIZE_ONLY ){
5347 if (!RExC_nestroot)
5348 RExC_nestroot = parno;
5349 if (RExC_seen & REG_SEEN_RECURSE) {
5350 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5351 "Setting open paren #%"IVdf" to %d\n",
5352 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5353 RExC_open_parens[parno-1]= ret;
5354 }
6bda09f9 5355 }
fac92740
MJD
5356 Set_Node_Length(ret, 1); /* MJD */
5357 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5358 is_open = 1;
a0d0e21e 5359 }
a0ed51b3 5360 }
fac92740 5361 else /* ! paren */
a0d0e21e
LW
5362 ret = NULL;
5363
5364 /* Pick up the branches, linking them together. */
fac92740 5365 parse_start = RExC_parse; /* MJD */
3dab1dad 5366 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5367 /* branch_len = (paren != 0); */
2af232bd 5368
a0d0e21e
LW
5369 if (br == NULL)
5370 return(NULL);
830247a4
IZ
5371 if (*RExC_parse == '|') {
5372 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5373 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5374 }
fac92740 5375 else { /* MJD */
6bda09f9 5376 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5377 Set_Node_Length(br, paren != 0);
5378 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5379 }
c277df42
IZ
5380 have_branch = 1;
5381 if (SIZE_ONLY)
830247a4 5382 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5383 }
5384 else if (paren == ':') {
c277df42
IZ
5385 *flagp |= flags&SIMPLE;
5386 }
6136c704 5387 if (is_open) { /* Starts with OPEN. */
3dab1dad 5388 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5389 }
5390 else if (paren != '?') /* Not Conditional */
a0d0e21e 5391 ret = br;
32a0ca98 5392 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5393 lastbr = br;
830247a4
IZ
5394 while (*RExC_parse == '|') {
5395 if (!SIZE_ONLY && RExC_extralen) {
5396 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5397 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5398 }
5399 if (SIZE_ONLY)
830247a4
IZ
5400 RExC_extralen += 2; /* Account for LONGJMP. */
5401 nextchar(pRExC_state);
3dab1dad 5402 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5403
a687059c 5404 if (br == NULL)
a0d0e21e 5405 return(NULL);
3dab1dad 5406 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5407 lastbr = br;
821b33a5
IZ
5408 if (flags&HASWIDTH)
5409 *flagp |= HASWIDTH;
a687059c 5410 *flagp |= flags&SPSTART;
a0d0e21e
LW
5411 }
5412
c277df42
IZ
5413 if (have_branch || paren != ':') {
5414 /* Make a closing node, and hook it on the end. */
5415 switch (paren) {
5416 case ':':
830247a4 5417 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5418 break;
5419 case 1:
c74340f9 5420 RExC_cpar++;
830247a4 5421 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5422 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5423 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5424 "Setting close paren #%"IVdf" to %d\n",
5425 (IV)parno, REG_NODE_NUM(ender)));
5426 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5427 if (RExC_nestroot == parno)
5428 RExC_nestroot = 0;
40d049e4 5429 }
fac92740
MJD
5430 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5431 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5432 break;
5433 case '<':
c277df42
IZ
5434 case ',':
5435 case '=':
5436 case '!':
c277df42 5437 *flagp &= ~HASWIDTH;
821b33a5
IZ
5438 /* FALL THROUGH */
5439 case '>':
830247a4 5440 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5441 break;
5442 case 0:
830247a4 5443 ender = reg_node(pRExC_state, END);
40d049e4
YO
5444 if (!SIZE_ONLY) {
5445 assert(!RExC_opend); /* there can only be one! */
5446 RExC_opend = ender;
5447 }
c277df42
IZ
5448 break;
5449 }
eaf3ca90 5450 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5451
9674d46a 5452 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5453 if (depth==1)
5454 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5455
c277df42 5456 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5457 for (br = ret; br; br = regnext(br)) {
5458 const U8 op = PL_regkind[OP(br)];
5459 if (op == BRANCH) {
07be1b83 5460 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5461 }
5462 else if (op == BRANCHJ) {
07be1b83 5463 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5464 }
c277df42
IZ
5465 }
5466 }
a0d0e21e 5467 }
c277df42
IZ
5468
5469 {
e1ec3a88
AL
5470 const char *p;
5471 static const char parens[] = "=!<,>";
c277df42
IZ
5472
5473 if (paren && (p = strchr(parens, paren))) {
eb160463 5474 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5475 int flag = (p - parens) > 1;
5476
5477 if (paren == '>')
5478 node = SUSPEND, flag = 0;
6bda09f9 5479 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5480 Set_Node_Cur_Length(ret);
5481 Set_Node_Offset(ret, parse_start + 1);
c277df42 5482 ret->flags = flag;
07be1b83 5483 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5484 }
a0d0e21e
LW
5485 }
5486
5487 /* Check for proper termination. */
ce3e6498 5488 if (paren) {
e2509266 5489 RExC_flags = oregflags;
830247a4
IZ
5490 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5491 RExC_parse = oregcomp_parse;
380a0633 5492 vFAIL("Unmatched (");
ce3e6498 5493 }
a0ed51b3 5494 }
830247a4
IZ
5495 else if (!paren && RExC_parse < RExC_end) {
5496 if (*RExC_parse == ')') {
5497 RExC_parse++;
380a0633 5498 vFAIL("Unmatched )");
a0ed51b3
LW
5499 }
5500 else
b45f050a 5501 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5502 /* NOTREACHED */
5503 }
a687059c 5504
a0d0e21e 5505 return(ret);
a687059c
LW
5506}
5507
5508/*
5509 - regbranch - one alternative of an | operator
5510 *
5511 * Implements the concatenation operator.
5512 */
76e3520e 5513STATIC regnode *
3dab1dad 5514S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5515{
97aff369 5516 dVAR;
c277df42
IZ
5517 register regnode *ret;
5518 register regnode *chain = NULL;
5519 register regnode *latest;
5520 I32 flags = 0, c = 0;
3dab1dad
YO
5521 GET_RE_DEBUG_FLAGS_DECL;
5522 DEBUG_PARSE("brnc");
b81d288d 5523 if (first)
c277df42
IZ
5524 ret = NULL;
5525 else {
b81d288d 5526 if (!SIZE_ONLY && RExC_extralen)
830247a4 5527 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5528 else {
830247a4 5529 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5530 Set_Node_Length(ret, 1);
5531 }
c277df42
IZ
5532 }
5533
b81d288d 5534 if (!first && SIZE_ONLY)
830247a4 5535 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5536
c277df42 5537 *flagp = WORST; /* Tentatively. */
a0d0e21e 5538
830247a4
IZ
5539 RExC_parse--;
5540 nextchar(pRExC_state);
5541 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5542 flags &= ~TRYAGAIN;
3dab1dad 5543 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5544 if (latest == NULL) {
5545 if (flags & TRYAGAIN)
5546 continue;
5547 return(NULL);
a0ed51b3
LW
5548 }
5549 else if (ret == NULL)
c277df42 5550 ret = latest;
a0d0e21e 5551 *flagp |= flags&HASWIDTH;
c277df42 5552 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5553 *flagp |= flags&SPSTART;
5554 else {
830247a4 5555 RExC_naughty++;
3dab1dad 5556 REGTAIL(pRExC_state, chain, latest);
a687059c 5557 }
a0d0e21e 5558 chain = latest;
c277df42
IZ
5559 c++;
5560 }
5561 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5562 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5563 if (ret == NULL)
5564 ret = chain;
5565 }
5566 if (c == 1) {
5567 *flagp |= flags&SIMPLE;
a0d0e21e 5568 }
a687059c 5569
d4c19fe8 5570 return ret;
a687059c
LW
5571}
5572
5573/*
5574 - regpiece - something followed by possible [*+?]
5575 *
5576 * Note that the branching code sequences used for ? and the general cases
5577 * of * and + are somewhat optimized: they use the same NOTHING node as
5578 * both the endmarker for their branch list and the body of the last branch.
5579 * It might seem that this node could be dispensed with entirely, but the
5580 * endmarker role is not redundant.
5581 */
76e3520e 5582STATIC regnode *
3dab1dad 5583S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5584{
97aff369 5585 dVAR;
c277df42 5586 register regnode *ret;
a0d0e21e
LW
5587 register char op;
5588 register char *next;
5589 I32 flags;
1df70142 5590 const char * const origparse = RExC_parse;
a0d0e21e 5591 I32 min;
c277df42 5592 I32 max = REG_INFTY;
fac92740 5593 char *parse_start;
10edeb5d 5594 const char *maxpos = NULL;
3dab1dad
YO
5595 GET_RE_DEBUG_FLAGS_DECL;
5596 DEBUG_PARSE("piec");
a0d0e21e 5597
3dab1dad 5598 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5599 if (ret == NULL) {
5600 if (flags & TRYAGAIN)
5601 *flagp |= TRYAGAIN;
5602 return(NULL);
5603 }
5604
830247a4 5605 op = *RExC_parse;
a0d0e21e 5606
830247a4 5607 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5608 maxpos = NULL;
fac92740 5609 parse_start = RExC_parse; /* MJD */
830247a4 5610 next = RExC_parse + 1;
a0d0e21e
LW
5611 while (isDIGIT(*next) || *next == ',') {
5612 if (*next == ',') {
5613 if (maxpos)
5614 break;
5615 else
5616 maxpos = next;
a687059c 5617 }
a0d0e21e
LW
5618 next++;
5619 }
5620 if (*next == '}') { /* got one */
5621 if (!maxpos)
5622 maxpos = next;
830247a4
IZ
5623 RExC_parse++;
5624 min = atoi(RExC_parse);
a0d0e21e
LW
5625 if (*maxpos == ',')
5626 maxpos++;
5627 else
830247a4 5628 maxpos = RExC_parse;
a0d0e21e
LW
5629 max = atoi(maxpos);
5630 if (!max && *maxpos != '0')
c277df42
IZ
5631 max = REG_INFTY; /* meaning "infinity" */
5632 else if (max >= REG_INFTY)
8615cb43 5633 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5634 RExC_parse = next;
5635 nextchar(pRExC_state);
a0d0e21e
LW
5636
5637 do_curly:
5638 if ((flags&SIMPLE)) {
830247a4 5639 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5640 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5641 Set_Node_Offset(ret, parse_start+1); /* MJD */
5642 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5643 }
5644 else {
3dab1dad 5645 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5646
5647 w->flags = 0;
3dab1dad 5648 REGTAIL(pRExC_state, ret, w);
830247a4 5649 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5650 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5651 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5652 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5653 }
6bda09f9 5654 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5655 /* MJD hk */
5656 Set_Node_Offset(ret, parse_start+1);
2af232bd 5657 Set_Node_Length(ret,
fac92740 5658 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5659
830247a4 5660 if (!SIZE_ONLY && RExC_extralen)
c277df42 5661 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5662 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5663 if (SIZE_ONLY)
830247a4
IZ
5664 RExC_whilem_seen++, RExC_extralen += 3;
5665 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5666 }
c277df42 5667 ret->flags = 0;
a0d0e21e
LW
5668
5669 if (min > 0)
821b33a5
IZ
5670 *flagp = WORST;
5671 if (max > 0)
5672 *flagp |= HASWIDTH;
a0d0e21e 5673 if (max && max < min)
8615cb43 5674 vFAIL("Can't do {n,m} with n > m");
c277df42 5675 if (!SIZE_ONLY) {
eb160463
GS
5676 ARG1_SET(ret, (U16)min);
5677 ARG2_SET(ret, (U16)max);
a687059c 5678 }
a687059c 5679
a0d0e21e 5680 goto nest_check;
a687059c 5681 }
a0d0e21e 5682 }
a687059c 5683
a0d0e21e
LW
5684 if (!ISMULT1(op)) {
5685 *flagp = flags;
a687059c 5686 return(ret);
a0d0e21e 5687 }
bb20fd44 5688
c277df42 5689#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5690
5691 /* if this is reinstated, don't forget to put this back into perldiag:
5692
5693 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5694
5695 (F) The part of the regexp subject to either the * or + quantifier
5696 could match an empty string. The {#} shows in the regular
5697 expression about where the problem was discovered.
5698
5699 */
5700
bb20fd44 5701 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5702 vFAIL("Regexp *+ operand could be empty");
b81d288d 5703#endif
bb20fd44 5704
fac92740 5705 parse_start = RExC_parse;
830247a4 5706 nextchar(pRExC_state);
a0d0e21e 5707
821b33a5 5708 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5709
5710 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5711 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5712 ret->flags = 0;
830247a4 5713 RExC_naughty += 4;
a0d0e21e
LW
5714 }
5715 else if (op == '*') {
5716 min = 0;
5717 goto do_curly;
a0ed51b3
LW
5718 }
5719 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5720 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5721 ret->flags = 0;
830247a4 5722 RExC_naughty += 3;
a0d0e21e
LW
5723 }
5724 else if (op == '+') {
5725 min = 1;
5726 goto do_curly;
a0ed51b3
LW
5727 }
5728 else if (op == '?') {
a0d0e21e
LW
5729 min = 0; max = 1;
5730 goto do_curly;
5731 }
5732 nest_check:
041457d9 5733 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5734 vWARN3(RExC_parse,
b45f050a 5735 "%.*s matches null string many times",
afd78fd5 5736 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5737 origparse);
a0d0e21e
LW
5738 }
5739
b9b4dddf 5740 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5741 nextchar(pRExC_state);
6bda09f9 5742 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5743 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5744 }
b9b4dddf
YO
5745#ifndef REG_ALLOW_MINMOD_SUSPEND
5746 else
5747#endif
5748 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5749 regnode *ender;
5750 nextchar(pRExC_state);
5751 ender = reg_node(pRExC_state, SUCCEED);
5752 REGTAIL(pRExC_state, ret, ender);
5753 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5754 ret->flags = 0;
5755 ender = reg_node(pRExC_state, TAIL);
5756 REGTAIL(pRExC_state, ret, ender);
5757 /*ret= ender;*/
5758 }
5759
5760 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5761 RExC_parse++;
b45f050a
JF
5762 vFAIL("Nested quantifiers");
5763 }
a0d0e21e
LW
5764
5765 return(ret);
a687059c
LW
5766}
5767
fc8cd66c
YO
5768
5769/* reg_namedseq(pRExC_state,UVp)
5770
5771 This is expected to be called by a parser routine that has
5772 recognized'\N' and needs to handle the rest. RExC_parse is
5773 expected to point at the first char following the N at the time
5774 of the call.
5775
5776 If valuep is non-null then it is assumed that we are parsing inside
5777 of a charclass definition and the first codepoint in the resolved
5778 string is returned via *valuep and the routine will return NULL.
5779 In this mode if a multichar string is returned from the charnames
5780 handler a warning will be issued, and only the first char in the
5781 sequence will be examined. If the string returned is zero length
5782 then the value of *valuep is undefined and NON-NULL will
5783 be returned to indicate failure. (This will NOT be a valid pointer
5784 to a regnode.)
5785
5786 If value is null then it is assumed that we are parsing normal text
5787 and inserts a new EXACT node into the program containing the resolved
5788 string and returns a pointer to the new node. If the string is
5789 zerolength a NOTHING node is emitted.
5790
5791 On success RExC_parse is set to the char following the endbrace.
5792 Parsing failures will generate a fatal errorvia vFAIL(...)
5793
5794 NOTE: We cache all results from the charnames handler locally in
5795 the RExC_charnames hash (created on first use) to prevent a charnames
5796 handler from playing silly-buggers and returning a short string and
5797 then a long string for a given pattern. Since the regexp program
5798 size is calculated during an initial parse this would result
5799 in a buffer overrun so we cache to prevent the charname result from
5800 changing during the course of the parse.
5801
5802 */
5803STATIC regnode *
5804S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5805{
5806 char * name; /* start of the content of the name */
5807 char * endbrace; /* endbrace following the name */
5808 SV *sv_str = NULL;
5809 SV *sv_name = NULL;
5810 STRLEN len; /* this has various purposes throughout the code */
5811 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5812 regnode *ret = NULL;
5813
5814 if (*RExC_parse != '{') {
5815 vFAIL("Missing braces on \\N{}");
5816 }
5817 name = RExC_parse+1;
5818 endbrace = strchr(RExC_parse, '}');
5819 if ( ! endbrace ) {
5820 RExC_parse++;
5821 vFAIL("Missing right brace on \\N{}");
5822 }
5823 RExC_parse = endbrace + 1;
5824
5825
5826 /* RExC_parse points at the beginning brace,
5827 endbrace points at the last */
5828 if ( name[0]=='U' && name[1]=='+' ) {
5829 /* its a "unicode hex" notation {U+89AB} */
5830 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5831 | PERL_SCAN_DISALLOW_PREFIX
5832 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5833 UV cp;
196f1508 5834 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5835 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5836 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5837 cp = 0xFFFD;
5838 }
5839 if (cp > 0xff)
5840 RExC_utf8 = 1;
5841 if ( valuep ) {
5842 *valuep = cp;
5843 return NULL;
5844 }
5845 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5846 } else {
5847 /* fetch the charnames handler for this scope */
5848 HV * const table = GvHV(PL_hintgv);
5849 SV **cvp= table ?
5850 hv_fetchs(table, "charnames", FALSE) :
5851 NULL;
5852 SV *cv= cvp ? *cvp : NULL;
5853 HE *he_str;
5854 int count;
5855 /* create an SV with the name as argument */
5856 sv_name = newSVpvn(name, endbrace - name);
5857
5858 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5859 vFAIL2("Constant(\\N{%s}) unknown: "
5860 "(possibly a missing \"use charnames ...\")",
5861 SvPVX(sv_name));
5862 }
5863 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5864 vFAIL2("Constant(\\N{%s}): "
5865 "$^H{charnames} is not defined",SvPVX(sv_name));
5866 }
5867
5868
5869
5870 if (!RExC_charnames) {
5871 /* make sure our cache is allocated */
5872 RExC_charnames = newHV();
6bda09f9 5873 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5874 }
5875 /* see if we have looked this one up before */
5876 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5877 if ( he_str ) {
5878 sv_str = HeVAL(he_str);
5879 cached = 1;
5880 } else {
5881 dSP ;
5882
5883 ENTER ;
5884 SAVETMPS ;
5885 PUSHMARK(SP) ;
5886
5887 XPUSHs(sv_name);
5888
5889 PUTBACK ;
5890
5891 count= call_sv(cv, G_SCALAR);
5892
5893 if (count == 1) { /* XXXX is this right? dmq */
5894 sv_str = POPs;
5895 SvREFCNT_inc_simple_void(sv_str);
5896 }
5897
5898 SPAGAIN ;
5899 PUTBACK ;
5900 FREETMPS ;
5901 LEAVE ;
5902
5903 if ( !sv_str || !SvOK(sv_str) ) {
5904 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5905 "did not return a defined value",SvPVX(sv_name));
5906 }
5907 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5908 cached = 1;
5909 }
5910 }
5911 if (valuep) {
5912 char *p = SvPV(sv_str, len);
5913 if (len) {
5914 STRLEN numlen = 1;
5915 if ( SvUTF8(sv_str) ) {
196f1508 5916 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5917 if (*valuep > 0x7F)
5918 RExC_utf8 = 1;
5919 /* XXXX
5920 We have to turn on utf8 for high bit chars otherwise
5921 we get failures with
5922
5923 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5924 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5925
5926 This is different from what \x{} would do with the same
5927 codepoint, where the condition is > 0xFF.
5928 - dmq
5929 */
5930
5931
5932 } else {
5933 *valuep = (UV)*p;
5934 /* warn if we havent used the whole string? */
5935 }
5936 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5937 vWARN2(RExC_parse,
5938 "Ignoring excess chars from \\N{%s} in character class",
5939 SvPVX(sv_name)
5940 );
5941 }
5942 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5943 vWARN2(RExC_parse,
5944 "Ignoring zero length \\N{%s} in character class",
5945 SvPVX(sv_name)
5946 );
5947 }
5948 if (sv_name)
5949 SvREFCNT_dec(sv_name);
5950 if (!cached)
5951 SvREFCNT_dec(sv_str);
5952 return len ? NULL : (regnode *)&len;
5953 } else if(SvCUR(sv_str)) {
5954
5955 char *s;
5956 char *p, *pend;
5957 STRLEN charlen = 1;
5958 char * parse_start = name-3; /* needed for the offsets */
5959 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5960
5961 ret = reg_node(pRExC_state,
5962 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5963 s= STRING(ret);
5964
5965 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5966 sv_utf8_upgrade(sv_str);
5967 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5968 RExC_utf8= 1;
5969 }
5970
5971 p = SvPV(sv_str, len);
5972 pend = p + len;
5973 /* len is the length written, charlen is the size the char read */
5974 for ( len = 0; p < pend; p += charlen ) {
5975 if (UTF) {
196f1508 5976 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5977 if (FOLD) {
5978 STRLEN foldlen,numlen;
5979 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5980 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5981 /* Emit all the Unicode characters. */
5982
5983 for (foldbuf = tmpbuf;
5984 foldlen;
5985 foldlen -= numlen)
5986 {
5987 uvc = utf8_to_uvchr(foldbuf, &numlen);
5988 if (numlen > 0) {
5989 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5990 s += unilen;
5991 len += unilen;
5992 /* In EBCDIC the numlen
5993 * and unilen can differ. */
5994 foldbuf += numlen;
5995 if (numlen >= foldlen)
5996 break;
5997 }
5998 else
5999 break; /* "Can't happen." */
6000 }
6001 } else {
6002 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6003 if (unilen > 0) {
6004 s += unilen;
6005 len += unilen;
6006 }
6007 }
6008 } else {
6009 len++;
6010 REGC(*p, s++);
6011 }
6012 }
6013 if (SIZE_ONLY) {
6014 RExC_size += STR_SZ(len);
6015 } else {
6016 STR_LEN(ret) = len;
6017 RExC_emit += STR_SZ(len);
6018 }
6019 Set_Node_Cur_Length(ret); /* MJD */
6020 RExC_parse--;
6021 nextchar(pRExC_state);
6022 } else {
6023 ret = reg_node(pRExC_state,NOTHING);
6024 }
6025 if (!cached) {
6026 SvREFCNT_dec(sv_str);
6027 }
6028 if (sv_name) {
6029 SvREFCNT_dec(sv_name);
6030 }
6031 return ret;
6032
6033}
6034
6035
9e08bc66
TS
6036/*
6037 * reg_recode
6038 *
6039 * It returns the code point in utf8 for the value in *encp.
6040 * value: a code value in the source encoding
6041 * encp: a pointer to an Encode object
6042 *
6043 * If the result from Encode is not a single character,
6044 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6045 */
6046STATIC UV
6047S_reg_recode(pTHX_ const char value, SV **encp)
6048{
6049 STRLEN numlen = 1;
6050 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6051 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6052 : SvPVX(sv);
6053 const STRLEN newlen = SvCUR(sv);
6054 UV uv = UNICODE_REPLACEMENT;
6055
6056 if (newlen)
6057 uv = SvUTF8(sv)
6058 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6059 : *(U8*)s;
6060
6061 if (!newlen || numlen != newlen) {
6062 uv = UNICODE_REPLACEMENT;
6063 if (encp)
6064 *encp = NULL;
6065 }
6066 return uv;
6067}
6068
fc8cd66c 6069
a687059c
LW
6070/*
6071 - regatom - the lowest level
6072 *
6073 * Optimization: gobbles an entire sequence of ordinary characters so that
6074 * it can turn them into a single node, which is smaller to store and
6075 * faster to run. Backslashed characters are exceptions, each becoming a
6076 * separate node; the code is simpler that way and it's not worth fixing.
6077 *
7f6f358c
YO
6078 * [Yes, it is worth fixing, some scripts can run twice the speed.]
6079 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6080 */
76e3520e 6081STATIC regnode *
3dab1dad 6082S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6083{
97aff369 6084 dVAR;
cbbf8932 6085 register regnode *ret = NULL;
a0d0e21e 6086 I32 flags;
45948336 6087 char *parse_start = RExC_parse;
3dab1dad
YO
6088 GET_RE_DEBUG_FLAGS_DECL;
6089 DEBUG_PARSE("atom");
a0d0e21e
LW
6090 *flagp = WORST; /* Tentatively. */
6091
6092tryagain:
830247a4 6093 switch (*RExC_parse) {
a0d0e21e 6094 case '^':
830247a4
IZ
6095 RExC_seen_zerolen++;
6096 nextchar(pRExC_state);
bbe252da 6097 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6098 ret = reg_node(pRExC_state, MBOL);
bbe252da 6099 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6100 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6101 else
830247a4 6102 ret = reg_node(pRExC_state, BOL);
fac92740 6103 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6104 break;
6105 case '$':
830247a4 6106 nextchar(pRExC_state);
b81d288d 6107 if (*RExC_parse)
830247a4 6108 RExC_seen_zerolen++;
bbe252da 6109 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6110 ret = reg_node(pRExC_state, MEOL);
bbe252da 6111 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6112 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6113 else
830247a4 6114 ret = reg_node(pRExC_state, EOL);
fac92740 6115 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6116 break;
6117 case '.':
830247a4 6118 nextchar(pRExC_state);
bbe252da 6119 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
6120 ret = reg_node(pRExC_state, SANY);
6121 else
6122 ret = reg_node(pRExC_state, REG_ANY);
6123 *flagp |= HASWIDTH|SIMPLE;
830247a4 6124 RExC_naughty++;
fac92740 6125 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6126 break;
6127 case '[':
b45f050a 6128 {
3dab1dad
YO
6129 char * const oregcomp_parse = ++RExC_parse;
6130 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6131 if (*RExC_parse != ']') {
6132 RExC_parse = oregcomp_parse;
b45f050a
JF
6133 vFAIL("Unmatched [");
6134 }
830247a4 6135 nextchar(pRExC_state);
a0d0e21e 6136 *flagp |= HASWIDTH|SIMPLE;
fac92740 6137 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6138 break;
b45f050a 6139 }
a0d0e21e 6140 case '(':
830247a4 6141 nextchar(pRExC_state);
3dab1dad 6142 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6143 if (ret == NULL) {
bf93d4cc 6144 if (flags & TRYAGAIN) {
830247a4 6145 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6146 /* Make parent create an empty node if needed. */
6147 *flagp |= TRYAGAIN;
6148 return(NULL);
6149 }
a0d0e21e 6150 goto tryagain;
bf93d4cc 6151 }
a0d0e21e
LW
6152 return(NULL);
6153 }
c277df42 6154 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
6155 break;
6156 case '|':
6157 case ')':
6158 if (flags & TRYAGAIN) {
6159 *flagp |= TRYAGAIN;
6160 return NULL;
6161 }
b45f050a 6162 vFAIL("Internal urp");
a0d0e21e
LW
6163 /* Supposed to be caught earlier. */
6164 break;
85afd4ae 6165 case '{':
830247a4
IZ
6166 if (!regcurly(RExC_parse)) {
6167 RExC_parse++;
85afd4ae
CS
6168 goto defchar;
6169 }
6170 /* FALL THROUGH */
a0d0e21e
LW
6171 case '?':
6172 case '+':
6173 case '*':
830247a4 6174 RExC_parse++;
b45f050a 6175 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6176 break;
6177 case '\\':
830247a4 6178 switch (*++RExC_parse) {
a0d0e21e 6179 case 'A':
830247a4
IZ
6180 RExC_seen_zerolen++;
6181 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6182 *flagp |= SIMPLE;
830247a4 6183 nextchar(pRExC_state);
fac92740 6184 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6185 break;
6186 case 'G':
830247a4
IZ
6187 ret = reg_node(pRExC_state, GPOS);
6188 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6189 *flagp |= SIMPLE;
830247a4 6190 nextchar(pRExC_state);
fac92740 6191 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6192 break;
6193 case 'Z':
830247a4 6194 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6195 *flagp |= SIMPLE;
a1917ab9 6196 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 6197 nextchar(pRExC_state);
a0d0e21e 6198 break;
b85d18e9 6199 case 'z':
830247a4 6200 ret = reg_node(pRExC_state, EOS);
b85d18e9 6201 *flagp |= SIMPLE;
830247a4
IZ
6202 RExC_seen_zerolen++; /* Do not optimize RE away */
6203 nextchar(pRExC_state);
fac92740 6204 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 6205 break;
4a2d328f 6206 case 'C':
f33976b4
DB
6207 ret = reg_node(pRExC_state, CANY);
6208 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6209 *flagp |= HASWIDTH|SIMPLE;
830247a4 6210 nextchar(pRExC_state);
fac92740 6211 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
6212 break;
6213 case 'X':
830247a4 6214 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6215 *flagp |= HASWIDTH;
830247a4 6216 nextchar(pRExC_state);
fac92740 6217 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 6218 break;
a0d0e21e 6219 case 'w':
eb160463 6220 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6221 *flagp |= HASWIDTH|SIMPLE;
830247a4 6222 nextchar(pRExC_state);
fac92740 6223 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6224 break;
6225 case 'W':
eb160463 6226 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6227 *flagp |= HASWIDTH|SIMPLE;
830247a4 6228 nextchar(pRExC_state);
fac92740 6229 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6230 break;
6231 case 'b':
830247a4
IZ
6232 RExC_seen_zerolen++;
6233 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6234 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6235 *flagp |= SIMPLE;
830247a4 6236 nextchar(pRExC_state);
fac92740 6237 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6238 break;
6239 case 'B':
830247a4
IZ
6240 RExC_seen_zerolen++;
6241 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6242 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6243 *flagp |= SIMPLE;
830247a4 6244 nextchar(pRExC_state);
fac92740 6245 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6246 break;
6247 case 's':
eb160463 6248 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6249 *flagp |= HASWIDTH|SIMPLE;
830247a4 6250 nextchar(pRExC_state);
fac92740 6251 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6252 break;
6253 case 'S':
eb160463 6254 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6255 *flagp |= HASWIDTH|SIMPLE;
830247a4 6256 nextchar(pRExC_state);
fac92740 6257 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6258 break;
6259 case 'd':
ffc61ed2 6260 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6261 *flagp |= HASWIDTH|SIMPLE;
830247a4 6262 nextchar(pRExC_state);
fac92740 6263 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6264 break;
6265 case 'D':
ffc61ed2 6266 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6267 *flagp |= HASWIDTH|SIMPLE;
830247a4 6268 nextchar(pRExC_state);
fac92740 6269 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 6270 break;
a14b48bc
LW
6271 case 'p':
6272 case 'P':
3568d838 6273 {
3dab1dad 6274 char* const oldregxend = RExC_end;
ccb2c380 6275 char* parse_start = RExC_parse - 2;
a14b48bc 6276
830247a4 6277 if (RExC_parse[1] == '{') {
3568d838 6278 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6279 RExC_end = strchr(RExC_parse, '}');
6280 if (!RExC_end) {
3dab1dad 6281 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6282 RExC_parse += 2;
6283 RExC_end = oldregxend;
0da60cf5 6284 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6285 }
830247a4 6286 RExC_end++;
a14b48bc 6287 }
af6f566e 6288 else {
830247a4 6289 RExC_end = RExC_parse + 2;
af6f566e
HS
6290 if (RExC_end > oldregxend)
6291 RExC_end = oldregxend;
6292 }
830247a4 6293 RExC_parse--;
a14b48bc 6294
3dab1dad 6295 ret = regclass(pRExC_state,depth+1);
a14b48bc 6296
830247a4
IZ
6297 RExC_end = oldregxend;
6298 RExC_parse--;
ccb2c380
MP
6299
6300 Set_Node_Offset(ret, parse_start + 2);
6301 Set_Node_Cur_Length(ret);
830247a4 6302 nextchar(pRExC_state);
a14b48bc
LW
6303 *flagp |= HASWIDTH|SIMPLE;
6304 }
6305 break;
fc8cd66c
YO
6306 case 'N':
6307 /* Handle \N{NAME} here and not below because it can be
6308 multicharacter. join_exact() will join them up later on.
6309 Also this makes sure that things like /\N{BLAH}+/ and
6310 \N{BLAH} being multi char Just Happen. dmq*/
6311 ++RExC_parse;
6312 ret= reg_namedseq(pRExC_state, NULL);
6313 break;
0a4db386 6314 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
6315 {
6316 char ch= RExC_parse[1];
6317 if (ch != '<' && ch != '\'') {
6318 if (SIZE_ONLY)
6319 vWARN( RExC_parse + 1,
6320 "Possible broken named back reference treated as literal k");
6321 parse_start--;
6322 goto defchar;
6323 } else {
6324 char* name_start = (RExC_parse += 2);
2eccd3b2 6325 U32 num = 0;
0a4db386
YO
6326 SV *sv_dat = reg_scan_name(pRExC_state,
6327 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
6328 ch= (ch == '<') ? '>' : '\'';
6329
81714fb9
YO
6330 if (RExC_parse == name_start || *RExC_parse != ch)
6331 vFAIL2("Sequence \\k%c... not terminated",
6332 (ch == '>' ? '<' : ch));
6333
6334 RExC_sawback = 1;
6335 ret = reganode(pRExC_state,
6336 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6337 num);
6338 *flagp |= HASWIDTH;
6339
6340
6341 if (!SIZE_ONLY) {
81714fb9
YO
6342 num = add_data( pRExC_state, 1, "S" );
6343 ARG_SET(ret,num);
f8fc2ecf 6344 RExC_rxi->data->data[num]=(void*)sv_dat;
81714fb9
YO
6345 SvREFCNT_inc(sv_dat);
6346 }
6347 /* override incorrect value set in reganode MJD */
6348 Set_Node_Offset(ret, parse_start+1);
6349 Set_Node_Cur_Length(ret); /* MJD */
6350 nextchar(pRExC_state);
6351
6352 }
6353 break;
6354 }
a0d0e21e
LW
6355 case 'n':
6356 case 'r':
6357 case 't':
6358 case 'f':
6359 case 'e':
6360 case 'a':
6361 case 'x':
6362 case 'c':
6363 case '0':
6364 goto defchar;
c74340f9 6365 case 'R':
a0d0e21e
LW
6366 case '1': case '2': case '3': case '4':
6367 case '5': case '6': case '7': case '8': case '9':
6368 {
c74340f9
YO
6369 I32 num;
6370 bool isrel=(*RExC_parse=='R');
6371 if (isrel)
6372 RExC_parse++;
6373 num = atoi(RExC_parse);
6374 if (isrel) {
6375 num = RExC_cpar - num;
6376 if (num < 1)
6377 vFAIL("Reference to nonexistent or unclosed group");
6378 }
830247a4 6379 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
6380 goto defchar;
6381 else {
3dab1dad 6382 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6383 while (isDIGIT(*RExC_parse))
6384 RExC_parse++;
b45f050a 6385
c74340f9
YO
6386 if (!SIZE_ONLY) {
6387 if (num > (I32)RExC_rx->nparens)
6388 vFAIL("Reference to nonexistent group");
6389 /* People make this error all the time apparently.
6390 So we cant fail on it, even though we should
6391
6392 else if (num >= RExC_cpar)
6393 vFAIL("Reference to unclosed group will always match");
6394 */
6395 }
830247a4 6396 RExC_sawback = 1;
eb160463
GS
6397 ret = reganode(pRExC_state,
6398 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6399 num);
a0d0e21e 6400 *flagp |= HASWIDTH;
2af232bd 6401
fac92740 6402 /* override incorrect value set in reganode MJD */
2af232bd 6403 Set_Node_Offset(ret, parse_start+1);
fac92740 6404 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6405 RExC_parse--;
6406 nextchar(pRExC_state);
a0d0e21e
LW
6407 }
6408 }
6409 break;
6410 case '\0':
830247a4 6411 if (RExC_parse >= RExC_end)
b45f050a 6412 FAIL("Trailing \\");
a0d0e21e
LW
6413 /* FALL THROUGH */
6414 default:
a0288114 6415 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6416 back into the quick-grab loop below */
45948336 6417 parse_start--;
a0d0e21e
LW
6418 goto defchar;
6419 }
6420 break;
4633a7c4
LW
6421
6422 case '#':
bbe252da 6423 if (RExC_flags & RXf_PMf_EXTENDED) {
3dab1dad
YO
6424 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6425 RExC_parse++;
830247a4 6426 if (RExC_parse < RExC_end)
4633a7c4
LW
6427 goto tryagain;
6428 }
6429 /* FALL THROUGH */
6430
a0d0e21e 6431 default: {
ba210ebe 6432 register STRLEN len;
58ae7d3f 6433 register UV ender;
a0d0e21e 6434 register char *p;
3dab1dad 6435 char *s;
80aecb99 6436 STRLEN foldlen;
89ebb4a3 6437 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6438
6439 parse_start = RExC_parse - 1;
a0d0e21e 6440
830247a4 6441 RExC_parse++;
a0d0e21e
LW
6442
6443 defchar:
58ae7d3f 6444 ender = 0;
eb160463
GS
6445 ret = reg_node(pRExC_state,
6446 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6447 s = STRING(ret);
830247a4
IZ
6448 for (len = 0, p = RExC_parse - 1;
6449 len < 127 && p < RExC_end;
a0d0e21e
LW
6450 len++)
6451 {
3dab1dad 6452 char * const oldp = p;
5b5a24f7 6453
bbe252da 6454 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6455 p = regwhite(p, RExC_end);
a0d0e21e
LW
6456 switch (*p) {
6457 case '^':
6458 case '$':
6459 case '.':
6460 case '[':
6461 case '(':
6462 case ')':
6463 case '|':
6464 goto loopdone;
6465 case '\\':
6466 switch (*++p) {
6467 case 'A':
1ed8eac0
JF
6468 case 'C':
6469 case 'X':
a0d0e21e
LW
6470 case 'G':
6471 case 'Z':
b85d18e9 6472 case 'z':
a0d0e21e
LW
6473 case 'w':
6474 case 'W':
6475 case 'b':
6476 case 'B':
6477 case 's':
6478 case 'S':
6479 case 'd':
6480 case 'D':
a14b48bc
LW
6481 case 'p':
6482 case 'P':
fc8cd66c 6483 case 'N':
c74340f9 6484 case 'R':
a0d0e21e
LW
6485 --p;
6486 goto loopdone;
6487 case 'n':
6488 ender = '\n';
6489 p++;
a687059c 6490 break;
a0d0e21e
LW
6491 case 'r':
6492 ender = '\r';
6493 p++;
a687059c 6494 break;
a0d0e21e
LW
6495 case 't':
6496 ender = '\t';
6497 p++;
a687059c 6498 break;
a0d0e21e
LW
6499 case 'f':
6500 ender = '\f';
6501 p++;
a687059c 6502 break;
a0d0e21e 6503 case 'e':
c7f1f016 6504 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6505 p++;
a687059c 6506 break;
a0d0e21e 6507 case 'a':
c7f1f016 6508 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6509 p++;
a687059c 6510 break;
a0d0e21e 6511 case 'x':
a0ed51b3 6512 if (*++p == '{') {
1df70142 6513 char* const e = strchr(p, '}');
b81d288d 6514
b45f050a 6515 if (!e) {
830247a4 6516 RExC_parse = p + 1;
b45f050a
JF
6517 vFAIL("Missing right brace on \\x{}");
6518 }
de5f0749 6519 else {
a4c04bdc
NC
6520 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6521 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6522 STRLEN numlen = e - p - 1;
53305cf1 6523 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6524 if (ender > 0xff)
6525 RExC_utf8 = 1;
a0ed51b3
LW
6526 p = e + 1;
6527 }
a0ed51b3
LW
6528 }
6529 else {
a4c04bdc 6530 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6531 STRLEN numlen = 2;
53305cf1 6532 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6533 p += numlen;
6534 }
9e08bc66
TS
6535 if (PL_encoding && ender < 0x100)
6536 goto recode_encoding;
a687059c 6537 break;
a0d0e21e
LW
6538 case 'c':
6539 p++;
bbce6d69 6540 ender = UCHARAT(p++);
6541 ender = toCTRL(ender);
a687059c 6542 break;
a0d0e21e
LW
6543 case '0': case '1': case '2': case '3':case '4':
6544 case '5': case '6': case '7': case '8':case '9':
6545 if (*p == '0' ||
830247a4 6546 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6547 I32 flags = 0;
1df70142 6548 STRLEN numlen = 3;
53305cf1 6549 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6550 p += numlen;
6551 }
6552 else {
6553 --p;
6554 goto loopdone;
a687059c 6555 }
9e08bc66
TS
6556 if (PL_encoding && ender < 0x100)
6557 goto recode_encoding;
6558 break;
6559 recode_encoding:
6560 {
6561 SV* enc = PL_encoding;
6562 ender = reg_recode((const char)(U8)ender, &enc);
6563 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6564 vWARN(p, "Invalid escape in the specified encoding");
6565 RExC_utf8 = 1;
6566 }
a687059c 6567 break;
a0d0e21e 6568 case '\0':
830247a4 6569 if (p >= RExC_end)
b45f050a 6570 FAIL("Trailing \\");
a687059c 6571 /* FALL THROUGH */
a0d0e21e 6572 default:
041457d9 6573 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6574 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6575 goto normal_default;
a0d0e21e
LW
6576 }
6577 break;
a687059c 6578 default:
a0ed51b3 6579 normal_default:
fd400ab9 6580 if (UTF8_IS_START(*p) && UTF) {
1df70142 6581 STRLEN numlen;
5e12f4fb 6582 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6583 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6584 p += numlen;
6585 }
6586 else
6587 ender = *p++;
a0d0e21e 6588 break;
a687059c 6589 }
bbe252da 6590 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6591 p = regwhite(p, RExC_end);
60a8b682
JH
6592 if (UTF && FOLD) {
6593 /* Prime the casefolded buffer. */
ac7e0132 6594 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6595 }
a0d0e21e
LW
6596 if (ISMULT2(p)) { /* Back off on ?+*. */
6597 if (len)
6598 p = oldp;
16ea2a2e 6599 else if (UTF) {
80aecb99 6600 if (FOLD) {
60a8b682 6601 /* Emit all the Unicode characters. */
1df70142 6602 STRLEN numlen;
80aecb99
JH
6603 for (foldbuf = tmpbuf;
6604 foldlen;
6605 foldlen -= numlen) {
6606 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6607 if (numlen > 0) {
71207a34 6608 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6609 s += unilen;
6610 len += unilen;
6611 /* In EBCDIC the numlen
6612 * and unilen can differ. */
9dc45d57 6613 foldbuf += numlen;
47654450
JH
6614 if (numlen >= foldlen)
6615 break;
9dc45d57
JH
6616 }
6617 else
6618 break; /* "Can't happen." */
80aecb99
JH
6619 }
6620 }
6621 else {
71207a34 6622 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6623 if (unilen > 0) {
0ebc6274
JH
6624 s += unilen;
6625 len += unilen;
9dc45d57 6626 }
80aecb99 6627 }
a0ed51b3 6628 }
a0d0e21e
LW
6629 else {
6630 len++;
eb160463 6631 REGC((char)ender, s++);
a0d0e21e
LW
6632 }
6633 break;
a687059c 6634 }
16ea2a2e 6635 if (UTF) {
80aecb99 6636 if (FOLD) {
60a8b682 6637 /* Emit all the Unicode characters. */
1df70142 6638 STRLEN numlen;
80aecb99
JH
6639 for (foldbuf = tmpbuf;
6640 foldlen;
6641 foldlen -= numlen) {
6642 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6643 if (numlen > 0) {
71207a34 6644 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6645 len += unilen;
6646 s += unilen;
6647 /* In EBCDIC the numlen
6648 * and unilen can differ. */
9dc45d57 6649 foldbuf += numlen;
47654450
JH
6650 if (numlen >= foldlen)
6651 break;
9dc45d57
JH
6652 }
6653 else
6654 break;
80aecb99
JH
6655 }
6656 }
6657 else {
71207a34 6658 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6659 if (unilen > 0) {
0ebc6274
JH
6660 s += unilen;
6661 len += unilen;
9dc45d57 6662 }
80aecb99
JH
6663 }
6664 len--;
a0ed51b3
LW
6665 }
6666 else
eb160463 6667 REGC((char)ender, s++);
a0d0e21e
LW
6668 }
6669 loopdone:
830247a4 6670 RExC_parse = p - 1;
fac92740 6671 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6672 nextchar(pRExC_state);
793db0cb
JH
6673 {
6674 /* len is STRLEN which is unsigned, need to copy to signed */
6675 IV iv = len;
6676 if (iv < 0)
6677 vFAIL("Internal disaster");
6678 }
a0d0e21e
LW
6679 if (len > 0)
6680 *flagp |= HASWIDTH;
090f7165 6681 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6682 *flagp |= SIMPLE;
3dab1dad 6683
cd439c50 6684 if (SIZE_ONLY)
830247a4 6685 RExC_size += STR_SZ(len);
3dab1dad
YO
6686 else {
6687 STR_LEN(ret) = len;
830247a4 6688 RExC_emit += STR_SZ(len);
07be1b83 6689 }
3dab1dad 6690 }
a0d0e21e
LW
6691 break;
6692 }
a687059c 6693
a0d0e21e 6694 return(ret);
a687059c
LW
6695}
6696
873ef191 6697STATIC char *
5f66b61c 6698S_regwhite(char *p, const char *e)
5b5a24f7
CS
6699{
6700 while (p < e) {
6701 if (isSPACE(*p))
6702 ++p;
6703 else if (*p == '#') {
6704 do {
6705 p++;
6706 } while (p < e && *p != '\n');
6707 }
6708 else
6709 break;
6710 }
6711 return p;
6712}
6713
b8c5462f
JH
6714/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6715 Character classes ([:foo:]) can also be negated ([:^foo:]).
6716 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6717 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6718 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6719
6720#define POSIXCC_DONE(c) ((c) == ':')
6721#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6722#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6723
b8c5462f 6724STATIC I32
830247a4 6725S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6726{
97aff369 6727 dVAR;
936ed897 6728 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6729
830247a4 6730 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6731 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6732 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6733 const char c = UCHARAT(RExC_parse);
097eb12c 6734 char* const s = RExC_parse++;
b81d288d 6735
9a86a77b 6736 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6737 RExC_parse++;
6738 if (RExC_parse == RExC_end)
620e46c5 6739 /* Grandfather lone [:, [=, [. */
830247a4 6740 RExC_parse = s;
620e46c5 6741 else {
3dab1dad 6742 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6743 assert(*t == c);
6744
9a86a77b 6745 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6746 const char *posixcc = s + 1;
830247a4 6747 RExC_parse++; /* skip over the ending ] */
3dab1dad 6748
b8c5462f 6749 if (*s == ':') {
1df70142
AL
6750 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6751 const I32 skip = t - posixcc;
80916619
NC
6752
6753 /* Initially switch on the length of the name. */
6754 switch (skip) {
6755 case 4:
3dab1dad
YO
6756 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6757 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6758 break;
80916619
NC
6759 case 5:
6760 /* Names all of length 5. */
6761 /* alnum alpha ascii blank cntrl digit graph lower
6762 print punct space upper */
6763 /* Offset 4 gives the best switch position. */
6764 switch (posixcc[4]) {
6765 case 'a':
3dab1dad
YO
6766 if (memEQ(posixcc, "alph", 4)) /* alpha */
6767 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6768 break;
6769 case 'e':
3dab1dad
YO
6770 if (memEQ(posixcc, "spac", 4)) /* space */
6771 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6772 break;
6773 case 'h':
3dab1dad
YO
6774 if (memEQ(posixcc, "grap", 4)) /* graph */
6775 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6776 break;
6777 case 'i':
3dab1dad
YO
6778 if (memEQ(posixcc, "asci", 4)) /* ascii */
6779 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6780 break;
6781 case 'k':
3dab1dad
YO
6782 if (memEQ(posixcc, "blan", 4)) /* blank */
6783 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6784 break;
6785 case 'l':
3dab1dad
YO
6786 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6787 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6788 break;
6789 case 'm':
3dab1dad
YO
6790 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6791 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6792 break;
6793 case 'r':
3dab1dad
YO
6794 if (memEQ(posixcc, "lowe", 4)) /* lower */
6795 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6796 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6797 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6798 break;
6799 case 't':
3dab1dad
YO
6800 if (memEQ(posixcc, "digi", 4)) /* digit */
6801 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6802 else if (memEQ(posixcc, "prin", 4)) /* print */
6803 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6804 else if (memEQ(posixcc, "punc", 4)) /* punct */
6805 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6806 break;
b8c5462f
JH
6807 }
6808 break;
80916619 6809 case 6:
3dab1dad
YO
6810 if (memEQ(posixcc, "xdigit", 6))
6811 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6812 break;
6813 }
80916619
NC
6814
6815 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6816 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6817 t - s - 1, s + 1);
80916619
NC
6818 assert (posixcc[skip] == ':');
6819 assert (posixcc[skip+1] == ']');
b45f050a 6820 } else if (!SIZE_ONLY) {
b8c5462f 6821 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6822
830247a4 6823 /* adjust RExC_parse so the warning shows after
b45f050a 6824 the class closes */
9a86a77b 6825 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6826 RExC_parse++;
b45f050a
JF
6827 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6828 }
b8c5462f
JH
6829 } else {
6830 /* Maternal grandfather:
6831 * "[:" ending in ":" but not in ":]" */
830247a4 6832 RExC_parse = s;
767d463e 6833 }
620e46c5
JH
6834 }
6835 }
6836
b8c5462f
JH
6837 return namedclass;
6838}
6839
6840STATIC void
830247a4 6841S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6842{
97aff369 6843 dVAR;
3dab1dad 6844 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6845 const char *s = RExC_parse;
6846 const char c = *s++;
b8c5462f 6847
3dab1dad 6848 while (isALNUM(*s))
b8c5462f
JH
6849 s++;
6850 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6851 if (ckWARN(WARN_REGEXP))
6852 vWARN3(s+2,
6853 "POSIX syntax [%c %c] belongs inside character classes",
6854 c, c);
b45f050a
JF
6855
6856 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6857 if (POSIXCC_NOTYET(c)) {
830247a4 6858 /* adjust RExC_parse so the error shows after
b45f050a 6859 the class closes */
9a86a77b 6860 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6861 NOOP;
b45f050a
JF
6862 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6863 }
b8c5462f
JH
6864 }
6865 }
620e46c5
JH
6866}
6867
7f6f358c
YO
6868
6869/*
6870 parse a class specification and produce either an ANYOF node that
6871 matches the pattern. If the pattern matches a single char only and
6872 that char is < 256 then we produce an EXACT node instead.
6873*/
76e3520e 6874STATIC regnode *
3dab1dad 6875S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6876{
97aff369 6877 dVAR;
9ef43ace 6878 register UV value = 0;
9a86a77b 6879 register UV nextvalue;
3568d838 6880 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6881 register IV range = 0;
c277df42 6882 register regnode *ret;
ba210ebe 6883 STRLEN numlen;
ffc61ed2 6884 IV namedclass;
cbbf8932 6885 char *rangebegin = NULL;
936ed897 6886 bool need_class = 0;
c445ea15 6887 SV *listsv = NULL;
ffc61ed2 6888 UV n;
9e55ce06 6889 bool optimize_invert = TRUE;
cbbf8932 6890 AV* unicode_alternate = NULL;
1b2d223b
JH
6891#ifdef EBCDIC
6892 UV literal_endpoint = 0;
6893#endif
7f6f358c 6894 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6895
3dab1dad 6896 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6897 case we need to change the emitted regop to an EXACT. */
07be1b83 6898 const char * orig_parse = RExC_parse;
72f13be8 6899 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6900#ifndef DEBUGGING
6901 PERL_UNUSED_ARG(depth);
6902#endif
72f13be8 6903
3dab1dad 6904 DEBUG_PARSE("clas");
7f6f358c
YO
6905
6906 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6907 ret = reganode(pRExC_state, ANYOF, 0);
6908
6909 if (!SIZE_ONLY)
6910 ANYOF_FLAGS(ret) = 0;
6911
9a86a77b 6912 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6913 RExC_naughty++;
6914 RExC_parse++;
6915 if (!SIZE_ONLY)
6916 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6917 }
a0d0e21e 6918
73060fc4 6919 if (SIZE_ONLY) {
830247a4 6920 RExC_size += ANYOF_SKIP;
73060fc4
JH
6921 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6922 }
936ed897 6923 else {
830247a4 6924 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6925 if (FOLD)
6926 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6927 if (LOC)
6928 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6929 ANYOF_BITMAP_ZERO(ret);
396482e1 6930 listsv = newSVpvs("# comment\n");
a0d0e21e 6931 }
b8c5462f 6932
9a86a77b
JH
6933 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6934
b938889d 6935 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6936 checkposixcc(pRExC_state);
b8c5462f 6937
f064b6ad
HS
6938 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6939 if (UCHARAT(RExC_parse) == ']')
6940 goto charclassloop;
ffc61ed2 6941
fc8cd66c 6942parseit:
9a86a77b 6943 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6944
6945 charclassloop:
6946
6947 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6948
73b437c8 6949 if (!range)
830247a4 6950 rangebegin = RExC_parse;
ffc61ed2 6951 if (UTF) {
5e12f4fb 6952 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6953 RExC_end - RExC_parse,
9f7f3913 6954 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6955 RExC_parse += numlen;
6956 }
6957 else
6958 value = UCHARAT(RExC_parse++);
7f6f358c 6959
9a86a77b
JH
6960 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6961 if (value == '[' && POSIXCC(nextvalue))
830247a4 6962 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6963 else if (value == '\\') {
ffc61ed2 6964 if (UTF) {
5e12f4fb 6965 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6966 RExC_end - RExC_parse,
9f7f3913 6967 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6968 RExC_parse += numlen;
6969 }
6970 else
6971 value = UCHARAT(RExC_parse++);
470c3474 6972 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6973 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6974 * be a problem later if we want switch on Unicode.
6975 * A similar issue a little bit later when switching on
6976 * namedclass. --jhi */
ffc61ed2 6977 switch ((I32)value) {
b8c5462f
JH
6978 case 'w': namedclass = ANYOF_ALNUM; break;
6979 case 'W': namedclass = ANYOF_NALNUM; break;
6980 case 's': namedclass = ANYOF_SPACE; break;
6981 case 'S': namedclass = ANYOF_NSPACE; break;
6982 case 'd': namedclass = ANYOF_DIGIT; break;
6983 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6984 case 'N': /* Handle \N{NAME} in class */
6985 {
6986 /* We only pay attention to the first char of
6987 multichar strings being returned. I kinda wonder
6988 if this makes sense as it does change the behaviour
6989 from earlier versions, OTOH that behaviour was broken
6990 as well. */
6991 UV v; /* value is register so we cant & it /grrr */
6992 if (reg_namedseq(pRExC_state, &v)) {
6993 goto parseit;
6994 }
6995 value= v;
6996 }
6997 break;
ffc61ed2
JH
6998 case 'p':
6999 case 'P':
3dab1dad
YO
7000 {
7001 char *e;
af6f566e 7002 if (RExC_parse >= RExC_end)
2a4859cd 7003 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 7004 if (*RExC_parse == '{') {
1df70142 7005 const U8 c = (U8)value;
ffc61ed2
JH
7006 e = strchr(RExC_parse++, '}');
7007 if (!e)
0da60cf5 7008 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
7009 while (isSPACE(UCHARAT(RExC_parse)))
7010 RExC_parse++;
7011 if (e == RExC_parse)
0da60cf5 7012 vFAIL2("Empty \\%c{}", c);
ffc61ed2 7013 n = e - RExC_parse;
ab13f0c7
JH
7014 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7015 n--;
ffc61ed2
JH
7016 }
7017 else {
7018 e = RExC_parse;
7019 n = 1;
7020 }
7021 if (!SIZE_ONLY) {
ab13f0c7
JH
7022 if (UCHARAT(RExC_parse) == '^') {
7023 RExC_parse++;
7024 n--;
7025 value = value == 'p' ? 'P' : 'p'; /* toggle */
7026 while (isSPACE(UCHARAT(RExC_parse))) {
7027 RExC_parse++;
7028 n--;
7029 }
7030 }
097eb12c
AL
7031 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7032 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
7033 }
7034 RExC_parse = e + 1;
7035 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 7036 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 7037 }
f81125e2 7038 break;
b8c5462f
JH
7039 case 'n': value = '\n'; break;
7040 case 'r': value = '\r'; break;
7041 case 't': value = '\t'; break;
7042 case 'f': value = '\f'; break;
7043 case 'b': value = '\b'; break;
c7f1f016
NIS
7044 case 'e': value = ASCII_TO_NATIVE('\033');break;
7045 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 7046 case 'x':
ffc61ed2 7047 if (*RExC_parse == '{') {
a4c04bdc
NC
7048 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7049 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 7050 char * const e = strchr(RExC_parse++, '}');
b81d288d 7051 if (!e)
ffc61ed2 7052 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
7053
7054 numlen = e - RExC_parse;
7055 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7056 RExC_parse = e + 1;
7057 }
7058 else {
a4c04bdc 7059 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
7060 numlen = 2;
7061 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7062 RExC_parse += numlen;
7063 }
9e08bc66
TS
7064 if (PL_encoding && value < 0x100)
7065 goto recode_encoding;
b8c5462f
JH
7066 break;
7067 case 'c':
830247a4 7068 value = UCHARAT(RExC_parse++);
b8c5462f
JH
7069 value = toCTRL(value);
7070 break;
7071 case '0': case '1': case '2': case '3': case '4':
7072 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
7073 {
7074 I32 flags = 0;
7075 numlen = 3;
7076 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7077 RExC_parse += numlen;
7078 if (PL_encoding && value < 0x100)
7079 goto recode_encoding;
7080 break;
7081 }
7082 recode_encoding:
7083 {
7084 SV* enc = PL_encoding;
7085 value = reg_recode((const char)(U8)value, &enc);
7086 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7087 vWARN(RExC_parse,
7088 "Invalid escape in the specified encoding");
7089 break;
7090 }
1028017a 7091 default:
041457d9 7092 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7093 vWARN2(RExC_parse,
7094 "Unrecognized escape \\%c in character class passed through",
7095 (int)value);
1028017a 7096 break;
b8c5462f 7097 }
ffc61ed2 7098 } /* end of \blah */
1b2d223b
JH
7099#ifdef EBCDIC
7100 else
7101 literal_endpoint++;
7102#endif
ffc61ed2
JH
7103
7104 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7105
7106 if (!SIZE_ONLY && !need_class)
936ed897 7107 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7108
936ed897 7109 need_class = 1;
ffc61ed2
JH
7110
7111 /* a bad range like a-\d, a-[:digit:] ? */
7112 if (range) {
73b437c8 7113 if (!SIZE_ONLY) {
afd78fd5 7114 if (ckWARN(WARN_REGEXP)) {
097eb12c 7115 const int w =
afd78fd5
JH
7116 RExC_parse >= rangebegin ?
7117 RExC_parse - rangebegin : 0;
830247a4 7118 vWARN4(RExC_parse,
b45f050a 7119 "False [] range \"%*.*s\"",
097eb12c 7120 w, w, rangebegin);
afd78fd5 7121 }
3568d838
JH
7122 if (prevvalue < 256) {
7123 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7124 ANYOF_BITMAP_SET(ret, '-');
7125 }
7126 else {
7127 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7128 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7129 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7130 }
b8c5462f 7131 }
ffc61ed2
JH
7132
7133 range = 0; /* this was not a true range */
73b437c8 7134 }
ffc61ed2 7135
73b437c8 7136 if (!SIZE_ONLY) {
c49a72a9
NC
7137 const char *what = NULL;
7138 char yesno = 0;
7139
3568d838
JH
7140 if (namedclass > OOB_NAMEDCLASS)
7141 optimize_invert = FALSE;
e2962f66
JH
7142 /* Possible truncation here but in some 64-bit environments
7143 * the compiler gets heartburn about switch on 64-bit values.
7144 * A similar issue a little earlier when switching on value.
98f323fa 7145 * --jhi */
e2962f66 7146 switch ((I32)namedclass) {
73b437c8
JH
7147 case ANYOF_ALNUM:
7148 if (LOC)
936ed897 7149 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
7150 else {
7151 for (value = 0; value < 256; value++)
7152 if (isALNUM(value))
936ed897 7153 ANYOF_BITMAP_SET(ret, value);
73b437c8 7154 }
c49a72a9
NC
7155 yesno = '+';
7156 what = "Word";
73b437c8
JH
7157 break;
7158 case ANYOF_NALNUM:
7159 if (LOC)
936ed897 7160 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
7161 else {
7162 for (value = 0; value < 256; value++)
7163 if (!isALNUM(value))
936ed897 7164 ANYOF_BITMAP_SET(ret, value);
73b437c8 7165 }
c49a72a9
NC
7166 yesno = '!';
7167 what = "Word";
73b437c8 7168 break;
ffc61ed2 7169 case ANYOF_ALNUMC:
73b437c8 7170 if (LOC)
ffc61ed2 7171 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
7172 else {
7173 for (value = 0; value < 256; value++)
ffc61ed2 7174 if (isALNUMC(value))
936ed897 7175 ANYOF_BITMAP_SET(ret, value);
73b437c8 7176 }
c49a72a9
NC
7177 yesno = '+';
7178 what = "Alnum";
73b437c8
JH
7179 break;
7180 case ANYOF_NALNUMC:
7181 if (LOC)
936ed897 7182 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
7183 else {
7184 for (value = 0; value < 256; value++)
7185 if (!isALNUMC(value))
936ed897 7186 ANYOF_BITMAP_SET(ret, value);
73b437c8 7187 }
c49a72a9
NC
7188 yesno = '!';
7189 what = "Alnum";
73b437c8
JH
7190 break;
7191 case ANYOF_ALPHA:
7192 if (LOC)
936ed897 7193 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
7194 else {
7195 for (value = 0; value < 256; value++)
7196 if (isALPHA(value))
936ed897 7197 ANYOF_BITMAP_SET(ret, value);
73b437c8 7198 }
c49a72a9
NC
7199 yesno = '+';
7200 what = "Alpha";
73b437c8
JH
7201 break;
7202 case ANYOF_NALPHA:
7203 if (LOC)
936ed897 7204 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
7205 else {
7206 for (value = 0; value < 256; value++)
7207 if (!isALPHA(value))
936ed897 7208 ANYOF_BITMAP_SET(ret, value);
73b437c8 7209 }
c49a72a9
NC
7210 yesno = '!';
7211 what = "Alpha";
73b437c8
JH
7212 break;
7213 case ANYOF_ASCII:
7214 if (LOC)
936ed897 7215 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7216 else {
c7f1f016 7217#ifndef EBCDIC
1ba5c669
JH
7218 for (value = 0; value < 128; value++)
7219 ANYOF_BITMAP_SET(ret, value);
7220#else /* EBCDIC */
ffbc6a93 7221 for (value = 0; value < 256; value++) {
3a3c4447
JH
7222 if (isASCII(value))
7223 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7224 }
1ba5c669 7225#endif /* EBCDIC */
73b437c8 7226 }
c49a72a9
NC
7227 yesno = '+';
7228 what = "ASCII";
73b437c8
JH
7229 break;
7230 case ANYOF_NASCII:
7231 if (LOC)
936ed897 7232 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7233 else {
c7f1f016 7234#ifndef EBCDIC
1ba5c669
JH
7235 for (value = 128; value < 256; value++)
7236 ANYOF_BITMAP_SET(ret, value);
7237#else /* EBCDIC */
ffbc6a93 7238 for (value = 0; value < 256; value++) {
3a3c4447
JH
7239 if (!isASCII(value))
7240 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7241 }
1ba5c669 7242#endif /* EBCDIC */
73b437c8 7243 }
c49a72a9
NC
7244 yesno = '!';
7245 what = "ASCII";
73b437c8 7246 break;
aaa51d5e
JF
7247 case ANYOF_BLANK:
7248 if (LOC)
7249 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7250 else {
7251 for (value = 0; value < 256; value++)
7252 if (isBLANK(value))
7253 ANYOF_BITMAP_SET(ret, value);
7254 }
c49a72a9
NC
7255 yesno = '+';
7256 what = "Blank";
aaa51d5e
JF
7257 break;
7258 case ANYOF_NBLANK:
7259 if (LOC)
7260 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7261 else {
7262 for (value = 0; value < 256; value++)
7263 if (!isBLANK(value))
7264 ANYOF_BITMAP_SET(ret, value);
7265 }
c49a72a9
NC
7266 yesno = '!';
7267 what = "Blank";
aaa51d5e 7268 break;
73b437c8
JH
7269 case ANYOF_CNTRL:
7270 if (LOC)
936ed897 7271 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
7272 else {
7273 for (value = 0; value < 256; value++)
7274 if (isCNTRL(value))
936ed897 7275 ANYOF_BITMAP_SET(ret, value);
73b437c8 7276 }
c49a72a9
NC
7277 yesno = '+';
7278 what = "Cntrl";
73b437c8
JH
7279 break;
7280 case ANYOF_NCNTRL:
7281 if (LOC)
936ed897 7282 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
7283 else {
7284 for (value = 0; value < 256; value++)
7285 if (!isCNTRL(value))
936ed897 7286 ANYOF_BITMAP_SET(ret, value);
73b437c8 7287 }
c49a72a9
NC
7288 yesno = '!';
7289 what = "Cntrl";
ffc61ed2
JH
7290 break;
7291 case ANYOF_DIGIT:
7292 if (LOC)
7293 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7294 else {
7295 /* consecutive digits assumed */
7296 for (value = '0'; value <= '9'; value++)
7297 ANYOF_BITMAP_SET(ret, value);
7298 }
c49a72a9
NC
7299 yesno = '+';
7300 what = "Digit";
ffc61ed2
JH
7301 break;
7302 case ANYOF_NDIGIT:
7303 if (LOC)
7304 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7305 else {
7306 /* consecutive digits assumed */
7307 for (value = 0; value < '0'; value++)
7308 ANYOF_BITMAP_SET(ret, value);
7309 for (value = '9' + 1; value < 256; value++)
7310 ANYOF_BITMAP_SET(ret, value);
7311 }
c49a72a9
NC
7312 yesno = '!';
7313 what = "Digit";
73b437c8
JH
7314 break;
7315 case ANYOF_GRAPH:
7316 if (LOC)
936ed897 7317 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
7318 else {
7319 for (value = 0; value < 256; value++)
7320 if (isGRAPH(value))
936ed897 7321 ANYOF_BITMAP_SET(ret, value);
73b437c8 7322 }
c49a72a9
NC
7323 yesno = '+';
7324 what = "Graph";
73b437c8
JH
7325 break;
7326 case ANYOF_NGRAPH:
7327 if (LOC)
936ed897 7328 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
7329 else {
7330 for (value = 0; value < 256; value++)
7331 if (!isGRAPH(value))
936ed897 7332 ANYOF_BITMAP_SET(ret, value);
73b437c8 7333 }
c49a72a9
NC
7334 yesno = '!';
7335 what = "Graph";
73b437c8
JH
7336 break;
7337 case ANYOF_LOWER:
7338 if (LOC)
936ed897 7339 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
7340 else {
7341 for (value = 0; value < 256; value++)
7342 if (isLOWER(value))
936ed897 7343 ANYOF_BITMAP_SET(ret, value);
73b437c8 7344 }
c49a72a9
NC
7345 yesno = '+';
7346 what = "Lower";
73b437c8
JH
7347 break;
7348 case ANYOF_NLOWER:
7349 if (LOC)
936ed897 7350 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
7351 else {
7352 for (value = 0; value < 256; value++)
7353 if (!isLOWER(value))
936ed897 7354 ANYOF_BITMAP_SET(ret, value);
73b437c8 7355 }
c49a72a9
NC
7356 yesno = '!';
7357 what = "Lower";
73b437c8
JH
7358 break;
7359 case ANYOF_PRINT:
7360 if (LOC)
936ed897 7361 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
7362 else {
7363 for (value = 0; value < 256; value++)
7364 if (isPRINT(value))
936ed897 7365 ANYOF_BITMAP_SET(ret, value);
73b437c8 7366 }
c49a72a9
NC
7367 yesno = '+';
7368 what = "Print";
73b437c8
JH
7369 break;
7370 case ANYOF_NPRINT:
7371 if (LOC)
936ed897 7372 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
7373 else {
7374 for (value = 0; value < 256; value++)
7375 if (!isPRINT(value))
936ed897 7376 ANYOF_BITMAP_SET(ret, value);
73b437c8 7377 }
c49a72a9
NC
7378 yesno = '!';
7379 what = "Print";
73b437c8 7380 break;
aaa51d5e
JF
7381 case ANYOF_PSXSPC:
7382 if (LOC)
7383 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7384 else {
7385 for (value = 0; value < 256; value++)
7386 if (isPSXSPC(value))
7387 ANYOF_BITMAP_SET(ret, value);
7388 }
c49a72a9
NC
7389 yesno = '+';
7390 what = "Space";
aaa51d5e
JF
7391 break;
7392 case ANYOF_NPSXSPC:
7393 if (LOC)
7394 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7395 else {
7396 for (value = 0; value < 256; value++)
7397 if (!isPSXSPC(value))
7398 ANYOF_BITMAP_SET(ret, value);
7399 }
c49a72a9
NC
7400 yesno = '!';
7401 what = "Space";
aaa51d5e 7402 break;
73b437c8
JH
7403 case ANYOF_PUNCT:
7404 if (LOC)
936ed897 7405 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
7406 else {
7407 for (value = 0; value < 256; value++)
7408 if (isPUNCT(value))
936ed897 7409 ANYOF_BITMAP_SET(ret, value);
73b437c8 7410 }
c49a72a9
NC
7411 yesno = '+';
7412 what = "Punct";
73b437c8
JH
7413 break;
7414 case ANYOF_NPUNCT:
7415 if (LOC)
936ed897 7416 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
7417 else {
7418 for (value = 0; value < 256; value++)
7419 if (!isPUNCT(value))
936ed897 7420 ANYOF_BITMAP_SET(ret, value);
73b437c8 7421 }
c49a72a9
NC
7422 yesno = '!';
7423 what = "Punct";
ffc61ed2
JH
7424 break;
7425 case ANYOF_SPACE:
7426 if (LOC)
7427 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7428 else {
7429 for (value = 0; value < 256; value++)
7430 if (isSPACE(value))
7431 ANYOF_BITMAP_SET(ret, value);
7432 }
c49a72a9
NC
7433 yesno = '+';
7434 what = "SpacePerl";
ffc61ed2
JH
7435 break;
7436 case ANYOF_NSPACE:
7437 if (LOC)
7438 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7439 else {
7440 for (value = 0; value < 256; value++)
7441 if (!isSPACE(value))
7442 ANYOF_BITMAP_SET(ret, value);
7443 }
c49a72a9
NC
7444 yesno = '!';
7445 what = "SpacePerl";
73b437c8
JH
7446 break;
7447 case ANYOF_UPPER:
7448 if (LOC)
936ed897 7449 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
7450 else {
7451 for (value = 0; value < 256; value++)
7452 if (isUPPER(value))
936ed897 7453 ANYOF_BITMAP_SET(ret, value);
73b437c8 7454 }
c49a72a9
NC
7455 yesno = '+';
7456 what = "Upper";
73b437c8
JH
7457 break;
7458 case ANYOF_NUPPER:
7459 if (LOC)
936ed897 7460 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
7461 else {
7462 for (value = 0; value < 256; value++)
7463 if (!isUPPER(value))
936ed897 7464 ANYOF_BITMAP_SET(ret, value);
73b437c8 7465 }
c49a72a9
NC
7466 yesno = '!';
7467 what = "Upper";
73b437c8
JH
7468 break;
7469 case ANYOF_XDIGIT:
7470 if (LOC)
936ed897 7471 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
7472 else {
7473 for (value = 0; value < 256; value++)
7474 if (isXDIGIT(value))
936ed897 7475 ANYOF_BITMAP_SET(ret, value);
73b437c8 7476 }
c49a72a9
NC
7477 yesno = '+';
7478 what = "XDigit";
73b437c8
JH
7479 break;
7480 case ANYOF_NXDIGIT:
7481 if (LOC)
936ed897 7482 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
7483 else {
7484 for (value = 0; value < 256; value++)
7485 if (!isXDIGIT(value))
936ed897 7486 ANYOF_BITMAP_SET(ret, value);
73b437c8 7487 }
c49a72a9
NC
7488 yesno = '!';
7489 what = "XDigit";
73b437c8 7490 break;
f81125e2
JP
7491 case ANYOF_MAX:
7492 /* this is to handle \p and \P */
7493 break;
73b437c8 7494 default:
b45f050a 7495 vFAIL("Invalid [::] class");
73b437c8 7496 break;
b8c5462f 7497 }
c49a72a9
NC
7498 if (what) {
7499 /* Strings such as "+utf8::isWord\n" */
7500 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7501 }
b8c5462f 7502 if (LOC)
936ed897 7503 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7504 continue;
a0d0e21e 7505 }
ffc61ed2
JH
7506 } /* end of namedclass \blah */
7507
a0d0e21e 7508 if (range) {
eb160463 7509 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7510 const int w = RExC_parse - rangebegin;
7511 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7512 range = 0; /* not a valid range */
73b437c8 7513 }
a0d0e21e
LW
7514 }
7515 else {
3568d838 7516 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7517 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7518 RExC_parse[1] != ']') {
7519 RExC_parse++;
ffc61ed2
JH
7520
7521 /* a bad range like \w-, [:word:]- ? */
7522 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7523 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7524 const int w =
afd78fd5
JH
7525 RExC_parse >= rangebegin ?
7526 RExC_parse - rangebegin : 0;
830247a4 7527 vWARN4(RExC_parse,
b45f050a 7528 "False [] range \"%*.*s\"",
097eb12c 7529 w, w, rangebegin);
afd78fd5 7530 }
73b437c8 7531 if (!SIZE_ONLY)
936ed897 7532 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7533 } else
ffc61ed2
JH
7534 range = 1; /* yeah, it's a range! */
7535 continue; /* but do it the next time */
a0d0e21e 7536 }
a687059c 7537 }
ffc61ed2 7538
93733859 7539 /* now is the next time */
07be1b83 7540 /*stored += (value - prevvalue + 1);*/
ae5c130c 7541 if (!SIZE_ONLY) {
3568d838 7542 if (prevvalue < 256) {
1df70142 7543 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7544 IV i;
3568d838 7545#ifdef EBCDIC
1b2d223b
JH
7546 /* In EBCDIC [\x89-\x91] should include
7547 * the \x8e but [i-j] should not. */
7548 if (literal_endpoint == 2 &&
7549 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7550 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7551 {
3568d838
JH
7552 if (isLOWER(prevvalue)) {
7553 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7554 if (isLOWER(i))
7555 ANYOF_BITMAP_SET(ret, i);
7556 } else {
3568d838 7557 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7558 if (isUPPER(i))
7559 ANYOF_BITMAP_SET(ret, i);
7560 }
8ada0baa 7561 }
ffc61ed2 7562 else
8ada0baa 7563#endif
07be1b83
YO
7564 for (i = prevvalue; i <= ceilvalue; i++) {
7565 if (!ANYOF_BITMAP_TEST(ret,i)) {
7566 stored++;
7567 ANYOF_BITMAP_SET(ret, i);
7568 }
7569 }
3568d838 7570 }
a5961de5 7571 if (value > 255 || UTF) {
1df70142
AL
7572 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7573 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7574 stored+=2; /* can't optimize this class */
ffc61ed2 7575 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7576 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7577 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7578 prevnatvalue, natvalue);
7579 }
7580 else if (prevnatvalue == natvalue) {
7581 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7582 if (FOLD) {
89ebb4a3 7583 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7584 STRLEN foldlen;
1df70142 7585 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7586
e294cc5d
JH
7587#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7588 if (RExC_precomp[0] == ':' &&
7589 RExC_precomp[1] == '[' &&
7590 (f == 0xDF || f == 0x92)) {
7591 f = NATIVE_TO_UNI(f);
7592 }
7593#endif
c840d2a2
JH
7594 /* If folding and foldable and a single
7595 * character, insert also the folded version
7596 * to the charclass. */
9e55ce06 7597 if (f != value) {
e294cc5d
JH
7598#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7599 if ((RExC_precomp[0] == ':' &&
7600 RExC_precomp[1] == '[' &&
7601 (f == 0xA2 &&
7602 (value == 0xFB05 || value == 0xFB06))) ?
7603 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7604 foldlen == (STRLEN)UNISKIP(f) )
7605#else
eb160463 7606 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7607#endif
9e55ce06
JH
7608 Perl_sv_catpvf(aTHX_ listsv,
7609 "%04"UVxf"\n", f);
7610 else {
7611 /* Any multicharacter foldings
7612 * require the following transform:
7613 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7614 * where E folds into "pq" and F folds
7615 * into "rst", all other characters
7616 * fold to single characters. We save
7617 * away these multicharacter foldings,
7618 * to be later saved as part of the
7619 * additional "s" data. */
7620 SV *sv;
7621
7622 if (!unicode_alternate)
7623 unicode_alternate = newAV();
7624 sv = newSVpvn((char*)foldbuf, foldlen);
7625 SvUTF8_on(sv);
7626 av_push(unicode_alternate, sv);
7627 }
7628 }
254ba52a 7629
60a8b682
JH
7630 /* If folding and the value is one of the Greek
7631 * sigmas insert a few more sigmas to make the
7632 * folding rules of the sigmas to work right.
7633 * Note that not all the possible combinations
7634 * are handled here: some of them are handled
9e55ce06
JH
7635 * by the standard folding rules, and some of
7636 * them (literal or EXACTF cases) are handled
7637 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7638 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7639 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7640 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7641 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7642 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7643 }
7644 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7645 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7646 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7647 }
7648 }
ffc61ed2 7649 }
1b2d223b
JH
7650#ifdef EBCDIC
7651 literal_endpoint = 0;
7652#endif
8ada0baa 7653 }
ffc61ed2
JH
7654
7655 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7656 }
ffc61ed2 7657
936ed897 7658 if (need_class) {
4f66b38d 7659 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7660 if (SIZE_ONLY)
830247a4 7661 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7662 else
830247a4 7663 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7664 }
ffc61ed2 7665
7f6f358c
YO
7666
7667 if (SIZE_ONLY)
7668 return ret;
7669 /****** !SIZE_ONLY AFTER HERE *********/
7670
7671 if( stored == 1 && value < 256
7672 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7673 ) {
7674 /* optimize single char class to an EXACT node
7675 but *only* when its not a UTF/high char */
07be1b83
YO
7676 const char * cur_parse= RExC_parse;
7677 RExC_emit = (regnode *)orig_emit;
7678 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7679 ret = reg_node(pRExC_state,
7680 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7681 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7682 *STRING(ret)= (char)value;
7683 STR_LEN(ret)= 1;
7684 RExC_emit += STR_SZ(1);
7685 return ret;
7686 }
ae5c130c 7687 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7688 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7689 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7690 ) {
a0ed51b3 7691 for (value = 0; value < 256; ++value) {
936ed897 7692 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7693 UV fold = PL_fold[value];
ffc61ed2
JH
7694
7695 if (fold != value)
7696 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7697 }
7698 }
936ed897 7699 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7700 }
ffc61ed2 7701
ae5c130c 7702 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7703 if (optimize_invert &&
ffc61ed2
JH
7704 /* If the only flag is inversion. */
7705 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7706 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7707 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7708 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7709 }
7f6f358c 7710 {
097eb12c 7711 AV * const av = newAV();
ffc61ed2 7712 SV *rv;
9e55ce06 7713 /* The 0th element stores the character class description
6a0407ee 7714 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7715 * to initialize the appropriate swash (which gets stored in
7716 * the 1st element), and also useful for dumping the regnode.
7717 * The 2nd element stores the multicharacter foldings,
6a0407ee 7718 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7719 av_store(av, 0, listsv);
7720 av_store(av, 1, NULL);
9e55ce06 7721 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7722 rv = newRV_noinc((SV*)av);
19860706 7723 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 7724 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 7725 ARG_SET(ret, n);
a0ed51b3 7726 }
a0ed51b3
LW
7727 return ret;
7728}
7729
76e3520e 7730STATIC char*
830247a4 7731S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7732{
097eb12c 7733 char* const retval = RExC_parse++;
a0d0e21e 7734
4633a7c4 7735 for (;;) {
830247a4
IZ
7736 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7737 RExC_parse[2] == '#') {
e994fd66
AE
7738 while (*RExC_parse != ')') {
7739 if (RExC_parse == RExC_end)
7740 FAIL("Sequence (?#... not terminated");
830247a4 7741 RExC_parse++;
e994fd66 7742 }
830247a4 7743 RExC_parse++;
4633a7c4
LW
7744 continue;
7745 }
bbe252da 7746 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
7747 if (isSPACE(*RExC_parse)) {
7748 RExC_parse++;
748a9306
LW
7749 continue;
7750 }
830247a4 7751 else if (*RExC_parse == '#') {
e994fd66
AE
7752 while (RExC_parse < RExC_end)
7753 if (*RExC_parse++ == '\n') break;
748a9306
LW
7754 continue;
7755 }
748a9306 7756 }
4633a7c4 7757 return retval;
a0d0e21e 7758 }
a687059c
LW
7759}
7760
7761/*
c277df42 7762- reg_node - emit a node
a0d0e21e 7763*/
76e3520e 7764STATIC regnode * /* Location. */
830247a4 7765S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7766{
97aff369 7767 dVAR;
c277df42 7768 register regnode *ptr;
504618e9 7769 regnode * const ret = RExC_emit;
07be1b83 7770 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7771
c277df42 7772 if (SIZE_ONLY) {
830247a4
IZ
7773 SIZE_ALIGN(RExC_size);
7774 RExC_size += 1;
a0d0e21e
LW
7775 return(ret);
7776 }
e2e6a0f1
YO
7777#ifdef DEBUGGING
7778 if (OP(RExC_emit) == 255)
7779 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7780 reg_name[op], OP(RExC_emit));
7781#endif
c277df42 7782 NODE_ALIGN_FILL(ret);
a0d0e21e 7783 ptr = ret;
c277df42 7784 FILL_ADVANCE_NODE(ptr, op);
fac92740 7785 if (RExC_offsets) { /* MJD */
07be1b83 7786 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7787 "reg_node", __LINE__,
7788 reg_name[op],
07be1b83
YO
7789 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7790 ? "Overwriting end of array!\n" : "OK",
7791 (UV)(RExC_emit - RExC_emit_start),
7792 (UV)(RExC_parse - RExC_start),
7793 (UV)RExC_offsets[0]));
ccb2c380 7794 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7795 }
07be1b83 7796
830247a4 7797 RExC_emit = ptr;
a0d0e21e 7798 return(ret);
a687059c
LW
7799}
7800
7801/*
a0d0e21e
LW
7802- reganode - emit a node with an argument
7803*/
76e3520e 7804STATIC regnode * /* Location. */
830247a4 7805S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7806{
97aff369 7807 dVAR;
c277df42 7808 register regnode *ptr;
504618e9 7809 regnode * const ret = RExC_emit;
07be1b83 7810 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7811
c277df42 7812 if (SIZE_ONLY) {
830247a4
IZ
7813 SIZE_ALIGN(RExC_size);
7814 RExC_size += 2;
6bda09f9
YO
7815 /*
7816 We can't do this:
7817
7818 assert(2==regarglen[op]+1);
7819
7820 Anything larger than this has to allocate the extra amount.
7821 If we changed this to be:
7822
7823 RExC_size += (1 + regarglen[op]);
7824
7825 then it wouldn't matter. Its not clear what side effect
7826 might come from that so its not done so far.
7827 -- dmq
7828 */
a0d0e21e
LW
7829 return(ret);
7830 }
e2e6a0f1
YO
7831#ifdef DEBUGGING
7832 if (OP(RExC_emit) == 255)
7833 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7834#endif
c277df42 7835 NODE_ALIGN_FILL(ret);
a0d0e21e 7836 ptr = ret;
c277df42 7837 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7838 if (RExC_offsets) { /* MJD */
07be1b83 7839 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7840 "reganode",
ccb2c380
MP
7841 __LINE__,
7842 reg_name[op],
07be1b83 7843 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7844 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7845 (UV)(RExC_emit - RExC_emit_start),
7846 (UV)(RExC_parse - RExC_start),
7847 (UV)RExC_offsets[0]));
ccb2c380 7848 Set_Cur_Node_Offset;
fac92740
MJD
7849 }
7850
830247a4 7851 RExC_emit = ptr;
a0d0e21e 7852 return(ret);
fe14fcc3
LW
7853}
7854
7855/*
cd439c50 7856- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7857*/
71207a34
AL
7858STATIC STRLEN
7859S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7860{
97aff369 7861 dVAR;
71207a34 7862 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7863}
7864
7865/*
a0d0e21e
LW
7866- reginsert - insert an operator in front of already-emitted operand
7867*
7868* Means relocating the operand.
7869*/
76e3520e 7870STATIC void
6bda09f9 7871S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7872{
97aff369 7873 dVAR;
c277df42
IZ
7874 register regnode *src;
7875 register regnode *dst;
7876 register regnode *place;
504618e9 7877 const int offset = regarglen[(U8)op];
6bda09f9 7878 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7879 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7880/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7881 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7882 if (SIZE_ONLY) {
6bda09f9 7883 RExC_size += size;
a0d0e21e
LW
7884 return;
7885 }
a687059c 7886
830247a4 7887 src = RExC_emit;
6bda09f9 7888 RExC_emit += size;
830247a4 7889 dst = RExC_emit;
40d049e4 7890 if (RExC_open_parens) {
6bda09f9 7891 int paren;
6d99fb9b 7892 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 7893 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7894 if ( RExC_open_parens[paren] >= opnd ) {
7895 DEBUG_PARSE_FMT("open"," - %d",size);
7896 RExC_open_parens[paren] += size;
7897 } else {
7898 DEBUG_PARSE_FMT("open"," - %s","ok");
7899 }
7900 if ( RExC_close_parens[paren] >= opnd ) {
7901 DEBUG_PARSE_FMT("close"," - %d",size);
7902 RExC_close_parens[paren] += size;
7903 } else {
7904 DEBUG_PARSE_FMT("close"," - %s","ok");
7905 }
7906 }
6bda09f9 7907 }
40d049e4 7908
fac92740 7909 while (src > opnd) {
c277df42 7910 StructCopy(--src, --dst, regnode);
fac92740 7911 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7912 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7913 "reg_insert",
ccb2c380
MP
7914 __LINE__,
7915 reg_name[op],
07be1b83
YO
7916 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7917 ? "Overwriting end of array!\n" : "OK",
7918 (UV)(src - RExC_emit_start),
7919 (UV)(dst - RExC_emit_start),
7920 (UV)RExC_offsets[0]));
ccb2c380
MP
7921 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7922 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7923 }
7924 }
7925
a0d0e21e
LW
7926
7927 place = opnd; /* Op node, where operand used to be. */
fac92740 7928 if (RExC_offsets) { /* MJD */
07be1b83 7929 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7930 "reginsert",
ccb2c380
MP
7931 __LINE__,
7932 reg_name[op],
07be1b83 7933 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7934 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7935 (UV)(place - RExC_emit_start),
7936 (UV)(RExC_parse - RExC_start),
786e8c11 7937 (UV)RExC_offsets[0]));
ccb2c380 7938 Set_Node_Offset(place, RExC_parse);
45948336 7939 Set_Node_Length(place, 1);
fac92740 7940 }
c277df42
IZ
7941 src = NEXTOPER(place);
7942 FILL_ADVANCE_NODE(place, op);
7943 Zero(src, offset, regnode);
a687059c
LW
7944}
7945
7946/*
c277df42 7947- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7948- SEE ALSO: regtail_study
a0d0e21e 7949*/
097eb12c 7950/* TODO: All three parms should be const */
76e3520e 7951STATIC void
3dab1dad 7952S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7953{
97aff369 7954 dVAR;
c277df42 7955 register regnode *scan;
72f13be8 7956 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7957#ifndef DEBUGGING
7958 PERL_UNUSED_ARG(depth);
7959#endif
a0d0e21e 7960
c277df42 7961 if (SIZE_ONLY)
a0d0e21e
LW
7962 return;
7963
7964 /* Find last node. */
7965 scan = p;
7966 for (;;) {
504618e9 7967 regnode * const temp = regnext(scan);
3dab1dad
YO
7968 DEBUG_PARSE_r({
7969 SV * const mysv=sv_newmortal();
7970 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7971 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7972 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7973 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7974 (temp == NULL ? "->" : ""),
7975 (temp == NULL ? reg_name[OP(val)] : "")
7976 );
3dab1dad
YO
7977 });
7978 if (temp == NULL)
7979 break;
7980 scan = temp;
7981 }
7982
7983 if (reg_off_by_arg[OP(scan)]) {
7984 ARG_SET(scan, val - scan);
7985 }
7986 else {
7987 NEXT_OFF(scan) = val - scan;
7988 }
7989}
7990
07be1b83 7991#ifdef DEBUGGING
3dab1dad
YO
7992/*
7993- regtail_study - set the next-pointer at the end of a node chain of p to val.
7994- Look for optimizable sequences at the same time.
7995- currently only looks for EXACT chains.
07be1b83
YO
7996
7997This is expermental code. The idea is to use this routine to perform
7998in place optimizations on branches and groups as they are constructed,
7999with the long term intention of removing optimization from study_chunk so
8000that it is purely analytical.
8001
8002Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8003to control which is which.
8004
3dab1dad
YO
8005*/
8006/* TODO: All four parms should be const */
07be1b83 8007
3dab1dad
YO
8008STATIC U8
8009S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8010{
8011 dVAR;
8012 register regnode *scan;
07be1b83
YO
8013 U8 exact = PSEUDO;
8014#ifdef EXPERIMENTAL_INPLACESCAN
8015 I32 min = 0;
8016#endif
8017
3dab1dad
YO
8018 GET_RE_DEBUG_FLAGS_DECL;
8019
07be1b83 8020
3dab1dad
YO
8021 if (SIZE_ONLY)
8022 return exact;
8023
8024 /* Find last node. */
8025
8026 scan = p;
8027 for (;;) {
8028 regnode * const temp = regnext(scan);
07be1b83
YO
8029#ifdef EXPERIMENTAL_INPLACESCAN
8030 if (PL_regkind[OP(scan)] == EXACT)
8031 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8032 return EXACT;
8033#endif
3dab1dad
YO
8034 if ( exact ) {
8035 switch (OP(scan)) {
8036 case EXACT:
8037 case EXACTF:
8038 case EXACTFL:
8039 if( exact == PSEUDO )
8040 exact= OP(scan);
07be1b83
YO
8041 else if ( exact != OP(scan) )
8042 exact= 0;
3dab1dad
YO
8043 case NOTHING:
8044 break;
8045 default:
8046 exact= 0;
8047 }
8048 }
8049 DEBUG_PARSE_r({
8050 SV * const mysv=sv_newmortal();
8051 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8052 regprop(RExC_rx, mysv, scan);
eaf3ca90 8053 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8054 SvPV_nolen_const(mysv),
eaf3ca90
YO
8055 REG_NODE_NUM(scan),
8056 reg_name[exact]);
3dab1dad 8057 });
a0d0e21e
LW
8058 if (temp == NULL)
8059 break;
8060 scan = temp;
8061 }
07be1b83
YO
8062 DEBUG_PARSE_r({
8063 SV * const mysv_val=sv_newmortal();
8064 DEBUG_PARSE_MSG("");
8065 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
8066 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8067 SvPV_nolen_const(mysv_val),
8068 (IV)REG_NODE_NUM(val),
8069 (IV)(val - scan)
07be1b83
YO
8070 );
8071 });
c277df42
IZ
8072 if (reg_off_by_arg[OP(scan)]) {
8073 ARG_SET(scan, val - scan);
a0ed51b3
LW
8074 }
8075 else {
c277df42
IZ
8076 NEXT_OFF(scan) = val - scan;
8077 }
3dab1dad
YO
8078
8079 return exact;
a687059c 8080}
07be1b83 8081#endif
a687059c
LW
8082
8083/*
a687059c
LW
8084 - regcurly - a little FSA that accepts {\d+,?\d*}
8085 */
79072805 8086STATIC I32
5f66b61c 8087S_regcurly(register const char *s)
a687059c
LW
8088{
8089 if (*s++ != '{')
8090 return FALSE;
f0fcb552 8091 if (!isDIGIT(*s))
a687059c 8092 return FALSE;
f0fcb552 8093 while (isDIGIT(*s))
a687059c
LW
8094 s++;
8095 if (*s == ',')
8096 s++;
f0fcb552 8097 while (isDIGIT(*s))
a687059c
LW
8098 s++;
8099 if (*s != '}')
8100 return FALSE;
8101 return TRUE;
8102}
8103
a687059c
LW
8104
8105/*
fd181c75 8106 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
8107 */
8108void
097eb12c 8109Perl_regdump(pTHX_ const regexp *r)
a687059c 8110{
35ff7856 8111#ifdef DEBUGGING
97aff369 8112 dVAR;
c445ea15 8113 SV * const sv = sv_newmortal();
ab3bbdeb 8114 SV *dsv= sv_newmortal();
f8fc2ecf 8115 RXi_GET_DECL(r,ri);
a687059c 8116
f8fc2ecf 8117 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8118
8119 /* Header fields of interest. */
ab3bbdeb
YO
8120 if (r->anchored_substr) {
8121 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8122 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8123 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8124 "anchored %s%s at %"IVdf" ",
8125 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8126 (IV)r->anchored_offset);
ab3bbdeb
YO
8127 } else if (r->anchored_utf8) {
8128 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8129 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8130 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8131 "anchored utf8 %s%s at %"IVdf" ",
8132 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8133 (IV)r->anchored_offset);
ab3bbdeb
YO
8134 }
8135 if (r->float_substr) {
8136 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8137 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8138 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8139 "floating %s%s at %"IVdf"..%"UVuf" ",
8140 s, RE_SV_TAIL(r->float_substr),
7b0972df 8141 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8142 } else if (r->float_utf8) {
8143 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8144 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8145 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8146 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8147 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8148 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8149 }
33b8afdf 8150 if (r->check_substr || r->check_utf8)
b81d288d 8151 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8152 (const char *)
8153 (r->check_substr == r->float_substr
8154 && r->check_utf8 == r->float_utf8
8155 ? "(checking floating" : "(checking anchored"));
bbe252da 8156 if (r->extflags & RXf_NOSCAN)
c277df42 8157 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 8158 if (r->extflags & RXf_CHECK_ALL)
c277df42 8159 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8160 if (r->check_substr || r->check_utf8)
c277df42
IZ
8161 PerlIO_printf(Perl_debug_log, ") ");
8162
f8fc2ecf
YO
8163 if (ri->regstclass) {
8164 regprop(r, sv, ri->regstclass);
1de06328 8165 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8166 }
bbe252da 8167 if (r->extflags & RXf_ANCH) {
774d564b 8168 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 8169 if (r->extflags & RXf_ANCH_BOL)
774d564b 8170 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 8171 if (r->extflags & RXf_ANCH_MBOL)
c277df42 8172 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 8173 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 8174 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 8175 if (r->extflags & RXf_ANCH_GPOS)
774d564b 8176 PerlIO_printf(Perl_debug_log, "(GPOS)");
8177 PerlIO_putc(Perl_debug_log, ' ');
8178 }
bbe252da 8179 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 8180 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 8181 if (r->intflags & PREGf_SKIP)
760ac839 8182 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 8183 if (r->intflags & PREGf_IMPLICIT)
760ac839 8184 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 8185 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 8186 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 8187 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8188 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8189#else
96a5add6 8190 PERL_UNUSED_CONTEXT;
65e66c80 8191 PERL_UNUSED_ARG(r);
17c3b450 8192#endif /* DEBUGGING */
a687059c
LW
8193}
8194
8195/*
a0d0e21e
LW
8196- regprop - printable representation of opcode
8197*/
46fc3d4c 8198void
32fc9b6a 8199Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8200{
35ff7856 8201#ifdef DEBUGGING
97aff369 8202 dVAR;
9b155405 8203 register int k;
f8fc2ecf 8204 RXi_GET_DECL(prog,progi);
1de06328 8205 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 8206
a0d0e21e 8207
54dc92de 8208 sv_setpvn(sv, "", 0);
8aa23a47 8209
03363afd 8210 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8211 /* It would be nice to FAIL() here, but this may be called from
8212 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8213 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8214 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8215
3dab1dad 8216 k = PL_regkind[OP(o)];
9b155405 8217
2a782b5b 8218 if (k == EXACT) {
396482e1 8219 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8220 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8221 * is a crude hack but it may be the best for now since
8222 * we have no flag "this EXACTish node was UTF-8"
8223 * --jhi */
8224 const char * const s =
ddc5bc0f 8225 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8226 PL_colors[0], PL_colors[1],
8227 PERL_PV_ESCAPE_UNI_DETECT |
8228 PERL_PV_PRETTY_ELIPSES |
8229 PERL_PV_PRETTY_LTGT
8230 );
8231 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8232 } else if (k == TRIE) {
3dab1dad 8233 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 8234 * progi->data isn't available here */
1de06328
YO
8235 const char op = OP(o);
8236 const I32 n = ARG(o);
8237 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 8238 (reg_ac_data *)progi->data->data[n] :
1de06328 8239 NULL;
3251b653
NC
8240 const reg_trie_data * const trie
8241 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328
YO
8242
8243 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8244 DEBUG_TRIE_COMPILE_r(
8245 Perl_sv_catpvf(aTHX_ sv,
8246 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8247 (UV)trie->startstate,
1e2e3d02 8248 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8249 (UV)trie->wordcount,
8250 (UV)trie->minlen,
8251 (UV)trie->maxlen,
8252 (UV)TRIE_CHARCOUNT(trie),
8253 (UV)trie->uniquecharcount
8254 )
8255 );
8256 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8257 int i;
8258 int rangestart = -1;
f46cb337 8259 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8260 Perl_sv_catpvf(aTHX_ sv, "[");
8261 for (i = 0; i <= 256; i++) {
8262 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8263 if (rangestart == -1)
8264 rangestart = i;
8265 } else if (rangestart != -1) {
8266 if (i <= rangestart + 3)
8267 for (; rangestart < i; rangestart++)
8268 put_byte(sv, rangestart);
8269 else {
8270 put_byte(sv, rangestart);
8271 sv_catpvs(sv, "-");
8272 put_byte(sv, i - 1);
8273 }
8274 rangestart = -1;
8275 }
8276 }
8277 Perl_sv_catpvf(aTHX_ sv, "]");
8278 }
8279
a3621e74 8280 } else if (k == CURLY) {
cb434fcc 8281 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8282 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8283 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8284 }
2c2d71f5
JH
8285 else if (k == WHILEM && o->flags) /* Ordinal/of */
8286 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
e2e6a0f1 8287 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
894356b3 8288 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1a147d38 8289 else if (k == GOSUB)
6bda09f9 8290 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8291 else if (k == VERB) {
8292 if (!o->flags)
8293 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
f8fc2ecf 8294 (SV*)progi->data->data[ ARG( o ) ]);
e2e6a0f1 8295 } else if (k == LOGICAL)
04ebc1ab 8296 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8297 else if (k == ANYOF) {
8298 int i, rangestart = -1;
2d03de9c 8299 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8300
8301 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8302 static const char * const anyofs[] = {
653099ff
GS
8303 "\\w",
8304 "\\W",
8305 "\\s",
8306 "\\S",
8307 "\\d",
8308 "\\D",
8309 "[:alnum:]",
8310 "[:^alnum:]",
8311 "[:alpha:]",
8312 "[:^alpha:]",
8313 "[:ascii:]",
8314 "[:^ascii:]",
8315 "[:ctrl:]",
8316 "[:^ctrl:]",
8317 "[:graph:]",
8318 "[:^graph:]",
8319 "[:lower:]",
8320 "[:^lower:]",
8321 "[:print:]",
8322 "[:^print:]",
8323 "[:punct:]",
8324 "[:^punct:]",
8325 "[:upper:]",
aaa51d5e 8326 "[:^upper:]",
653099ff 8327 "[:xdigit:]",
aaa51d5e
JF
8328 "[:^xdigit:]",
8329 "[:space:]",
8330 "[:^space:]",
8331 "[:blank:]",
8332 "[:^blank:]"
653099ff
GS
8333 };
8334
19860706 8335 if (flags & ANYOF_LOCALE)
396482e1 8336 sv_catpvs(sv, "{loc}");
19860706 8337 if (flags & ANYOF_FOLD)
396482e1 8338 sv_catpvs(sv, "{i}");
653099ff 8339 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8340 if (flags & ANYOF_INVERT)
396482e1 8341 sv_catpvs(sv, "^");
ffc61ed2
JH
8342 for (i = 0; i <= 256; i++) {
8343 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8344 if (rangestart == -1)
8345 rangestart = i;
8346 } else if (rangestart != -1) {
8347 if (i <= rangestart + 3)
8348 for (; rangestart < i; rangestart++)
653099ff 8349 put_byte(sv, rangestart);
ffc61ed2
JH
8350 else {
8351 put_byte(sv, rangestart);
396482e1 8352 sv_catpvs(sv, "-");
ffc61ed2 8353 put_byte(sv, i - 1);
653099ff 8354 }
ffc61ed2 8355 rangestart = -1;
653099ff 8356 }
847a199f 8357 }
ffc61ed2
JH
8358
8359 if (o->flags & ANYOF_CLASS)
bb7a0f54 8360 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8361 if (ANYOF_CLASS_TEST(o,i))
8362 sv_catpv(sv, anyofs[i]);
8363
8364 if (flags & ANYOF_UNICODE)
396482e1 8365 sv_catpvs(sv, "{unicode}");
1aa99e6b 8366 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8367 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8368
8369 {
8370 SV *lv;
32fc9b6a 8371 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8372
ffc61ed2
JH
8373 if (lv) {
8374 if (sw) {
89ebb4a3 8375 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8376
ffc61ed2 8377 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8378 uvchr_to_utf8(s, i);
ffc61ed2 8379
3568d838 8380 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8381 if (rangestart == -1)
8382 rangestart = i;
8383 } else if (rangestart != -1) {
ffc61ed2
JH
8384 if (i <= rangestart + 3)
8385 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8386 const U8 * const e = uvchr_to_utf8(s,rangestart);
8387 U8 *p;
8388 for(p = s; p < e; p++)
ffc61ed2
JH
8389 put_byte(sv, *p);
8390 }
8391 else {
2d03de9c
AL
8392 const U8 *e = uvchr_to_utf8(s,rangestart);
8393 U8 *p;
8394 for (p = s; p < e; p++)
ffc61ed2 8395 put_byte(sv, *p);
396482e1 8396 sv_catpvs(sv, "-");
2d03de9c
AL
8397 e = uvchr_to_utf8(s, i-1);
8398 for (p = s; p < e; p++)
1df70142 8399 put_byte(sv, *p);
ffc61ed2
JH
8400 }
8401 rangestart = -1;
8402 }
19860706 8403 }
ffc61ed2 8404
396482e1 8405 sv_catpvs(sv, "..."); /* et cetera */
19860706 8406 }
fde631ed 8407
ffc61ed2 8408 {
2e0de35c 8409 char *s = savesvpv(lv);
c445ea15 8410 char * const origs = s;
b81d288d 8411
3dab1dad
YO
8412 while (*s && *s != '\n')
8413 s++;
b81d288d 8414
ffc61ed2 8415 if (*s == '\n') {
2d03de9c 8416 const char * const t = ++s;
ffc61ed2
JH
8417
8418 while (*s) {
8419 if (*s == '\n')
8420 *s = ' ';
8421 s++;
8422 }
8423 if (s[-1] == ' ')
8424 s[-1] = 0;
8425
8426 sv_catpv(sv, t);
fde631ed 8427 }
b81d288d 8428
ffc61ed2 8429 Safefree(origs);
fde631ed
JH
8430 }
8431 }
653099ff 8432 }
ffc61ed2 8433
653099ff
GS
8434 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8435 }
9b155405 8436 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8437 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8438#else
96a5add6 8439 PERL_UNUSED_CONTEXT;
65e66c80
SP
8440 PERL_UNUSED_ARG(sv);
8441 PERL_UNUSED_ARG(o);
f9049ba1 8442 PERL_UNUSED_ARG(prog);
17c3b450 8443#endif /* DEBUGGING */
35ff7856 8444}
a687059c 8445
cad2e5aa
JH
8446SV *
8447Perl_re_intuit_string(pTHX_ regexp *prog)
8448{ /* Assume that RE_INTUIT is set */
97aff369 8449 dVAR;
a3621e74 8450 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8451 PERL_UNUSED_CONTEXT;
8452
a3621e74 8453 DEBUG_COMPILE_r(
cfd0369c 8454 {
2d03de9c 8455 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8456 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8457
8458 if (!PL_colorset) reginitcolors();
8459 PerlIO_printf(Perl_debug_log,
a0288114 8460 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8461 PL_colors[4],
8462 prog->check_substr ? "" : "utf8 ",
8463 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8464 s,
8465 PL_colors[1],
8466 (strlen(s) > 60 ? "..." : ""));
8467 } );
8468
33b8afdf 8469 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8470}
8471
84da74a7
YO
8472/*
8473 pregfree - free a regexp
8474
8475 See regdupe below if you change anything here.
8476*/
8477
2b69d0c2 8478void
864dbfa3 8479Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8480{
27da23d5 8481 dVAR;
f8fc2ecf 8482 RXi_GET_DECL(r,ri);
fc32ee4a 8483 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8484
7821416a
IZ
8485 if (!r || (--r->refcnt > 0))
8486 return;
ab3bbdeb 8487 DEBUG_COMPILE_r({
0df25f3d
YO
8488 if (!PL_colorset)
8489 reginitcolors();
5c9f2f80
YO
8490 {
8491 SV *dsv= sv_newmortal();
bbe252da 8492 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
ab3bbdeb
YO
8493 dsv, r->precomp, r->prelen, 60);
8494 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8495 PL_colors[4],PL_colors[5],s);
8496 }
9e55ce06 8497 });
cad2e5aa 8498
43c5f42d
NC
8499 /* gcov results gave these as non-null 100% of the time, so there's no
8500 optimisation in checking them before calling Safefree */
8501 Safefree(r->precomp);
f8fc2ecf 8502 Safefree(ri->offsets); /* 20010421 MJD */
ed252734 8503 RX_MATCH_COPY_FREE(r);
f8c7b90f 8504#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8505 if (r->saved_copy)
8506 SvREFCNT_dec(r->saved_copy);
8507#endif
a193d654
GS
8508 if (r->substrs) {
8509 if (r->anchored_substr)
8510 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8511 if (r->anchored_utf8)
8512 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8513 if (r->float_substr)
8514 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8515 if (r->float_utf8)
8516 SvREFCNT_dec(r->float_utf8);
2779dcf1 8517 Safefree(r->substrs);
a193d654 8518 }
81714fb9
YO
8519 if (r->paren_names)
8520 SvREFCNT_dec(r->paren_names);
f8fc2ecf
YO
8521 if (ri->data) {
8522 int n = ri->data->count;
f3548bdc
DM
8523 PAD* new_comppad = NULL;
8524 PAD* old_comppad;
4026c95a 8525 PADOFFSET refcnt;
dfad63ad 8526
c277df42 8527 while (--n >= 0) {
261faec3 8528 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 8529 switch (ri->data->what[n]) {
c277df42 8530 case 's':
81714fb9 8531 case 'S':
55eed653 8532 case 'u':
f8fc2ecf 8533 SvREFCNT_dec((SV*)ri->data->data[n]);
c277df42 8534 break;
653099ff 8535 case 'f':
f8fc2ecf 8536 Safefree(ri->data->data[n]);
653099ff 8537 break;
dfad63ad 8538 case 'p':
f8fc2ecf 8539 new_comppad = (AV*)ri->data->data[n];
dfad63ad 8540 break;
c277df42 8541 case 'o':
dfad63ad 8542 if (new_comppad == NULL)
cea2e8a9 8543 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8544 PAD_SAVE_LOCAL(old_comppad,
8545 /* Watch out for global destruction's random ordering. */
c445ea15 8546 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8547 );
b34c0dd4 8548 OP_REFCNT_LOCK;
f8fc2ecf 8549 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
8550 OP_REFCNT_UNLOCK;
8551 if (!refcnt)
f8fc2ecf 8552 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 8553
f3548bdc 8554 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8555 SvREFCNT_dec((SV*)new_comppad);
8556 new_comppad = NULL;
c277df42
IZ
8557 break;
8558 case 'n':
9e55ce06 8559 break;
07be1b83 8560 case 'T':
be8e71aa
YO
8561 { /* Aho Corasick add-on structure for a trie node.
8562 Used in stclass optimization only */
07be1b83 8563 U32 refcount;
f8fc2ecf 8564 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
8565 OP_REFCNT_LOCK;
8566 refcount = --aho->refcount;
8567 OP_REFCNT_UNLOCK;
8568 if ( !refcount ) {
446bd890
NC
8569 PerlMemShared_free(aho->states);
8570 PerlMemShared_free(aho->fail);
446bd890
NC
8571 /* do this last!!!! */
8572 PerlMemShared_free(ri->data->data[n]);
8573 PerlMemShared_free(ri->regstclass);
07be1b83
YO
8574 }
8575 }
8576 break;
a3621e74 8577 case 't':
07be1b83 8578 {
be8e71aa 8579 /* trie structure. */
07be1b83 8580 U32 refcount;
f8fc2ecf 8581 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
8582 OP_REFCNT_LOCK;
8583 refcount = --trie->refcount;
8584 OP_REFCNT_UNLOCK;
8585 if ( !refcount ) {
446bd890 8586 PerlMemShared_free(trie->charmap);
446bd890
NC
8587 PerlMemShared_free(trie->states);
8588 PerlMemShared_free(trie->trans);
07be1b83 8589 if (trie->bitmap)
446bd890 8590 PerlMemShared_free(trie->bitmap);
07be1b83 8591 if (trie->wordlen)
446bd890 8592 PerlMemShared_free(trie->wordlen);
786e8c11 8593 if (trie->jump)
446bd890 8594 PerlMemShared_free(trie->jump);
786e8c11 8595 if (trie->nextword)
446bd890 8596 PerlMemShared_free(trie->nextword);
446bd890
NC
8597 /* do this last!!!! */
8598 PerlMemShared_free(ri->data->data[n]);
a3621e74 8599 }
07be1b83
YO
8600 }
8601 break;
c277df42 8602 default:
f8fc2ecf 8603 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
8604 }
8605 }
f8fc2ecf
YO
8606 Safefree(ri->data->what);
8607 Safefree(ri->data);
a0d0e21e
LW
8608 }
8609 Safefree(r->startp);
8610 Safefree(r->endp);
f8fc2ecf
YO
8611 if (ri->swap) {
8612 Safefree(ri->swap->startp);
8613 Safefree(ri->swap->endp);
8614 Safefree(ri->swap);
c74340f9 8615 }
f8fc2ecf 8616 Safefree(ri);
a0d0e21e 8617 Safefree(r);
a687059c 8618}
c277df42 8619
84da74a7
YO
8620#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8621#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8622#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8623#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8624
8625/*
8626 regdupe - duplicate a regexp.
8627
8628 This routine is called by sv.c's re_dup and is expected to clone a
8629 given regexp structure. It is a no-op when not under USE_ITHREADS.
8630 (Originally this *was* re_dup() for change history see sv.c)
8631
8632 See pregfree() above if you change anything here.
8633*/
a3c0e9ca 8634#if defined(USE_ITHREADS)
84da74a7
YO
8635regexp *
8636Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8637{
84da74a7 8638 dVAR;
f8fc2ecf
YO
8639 regexp *ret;
8640 regexp_internal *reti;
84da74a7
YO
8641 int i, len, npar;
8642 struct reg_substr_datum *s;
f8fc2ecf
YO
8643 RXi_GET_DECL(r,ri);
8644
84da74a7
YO
8645 if (!r)
8646 return (REGEXP *)NULL;
8647
8648 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8649 return ret;
8650
f8fc2ecf 8651 len = ri->offsets[0];
84da74a7
YO
8652 npar = r->nparens+1;
8653
f8fc2ecf
YO
8654 Newxz(ret, 1, regexp);
8655 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
8656 RXi_SET(ret,reti);
8657 Copy(ri->program, reti->program, len+1, regnode);
84da74a7
YO
8658
8659 Newx(ret->startp, npar, I32);
8660 Copy(r->startp, ret->startp, npar, I32);
8661 Newx(ret->endp, npar, I32);
8662 Copy(r->startp, ret->startp, npar, I32);
f8fc2ecf
YO
8663 if(ri->swap) {
8664 Newx(reti->swap, 1, regexp_paren_ofs);
c74340f9 8665 /* no need to copy these */
f8fc2ecf
YO
8666 Newx(reti->swap->startp, npar, I32);
8667 Newx(reti->swap->endp, npar, I32);
c74340f9 8668 } else {
f8fc2ecf 8669 reti->swap = NULL;
c74340f9 8670 }
84da74a7
YO
8671
8672 Newx(ret->substrs, 1, struct reg_substr_data);
8673 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8674 s->min_offset = r->substrs->data[i].min_offset;
8675 s->max_offset = r->substrs->data[i].max_offset;
8676 s->end_shift = r->substrs->data[i].end_shift;
8677 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8678 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8679 }
8680
f8fc2ecf
YO
8681 reti->regstclass = NULL;
8682 if (ri->data) {
84da74a7 8683 struct reg_data *d;
f8fc2ecf 8684 const int count = ri->data->count;
84da74a7
YO
8685 int i;
8686
8687 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8688 char, struct reg_data);
8689 Newx(d->what, count, U8);
8690
8691 d->count = count;
8692 for (i = 0; i < count; i++) {
f8fc2ecf 8693 d->what[i] = ri->data->what[i];
84da74a7 8694 switch (d->what[i]) {
55eed653 8695 /* legal options are one of: sSfpontTu
84da74a7
YO
8696 see also regcomp.h and pregfree() */
8697 case 's':
81714fb9 8698 case 'S':
0536c0a7 8699 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 8700 case 'u': /* actually an HV, but the dup function is identical. */
f8fc2ecf 8701 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
84da74a7 8702 break;
84da74a7
YO
8703 case 'f':
8704 /* This is cheating. */
8705 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 8706 StructCopy(ri->data->data[i], d->data[i],
84da74a7 8707 struct regnode_charclass_class);
f8fc2ecf 8708 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
8709 break;
8710 case 'o':
bbe252da
YO
8711 /* Compiled op trees are readonly and in shared memory,
8712 and can thus be shared without duplication. */
84da74a7 8713 OP_REFCNT_LOCK;
f8fc2ecf 8714 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
8715 OP_REFCNT_UNLOCK;
8716 break;
23eab42c
NC
8717 case 'T':
8718 /* Trie stclasses are readonly and can thus be shared
8719 * without duplication. We free the stclass in pregfree
8720 * when the corresponding reg_ac_data struct is freed.
8721 */
8722 reti->regstclass= ri->regstclass;
8723 /* Fall through */
84da74a7 8724 case 't':
84da74a7 8725 OP_REFCNT_LOCK;
0536c0a7 8726 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 8727 OP_REFCNT_UNLOCK;
0536c0a7
NC
8728 /* Fall through */
8729 case 'n':
8730 d->data[i] = ri->data->data[i];
84da74a7 8731 break;
84da74a7 8732 default:
f8fc2ecf 8733 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
8734 }
8735 }
8736
f8fc2ecf 8737 reti->data = d;
84da74a7
YO
8738 }
8739 else
f8fc2ecf 8740 reti->data = NULL;
84da74a7 8741
f8fc2ecf
YO
8742 Newx(reti->offsets, 2*len+1, U32);
8743 Copy(ri->offsets, reti->offsets, 2*len+1, U32);
84da74a7
YO
8744
8745 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8746 ret->refcnt = r->refcnt;
8747 ret->minlen = r->minlen;
de8c5301 8748 ret->minlenret = r->minlenret;
84da74a7
YO
8749 ret->prelen = r->prelen;
8750 ret->nparens = r->nparens;
8751 ret->lastparen = r->lastparen;
8752 ret->lastcloseparen = r->lastcloseparen;
bbe252da
YO
8753 ret->intflags = r->intflags;
8754 ret->extflags = r->extflags;
84da74a7
YO
8755
8756 ret->sublen = r->sublen;
8757
f9f4320a 8758 ret->engine = r->engine;
81714fb9
YO
8759
8760 ret->paren_names = hv_dup_inc(r->paren_names, param);
f9f4320a 8761
84da74a7
YO
8762 if (RX_MATCH_COPIED(ret))
8763 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8764 else
8765 ret->subbeg = NULL;
8766#ifdef PERL_OLD_COPY_ON_WRITE
8767 ret->saved_copy = NULL;
8768#endif
8769
8770 ptr_table_store(PL_ptr_table, r, ret);
8771 return ret;
84da74a7 8772}
a3c0e9ca 8773#endif
84da74a7 8774
de8c5301
YO
8775/*
8776 reg_stringify()
8777
8778 converts a regexp embedded in a MAGIC struct to its stringified form,
8779 caching the converted form in the struct and returns the cached
8780 string.
8781
8782 If lp is nonnull then it is used to return the length of the
8783 resulting string
8784
8785 If flags is nonnull and the returned string contains UTF8 then
8786 (flags & 1) will be true.
8787
8788 If haseval is nonnull then it is used to return whether the pattern
8789 contains evals.
8790
8791 Normally called via macro:
8792
8793 CALLREG_STRINGIFY(mg,0,0);
8794
8795 And internally with
8796
8797 CALLREG_AS_STR(mg,lp,flags,haseval)
8798
8799 See sv_2pv_flags() in sv.c for an example of internal usage.
8800
8801 */
8802
8803char *
8804Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8805 dVAR;
8806 const regexp * const re = (regexp *)mg->mg_obj;
f8fc2ecf
YO
8807 RXi_GET_DECL(re,ri);
8808
de8c5301
YO
8809 if (!mg->mg_ptr) {
8810 const char *fptr = "msix";
8811 char reflags[6];
8812 char ch;
8813 int left = 0;
8814 int right = 4;
8815 bool need_newline = 0;
bbe252da 8816 U16 reganch = (U16)((re->extflags & RXf_PMf_COMPILETIME) >> 12);
de8c5301
YO
8817
8818 while((ch = *fptr++)) {
8819 if(reganch & 1) {
8820 reflags[left++] = ch;
8821 }
8822 else {
8823 reflags[right--] = ch;
8824 }
8825 reganch >>= 1;
8826 }
8827 if(left != 4) {
8828 reflags[left] = '-';
8829 left = 5;
8830 }
8831
8832 mg->mg_len = re->prelen + 4 + left;
8833 /*
8834 * If /x was used, we have to worry about a regex ending with a
8835 * comment later being embedded within another regex. If so, we don't
8836 * want this regex's "commentization" to leak out to the right part of
8837 * the enclosing regex, we must cap it with a newline.
8838 *
8839 * So, if /x was used, we scan backwards from the end of the regex. If
8840 * we find a '#' before we find a newline, we need to add a newline
8841 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8842 * we don't need to add anything. -jfriedl
8843 */
bbe252da 8844 if (PMf_EXTENDED & re->extflags) {
de8c5301
YO
8845 const char *endptr = re->precomp + re->prelen;
8846 while (endptr >= re->precomp) {
8847 const char c = *(endptr--);
8848 if (c == '\n')
8849 break; /* don't need another */
8850 if (c == '#') {
8851 /* we end while in a comment, so we need a newline */
8852 mg->mg_len++; /* save space for it */
8853 need_newline = 1; /* note to add it */
8854 break;
8855 }
8856 }
8857 }
8858
8859 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8860 mg->mg_ptr[0] = '(';
8861 mg->mg_ptr[1] = '?';
8862 Copy(reflags, mg->mg_ptr+2, left, char);
8863 *(mg->mg_ptr+left+2) = ':';
8864 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8865 if (need_newline)
8866 mg->mg_ptr[mg->mg_len - 2] = '\n';
8867 mg->mg_ptr[mg->mg_len - 1] = ')';
8868 mg->mg_ptr[mg->mg_len] = 0;
8869 }
8870 if (haseval)
f8fc2ecf 8871 *haseval = ri->program[0].next_off;
de8c5301 8872 if (flags)
bbe252da 8873 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
de8c5301
YO
8874
8875 if (lp)
8876 *lp = mg->mg_len;
8877 return mg->mg_ptr;
8878}
8879
8880
76234dfb 8881#ifndef PERL_IN_XSUB_RE
c277df42
IZ
8882/*
8883 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8884 */
8885regnode *
864dbfa3 8886Perl_regnext(pTHX_ register regnode *p)
c277df42 8887{
97aff369 8888 dVAR;
c277df42
IZ
8889 register I32 offset;
8890
f8fc2ecf 8891 if (!p)
c277df42
IZ
8892 return(NULL);
8893
8894 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8895 if (offset == 0)
8896 return(NULL);
8897
c277df42 8898 return(p+offset);
c277df42 8899}
76234dfb 8900#endif
c277df42 8901
01f988be 8902STATIC void
cea2e8a9 8903S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8904{
8905 va_list args;
8906 STRLEN l1 = strlen(pat1);
8907 STRLEN l2 = strlen(pat2);
8908 char buf[512];
06bf62c7 8909 SV *msv;
73d840c0 8910 const char *message;
c277df42
IZ
8911
8912 if (l1 > 510)
8913 l1 = 510;
8914 if (l1 + l2 > 510)
8915 l2 = 510 - l1;
8916 Copy(pat1, buf, l1 , char);
8917 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8918 buf[l1 + l2] = '\n';
8919 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8920#ifdef I_STDARG
8921 /* ANSI variant takes additional second argument */
c277df42 8922 va_start(args, pat2);
8736538c
AS
8923#else
8924 va_start(args);
8925#endif
5a844595 8926 msv = vmess(buf, &args);
c277df42 8927 va_end(args);
cfd0369c 8928 message = SvPV_const(msv,l1);
c277df42
IZ
8929 if (l1 > 512)
8930 l1 = 512;
8931 Copy(message, buf, l1 , char);
197cf9b9 8932 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8933 Perl_croak(aTHX_ "%s", buf);
c277df42 8934}
a0ed51b3
LW
8935
8936/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8937
76234dfb 8938#ifndef PERL_IN_XSUB_RE
a0ed51b3 8939void
864dbfa3 8940Perl_save_re_context(pTHX)
b81d288d 8941{
97aff369 8942 dVAR;
1ade1aa1
NC
8943
8944 struct re_save_state *state;
8945
8946 SAVEVPTR(PL_curcop);
8947 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8948
8949 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8950 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8951 SSPUSHINT(SAVEt_RE_STATE);
8952
46ab3289 8953 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8954
a0ed51b3 8955 PL_reg_start_tmp = 0;
a0ed51b3 8956 PL_reg_start_tmpl = 0;
c445ea15 8957 PL_reg_oldsaved = NULL;
a5db57d6 8958 PL_reg_oldsavedlen = 0;
a5db57d6 8959 PL_reg_maxiter = 0;
a5db57d6 8960 PL_reg_leftiter = 0;
c445ea15 8961 PL_reg_poscache = NULL;
a5db57d6 8962 PL_reg_poscache_size = 0;
1ade1aa1
NC
8963#ifdef PERL_OLD_COPY_ON_WRITE
8964 PL_nrs = NULL;
8965#endif
ada6e8a9 8966
c445ea15
AL
8967 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8968 if (PL_curpm) {
8969 const REGEXP * const rx = PM_GETRE(PL_curpm);
8970 if (rx) {
1df70142 8971 U32 i;
ada6e8a9 8972 for (i = 1; i <= rx->nparens; i++) {
1df70142 8973 char digits[TYPE_CHARS(long)];
d9fad198 8974 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8975 GV *const *const gvp
8976 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8977
b37c2d43
AL
8978 if (gvp) {
8979 GV * const gv = *gvp;
8980 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8981 save_scalar(gv);
49f27e4b 8982 }
ada6e8a9
AMS
8983 }
8984 }
8985 }
a0ed51b3 8986}
76234dfb 8987#endif
51371543 8988
51371543 8989static void
acfe0abc 8990clear_re(pTHX_ void *r)
51371543 8991{
97aff369 8992 dVAR;
51371543
GS
8993 ReREFCNT_dec((regexp *)r);
8994}
ffbc6a93 8995
a28509cc
AL
8996#ifdef DEBUGGING
8997
8998STATIC void
8999S_put_byte(pTHX_ SV *sv, int c)
9000{
9001 if (isCNTRL(c) || c == 255 || !isPRINT(c))
9002 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9003 else if (c == '-' || c == ']' || c == '\\' || c == '^')
9004 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
9005 else
9006 Perl_sv_catpvf(aTHX_ sv, "%c", c);
9007}
9008
786e8c11 9009
3dab1dad
YO
9010#define CLEAR_OPTSTART \
9011 if (optstart) STMT_START { \
70685ca0 9012 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
9013 optstart=NULL; \
9014 } STMT_END
9015
786e8c11 9016#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 9017
b5a2f8d8
NC
9018STATIC const regnode *
9019S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9020 const regnode *last, const regnode *plast,
9021 SV* sv, I32 indent, U32 depth)
a28509cc 9022{
97aff369 9023 dVAR;
786e8c11 9024 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9025 register const regnode *next;
3dab1dad 9026 const regnode *optstart= NULL;
f8fc2ecf 9027 RXi_GET_DECL(r,ri);
3dab1dad 9028 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 9029
786e8c11
YO
9030#ifdef DEBUG_DUMPUNTIL
9031 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9032 last ? last-start : 0,plast ? plast-start : 0);
9033#endif
9034
9035 if (plast && plast < last)
9036 last= plast;
9037
9038 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
9039 /* While that wasn't END last time... */
9040
9041 NODE_ALIGN(node);
9042 op = OP(node);
de734bd5 9043 if (op == CLOSE || op == WHILEM)
786e8c11 9044 indent--;
b5a2f8d8 9045 next = regnext((regnode *)node);
07be1b83 9046
a28509cc 9047 /* Where, what. */
8e11feef 9048 if (OP(node) == OPTIMIZED) {
e68ec53f 9049 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 9050 optstart = node;
3dab1dad 9051 else
8e11feef 9052 goto after_print;
3dab1dad
YO
9053 } else
9054 CLEAR_OPTSTART;
07be1b83 9055
32fc9b6a 9056 regprop(r, sv, node);
a28509cc 9057 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 9058 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
9059
9060 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
9061 if (next == NULL) /* Next ptr. */
9062 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
9063 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9064 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
9065 else
9066 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 9067
1de06328 9068 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 9069 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
9070 }
9071
a28509cc
AL
9072 after_print:
9073 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
9074 assert(next);
9075 {
9076 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
9077 ? regnext((regnode *)next)
9078 : next);
be8e71aa
YO
9079 if (last && nnode > last)
9080 nnode = last;
786e8c11 9081 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 9082 }
a28509cc
AL
9083 }
9084 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9085 assert(next);
786e8c11 9086 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9087 }
9088 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9089 const regnode *this_trie = node;
1de06328 9090 const char op = OP(node);
a28509cc 9091 const I32 n = ARG(node);
1de06328 9092 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 9093 (reg_ac_data *)ri->data->data[n] :
1de06328 9094 NULL;
3251b653
NC
9095 const reg_trie_data * const trie =
9096 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781
NC
9097#ifdef DEBUGGING
9098 AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9099#endif
786e8c11 9100 const regnode *nextbranch= NULL;
a28509cc 9101 I32 word_idx;
1de06328 9102 sv_setpvn(sv, "", 0);
786e8c11 9103 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 9104 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
9105
9106 PerlIO_printf(Perl_debug_log, "%*s%s ",
9107 (int)(2*(indent+3)), "",
9108 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9109 PL_colors[0], PL_colors[1],
9110 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9111 PERL_PV_PRETTY_ELIPSES |
7f69552c 9112 PERL_PV_PRETTY_LTGT
786e8c11
YO
9113 )
9114 : "???"
9115 );
9116 if (trie->jump) {
40d049e4 9117 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
9118 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9119 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
9120 if (dist) {
9121 if (!nextbranch)
24b23f37 9122 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9123 DUMPUNTIL(this_trie + dist, nextbranch);
9124 }
786e8c11
YO
9125 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9126 nextbranch= regnext((regnode *)nextbranch);
9127 } else {
9128 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9129 }
786e8c11
YO
9130 }
9131 if (last && next > last)
9132 node= last;
9133 else
9134 node= next;
a28509cc 9135 }
786e8c11
YO
9136 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9137 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9138 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9139 }
9140 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9141 assert(next);
786e8c11 9142 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9143 }
9144 else if ( op == PLUS || op == STAR) {
786e8c11 9145 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9146 }
9147 else if (op == ANYOF) {
9148 /* arglen 1 + class block */
9149 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9150 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9151 node = NEXTOPER(node);
9152 }
9153 else if (PL_regkind[(U8)op] == EXACT) {
9154 /* Literal string, where present. */
9155 node += NODE_SZ_STR(node) - 1;
9156 node = NEXTOPER(node);
9157 }
9158 else {
9159 node = NEXTOPER(node);
9160 node += regarglen[(U8)op];
9161 }
9162 if (op == CURLYX || op == OPEN)
786e8c11 9163 indent++;
a28509cc 9164 }
3dab1dad 9165 CLEAR_OPTSTART;
786e8c11 9166#ifdef DEBUG_DUMPUNTIL
70685ca0 9167 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 9168#endif
1de06328 9169 return node;
a28509cc
AL
9170}
9171
9172#endif /* DEBUGGING */
9173
241d1a3b
NC
9174/*
9175 * Local variables:
9176 * c-indentation-style: bsd
9177 * c-basic-offset: 4
9178 * indent-tabs-mode: t
9179 * End:
9180 *
37442d52
RGS
9181 * ex: set ts=8 sts=4 sw=4 noet:
9182 */