This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
The PERLIO environment variable, when unset, doesn't default to :stdio.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
40d049e4
YO
129 regnode **recurse; /* Recurse regops */
130 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
131#if ADD_TO_REGEXEC
132 char *starttry; /* -Dr: where regtry was called. */
133#define RExC_starttry (pRExC_state->starttry)
134#endif
3dab1dad 135#ifdef DEBUGGING
be8e71aa 136 const char *lastparse;
3dab1dad
YO
137 I32 lastnum;
138#define RExC_lastparse (pRExC_state->lastparse)
139#define RExC_lastnum (pRExC_state->lastnum)
140#endif
830247a4
IZ
141} RExC_state_t;
142
e2509266 143#define RExC_flags (pRExC_state->flags)
830247a4
IZ
144#define RExC_precomp (pRExC_state->precomp)
145#define RExC_rx (pRExC_state->rx)
f8fc2ecf 146#define RExC_rxi (pRExC_state->rxi)
fac92740 147#define RExC_start (pRExC_state->start)
830247a4
IZ
148#define RExC_end (pRExC_state->end)
149#define RExC_parse (pRExC_state->parse)
150#define RExC_whilem_seen (pRExC_state->whilem_seen)
f8fc2ecf 151#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
830247a4 152#define RExC_emit (pRExC_state->emit)
fac92740 153#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
154#define RExC_naughty (pRExC_state->naughty)
155#define RExC_sawback (pRExC_state->sawback)
156#define RExC_seen (pRExC_state->seen)
157#define RExC_size (pRExC_state->size)
158#define RExC_npar (pRExC_state->npar)
e2e6a0f1 159#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
160#define RExC_extralen (pRExC_state->extralen)
161#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 163#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 164#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
165#define RExC_open_parens (pRExC_state->open_parens)
166#define RExC_close_parens (pRExC_state->close_parens)
167#define RExC_opend (pRExC_state->opend)
81714fb9 168#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
169#define RExC_recurse (pRExC_state->recurse)
170#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 171
a687059c
LW
172#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
173#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
174 ((*s) == '{' && regcurly(s)))
a687059c 175
35c8bce7
LW
176#ifdef SPSTART
177#undef SPSTART /* dratted cpp namespace... */
178#endif
a687059c
LW
179/*
180 * Flags to be passed up and down.
181 */
a687059c 182#define WORST 0 /* Worst case. */
821b33a5 183#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
184#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
185#define SPSTART 0x4 /* Starts with * or +. */
186#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 187
3dab1dad
YO
188#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
189
07be1b83
YO
190/* whether trie related optimizations are enabled */
191#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
192#define TRIE_STUDY_OPT
786e8c11 193#define FULL_TRIE_STUDY
07be1b83
YO
194#define TRIE_STCLASS
195#endif
1de06328
YO
196
197
40d049e4
YO
198
199#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
200#define PBITVAL(paren) (1 << ((paren) & 7))
201#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
202#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
203#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
204
205
1de06328
YO
206/* About scan_data_t.
207
208 During optimisation we recurse through the regexp program performing
209 various inplace (keyhole style) optimisations. In addition study_chunk
210 and scan_commit populate this data structure with information about
211 what strings MUST appear in the pattern. We look for the longest
212 string that must appear for at a fixed location, and we look for the
213 longest string that may appear at a floating location. So for instance
214 in the pattern:
215
216 /FOO[xX]A.*B[xX]BAR/
217
218 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
219 strings (because they follow a .* construct). study_chunk will identify
220 both FOO and BAR as being the longest fixed and floating strings respectively.
221
222 The strings can be composites, for instance
223
224 /(f)(o)(o)/
225
226 will result in a composite fixed substring 'foo'.
227
228 For each string some basic information is maintained:
229
230 - offset or min_offset
231 This is the position the string must appear at, or not before.
232 It also implicitly (when combined with minlenp) tells us how many
233 character must match before the string we are searching.
234 Likewise when combined with minlenp and the length of the string
235 tells us how many characters must appear after the string we have
236 found.
237
238 - max_offset
239 Only used for floating strings. This is the rightmost point that
240 the string can appear at. Ifset to I32 max it indicates that the
241 string can occur infinitely far to the right.
242
243 - minlenp
244 A pointer to the minimum length of the pattern that the string
245 was found inside. This is important as in the case of positive
246 lookahead or positive lookbehind we can have multiple patterns
247 involved. Consider
248
249 /(?=FOO).*F/
250
251 The minimum length of the pattern overall is 3, the minimum length
252 of the lookahead part is 3, but the minimum length of the part that
253 will actually match is 1. So 'FOO's minimum length is 3, but the
254 minimum length for the F is 1. This is important as the minimum length
255 is used to determine offsets in front of and behind the string being
256 looked for. Since strings can be composites this is the length of the
257 pattern at the time it was commited with a scan_commit. Note that
258 the length is calculated by study_chunk, so that the minimum lengths
259 are not known until the full pattern has been compiled, thus the
260 pointer to the value.
261
262 - lookbehind
263
264 In the case of lookbehind the string being searched for can be
265 offset past the start point of the final matching string.
266 If this value was just blithely removed from the min_offset it would
267 invalidate some of the calculations for how many chars must match
268 before or after (as they are derived from min_offset and minlen and
269 the length of the string being searched for).
270 When the final pattern is compiled and the data is moved from the
271 scan_data_t structure into the regexp structure the information
272 about lookbehind is factored in, with the information that would
273 have been lost precalculated in the end_shift field for the
274 associated string.
275
276 The fields pos_min and pos_delta are used to store the minimum offset
277 and the delta to the maximum offset at the current point in the pattern.
278
279*/
2c2d71f5
JH
280
281typedef struct scan_data_t {
1de06328
YO
282 /*I32 len_min; unused */
283 /*I32 len_delta; unused */
2c2d71f5
JH
284 I32 pos_min;
285 I32 pos_delta;
286 SV *last_found;
1de06328 287 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
288 I32 last_start_min;
289 I32 last_start_max;
1de06328
YO
290 SV **longest; /* Either &l_fixed, or &l_float. */
291 SV *longest_fixed; /* longest fixed string found in pattern */
292 I32 offset_fixed; /* offset where it starts */
293 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
294 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
295 SV *longest_float; /* longest floating string found in pattern */
296 I32 offset_float_min; /* earliest point in string it can appear */
297 I32 offset_float_max; /* latest point in string it can appear */
298 I32 *minlen_float; /* pointer to the minlen relevent to the string */
299 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
300 I32 flags;
301 I32 whilem_c;
cb434fcc 302 I32 *last_closep;
653099ff 303 struct regnode_charclass_class *start_class;
2c2d71f5
JH
304} scan_data_t;
305
a687059c 306/*
e50aee73 307 * Forward declarations for pregcomp()'s friends.
a687059c 308 */
a0d0e21e 309
27da23d5 310static const scan_data_t zero_scan_data =
1de06328 311 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
312
313#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
314#define SF_BEFORE_SEOL 0x0001
315#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
316#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
317#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
318
09b7f37c
CB
319#ifdef NO_UNARY_PLUS
320# define SF_FIX_SHIFT_EOL (0+2)
321# define SF_FL_SHIFT_EOL (0+4)
322#else
323# define SF_FIX_SHIFT_EOL (+2)
324# define SF_FL_SHIFT_EOL (+4)
325#endif
c277df42
IZ
326
327#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
328#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
329
330#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
331#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
332#define SF_IS_INF 0x0040
333#define SF_HAS_PAR 0x0080
334#define SF_IN_PAR 0x0100
335#define SF_HAS_EVAL 0x0200
336#define SCF_DO_SUBSTR 0x0400
653099ff
GS
337#define SCF_DO_STCLASS_AND 0x0800
338#define SCF_DO_STCLASS_OR 0x1000
339#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 340#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 341
786e8c11 342#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 343#define SCF_SEEN_ACCEPT 0x8000
07be1b83 344
eb160463 345#define UTF (RExC_utf8 != 0)
bbe252da
YO
346#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
347#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 348
ffc61ed2 349#define OOB_UNICODE 12345678
93733859 350#define OOB_NAMEDCLASS -1
b8c5462f 351
a0ed51b3
LW
352#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
353#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
354
8615cb43 355
b45f050a
JF
356/* length of regex to show in messages that don't mark a position within */
357#define RegexLengthToShowInErrorMessages 127
358
359/*
360 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
361 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
362 * op/pragma/warn/regcomp.
363 */
7253e4e3
RK
364#define MARKER1 "<-- HERE" /* marker as it appears in the description */
365#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 366
7253e4e3 367#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
368
369/*
370 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
371 * arg. Show regex, up to a maximum length. If it's too long, chop and add
372 * "...".
373 */
58e23c8d 374#define _FAIL(code) STMT_START { \
bfed75c6 375 const char *ellipses = ""; \
ccb2c380
MP
376 IV len = RExC_end - RExC_precomp; \
377 \
378 if (!SIZE_ONLY) \
379 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
380 if (len > RegexLengthToShowInErrorMessages) { \
381 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
382 len = RegexLengthToShowInErrorMessages - 10; \
383 ellipses = "..."; \
384 } \
58e23c8d 385 code; \
ccb2c380 386} STMT_END
8615cb43 387
58e23c8d
YO
388#define FAIL(msg) _FAIL( \
389 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
390 msg, (int)len, RExC_precomp, ellipses))
391
392#define FAIL2(msg,arg) _FAIL( \
393 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
394 arg, (int)len, RExC_precomp, ellipses))
395
b45f050a 396/*
b45f050a
JF
397 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
398 */
ccb2c380 399#define Simple_vFAIL(m) STMT_START { \
a28509cc 400 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
401 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
402 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
403} STMT_END
b45f050a
JF
404
405/*
406 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
407 */
ccb2c380
MP
408#define vFAIL(m) STMT_START { \
409 if (!SIZE_ONLY) \
410 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
411 Simple_vFAIL(m); \
412} STMT_END
b45f050a
JF
413
414/*
415 * Like Simple_vFAIL(), but accepts two arguments.
416 */
ccb2c380 417#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 418 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
419 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
420 (int)offset, RExC_precomp, RExC_precomp + offset); \
421} STMT_END
b45f050a
JF
422
423/*
424 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
425 */
ccb2c380
MP
426#define vFAIL2(m,a1) STMT_START { \
427 if (!SIZE_ONLY) \
428 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
429 Simple_vFAIL2(m, a1); \
430} STMT_END
b45f050a
JF
431
432
433/*
434 * Like Simple_vFAIL(), but accepts three arguments.
435 */
ccb2c380 436#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 437 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
438 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
439 (int)offset, RExC_precomp, RExC_precomp + offset); \
440} STMT_END
b45f050a
JF
441
442/*
443 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
444 */
ccb2c380
MP
445#define vFAIL3(m,a1,a2) STMT_START { \
446 if (!SIZE_ONLY) \
447 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
448 Simple_vFAIL3(m, a1, a2); \
449} STMT_END
b45f050a
JF
450
451/*
452 * Like Simple_vFAIL(), but accepts four arguments.
453 */
ccb2c380 454#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 455 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
456 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
457 (int)offset, RExC_precomp, RExC_precomp + offset); \
458} STMT_END
b45f050a 459
ccb2c380 460#define vWARN(loc,m) STMT_START { \
a28509cc 461 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
462 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
463 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
464} STMT_END
465
466#define vWARNdep(loc,m) STMT_START { \
a28509cc 467 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
468 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
469 "%s" REPORT_LOCATION, \
470 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
471} STMT_END
472
473
474#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 475 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
476 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
477 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
478} STMT_END
479
480#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 481 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
482 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
483 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
484} STMT_END
485
486#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 487 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
488 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
489 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
490} STMT_END
491
492#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 493 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
494 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
495 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
496} STMT_END
9d1d55b5 497
8615cb43 498
cd439c50 499/* Allow for side effects in s */
ccb2c380
MP
500#define REGC(c,s) STMT_START { \
501 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
502} STMT_END
cd439c50 503
fac92740
MJD
504/* Macros for recording node offsets. 20001227 mjd@plover.com
505 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
506 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
507 * Element 0 holds the number n.
07be1b83 508 * Position is 1 indexed.
fac92740
MJD
509 */
510
ccb2c380
MP
511#define Set_Node_Offset_To_R(node,byte) STMT_START { \
512 if (! SIZE_ONLY) { \
513 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 514 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 515 if((node) < 0) { \
551405c4 516 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
517 } else { \
518 RExC_offsets[2*(node)-1] = (byte); \
519 } \
520 } \
521} STMT_END
522
523#define Set_Node_Offset(node,byte) \
524 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
525#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
526
527#define Set_Node_Length_To_R(node,len) STMT_START { \
528 if (! SIZE_ONLY) { \
529 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 530 __LINE__, (int)(node), (int)(len))); \
ccb2c380 531 if((node) < 0) { \
551405c4 532 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
533 } else { \
534 RExC_offsets[2*(node)] = (len); \
535 } \
536 } \
537} STMT_END
538
539#define Set_Node_Length(node,len) \
540 Set_Node_Length_To_R((node)-RExC_emit_start, len)
541#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
542#define Set_Node_Cur_Length(node) \
543 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
544
545/* Get offsets and lengths */
546#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
547#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
548
07be1b83
YO
549#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
550 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
551 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
552} STMT_END
553
554
555#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
556#define EXPERIMENTAL_INPLACESCAN
557#endif
558
304ee84b
YO
559#define DEBUG_STUDYDATA(str,data,depth) \
560DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 561 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
562 "%*s" str "Pos:%"IVdf"/%"IVdf \
563 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
564 (int)(depth)*2, "", \
565 (IV)((data)->pos_min), \
566 (IV)((data)->pos_delta), \
304ee84b 567 (UV)((data)->flags), \
1de06328 568 (IV)((data)->whilem_c), \
304ee84b
YO
569 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
570 is_inf ? "INF " : "" \
1de06328
YO
571 ); \
572 if ((data)->last_found) \
573 PerlIO_printf(Perl_debug_log, \
574 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
575 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
576 SvPVX_const((data)->last_found), \
577 (IV)((data)->last_end), \
578 (IV)((data)->last_start_min), \
579 (IV)((data)->last_start_max), \
580 ((data)->longest && \
581 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
582 SvPVX_const((data)->longest_fixed), \
583 (IV)((data)->offset_fixed), \
584 ((data)->longest && \
585 (data)->longest==&((data)->longest_float)) ? "*" : "", \
586 SvPVX_const((data)->longest_float), \
587 (IV)((data)->offset_float_min), \
588 (IV)((data)->offset_float_max) \
589 ); \
590 PerlIO_printf(Perl_debug_log,"\n"); \
591});
592
acfe0abc 593static void clear_re(pTHX_ void *r);
4327152a 594
653099ff 595/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 596 Update the longest found anchored substring and the longest found
653099ff
GS
597 floating substrings if needed. */
598
4327152a 599STATIC void
304ee84b 600S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 601{
e1ec3a88
AL
602 const STRLEN l = CHR_SVLEN(data->last_found);
603 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 604 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 605
c277df42 606 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 607 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
608 if (*data->longest == data->longest_fixed) {
609 data->offset_fixed = l ? data->last_start_min : data->pos_min;
610 if (data->flags & SF_BEFORE_EOL)
b81d288d 611 data->flags
c277df42
IZ
612 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
613 else
614 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
615 data->minlen_fixed=minlenp;
616 data->lookbehind_fixed=0;
a0ed51b3 617 }
304ee84b 618 else { /* *data->longest == data->longest_float */
c277df42 619 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
620 data->offset_float_max = (l
621 ? data->last_start_max
c277df42 622 : data->pos_min + data->pos_delta);
304ee84b 623 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 624 data->offset_float_max = I32_MAX;
c277df42 625 if (data->flags & SF_BEFORE_EOL)
b81d288d 626 data->flags
c277df42
IZ
627 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
628 else
629 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
630 data->minlen_float=minlenp;
631 data->lookbehind_float=0;
c277df42
IZ
632 }
633 }
634 SvCUR_set(data->last_found, 0);
0eda9292 635 {
a28509cc 636 SV * const sv = data->last_found;
097eb12c
AL
637 if (SvUTF8(sv) && SvMAGICAL(sv)) {
638 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
639 if (mg)
640 mg->mg_len = 0;
641 }
0eda9292 642 }
c277df42
IZ
643 data->last_end = -1;
644 data->flags &= ~SF_BEFORE_EOL;
304ee84b 645 DEBUG_STUDYDATA("cl_anything: ",data,0);
c277df42
IZ
646}
647
653099ff
GS
648/* Can match anything (initialization) */
649STATIC void
097eb12c 650S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 651{
653099ff 652 ANYOF_CLASS_ZERO(cl);
f8bef550 653 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 654 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
655 if (LOC)
656 cl->flags |= ANYOF_LOCALE;
657}
658
659/* Can match anything (initialization) */
660STATIC int
5f66b61c 661S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
662{
663 int value;
664
aaa51d5e 665 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
666 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
667 return 1;
1aa99e6b
IH
668 if (!(cl->flags & ANYOF_UNICODE_ALL))
669 return 0;
10edeb5d 670 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 671 return 0;
653099ff
GS
672 return 1;
673}
674
675/* Can match anything (initialization) */
676STATIC void
097eb12c 677S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 678{
8ecf7187 679 Zero(cl, 1, struct regnode_charclass_class);
653099ff 680 cl->type = ANYOF;
830247a4 681 cl_anything(pRExC_state, cl);
653099ff
GS
682}
683
684STATIC void
097eb12c 685S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 686{
8ecf7187 687 Zero(cl, 1, struct regnode_charclass_class);
653099ff 688 cl->type = ANYOF;
830247a4 689 cl_anything(pRExC_state, cl);
653099ff
GS
690 if (LOC)
691 cl->flags |= ANYOF_LOCALE;
692}
693
694/* 'And' a given class with another one. Can create false positives */
695/* We assume that cl is not inverted */
696STATIC void
5f66b61c 697S_cl_and(struct regnode_charclass_class *cl,
a28509cc 698 const struct regnode_charclass_class *and_with)
653099ff 699{
40d049e4
YO
700
701 assert(and_with->type == ANYOF);
653099ff
GS
702 if (!(and_with->flags & ANYOF_CLASS)
703 && !(cl->flags & ANYOF_CLASS)
704 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
705 && !(and_with->flags & ANYOF_FOLD)
706 && !(cl->flags & ANYOF_FOLD)) {
707 int i;
708
709 if (and_with->flags & ANYOF_INVERT)
710 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
711 cl->bitmap[i] &= ~and_with->bitmap[i];
712 else
713 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
714 cl->bitmap[i] &= and_with->bitmap[i];
715 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
716 if (!(and_with->flags & ANYOF_EOS))
717 cl->flags &= ~ANYOF_EOS;
1aa99e6b 718
14ebb1a2
JH
719 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
720 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
721 cl->flags &= ~ANYOF_UNICODE_ALL;
722 cl->flags |= ANYOF_UNICODE;
723 ARG_SET(cl, ARG(and_with));
724 }
14ebb1a2
JH
725 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
726 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 727 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
728 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
729 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 730 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
731}
732
733/* 'OR' a given class with another one. Can create false positives */
734/* We assume that cl is not inverted */
735STATIC void
097eb12c 736S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 737{
653099ff
GS
738 if (or_with->flags & ANYOF_INVERT) {
739 /* We do not use
740 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
741 * <= (B1 | !B2) | (CL1 | !CL2)
742 * which is wasteful if CL2 is small, but we ignore CL2:
743 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
744 * XXXX Can we handle case-fold? Unclear:
745 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
746 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
747 */
748 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
749 && !(or_with->flags & ANYOF_FOLD)
750 && !(cl->flags & ANYOF_FOLD) ) {
751 int i;
752
753 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
754 cl->bitmap[i] |= ~or_with->bitmap[i];
755 } /* XXXX: logic is complicated otherwise */
756 else {
830247a4 757 cl_anything(pRExC_state, cl);
653099ff
GS
758 }
759 } else {
760 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
761 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 762 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
763 || (cl->flags & ANYOF_FOLD)) ) {
764 int i;
765
766 /* OR char bitmap and class bitmap separately */
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] |= or_with->bitmap[i];
769 if (or_with->flags & ANYOF_CLASS) {
770 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
771 cl->classflags[i] |= or_with->classflags[i];
772 cl->flags |= ANYOF_CLASS;
773 }
774 }
775 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 776 cl_anything(pRExC_state, cl);
653099ff
GS
777 }
778 }
779 if (or_with->flags & ANYOF_EOS)
780 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
781
782 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
783 ARG(cl) != ARG(or_with)) {
784 cl->flags |= ANYOF_UNICODE_ALL;
785 cl->flags &= ~ANYOF_UNICODE;
786 }
787 if (or_with->flags & ANYOF_UNICODE_ALL) {
788 cl->flags |= ANYOF_UNICODE_ALL;
789 cl->flags &= ~ANYOF_UNICODE;
790 }
653099ff
GS
791}
792
a3621e74
YO
793#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
794#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
795#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
796#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
797
3dab1dad
YO
798
799#ifdef DEBUGGING
07be1b83 800/*
2b8b4781
NC
801 dump_trie(trie,widecharmap,revcharmap)
802 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
803 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
804
805 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
806 The _interim_ variants are used for debugging the interim
807 tables that are used to generate the final compressed
808 representation which is what dump_trie expects.
809
3dab1dad
YO
810 Part of the reason for their existance is to provide a form
811 of documentation as to how the different representations function.
07be1b83
YO
812
813*/
3dab1dad
YO
814
815/*
3dab1dad
YO
816 Dumps the final compressed table form of the trie to Perl_debug_log.
817 Used for debugging make_trie().
818*/
819
820STATIC void
2b8b4781
NC
821S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
822 AV *revcharmap, U32 depth)
3dab1dad
YO
823{
824 U32 state;
ab3bbdeb 825 SV *sv=sv_newmortal();
55eed653 826 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
827 GET_RE_DEBUG_FLAGS_DECL;
828
ab3bbdeb 829
3dab1dad
YO
830 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
831 (int)depth * 2 + 2,"",
832 "Match","Base","Ofs" );
833
834 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 835 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 836 if ( tmp ) {
ab3bbdeb
YO
837 PerlIO_printf( Perl_debug_log, "%*s",
838 colwidth,
ddc5bc0f 839 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
840 PL_colors[0], PL_colors[1],
841 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
842 PERL_PV_ESCAPE_FIRSTCHAR
843 )
844 );
3dab1dad
YO
845 }
846 }
847 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
848 (int)depth * 2 + 2,"");
849
850 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 851 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
852 PerlIO_printf( Perl_debug_log, "\n");
853
1e2e3d02 854 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 855 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
856
857 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
858
859 if ( trie->states[ state ].wordnum ) {
860 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
861 } else {
862 PerlIO_printf( Perl_debug_log, "%6s", "" );
863 }
864
865 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
866
867 if ( base ) {
868 U32 ofs = 0;
869
870 while( ( base + ofs < trie->uniquecharcount ) ||
871 ( base + ofs - trie->uniquecharcount < trie->lasttrans
872 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
873 ofs++;
874
875 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
876
877 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
878 if ( ( base + ofs >= trie->uniquecharcount ) &&
879 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
880 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
881 {
ab3bbdeb
YO
882 PerlIO_printf( Perl_debug_log, "%*"UVXf,
883 colwidth,
3dab1dad
YO
884 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
885 } else {
ab3bbdeb 886 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
887 }
888 }
889
890 PerlIO_printf( Perl_debug_log, "]");
891
892 }
893 PerlIO_printf( Perl_debug_log, "\n" );
894 }
895}
896/*
3dab1dad
YO
897 Dumps a fully constructed but uncompressed trie in list form.
898 List tries normally only are used for construction when the number of
899 possible chars (trie->uniquecharcount) is very high.
900 Used for debugging make_trie().
901*/
902STATIC void
55eed653 903S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
904 HV *widecharmap, AV *revcharmap, U32 next_alloc,
905 U32 depth)
3dab1dad
YO
906{
907 U32 state;
ab3bbdeb 908 SV *sv=sv_newmortal();
55eed653 909 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
910 GET_RE_DEBUG_FLAGS_DECL;
911 /* print out the table precompression. */
ab3bbdeb
YO
912 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
913 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
914 "------:-----+-----------------\n" );
3dab1dad
YO
915
916 for( state=1 ; state < next_alloc ; state ++ ) {
917 U16 charid;
918
ab3bbdeb 919 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
920 (int)depth * 2 + 2,"", (UV)state );
921 if ( ! trie->states[ state ].wordnum ) {
922 PerlIO_printf( Perl_debug_log, "%5s| ","");
923 } else {
924 PerlIO_printf( Perl_debug_log, "W%4x| ",
925 trie->states[ state ].wordnum
926 );
927 }
928 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 929 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
930 if ( tmp ) {
931 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
932 colwidth,
ddc5bc0f 933 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
934 PL_colors[0], PL_colors[1],
935 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
936 PERL_PV_ESCAPE_FIRSTCHAR
937 ) ,
1e2e3d02
YO
938 TRIE_LIST_ITEM(state,charid).forid,
939 (UV)TRIE_LIST_ITEM(state,charid).newstate
940 );
941 if (!(charid % 10))
664e119d
RGS
942 PerlIO_printf(Perl_debug_log, "\n%*s| ",
943 (int)((depth * 2) + 14), "");
1e2e3d02 944 }
ab3bbdeb
YO
945 }
946 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
947 }
948}
949
950/*
3dab1dad
YO
951 Dumps a fully constructed but uncompressed trie in table form.
952 This is the normal DFA style state transition table, with a few
953 twists to facilitate compression later.
954 Used for debugging make_trie().
955*/
956STATIC void
55eed653 957S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
958 HV *widecharmap, AV *revcharmap, U32 next_alloc,
959 U32 depth)
3dab1dad
YO
960{
961 U32 state;
962 U16 charid;
ab3bbdeb 963 SV *sv=sv_newmortal();
55eed653 964 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
965 GET_RE_DEBUG_FLAGS_DECL;
966
967 /*
968 print out the table precompression so that we can do a visual check
969 that they are identical.
970 */
971
972 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
973
974 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 975 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 976 if ( tmp ) {
ab3bbdeb
YO
977 PerlIO_printf( Perl_debug_log, "%*s",
978 colwidth,
ddc5bc0f 979 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
980 PL_colors[0], PL_colors[1],
981 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
982 PERL_PV_ESCAPE_FIRSTCHAR
983 )
984 );
3dab1dad
YO
985 }
986 }
987
988 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
989
990 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 991 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
992 }
993
994 PerlIO_printf( Perl_debug_log, "\n" );
995
996 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
997
998 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
999 (int)depth * 2 + 2,"",
1000 (UV)TRIE_NODENUM( state ) );
1001
1002 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1003 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1004 if (v)
1005 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1006 else
1007 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1008 }
1009 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1010 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1011 } else {
1012 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1013 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1014 }
1015 }
07be1b83 1016}
3dab1dad
YO
1017
1018#endif
1019
786e8c11
YO
1020/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1021 startbranch: the first branch in the whole branch sequence
1022 first : start branch of sequence of branch-exact nodes.
1023 May be the same as startbranch
1024 last : Thing following the last branch.
1025 May be the same as tail.
1026 tail : item following the branch sequence
1027 count : words in the sequence
1028 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1029 depth : indent depth
3dab1dad 1030
786e8c11 1031Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1032
786e8c11
YO
1033A trie is an N'ary tree where the branches are determined by digital
1034decomposition of the key. IE, at the root node you look up the 1st character and
1035follow that branch repeat until you find the end of the branches. Nodes can be
1036marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1037
786e8c11 1038 /he|she|his|hers/
72f13be8 1039
786e8c11
YO
1040would convert into the following structure. Numbers represent states, letters
1041following numbers represent valid transitions on the letter from that state, if
1042the number is in square brackets it represents an accepting state, otherwise it
1043will be in parenthesis.
07be1b83 1044
786e8c11
YO
1045 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1046 | |
1047 | (2)
1048 | |
1049 (1) +-i->(6)-+-s->[7]
1050 |
1051 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1052
786e8c11
YO
1053 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1054
1055This shows that when matching against the string 'hers' we will begin at state 1
1056read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1057then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1058is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1059single traverse. We store a mapping from accepting to state to which word was
1060matched, and then when we have multiple possibilities we try to complete the
1061rest of the regex in the order in which they occured in the alternation.
1062
1063The only prior NFA like behaviour that would be changed by the TRIE support is
1064the silent ignoring of duplicate alternations which are of the form:
1065
1066 / (DUPE|DUPE) X? (?{ ... }) Y /x
1067
1068Thus EVAL blocks follwing a trie may be called a different number of times with
1069and without the optimisation. With the optimisations dupes will be silently
1070ignored. This inconsistant behaviour of EVAL type nodes is well established as
1071the following demonstrates:
1072
1073 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1074
1075which prints out 'word' three times, but
1076
1077 'words'=~/(word|word|word)(?{ print $1 })S/
1078
1079which doesnt print it out at all. This is due to other optimisations kicking in.
1080
1081Example of what happens on a structural level:
1082
1083The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1084
1085 1: CURLYM[1] {1,32767}(18)
1086 5: BRANCH(8)
1087 6: EXACT <ac>(16)
1088 8: BRANCH(11)
1089 9: EXACT <ad>(16)
1090 11: BRANCH(14)
1091 12: EXACT <ab>(16)
1092 16: SUCCEED(0)
1093 17: NOTHING(18)
1094 18: END(0)
1095
1096This would be optimizable with startbranch=5, first=5, last=16, tail=16
1097and should turn into:
1098
1099 1: CURLYM[1] {1,32767}(18)
1100 5: TRIE(16)
1101 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1102 <ac>
1103 <ad>
1104 <ab>
1105 16: SUCCEED(0)
1106 17: NOTHING(18)
1107 18: END(0)
1108
1109Cases where tail != last would be like /(?foo|bar)baz/:
1110
1111 1: BRANCH(4)
1112 2: EXACT <foo>(8)
1113 4: BRANCH(7)
1114 5: EXACT <bar>(8)
1115 7: TAIL(8)
1116 8: EXACT <baz>(10)
1117 10: END(0)
1118
1119which would be optimizable with startbranch=1, first=1, last=7, tail=8
1120and would end up looking like:
1121
1122 1: TRIE(8)
1123 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1124 <foo>
1125 <bar>
1126 7: TAIL(8)
1127 8: EXACT <baz>(10)
1128 10: END(0)
1129
1130 d = uvuni_to_utf8_flags(d, uv, 0);
1131
1132is the recommended Unicode-aware way of saying
1133
1134 *(d++) = uv;
1135*/
1136
1e2e3d02 1137#define TRIE_STORE_REVCHAR \
786e8c11 1138 STMT_START { \
1e2e3d02 1139 SV *tmp = newSVpvs(""); \
786e8c11 1140 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1141 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
2b8b4781 1142 av_push( revcharmap, tmp ); \
786e8c11
YO
1143 } STMT_END
1144
1145#define TRIE_READ_CHAR STMT_START { \
1146 wordlen++; \
1147 if ( UTF ) { \
1148 if ( folder ) { \
1149 if ( foldlen > 0 ) { \
1150 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1151 foldlen -= len; \
1152 scan += len; \
1153 len = 0; \
1154 } else { \
1155 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1156 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1157 foldlen -= UNISKIP( uvc ); \
1158 scan = foldbuf + UNISKIP( uvc ); \
1159 } \
1160 } else { \
1161 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1162 } \
1163 } else { \
1164 uvc = (U32)*uc; \
1165 len = 1; \
1166 } \
1167} STMT_END
1168
1169
1170
1171#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1172 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1173 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1174 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1175 } \
1176 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1177 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1178 TRIE_LIST_CUR( state )++; \
1179} STMT_END
07be1b83 1180
786e8c11
YO
1181#define TRIE_LIST_NEW(state) STMT_START { \
1182 Newxz( trie->states[ state ].trans.list, \
1183 4, reg_trie_trans_le ); \
1184 TRIE_LIST_CUR( state ) = 1; \
1185 TRIE_LIST_LEN( state ) = 4; \
1186} STMT_END
07be1b83 1187
786e8c11
YO
1188#define TRIE_HANDLE_WORD(state) STMT_START { \
1189 U16 dupe= trie->states[ state ].wordnum; \
1190 regnode * const noper_next = regnext( noper ); \
1191 \
1192 if (trie->wordlen) \
1193 trie->wordlen[ curword ] = wordlen; \
1194 DEBUG_r({ \
1195 /* store the word for dumping */ \
1196 SV* tmp; \
1197 if (OP(noper) != NOTHING) \
1198 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1199 else \
1200 tmp = newSVpvn( "", 0 ); \
1201 if ( UTF ) SvUTF8_on( tmp ); \
2b8b4781 1202 av_push( trie_words, tmp ); \
786e8c11
YO
1203 }); \
1204 \
1205 curword++; \
1206 \
1207 if ( noper_next < tail ) { \
1208 if (!trie->jump) \
c944940b 1209 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1210 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1211 if (!jumper) \
1212 jumper = noper_next; \
1213 if (!nextbranch) \
1214 nextbranch= regnext(cur); \
1215 } \
1216 \
1217 if ( dupe ) { \
1218 /* So it's a dupe. This means we need to maintain a */\
1219 /* linked-list from the first to the next. */\
1220 /* we only allocate the nextword buffer when there */\
1221 /* a dupe, so first time we have to do the allocation */\
1222 if (!trie->nextword) \
c944940b 1223 trie->nextword = (U16 *) \
446bd890 1224 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1225 while ( trie->nextword[dupe] ) \
1226 dupe= trie->nextword[dupe]; \
1227 trie->nextword[dupe]= curword; \
1228 } else { \
1229 /* we haven't inserted this word yet. */ \
1230 trie->states[ state ].wordnum = curword; \
1231 } \
1232} STMT_END
07be1b83 1233
3dab1dad 1234
786e8c11
YO
1235#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1236 ( ( base + charid >= ucharcount \
1237 && base + charid < ubound \
1238 && state == trie->trans[ base - ucharcount + charid ].check \
1239 && trie->trans[ base - ucharcount + charid ].next ) \
1240 ? trie->trans[ base - ucharcount + charid ].next \
1241 : ( state==1 ? special : 0 ) \
1242 )
3dab1dad 1243
786e8c11
YO
1244#define MADE_TRIE 1
1245#define MADE_JUMP_TRIE 2
1246#define MADE_EXACT_TRIE 4
3dab1dad 1247
a3621e74 1248STATIC I32
786e8c11 1249S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1250{
27da23d5 1251 dVAR;
a3621e74
YO
1252 /* first pass, loop through and scan words */
1253 reg_trie_data *trie;
55eed653 1254 HV *widecharmap = NULL;
2b8b4781 1255 AV *revcharmap = newAV();
a3621e74 1256 regnode *cur;
9f7f3913 1257 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1258 STRLEN len = 0;
1259 UV uvc = 0;
1260 U16 curword = 0;
1261 U32 next_alloc = 0;
786e8c11
YO
1262 regnode *jumper = NULL;
1263 regnode *nextbranch = NULL;
7f69552c 1264 regnode *convert = NULL;
a3621e74 1265 /* we just use folder as a flag in utf8 */
e1ec3a88 1266 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1267 ? PL_fold
1268 : ( flags == EXACTFL
1269 ? PL_fold_locale
1270 : NULL
1271 )
1272 );
1273
2b8b4781
NC
1274#ifdef DEBUGGING
1275 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1276 AV *trie_words = NULL;
1277 /* along with revcharmap, this only used during construction but both are
1278 * useful during debugging so we store them in the struct when debugging.
8e11feef 1279 */
2b8b4781
NC
1280#else
1281 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1282 STRLEN trie_charcount=0;
3dab1dad 1283#endif
2b8b4781 1284 SV *re_trie_maxbuff;
a3621e74 1285 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1286#ifndef DEBUGGING
1287 PERL_UNUSED_ARG(depth);
1288#endif
a3621e74 1289
c944940b 1290 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1291 trie->refcount = 1;
3dab1dad 1292 trie->startstate = 1;
786e8c11 1293 trie->wordcount = word_count;
f8fc2ecf 1294 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1295 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1296 if (!(UTF && folder))
c944940b 1297 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1298 DEBUG_r({
2b8b4781 1299 trie_words = newAV();
a3621e74 1300 });
a3621e74 1301
0111c4fd 1302 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1303 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1304 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1305 }
3dab1dad
YO
1306 DEBUG_OPTIMISE_r({
1307 PerlIO_printf( Perl_debug_log,
786e8c11 1308 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1309 (int)depth * 2 + 2, "",
1310 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1311 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1312 (int)depth);
3dab1dad 1313 });
7f69552c
YO
1314
1315 /* Find the node we are going to overwrite */
1316 if ( first == startbranch && OP( last ) != BRANCH ) {
1317 /* whole branch chain */
1318 convert = first;
1319 } else {
1320 /* branch sub-chain */
1321 convert = NEXTOPER( first );
1322 }
1323
a3621e74
YO
1324 /* -- First loop and Setup --
1325
1326 We first traverse the branches and scan each word to determine if it
1327 contains widechars, and how many unique chars there are, this is
1328 important as we have to build a table with at least as many columns as we
1329 have unique chars.
1330
1331 We use an array of integers to represent the character codes 0..255
1332 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1333 native representation of the character value as the key and IV's for the
1334 coded index.
1335
1336 *TODO* If we keep track of how many times each character is used we can
1337 remap the columns so that the table compression later on is more
1338 efficient in terms of memory by ensuring most common value is in the
1339 middle and the least common are on the outside. IMO this would be better
1340 than a most to least common mapping as theres a decent chance the most
1341 common letter will share a node with the least common, meaning the node
1342 will not be compressable. With a middle is most common approach the worst
1343 case is when we have the least common nodes twice.
1344
1345 */
1346
a3621e74 1347 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1348 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1349 const U8 *uc = (U8*)STRING( noper );
a28509cc 1350 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1351 STRLEN foldlen = 0;
1352 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1353 const U8 *scan = (U8*)NULL;
07be1b83 1354 U32 wordlen = 0; /* required init */
3dab1dad 1355 STRLEN chars=0;
a3621e74 1356
3dab1dad
YO
1357 if (OP(noper) == NOTHING) {
1358 trie->minlen= 0;
1359 continue;
1360 }
1361 if (trie->bitmap) {
1362 TRIE_BITMAP_SET(trie,*uc);
1363 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1364 }
a3621e74 1365 for ( ; uc < e ; uc += len ) {
3dab1dad 1366 TRIE_CHARCOUNT(trie)++;
a3621e74 1367 TRIE_READ_CHAR;
3dab1dad 1368 chars++;
a3621e74
YO
1369 if ( uvc < 256 ) {
1370 if ( !trie->charmap[ uvc ] ) {
1371 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1372 if ( folder )
1373 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1374 TRIE_STORE_REVCHAR;
a3621e74
YO
1375 }
1376 } else {
1377 SV** svpp;
55eed653
NC
1378 if ( !widecharmap )
1379 widecharmap = newHV();
a3621e74 1380
55eed653 1381 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1382
1383 if ( !svpp )
e4584336 1384 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1385
1386 if ( !SvTRUE( *svpp ) ) {
1387 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1388 TRIE_STORE_REVCHAR;
a3621e74
YO
1389 }
1390 }
1391 }
3dab1dad
YO
1392 if( cur == first ) {
1393 trie->minlen=chars;
1394 trie->maxlen=chars;
1395 } else if (chars < trie->minlen) {
1396 trie->minlen=chars;
1397 } else if (chars > trie->maxlen) {
1398 trie->maxlen=chars;
1399 }
1400
a3621e74
YO
1401 } /* end first pass */
1402 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1403 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1404 (int)depth * 2 + 2,"",
55eed653 1405 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1406 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1407 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1408 );
c944940b 1409 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1410
1411 /*
1412 We now know what we are dealing with in terms of unique chars and
1413 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1414 representation using a flat table will take. If it's over a reasonable
1415 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1416 conservative but potentially much slower representation using an array
1417 of lists.
1418
1419 At the end we convert both representations into the same compressed
1420 form that will be used in regexec.c for matching with. The latter
1421 is a form that cannot be used to construct with but has memory
1422 properties similar to the list form and access properties similar
1423 to the table form making it both suitable for fast searches and
1424 small enough that its feasable to store for the duration of a program.
1425
1426 See the comment in the code where the compressed table is produced
1427 inplace from the flat tabe representation for an explanation of how
1428 the compression works.
1429
1430 */
1431
1432
3dab1dad 1433 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1434 /*
1435 Second Pass -- Array Of Lists Representation
1436
1437 Each state will be represented by a list of charid:state records
1438 (reg_trie_trans_le) the first such element holds the CUR and LEN
1439 points of the allocated array. (See defines above).
1440
1441 We build the initial structure using the lists, and then convert
1442 it into the compressed table form which allows faster lookups
1443 (but cant be modified once converted).
a3621e74
YO
1444 */
1445
a3621e74
YO
1446 STRLEN transcount = 1;
1447
1e2e3d02
YO
1448 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1449 "%*sCompiling trie using list compiler\n",
1450 (int)depth * 2 + 2, ""));
446bd890 1451
c944940b
JH
1452 trie->states = (reg_trie_state *)
1453 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1454 sizeof(reg_trie_state) );
a3621e74
YO
1455 TRIE_LIST_NEW(1);
1456 next_alloc = 2;
1457
1458 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1459
c445ea15
AL
1460 regnode * const noper = NEXTOPER( cur );
1461 U8 *uc = (U8*)STRING( noper );
1462 const U8 * const e = uc + STR_LEN( noper );
1463 U32 state = 1; /* required init */
1464 U16 charid = 0; /* sanity init */
1465 U8 *scan = (U8*)NULL; /* sanity init */
1466 STRLEN foldlen = 0; /* required init */
07be1b83 1467 U32 wordlen = 0; /* required init */
c445ea15
AL
1468 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1469
3dab1dad 1470 if (OP(noper) != NOTHING) {
786e8c11 1471 for ( ; uc < e ; uc += len ) {
c445ea15 1472
786e8c11 1473 TRIE_READ_CHAR;
c445ea15 1474
786e8c11
YO
1475 if ( uvc < 256 ) {
1476 charid = trie->charmap[ uvc ];
c445ea15 1477 } else {
55eed653 1478 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1479 if ( !svpp ) {
1480 charid = 0;
1481 } else {
1482 charid=(U16)SvIV( *svpp );
1483 }
c445ea15 1484 }
786e8c11
YO
1485 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1486 if ( charid ) {
a3621e74 1487
786e8c11
YO
1488 U16 check;
1489 U32 newstate = 0;
a3621e74 1490
786e8c11
YO
1491 charid--;
1492 if ( !trie->states[ state ].trans.list ) {
1493 TRIE_LIST_NEW( state );
c445ea15 1494 }
786e8c11
YO
1495 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1496 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1497 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1498 break;
1499 }
1500 }
1501 if ( ! newstate ) {
1502 newstate = next_alloc++;
1503 TRIE_LIST_PUSH( state, charid, newstate );
1504 transcount++;
1505 }
1506 state = newstate;
1507 } else {
1508 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1509 }
a28509cc 1510 }
c445ea15 1511 }
3dab1dad 1512 TRIE_HANDLE_WORD(state);
a3621e74
YO
1513
1514 } /* end second pass */
1515
1e2e3d02
YO
1516 /* next alloc is the NEXT state to be allocated */
1517 trie->statecount = next_alloc;
c944940b
JH
1518 trie->states = (reg_trie_state *)
1519 PerlMemShared_realloc( trie->states,
1520 next_alloc
1521 * sizeof(reg_trie_state) );
a3621e74 1522
3dab1dad 1523 /* and now dump it out before we compress it */
2b8b4781
NC
1524 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1525 revcharmap, next_alloc,
1526 depth+1)
1e2e3d02 1527 );
a3621e74 1528
c944940b
JH
1529 trie->trans = (reg_trie_trans *)
1530 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1531 {
1532 U32 state;
a3621e74
YO
1533 U32 tp = 0;
1534 U32 zp = 0;
1535
1536
1537 for( state=1 ; state < next_alloc ; state ++ ) {
1538 U32 base=0;
1539
1540 /*
1541 DEBUG_TRIE_COMPILE_MORE_r(
1542 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1543 );
1544 */
1545
1546 if (trie->states[state].trans.list) {
1547 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1548 U16 maxid=minid;
a28509cc 1549 U16 idx;
a3621e74
YO
1550
1551 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1552 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1553 if ( forid < minid ) {
1554 minid=forid;
1555 } else if ( forid > maxid ) {
1556 maxid=forid;
1557 }
a3621e74
YO
1558 }
1559 if ( transcount < tp + maxid - minid + 1) {
1560 transcount *= 2;
c944940b
JH
1561 trie->trans = (reg_trie_trans *)
1562 PerlMemShared_realloc( trie->trans,
446bd890
NC
1563 transcount
1564 * sizeof(reg_trie_trans) );
a3621e74
YO
1565 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1566 }
1567 base = trie->uniquecharcount + tp - minid;
1568 if ( maxid == minid ) {
1569 U32 set = 0;
1570 for ( ; zp < tp ; zp++ ) {
1571 if ( ! trie->trans[ zp ].next ) {
1572 base = trie->uniquecharcount + zp - minid;
1573 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1574 trie->trans[ zp ].check = state;
1575 set = 1;
1576 break;
1577 }
1578 }
1579 if ( !set ) {
1580 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1581 trie->trans[ tp ].check = state;
1582 tp++;
1583 zp = tp;
1584 }
1585 } else {
1586 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1587 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1588 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1589 trie->trans[ tid ].check = state;
1590 }
1591 tp += ( maxid - minid + 1 );
1592 }
1593 Safefree(trie->states[ state ].trans.list);
1594 }
1595 /*
1596 DEBUG_TRIE_COMPILE_MORE_r(
1597 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1598 );
1599 */
1600 trie->states[ state ].trans.base=base;
1601 }
cc601c31 1602 trie->lasttrans = tp + 1;
a3621e74
YO
1603 }
1604 } else {
1605 /*
1606 Second Pass -- Flat Table Representation.
1607
1608 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1609 We know that we will need Charcount+1 trans at most to store the data
1610 (one row per char at worst case) So we preallocate both structures
1611 assuming worst case.
1612
1613 We then construct the trie using only the .next slots of the entry
1614 structs.
1615
1616 We use the .check field of the first entry of the node temporarily to
1617 make compression both faster and easier by keeping track of how many non
1618 zero fields are in the node.
1619
1620 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1621 transition.
1622
1623 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1624 number representing the first entry of the node, and state as a
1625 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1626 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1627 are 2 entrys per node. eg:
1628
1629 A B A B
1630 1. 2 4 1. 3 7
1631 2. 0 3 3. 0 5
1632 3. 0 0 5. 0 0
1633 4. 0 0 7. 0 0
1634
1635 The table is internally in the right hand, idx form. However as we also
1636 have to deal with the states array which is indexed by nodenum we have to
1637 use TRIE_NODENUM() to convert.
1638
1639 */
1e2e3d02
YO
1640 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1641 "%*sCompiling trie using table compiler\n",
1642 (int)depth * 2 + 2, ""));
3dab1dad 1643
c944940b
JH
1644 trie->trans = (reg_trie_trans *)
1645 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1646 * trie->uniquecharcount + 1,
1647 sizeof(reg_trie_trans) );
1648 trie->states = (reg_trie_state *)
1649 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1650 sizeof(reg_trie_state) );
a3621e74
YO
1651 next_alloc = trie->uniquecharcount + 1;
1652
3dab1dad 1653
a3621e74
YO
1654 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1655
c445ea15 1656 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1657 const U8 *uc = (U8*)STRING( noper );
1658 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1659
1660 U32 state = 1; /* required init */
1661
1662 U16 charid = 0; /* sanity init */
1663 U32 accept_state = 0; /* sanity init */
1664 U8 *scan = (U8*)NULL; /* sanity init */
1665
1666 STRLEN foldlen = 0; /* required init */
07be1b83 1667 U32 wordlen = 0; /* required init */
a3621e74
YO
1668 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1669
3dab1dad 1670 if ( OP(noper) != NOTHING ) {
786e8c11 1671 for ( ; uc < e ; uc += len ) {
a3621e74 1672
786e8c11 1673 TRIE_READ_CHAR;
a3621e74 1674
786e8c11
YO
1675 if ( uvc < 256 ) {
1676 charid = trie->charmap[ uvc ];
1677 } else {
55eed653 1678 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1679 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1680 }
786e8c11
YO
1681 if ( charid ) {
1682 charid--;
1683 if ( !trie->trans[ state + charid ].next ) {
1684 trie->trans[ state + charid ].next = next_alloc;
1685 trie->trans[ state ].check++;
1686 next_alloc += trie->uniquecharcount;
1687 }
1688 state = trie->trans[ state + charid ].next;
1689 } else {
1690 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1691 }
1692 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1693 }
a3621e74 1694 }
3dab1dad
YO
1695 accept_state = TRIE_NODENUM( state );
1696 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1697
1698 } /* end second pass */
1699
3dab1dad 1700 /* and now dump it out before we compress it */
2b8b4781
NC
1701 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1702 revcharmap,
1703 next_alloc, depth+1));
a3621e74 1704
a3621e74
YO
1705 {
1706 /*
1707 * Inplace compress the table.*
1708
1709 For sparse data sets the table constructed by the trie algorithm will
1710 be mostly 0/FAIL transitions or to put it another way mostly empty.
1711 (Note that leaf nodes will not contain any transitions.)
1712
1713 This algorithm compresses the tables by eliminating most such
1714 transitions, at the cost of a modest bit of extra work during lookup:
1715
1716 - Each states[] entry contains a .base field which indicates the
1717 index in the state[] array wheres its transition data is stored.
1718
1719 - If .base is 0 there are no valid transitions from that node.
1720
1721 - If .base is nonzero then charid is added to it to find an entry in
1722 the trans array.
1723
1724 -If trans[states[state].base+charid].check!=state then the
1725 transition is taken to be a 0/Fail transition. Thus if there are fail
1726 transitions at the front of the node then the .base offset will point
1727 somewhere inside the previous nodes data (or maybe even into a node
1728 even earlier), but the .check field determines if the transition is
1729 valid.
1730
786e8c11 1731 XXX - wrong maybe?
a3621e74
YO
1732 The following process inplace converts the table to the compressed
1733 table: We first do not compress the root node 1,and mark its all its
1734 .check pointers as 1 and set its .base pointer as 1 as well. This
1735 allows to do a DFA construction from the compressed table later, and
1736 ensures that any .base pointers we calculate later are greater than
1737 0.
1738
1739 - We set 'pos' to indicate the first entry of the second node.
1740
1741 - We then iterate over the columns of the node, finding the first and
1742 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1743 and set the .check pointers accordingly, and advance pos
1744 appropriately and repreat for the next node. Note that when we copy
1745 the next pointers we have to convert them from the original
1746 NODEIDX form to NODENUM form as the former is not valid post
1747 compression.
1748
1749 - If a node has no transitions used we mark its base as 0 and do not
1750 advance the pos pointer.
1751
1752 - If a node only has one transition we use a second pointer into the
1753 structure to fill in allocated fail transitions from other states.
1754 This pointer is independent of the main pointer and scans forward
1755 looking for null transitions that are allocated to a state. When it
1756 finds one it writes the single transition into the "hole". If the
786e8c11 1757 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1758
1759 - Once compressed we can Renew/realloc the structures to release the
1760 excess space.
1761
1762 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1763 specifically Fig 3.47 and the associated pseudocode.
1764
1765 demq
1766 */
a3b680e6 1767 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1768 U32 state, charid;
a3621e74 1769 U32 pos = 0, zp=0;
1e2e3d02 1770 trie->statecount = laststate;
a3621e74
YO
1771
1772 for ( state = 1 ; state < laststate ; state++ ) {
1773 U8 flag = 0;
a28509cc
AL
1774 const U32 stateidx = TRIE_NODEIDX( state );
1775 const U32 o_used = trie->trans[ stateidx ].check;
1776 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1777 trie->trans[ stateidx ].check = 0;
1778
1779 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1780 if ( flag || trie->trans[ stateidx + charid ].next ) {
1781 if ( trie->trans[ stateidx + charid ].next ) {
1782 if (o_used == 1) {
1783 for ( ; zp < pos ; zp++ ) {
1784 if ( ! trie->trans[ zp ].next ) {
1785 break;
1786 }
1787 }
1788 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1789 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1790 trie->trans[ zp ].check = state;
1791 if ( ++zp > pos ) pos = zp;
1792 break;
1793 }
1794 used--;
1795 }
1796 if ( !flag ) {
1797 flag = 1;
1798 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1799 }
1800 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1801 trie->trans[ pos ].check = state;
1802 pos++;
1803 }
1804 }
1805 }
cc601c31 1806 trie->lasttrans = pos + 1;
c944940b
JH
1807 trie->states = (reg_trie_state *)
1808 PerlMemShared_realloc( trie->states, laststate
1809 * sizeof(reg_trie_state) );
a3621e74 1810 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1811 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1812 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1813 (int)depth * 2 + 2,"",
1814 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1815 (IV)next_alloc,
1816 (IV)pos,
a3621e74
YO
1817 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1818 );
1819
1820 } /* end table compress */
1821 }
1e2e3d02
YO
1822 DEBUG_TRIE_COMPILE_MORE_r(
1823 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1824 (int)depth * 2 + 2, "",
1825 (UV)trie->statecount,
1826 (UV)trie->lasttrans)
1827 );
cc601c31 1828 /* resize the trans array to remove unused space */
c944940b
JH
1829 trie->trans = (reg_trie_trans *)
1830 PerlMemShared_realloc( trie->trans, trie->lasttrans
1831 * sizeof(reg_trie_trans) );
a3621e74 1832
3dab1dad 1833 /* and now dump out the compressed format */
2b8b4781 1834 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1835
3dab1dad 1836 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1837 U8 nodetype =(U8)(flags & 0xFF);
1838 char *str=NULL;
786e8c11 1839
07be1b83 1840#ifdef DEBUGGING
e62cc96a 1841 regnode *optimize = NULL;
b57a0404
JH
1842 U32 mjd_offset = 0;
1843 U32 mjd_nodelen = 0;
07be1b83 1844#endif
a3621e74 1845 /*
3dab1dad
YO
1846 This means we convert either the first branch or the first Exact,
1847 depending on whether the thing following (in 'last') is a branch
1848 or not and whther first is the startbranch (ie is it a sub part of
1849 the alternation or is it the whole thing.)
1850 Assuming its a sub part we conver the EXACT otherwise we convert
1851 the whole branch sequence, including the first.
a3621e74 1852 */
3dab1dad 1853 /* Find the node we are going to overwrite */
7f69552c 1854 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1855 /* branch sub-chain */
3dab1dad 1856 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1857 DEBUG_r({
1858 mjd_offset= Node_Offset((convert));
1859 mjd_nodelen= Node_Length((convert));
1860 });
7f69552c
YO
1861 /* whole branch chain */
1862 } else {
1863 DEBUG_r({
1864 const regnode *nop = NEXTOPER( convert );
1865 mjd_offset= Node_Offset((nop));
1866 mjd_nodelen= Node_Length((nop));
1867 });
07be1b83 1868 }
7f69552c 1869
07be1b83
YO
1870 DEBUG_OPTIMISE_r(
1871 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1872 (int)depth * 2 + 2, "",
786e8c11 1873 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1874 );
a3621e74 1875
3dab1dad
YO
1876 /* But first we check to see if there is a common prefix we can
1877 split out as an EXACT and put in front of the TRIE node. */
1878 trie->startstate= 1;
55eed653 1879 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1880 U32 state;
1e2e3d02 1881 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1882 U32 ofs = 0;
8e11feef
RGS
1883 I32 idx = -1;
1884 U32 count = 0;
1885 const U32 base = trie->states[ state ].trans.base;
a3621e74 1886
3dab1dad 1887 if ( trie->states[state].wordnum )
8e11feef 1888 count = 1;
a3621e74 1889
8e11feef 1890 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1891 if ( ( base + ofs >= trie->uniquecharcount ) &&
1892 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1893 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1894 {
3dab1dad 1895 if ( ++count > 1 ) {
2b8b4781 1896 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1897 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1898 if ( state == 1 ) break;
3dab1dad
YO
1899 if ( count == 2 ) {
1900 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1901 DEBUG_OPTIMISE_r(
8e11feef
RGS
1902 PerlIO_printf(Perl_debug_log,
1903 "%*sNew Start State=%"UVuf" Class: [",
1904 (int)depth * 2 + 2, "",
786e8c11 1905 (UV)state));
be8e71aa 1906 if (idx >= 0) {
2b8b4781 1907 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 1908 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1909
3dab1dad 1910 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1911 if ( folder )
1912 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1913 DEBUG_OPTIMISE_r(
07be1b83 1914 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1915 );
8e11feef
RGS
1916 }
1917 }
1918 TRIE_BITMAP_SET(trie,*ch);
1919 if ( folder )
1920 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1921 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1922 }
1923 idx = ofs;
1924 }
3dab1dad
YO
1925 }
1926 if ( count == 1 ) {
2b8b4781 1927 SV **tmp = av_fetch( revcharmap, idx, 0);
de734bd5
A
1928 char *ch = SvPV_nolen( *tmp );
1929 DEBUG_OPTIMISE_r({
1930 SV *sv=sv_newmortal();
8e11feef
RGS
1931 PerlIO_printf( Perl_debug_log,
1932 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1933 (int)depth * 2 + 2, "",
de734bd5
A
1934 (UV)state, (UV)idx,
1935 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1936 PL_colors[0], PL_colors[1],
1937 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1938 PERL_PV_ESCAPE_FIRSTCHAR
1939 )
1940 );
1941 });
3dab1dad
YO
1942 if ( state==1 ) {
1943 OP( convert ) = nodetype;
1944 str=STRING(convert);
1945 STR_LEN(convert)=0;
1946 }
de734bd5
A
1947 while (*ch) {
1948 *str++ = *ch++;
1949 STR_LEN(convert)++;
1950 }
1951
8e11feef 1952 } else {
f9049ba1 1953#ifdef DEBUGGING
8e11feef
RGS
1954 if (state>1)
1955 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1956#endif
8e11feef
RGS
1957 break;
1958 }
1959 }
3dab1dad 1960 if (str) {
8e11feef 1961 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1962 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1963 trie->startstate = state;
07be1b83
YO
1964 trie->minlen -= (state - 1);
1965 trie->maxlen -= (state - 1);
1966 DEBUG_r({
1967 regnode *fix = convert;
de734bd5 1968 U32 word = trie->wordcount;
07be1b83
YO
1969 mjd_nodelen++;
1970 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1971 while( ++fix < n ) {
1972 Set_Node_Offset_Length(fix, 0, 0);
1973 }
de734bd5 1974 while (word--) {
2b8b4781 1975 SV ** const tmp = av_fetch( trie_words, word, 0 );
de734bd5
A
1976 if (tmp) {
1977 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1978 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1979 else
1980 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1981 }
1982 }
07be1b83 1983 });
8e11feef
RGS
1984 if (trie->maxlen) {
1985 convert = n;
1986 } else {
3dab1dad 1987 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1988 DEBUG_r(optimize= n);
3dab1dad
YO
1989 }
1990 }
1991 }
a5ca303d
YO
1992 if (!jumper)
1993 jumper = last;
3dab1dad 1994 if ( trie->maxlen ) {
8e11feef
RGS
1995 NEXT_OFF( convert ) = (U16)(tail - convert);
1996 ARG_SET( convert, data_slot );
786e8c11
YO
1997 /* Store the offset to the first unabsorbed branch in
1998 jump[0], which is otherwise unused by the jump logic.
1999 We use this when dumping a trie and during optimisation. */
2000 if (trie->jump)
7f69552c 2001 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2002
786e8c11
YO
2003 /* XXXX */
2004 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2005 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2006 {
2007 OP( convert ) = TRIEC;
2008 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2009 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2010 trie->bitmap= NULL;
2011 } else
2012 OP( convert ) = TRIE;
a3621e74 2013
3dab1dad
YO
2014 /* store the type in the flags */
2015 convert->flags = nodetype;
a5ca303d
YO
2016 DEBUG_r({
2017 optimize = convert
2018 + NODE_STEP_REGNODE
2019 + regarglen[ OP( convert ) ];
2020 });
2021 /* XXX We really should free up the resource in trie now,
2022 as we won't use them - (which resources?) dmq */
3dab1dad 2023 }
a3621e74 2024 /* needed for dumping*/
e62cc96a 2025 DEBUG_r(if (optimize) {
07be1b83 2026 regnode *opt = convert;
e62cc96a 2027 while ( ++opt < optimize) {
07be1b83
YO
2028 Set_Node_Offset_Length(opt,0,0);
2029 }
786e8c11
YO
2030 /*
2031 Try to clean up some of the debris left after the
2032 optimisation.
a3621e74 2033 */
786e8c11 2034 while( optimize < jumper ) {
07be1b83 2035 mjd_nodelen += Node_Length((optimize));
a3621e74 2036 OP( optimize ) = OPTIMIZED;
07be1b83 2037 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2038 optimize++;
2039 }
07be1b83 2040 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2041 });
2042 } /* end node insert */
55eed653 2043 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2044#ifdef DEBUGGING
2045 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2046 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2047#else
2048 SvREFCNT_dec(revcharmap);
07be1b83 2049#endif
786e8c11
YO
2050 return trie->jump
2051 ? MADE_JUMP_TRIE
2052 : trie->startstate>1
2053 ? MADE_EXACT_TRIE
2054 : MADE_TRIE;
2055}
2056
2057STATIC void
2058S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2059{
2060/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2061
2062 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2063 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2064 ISBN 0-201-10088-6
2065
2066 We find the fail state for each state in the trie, this state is the longest proper
2067 suffix of the current states 'word' that is also a proper prefix of another word in our
2068 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2069 the DFA not to have to restart after its tried and failed a word at a given point, it
2070 simply continues as though it had been matching the other word in the first place.
2071 Consider
2072 'abcdgu'=~/abcdefg|cdgu/
2073 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2074 fail, which would bring use to the state representing 'd' in the second word where we would
2075 try 'g' and succeed, prodceding to match 'cdgu'.
2076 */
2077 /* add a fail transition */
3251b653
NC
2078 const U32 trie_offset = ARG(source);
2079 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2080 U32 *q;
2081 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2082 const U32 numstates = trie->statecount;
786e8c11
YO
2083 const U32 ubound = trie->lasttrans + ucharcount;
2084 U32 q_read = 0;
2085 U32 q_write = 0;
2086 U32 charid;
2087 U32 base = trie->states[ 1 ].trans.base;
2088 U32 *fail;
2089 reg_ac_data *aho;
2090 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2091 GET_RE_DEBUG_FLAGS_DECL;
2092#ifndef DEBUGGING
2093 PERL_UNUSED_ARG(depth);
2094#endif
2095
2096
2097 ARG_SET( stclass, data_slot );
c944940b 2098 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2099 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2100 aho->trie=trie_offset;
446bd890
NC
2101 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2102 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2103 Newxz( q, numstates, U32);
c944940b 2104 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2105 aho->refcount = 1;
2106 fail = aho->fail;
2107 /* initialize fail[0..1] to be 1 so that we always have
2108 a valid final fail state */
2109 fail[ 0 ] = fail[ 1 ] = 1;
2110
2111 for ( charid = 0; charid < ucharcount ; charid++ ) {
2112 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2113 if ( newstate ) {
2114 q[ q_write ] = newstate;
2115 /* set to point at the root */
2116 fail[ q[ q_write++ ] ]=1;
2117 }
2118 }
2119 while ( q_read < q_write) {
2120 const U32 cur = q[ q_read++ % numstates ];
2121 base = trie->states[ cur ].trans.base;
2122
2123 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2124 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2125 if (ch_state) {
2126 U32 fail_state = cur;
2127 U32 fail_base;
2128 do {
2129 fail_state = fail[ fail_state ];
2130 fail_base = aho->states[ fail_state ].trans.base;
2131 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2132
2133 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2134 fail[ ch_state ] = fail_state;
2135 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2136 {
2137 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2138 }
2139 q[ q_write++ % numstates] = ch_state;
2140 }
2141 }
2142 }
2143 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2144 when we fail in state 1, this allows us to use the
2145 charclass scan to find a valid start char. This is based on the principle
2146 that theres a good chance the string being searched contains lots of stuff
2147 that cant be a start char.
2148 */
2149 fail[ 0 ] = fail[ 1 ] = 0;
2150 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2151 PerlIO_printf(Perl_debug_log,
2152 "%*sStclass Failtable (%"UVuf" states): 0",
2153 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2154 );
786e8c11
YO
2155 for( q_read=1; q_read<numstates; q_read++ ) {
2156 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2157 }
2158 PerlIO_printf(Perl_debug_log, "\n");
2159 });
2160 Safefree(q);
2161 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2162}
2163
786e8c11 2164
a3621e74 2165/*
5d1c421c
JH
2166 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2167 * These need to be revisited when a newer toolchain becomes available.
2168 */
2169#if defined(__sparc64__) && defined(__GNUC__)
2170# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2171# undef SPARC64_GCC_WORKAROUND
2172# define SPARC64_GCC_WORKAROUND 1
2173# endif
2174#endif
2175
07be1b83 2176#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2177 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2178 SV * const mysv=sv_newmortal(); \
2179 regnode *Next = regnext(scan); \
2180 regprop(RExC_rx, mysv, scan); \
7f69552c 2181 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2182 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2183 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2184 }});
07be1b83 2185
1de06328
YO
2186
2187
2188
2189
07be1b83
YO
2190#define JOIN_EXACT(scan,min,flags) \
2191 if (PL_regkind[OP(scan)] == EXACT) \
2192 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2193
be8e71aa 2194STATIC U32
07be1b83
YO
2195S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2196 /* Merge several consecutive EXACTish nodes into one. */
2197 regnode *n = regnext(scan);
2198 U32 stringok = 1;
2199 regnode *next = scan + NODE_SZ_STR(scan);
2200 U32 merged = 0;
2201 U32 stopnow = 0;
2202#ifdef DEBUGGING
2203 regnode *stop = scan;
72f13be8 2204 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2205#else
d47053eb
RGS
2206 PERL_UNUSED_ARG(depth);
2207#endif
2208#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2209 PERL_UNUSED_ARG(flags);
2210 PERL_UNUSED_ARG(val);
07be1b83 2211#endif
07be1b83
YO
2212 DEBUG_PEEP("join",scan,depth);
2213
2214 /* Skip NOTHING, merge EXACT*. */
2215 while (n &&
2216 ( PL_regkind[OP(n)] == NOTHING ||
2217 (stringok && (OP(n) == OP(scan))))
2218 && NEXT_OFF(n)
2219 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2220
2221 if (OP(n) == TAIL || n > next)
2222 stringok = 0;
2223 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2224 DEBUG_PEEP("skip:",n,depth);
2225 NEXT_OFF(scan) += NEXT_OFF(n);
2226 next = n + NODE_STEP_REGNODE;
2227#ifdef DEBUGGING
2228 if (stringok)
2229 stop = n;
2230#endif
2231 n = regnext(n);
2232 }
2233 else if (stringok) {
786e8c11 2234 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2235 regnode * const nnext = regnext(n);
2236
2237 DEBUG_PEEP("merg",n,depth);
2238
2239 merged++;
2240 if (oldl + STR_LEN(n) > U8_MAX)
2241 break;
2242 NEXT_OFF(scan) += NEXT_OFF(n);
2243 STR_LEN(scan) += STR_LEN(n);
2244 next = n + NODE_SZ_STR(n);
2245 /* Now we can overwrite *n : */
2246 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2247#ifdef DEBUGGING
2248 stop = next - 1;
2249#endif
2250 n = nnext;
2251 if (stopnow) break;
2252 }
2253
d47053eb
RGS
2254#ifdef EXPERIMENTAL_INPLACESCAN
2255 if (flags && !NEXT_OFF(n)) {
2256 DEBUG_PEEP("atch", val, depth);
2257 if (reg_off_by_arg[OP(n)]) {
2258 ARG_SET(n, val - n);
2259 }
2260 else {
2261 NEXT_OFF(n) = val - n;
2262 }
2263 stopnow = 1;
2264 }
07be1b83
YO
2265#endif
2266 }
2267
2268 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2269 /*
2270 Two problematic code points in Unicode casefolding of EXACT nodes:
2271
2272 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2273 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2274
2275 which casefold to
2276
2277 Unicode UTF-8
2278
2279 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2280 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2281
2282 This means that in case-insensitive matching (or "loose matching",
2283 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2284 length of the above casefolded versions) can match a target string
2285 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2286 This would rather mess up the minimum length computation.
2287
2288 What we'll do is to look for the tail four bytes, and then peek
2289 at the preceding two bytes to see whether we need to decrease
2290 the minimum length by four (six minus two).
2291
2292 Thanks to the design of UTF-8, there cannot be false matches:
2293 A sequence of valid UTF-8 bytes cannot be a subsequence of
2294 another valid sequence of UTF-8 bytes.
2295
2296 */
2297 char * const s0 = STRING(scan), *s, *t;
2298 char * const s1 = s0 + STR_LEN(scan) - 1;
2299 char * const s2 = s1 - 4;
e294cc5d
JH
2300#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2301 const char t0[] = "\xaf\x49\xaf\x42";
2302#else
07be1b83 2303 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2304#endif
07be1b83
YO
2305 const char * const t1 = t0 + 3;
2306
2307 for (s = s0 + 2;
2308 s < s2 && (t = ninstr(s, s1, t0, t1));
2309 s = t + 4) {
e294cc5d
JH
2310#ifdef EBCDIC
2311 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2312 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2313#else
07be1b83
YO
2314 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2315 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2316#endif
07be1b83
YO
2317 *min -= 4;
2318 }
2319 }
2320
2321#ifdef DEBUGGING
2322 /* Allow dumping */
2323 n = scan + NODE_SZ_STR(scan);
2324 while (n <= stop) {
2325 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2326 OP(n) = OPTIMIZED;
2327 NEXT_OFF(n) = 0;
2328 }
2329 n++;
2330 }
2331#endif
2332 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2333 return stopnow;
2334}
2335
653099ff
GS
2336/* REx optimizer. Converts nodes into quickier variants "in place".
2337 Finds fixed substrings. */
2338
a0288114 2339/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2340 to the position after last scanned or to NULL. */
2341
40d049e4
YO
2342#define INIT_AND_WITHP \
2343 assert(!and_withp); \
2344 Newx(and_withp,1,struct regnode_charclass_class); \
2345 SAVEFREEPV(and_withp)
07be1b83 2346
b515a41d
YO
2347/* this is a chain of data about sub patterns we are processing that
2348 need to be handled seperately/specially in study_chunk. Its so
2349 we can simulate recursion without losing state. */
2350struct scan_frame;
2351typedef struct scan_frame {
2352 regnode *last; /* last node to process in this frame */
2353 regnode *next; /* next node to process when last is reached */
2354 struct scan_frame *prev; /*previous frame*/
2355 I32 stop; /* what stopparen do we use */
2356} scan_frame;
2357
304ee84b
YO
2358
2359#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2360
76e3520e 2361STATIC I32
40d049e4 2362S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2363 I32 *minlenp, I32 *deltap,
40d049e4
YO
2364 regnode *last,
2365 scan_data_t *data,
2366 I32 stopparen,
2367 U8* recursed,
2368 struct regnode_charclass_class *and_withp,
2369 U32 flags, U32 depth)
c277df42
IZ
2370 /* scanp: Start here (read-write). */
2371 /* deltap: Write maxlen-minlen here. */
2372 /* last: Stop before this one. */
40d049e4
YO
2373 /* data: string data about the pattern */
2374 /* stopparen: treat close N as END */
2375 /* recursed: which subroutines have we recursed into */
2376 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2377{
97aff369 2378 dVAR;
c277df42
IZ
2379 I32 min = 0, pars = 0, code;
2380 regnode *scan = *scanp, *next;
2381 I32 delta = 0;
2382 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2383 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2384 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2385 scan_data_t data_fake;
a3621e74 2386 SV *re_trie_maxbuff = NULL;
786e8c11 2387 regnode *first_non_open = scan;
e2e6a0f1 2388 I32 stopmin = I32_MAX;
8aa23a47
YO
2389 scan_frame *frame = NULL;
2390
a3621e74 2391 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2392
13a24bad 2393#ifdef DEBUGGING
40d049e4 2394 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2395#endif
40d049e4 2396
786e8c11 2397 if ( depth == 0 ) {
40d049e4 2398 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2399 first_non_open=regnext(first_non_open);
2400 }
2401
b81d288d 2402
8aa23a47
YO
2403 fake_study_recurse:
2404 while ( scan && OP(scan) != END && scan < last ){
2405 /* Peephole optimizer: */
304ee84b 2406 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2407 DEBUG_PEEP("Peep",scan,depth);
2408 JOIN_EXACT(scan,&min,0);
2409
2410 /* Follow the next-chain of the current node and optimize
2411 away all the NOTHINGs from it. */
2412 if (OP(scan) != CURLYX) {
2413 const int max = (reg_off_by_arg[OP(scan)]
2414 ? I32_MAX
2415 /* I32 may be smaller than U16 on CRAYs! */
2416 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2417 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2418 int noff;
2419 regnode *n = scan;
2420
2421 /* Skip NOTHING and LONGJMP. */
2422 while ((n = regnext(n))
2423 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2424 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2425 && off + noff < max)
2426 off += noff;
2427 if (reg_off_by_arg[OP(scan)])
2428 ARG(scan) = off;
2429 else
2430 NEXT_OFF(scan) = off;
2431 }
a3621e74 2432
c277df42 2433
8aa23a47
YO
2434
2435 /* The principal pseudo-switch. Cannot be a switch, since we
2436 look into several different things. */
2437 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2438 || OP(scan) == IFTHEN) {
2439 next = regnext(scan);
2440 code = OP(scan);
2441 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2442
2443 if (OP(next) == code || code == IFTHEN) {
2444 /* NOTE - There is similar code to this block below for handling
2445 TRIE nodes on a re-study. If you change stuff here check there
2446 too. */
2447 I32 max1 = 0, min1 = I32_MAX, num = 0;
2448 struct regnode_charclass_class accum;
2449 regnode * const startbranch=scan;
2450
2451 if (flags & SCF_DO_SUBSTR)
304ee84b 2452 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2453 if (flags & SCF_DO_STCLASS)
2454 cl_init_zero(pRExC_state, &accum);
2455
2456 while (OP(scan) == code) {
2457 I32 deltanext, minnext, f = 0, fake;
2458 struct regnode_charclass_class this_class;
2459
2460 num++;
2461 data_fake.flags = 0;
2462 if (data) {
2463 data_fake.whilem_c = data->whilem_c;
2464 data_fake.last_closep = data->last_closep;
2465 }
2466 else
2467 data_fake.last_closep = &fake;
58e23c8d
YO
2468
2469 data_fake.pos_delta = delta;
8aa23a47
YO
2470 next = regnext(scan);
2471 scan = NEXTOPER(scan);
2472 if (code != BRANCH)
c277df42 2473 scan = NEXTOPER(scan);
8aa23a47
YO
2474 if (flags & SCF_DO_STCLASS) {
2475 cl_init(pRExC_state, &this_class);
2476 data_fake.start_class = &this_class;
2477 f = SCF_DO_STCLASS_AND;
58e23c8d 2478 }
8aa23a47
YO
2479 if (flags & SCF_WHILEM_VISITED_POS)
2480 f |= SCF_WHILEM_VISITED_POS;
2481
2482 /* we suppose the run is continuous, last=next...*/
2483 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2484 next, &data_fake,
2485 stopparen, recursed, NULL, f,depth+1);
2486 if (min1 > minnext)
2487 min1 = minnext;
2488 if (max1 < minnext + deltanext)
2489 max1 = minnext + deltanext;
2490 if (deltanext == I32_MAX)
2491 is_inf = is_inf_internal = 1;
2492 scan = next;
2493 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2494 pars++;
2495 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2496 if ( stopmin > minnext)
2497 stopmin = min + min1;
2498 flags &= ~SCF_DO_SUBSTR;
2499 if (data)
2500 data->flags |= SCF_SEEN_ACCEPT;
2501 }
2502 if (data) {
2503 if (data_fake.flags & SF_HAS_EVAL)
2504 data->flags |= SF_HAS_EVAL;
2505 data->whilem_c = data_fake.whilem_c;
3dab1dad 2506 }
8aa23a47
YO
2507 if (flags & SCF_DO_STCLASS)
2508 cl_or(pRExC_state, &accum, &this_class);
2509 }
2510 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2511 min1 = 0;
2512 if (flags & SCF_DO_SUBSTR) {
2513 data->pos_min += min1;
2514 data->pos_delta += max1 - min1;
2515 if (max1 != min1 || is_inf)
2516 data->longest = &(data->longest_float);
2517 }
2518 min += min1;
2519 delta += max1 - min1;
2520 if (flags & SCF_DO_STCLASS_OR) {
2521 cl_or(pRExC_state, data->start_class, &accum);
2522 if (min1) {
2523 cl_and(data->start_class, and_withp);
2524 flags &= ~SCF_DO_STCLASS;
653099ff 2525 }
8aa23a47
YO
2526 }
2527 else if (flags & SCF_DO_STCLASS_AND) {
2528 if (min1) {
2529 cl_and(data->start_class, &accum);
2530 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2531 }
8aa23a47
YO
2532 else {
2533 /* Switch to OR mode: cache the old value of
2534 * data->start_class */
2535 INIT_AND_WITHP;
2536 StructCopy(data->start_class, and_withp,
2537 struct regnode_charclass_class);
2538 flags &= ~SCF_DO_STCLASS_AND;
2539 StructCopy(&accum, data->start_class,
2540 struct regnode_charclass_class);
2541 flags |= SCF_DO_STCLASS_OR;
2542 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2543 }
8aa23a47 2544 }
a3621e74 2545
8aa23a47
YO
2546 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2547 /* demq.
a3621e74 2548
8aa23a47
YO
2549 Assuming this was/is a branch we are dealing with: 'scan' now
2550 points at the item that follows the branch sequence, whatever
2551 it is. We now start at the beginning of the sequence and look
2552 for subsequences of
a3621e74 2553
8aa23a47
YO
2554 BRANCH->EXACT=>x1
2555 BRANCH->EXACT=>x2
2556 tail
a3621e74 2557
8aa23a47 2558 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2559
8aa23a47
YO
2560 If we can find such a subseqence we need to turn the first
2561 element into a trie and then add the subsequent branch exact
2562 strings to the trie.
a3621e74 2563
8aa23a47 2564 We have two cases
a3621e74 2565
8aa23a47 2566 1. patterns where the whole set of branch can be converted.
a3621e74 2567
8aa23a47 2568 2. patterns where only a subset can be converted.
a3621e74 2569
8aa23a47
YO
2570 In case 1 we can replace the whole set with a single regop
2571 for the trie. In case 2 we need to keep the start and end
2572 branchs so
a3621e74 2573
8aa23a47
YO
2574 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2575 becomes BRANCH TRIE; BRANCH X;
786e8c11 2576
8aa23a47
YO
2577 There is an additional case, that being where there is a
2578 common prefix, which gets split out into an EXACT like node
2579 preceding the TRIE node.
a3621e74 2580
8aa23a47
YO
2581 If x(1..n)==tail then we can do a simple trie, if not we make
2582 a "jump" trie, such that when we match the appropriate word
2583 we "jump" to the appopriate tail node. Essentailly we turn
2584 a nested if into a case structure of sorts.
b515a41d 2585
8aa23a47
YO
2586 */
2587
2588 int made=0;
2589 if (!re_trie_maxbuff) {
2590 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2591 if (!SvIOK(re_trie_maxbuff))
2592 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2593 }
2594 if ( SvIV(re_trie_maxbuff)>=0 ) {
2595 regnode *cur;
2596 regnode *first = (regnode *)NULL;
2597 regnode *last = (regnode *)NULL;
2598 regnode *tail = scan;
2599 U8 optype = 0;
2600 U32 count=0;
a3621e74
YO
2601
2602#ifdef DEBUGGING
8aa23a47 2603 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2604#endif
8aa23a47
YO
2605 /* var tail is used because there may be a TAIL
2606 regop in the way. Ie, the exacts will point to the
2607 thing following the TAIL, but the last branch will
2608 point at the TAIL. So we advance tail. If we
2609 have nested (?:) we may have to move through several
2610 tails.
2611 */
2612
2613 while ( OP( tail ) == TAIL ) {
2614 /* this is the TAIL generated by (?:) */
2615 tail = regnext( tail );
2616 }
a3621e74 2617
8aa23a47
YO
2618
2619 DEBUG_OPTIMISE_r({
2620 regprop(RExC_rx, mysv, tail );
2621 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2622 (int)depth * 2 + 2, "",
2623 "Looking for TRIE'able sequences. Tail node is: ",
2624 SvPV_nolen_const( mysv )
2625 );
2626 });
2627
2628 /*
2629
2630 step through the branches, cur represents each
2631 branch, noper is the first thing to be matched
2632 as part of that branch and noper_next is the
2633 regnext() of that node. if noper is an EXACT
2634 and noper_next is the same as scan (our current
2635 position in the regex) then the EXACT branch is
2636 a possible optimization target. Once we have
2637 two or more consequetive such branches we can
2638 create a trie of the EXACT's contents and stich
2639 it in place. If the sequence represents all of
2640 the branches we eliminate the whole thing and
2641 replace it with a single TRIE. If it is a
2642 subsequence then we need to stitch it in. This
2643 means the first branch has to remain, and needs
2644 to be repointed at the item on the branch chain
2645 following the last branch optimized. This could
2646 be either a BRANCH, in which case the
2647 subsequence is internal, or it could be the
2648 item following the branch sequence in which
2649 case the subsequence is at the end.
2650
2651 */
2652
2653 /* dont use tail as the end marker for this traverse */
2654 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2655 regnode * const noper = NEXTOPER( cur );
b515a41d 2656#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2657 regnode * const noper_next = regnext( noper );
b515a41d
YO
2658#endif
2659
8aa23a47
YO
2660 DEBUG_OPTIMISE_r({
2661 regprop(RExC_rx, mysv, cur);
2662 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2663 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2664
2665 regprop(RExC_rx, mysv, noper);
2666 PerlIO_printf( Perl_debug_log, " -> %s",
2667 SvPV_nolen_const(mysv));
2668
2669 if ( noper_next ) {
2670 regprop(RExC_rx, mysv, noper_next );
2671 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2672 SvPV_nolen_const(mysv));
2673 }
2674 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2675 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2676 });
2677 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2678 : PL_regkind[ OP( noper ) ] == EXACT )
2679 || OP(noper) == NOTHING )
786e8c11 2680#ifdef NOJUMPTRIE
8aa23a47 2681 && noper_next == tail
786e8c11 2682#endif
8aa23a47
YO
2683 && count < U16_MAX)
2684 {
2685 count++;
2686 if ( !first || optype == NOTHING ) {
2687 if (!first) first = cur;
2688 optype = OP( noper );
2689 } else {
2690 last = cur;
2691 }
2692 } else {
2693 if ( last ) {
2694 make_trie( pRExC_state,
2695 startbranch, first, cur, tail, count,
2696 optype, depth+1 );
2697 }
2698 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2699#ifdef NOJUMPTRIE
8aa23a47 2700 && noper_next == tail
786e8c11 2701#endif
8aa23a47
YO
2702 ){
2703 count = 1;
2704 first = cur;
2705 optype = OP( noper );
2706 } else {
2707 count = 0;
2708 first = NULL;
2709 optype = 0;
2710 }
2711 last = NULL;
2712 }
2713 }
2714 DEBUG_OPTIMISE_r({
2715 regprop(RExC_rx, mysv, cur);
2716 PerlIO_printf( Perl_debug_log,
2717 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2718 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2719
2720 });
2721 if ( last ) {
2722 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2723#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2724 if ( ((made == MADE_EXACT_TRIE &&
2725 startbranch == first)
2726 || ( first_non_open == first )) &&
2727 depth==0 ) {
2728 flags |= SCF_TRIE_RESTUDY;
2729 if ( startbranch == first
2730 && scan == tail )
2731 {
2732 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2733 }
2734 }
3dab1dad 2735#endif
8aa23a47
YO
2736 }
2737 }
2738
2739 } /* do trie */
2740
653099ff 2741 }
8aa23a47
YO
2742 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2743 scan = NEXTOPER(NEXTOPER(scan));
2744 } else /* single branch is optimized. */
2745 scan = NEXTOPER(scan);
2746 continue;
2747 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2748 scan_frame *newframe = NULL;
2749 I32 paren;
2750 regnode *start;
2751 regnode *end;
2752
2753 if (OP(scan) != SUSPEND) {
2754 /* set the pointer */
2755 if (OP(scan) == GOSUB) {
2756 paren = ARG(scan);
2757 RExC_recurse[ARG2L(scan)] = scan;
2758 start = RExC_open_parens[paren-1];
2759 end = RExC_close_parens[paren-1];
2760 } else {
2761 paren = 0;
f8fc2ecf 2762 start = RExC_rxi->program + 1;
8aa23a47
YO
2763 end = RExC_opend;
2764 }
2765 if (!recursed) {
2766 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2767 SAVEFREEPV(recursed);
2768 }
2769 if (!PAREN_TEST(recursed,paren+1)) {
2770 PAREN_SET(recursed,paren+1);
2771 Newx(newframe,1,scan_frame);
2772 } else {
2773 if (flags & SCF_DO_SUBSTR) {
304ee84b 2774 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2775 data->longest = &(data->longest_float);
2776 }
2777 is_inf = is_inf_internal = 1;
2778 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2779 cl_anything(pRExC_state, data->start_class);
2780 flags &= ~SCF_DO_STCLASS;
2781 }
2782 } else {
2783 Newx(newframe,1,scan_frame);
2784 paren = stopparen;
2785 start = scan+2;
2786 end = regnext(scan);
2787 }
2788 if (newframe) {
2789 assert(start);
2790 assert(end);
2791 SAVEFREEPV(newframe);
2792 newframe->next = regnext(scan);
2793 newframe->last = last;
2794 newframe->stop = stopparen;
2795 newframe->prev = frame;
2796
2797 frame = newframe;
2798 scan = start;
2799 stopparen = paren;
2800 last = end;
2801
2802 continue;
2803 }
2804 }
2805 else if (OP(scan) == EXACT) {
2806 I32 l = STR_LEN(scan);
2807 UV uc;
2808 if (UTF) {
2809 const U8 * const s = (U8*)STRING(scan);
2810 l = utf8_length(s, s + l);
2811 uc = utf8_to_uvchr(s, NULL);
2812 } else {
2813 uc = *((U8*)STRING(scan));
2814 }
2815 min += l;
2816 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2817 /* The code below prefers earlier match for fixed
2818 offset, later match for variable offset. */
2819 if (data->last_end == -1) { /* Update the start info. */
2820 data->last_start_min = data->pos_min;
2821 data->last_start_max = is_inf
2822 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2823 }
8aa23a47
YO
2824 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2825 if (UTF)
2826 SvUTF8_on(data->last_found);
2827 {
2828 SV * const sv = data->last_found;
2829 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2830 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2831 if (mg && mg->mg_len >= 0)
2832 mg->mg_len += utf8_length((U8*)STRING(scan),
2833 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2834 }
8aa23a47
YO
2835 data->last_end = data->pos_min + l;
2836 data->pos_min += l; /* As in the first entry. */
2837 data->flags &= ~SF_BEFORE_EOL;
2838 }
2839 if (flags & SCF_DO_STCLASS_AND) {
2840 /* Check whether it is compatible with what we know already! */
2841 int compat = 1;
2842
2843 if (uc >= 0x100 ||
2844 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2845 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2846 && (!(data->start_class->flags & ANYOF_FOLD)
2847 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2848 )
2849 compat = 0;
2850 ANYOF_CLASS_ZERO(data->start_class);
2851 ANYOF_BITMAP_ZERO(data->start_class);
2852 if (compat)
2853 ANYOF_BITMAP_SET(data->start_class, uc);
2854 data->start_class->flags &= ~ANYOF_EOS;
2855 if (uc < 0x100)
2856 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2857 }
2858 else if (flags & SCF_DO_STCLASS_OR) {
2859 /* false positive possible if the class is case-folded */
2860 if (uc < 0x100)
2861 ANYOF_BITMAP_SET(data->start_class, uc);
2862 else
2863 data->start_class->flags |= ANYOF_UNICODE_ALL;
2864 data->start_class->flags &= ~ANYOF_EOS;
2865 cl_and(data->start_class, and_withp);
2866 }
2867 flags &= ~SCF_DO_STCLASS;
2868 }
2869 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2870 I32 l = STR_LEN(scan);
2871 UV uc = *((U8*)STRING(scan));
2872
2873 /* Search for fixed substrings supports EXACT only. */
2874 if (flags & SCF_DO_SUBSTR) {
2875 assert(data);
304ee84b 2876 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
2877 }
2878 if (UTF) {
2879 const U8 * const s = (U8 *)STRING(scan);
2880 l = utf8_length(s, s + l);
2881 uc = utf8_to_uvchr(s, NULL);
2882 }
2883 min += l;
2884 if (flags & SCF_DO_SUBSTR)
2885 data->pos_min += l;
2886 if (flags & SCF_DO_STCLASS_AND) {
2887 /* Check whether it is compatible with what we know already! */
2888 int compat = 1;
2889
2890 if (uc >= 0x100 ||
2891 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2892 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2893 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2894 compat = 0;
2895 ANYOF_CLASS_ZERO(data->start_class);
2896 ANYOF_BITMAP_ZERO(data->start_class);
2897 if (compat) {
2898 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2899 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2900 data->start_class->flags |= ANYOF_FOLD;
2901 if (OP(scan) == EXACTFL)
2902 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2903 }
8aa23a47
YO
2904 }
2905 else if (flags & SCF_DO_STCLASS_OR) {
2906 if (data->start_class->flags & ANYOF_FOLD) {
2907 /* false positive possible if the class is case-folded.
2908 Assume that the locale settings are the same... */
1aa99e6b
IH
2909 if (uc < 0x100)
2910 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2911 data->start_class->flags &= ~ANYOF_EOS;
2912 }
8aa23a47 2913 cl_and(data->start_class, and_withp);
653099ff 2914 }
8aa23a47
YO
2915 flags &= ~SCF_DO_STCLASS;
2916 }
2917 else if (strchr((const char*)PL_varies,OP(scan))) {
2918 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2919 I32 f = flags, pos_before = 0;
2920 regnode * const oscan = scan;
2921 struct regnode_charclass_class this_class;
2922 struct regnode_charclass_class *oclass = NULL;
2923 I32 next_is_eval = 0;
2924
2925 switch (PL_regkind[OP(scan)]) {
2926 case WHILEM: /* End of (?:...)* . */
2927 scan = NEXTOPER(scan);
2928 goto finish;
2929 case PLUS:
2930 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2931 next = NEXTOPER(scan);
2932 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2933 mincount = 1;
2934 maxcount = REG_INFTY;
2935 next = regnext(scan);
2936 scan = NEXTOPER(scan);
2937 goto do_curly;
2938 }
2939 }
2940 if (flags & SCF_DO_SUBSTR)
2941 data->pos_min++;
2942 min++;
2943 /* Fall through. */
2944 case STAR:
2945 if (flags & SCF_DO_STCLASS) {
2946 mincount = 0;
2947 maxcount = REG_INFTY;
2948 next = regnext(scan);
2949 scan = NEXTOPER(scan);
2950 goto do_curly;
2951 }
2952 is_inf = is_inf_internal = 1;
2953 scan = regnext(scan);
c277df42 2954 if (flags & SCF_DO_SUBSTR) {
304ee84b 2955 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 2956 data->longest = &(data->longest_float);
c277df42 2957 }
8aa23a47
YO
2958 goto optimize_curly_tail;
2959 case CURLY:
2960 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2961 && (scan->flags == stopparen))
2962 {
2963 mincount = 1;
2964 maxcount = 1;
2965 } else {
2966 mincount = ARG1(scan);
2967 maxcount = ARG2(scan);
653099ff 2968 }
8aa23a47
YO
2969 next = regnext(scan);
2970 if (OP(scan) == CURLYX) {
2971 I32 lp = (data ? *(data->last_closep) : 0);
2972 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2973 }
8aa23a47
YO
2974 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2975 next_is_eval = (OP(scan) == EVAL);
2976 do_curly:
2977 if (flags & SCF_DO_SUBSTR) {
304ee84b 2978 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 2979 pos_before = data->pos_min;
b45f050a 2980 }
8aa23a47
YO
2981 if (data) {
2982 fl = data->flags;
2983 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2984 if (is_inf)
2985 data->flags |= SF_IS_INF;
2986 }
2987 if (flags & SCF_DO_STCLASS) {
2988 cl_init(pRExC_state, &this_class);
2989 oclass = data->start_class;
2990 data->start_class = &this_class;
2991 f |= SCF_DO_STCLASS_AND;
2992 f &= ~SCF_DO_STCLASS_OR;
2993 }
2994 /* These are the cases when once a subexpression
2995 fails at a particular position, it cannot succeed
2996 even after backtracking at the enclosing scope.
2997
2998 XXXX what if minimal match and we are at the
2999 initial run of {n,m}? */
3000 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3001 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3002
8aa23a47
YO
3003 /* This will finish on WHILEM, setting scan, or on NULL: */
3004 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3005 last, data, stopparen, recursed, NULL,
3006 (mincount == 0
3007 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3008
8aa23a47
YO
3009 if (flags & SCF_DO_STCLASS)
3010 data->start_class = oclass;
3011 if (mincount == 0 || minnext == 0) {
3012 if (flags & SCF_DO_STCLASS_OR) {
3013 cl_or(pRExC_state, data->start_class, &this_class);
3014 }
3015 else if (flags & SCF_DO_STCLASS_AND) {
3016 /* Switch to OR mode: cache the old value of
3017 * data->start_class */
3018 INIT_AND_WITHP;
3019 StructCopy(data->start_class, and_withp,
3020 struct regnode_charclass_class);
3021 flags &= ~SCF_DO_STCLASS_AND;
3022 StructCopy(&this_class, data->start_class,
3023 struct regnode_charclass_class);
3024 flags |= SCF_DO_STCLASS_OR;
3025 data->start_class->flags |= ANYOF_EOS;
3026 }
3027 } else { /* Non-zero len */
3028 if (flags & SCF_DO_STCLASS_OR) {
3029 cl_or(pRExC_state, data->start_class, &this_class);
3030 cl_and(data->start_class, and_withp);
3031 }
3032 else if (flags & SCF_DO_STCLASS_AND)
3033 cl_and(data->start_class, &this_class);
3034 flags &= ~SCF_DO_STCLASS;
3035 }
3036 if (!scan) /* It was not CURLYX, but CURLY. */
3037 scan = next;
3038 if ( /* ? quantifier ok, except for (?{ ... }) */
3039 (next_is_eval || !(mincount == 0 && maxcount == 1))
3040 && (minnext == 0) && (deltanext == 0)
3041 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3042 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3043 && ckWARN(WARN_REGEXP))
3044 {
3045 vWARN(RExC_parse,
3046 "Quantifier unexpected on zero-length expression");
3047 }
3048
3049 min += minnext * mincount;
3050 is_inf_internal |= ((maxcount == REG_INFTY
3051 && (minnext + deltanext) > 0)
3052 || deltanext == I32_MAX);
3053 is_inf |= is_inf_internal;
3054 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3055
3056 /* Try powerful optimization CURLYX => CURLYN. */
3057 if ( OP(oscan) == CURLYX && data
3058 && data->flags & SF_IN_PAR
3059 && !(data->flags & SF_HAS_EVAL)
3060 && !deltanext && minnext == 1 ) {
3061 /* Try to optimize to CURLYN. */
3062 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3063 regnode * const nxt1 = nxt;
497b47a8 3064#ifdef DEBUGGING
8aa23a47 3065 regnode *nxt2;
497b47a8 3066#endif
c277df42 3067
8aa23a47
YO
3068 /* Skip open. */
3069 nxt = regnext(nxt);
3070 if (!strchr((const char*)PL_simple,OP(nxt))
3071 && !(PL_regkind[OP(nxt)] == EXACT
3072 && STR_LEN(nxt) == 1))
3073 goto nogo;
497b47a8 3074#ifdef DEBUGGING
8aa23a47 3075 nxt2 = nxt;
497b47a8 3076#endif
8aa23a47
YO
3077 nxt = regnext(nxt);
3078 if (OP(nxt) != CLOSE)
3079 goto nogo;
3080 if (RExC_open_parens) {
3081 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3082 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3083 }
3084 /* Now we know that nxt2 is the only contents: */
3085 oscan->flags = (U8)ARG(nxt);
3086 OP(oscan) = CURLYN;
3087 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3088
c277df42 3089#ifdef DEBUGGING
8aa23a47
YO
3090 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3091 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3092 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3093 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3094 OP(nxt + 1) = OPTIMIZED; /* was count. */
3095 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3096#endif
8aa23a47
YO
3097 }
3098 nogo:
3099
3100 /* Try optimization CURLYX => CURLYM. */
3101 if ( OP(oscan) == CURLYX && data
3102 && !(data->flags & SF_HAS_PAR)
3103 && !(data->flags & SF_HAS_EVAL)
3104 && !deltanext /* atom is fixed width */
3105 && minnext != 0 /* CURLYM can't handle zero width */
3106 ) {
3107 /* XXXX How to optimize if data == 0? */
3108 /* Optimize to a simpler form. */
3109 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3110 regnode *nxt2;
3111
3112 OP(oscan) = CURLYM;
3113 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3114 && (OP(nxt2) != WHILEM))
3115 nxt = nxt2;
3116 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3117 /* Need to optimize away parenths. */
3118 if (data->flags & SF_IN_PAR) {
3119 /* Set the parenth number. */
3120 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3121
3122 if (OP(nxt) != CLOSE)
3123 FAIL("Panic opt close");
3124 oscan->flags = (U8)ARG(nxt);
3125 if (RExC_open_parens) {
3126 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3127 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3128 }
8aa23a47
YO
3129 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3130 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3131
c277df42 3132#ifdef DEBUGGING
8aa23a47
YO
3133 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3134 OP(nxt + 1) = OPTIMIZED; /* was count. */
3135 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3136 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3137#endif
c277df42 3138#if 0
8aa23a47
YO
3139 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3140 regnode *nnxt = regnext(nxt1);
3141
3142 if (nnxt == nxt) {
3143 if (reg_off_by_arg[OP(nxt1)])
3144 ARG_SET(nxt1, nxt2 - nxt1);
3145 else if (nxt2 - nxt1 < U16_MAX)
3146 NEXT_OFF(nxt1) = nxt2 - nxt1;
3147 else
3148 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3149 }
8aa23a47 3150 nxt1 = nnxt;
c277df42 3151 }
5d1c421c 3152#endif
8aa23a47
YO
3153 /* Optimize again: */
3154 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3155 NULL, stopparen, recursed, NULL, 0,depth+1);
3156 }
3157 else
3158 oscan->flags = 0;
3159 }
3160 else if ((OP(oscan) == CURLYX)
3161 && (flags & SCF_WHILEM_VISITED_POS)
3162 /* See the comment on a similar expression above.
3163 However, this time it not a subexpression
3164 we care about, but the expression itself. */
3165 && (maxcount == REG_INFTY)
3166 && data && ++data->whilem_c < 16) {
3167 /* This stays as CURLYX, we can put the count/of pair. */
3168 /* Find WHILEM (as in regexec.c) */
3169 regnode *nxt = oscan + NEXT_OFF(oscan);
3170
3171 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3172 nxt += ARG(nxt);
3173 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3174 | (RExC_whilem_seen << 4)); /* On WHILEM */
3175 }
3176 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3177 pars++;
3178 if (flags & SCF_DO_SUBSTR) {
3179 SV *last_str = NULL;
3180 int counted = mincount != 0;
a0ed51b3 3181
8aa23a47
YO
3182 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3183#if defined(SPARC64_GCC_WORKAROUND)
3184 I32 b = 0;
3185 STRLEN l = 0;
3186 const char *s = NULL;
3187 I32 old = 0;
b515a41d 3188
8aa23a47
YO
3189 if (pos_before >= data->last_start_min)
3190 b = pos_before;
3191 else
3192 b = data->last_start_min;
b515a41d 3193
8aa23a47
YO
3194 l = 0;
3195 s = SvPV_const(data->last_found, l);
3196 old = b - data->last_start_min;
3197
3198#else
3199 I32 b = pos_before >= data->last_start_min
3200 ? pos_before : data->last_start_min;
3201 STRLEN l;
3202 const char * const s = SvPV_const(data->last_found, l);
3203 I32 old = b - data->last_start_min;
3204#endif
3205
3206 if (UTF)
3207 old = utf8_hop((U8*)s, old) - (U8*)s;
3208
3209 l -= old;
3210 /* Get the added string: */
3211 last_str = newSVpvn(s + old, l);
3212 if (UTF)
3213 SvUTF8_on(last_str);
3214 if (deltanext == 0 && pos_before == b) {
3215 /* What was added is a constant string */
3216 if (mincount > 1) {
3217 SvGROW(last_str, (mincount * l) + 1);
3218 repeatcpy(SvPVX(last_str) + l,
3219 SvPVX_const(last_str), l, mincount - 1);
3220 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3221 /* Add additional parts. */
3222 SvCUR_set(data->last_found,
3223 SvCUR(data->last_found) - l);
3224 sv_catsv(data->last_found, last_str);
3225 {
3226 SV * sv = data->last_found;
3227 MAGIC *mg =
3228 SvUTF8(sv) && SvMAGICAL(sv) ?
3229 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3230 if (mg && mg->mg_len >= 0)
3231 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3232 }
8aa23a47 3233 data->last_end += l * (mincount - 1);
b515a41d 3234 }
8aa23a47
YO
3235 } else {
3236 /* start offset must point into the last copy */
3237 data->last_start_min += minnext * (mincount - 1);
3238 data->last_start_max += is_inf ? I32_MAX
3239 : (maxcount - 1) * (minnext + data->pos_delta);
3240 }
c277df42 3241 }
8aa23a47
YO
3242 /* It is counted once already... */
3243 data->pos_min += minnext * (mincount - counted);
3244 data->pos_delta += - counted * deltanext +
3245 (minnext + deltanext) * maxcount - minnext * mincount;
3246 if (mincount != maxcount) {
3247 /* Cannot extend fixed substrings found inside
3248 the group. */
304ee84b 3249 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3250 if (mincount && last_str) {
3251 SV * const sv = data->last_found;
3252 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3253 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3254
3255 if (mg)
3256 mg->mg_len = -1;
3257 sv_setsv(sv, last_str);
3258 data->last_end = data->pos_min;
3259 data->last_start_min =
3260 data->pos_min - CHR_SVLEN(last_str);
3261 data->last_start_max = is_inf
3262 ? I32_MAX
3263 : data->pos_min + data->pos_delta
3264 - CHR_SVLEN(last_str);
3265 }
3266 data->longest = &(data->longest_float);
3267 }
3268 SvREFCNT_dec(last_str);
c277df42 3269 }
8aa23a47
YO
3270 if (data && (fl & SF_HAS_EVAL))
3271 data->flags |= SF_HAS_EVAL;
3272 optimize_curly_tail:
3273 if (OP(oscan) != CURLYX) {
3274 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3275 && NEXT_OFF(next))
3276 NEXT_OFF(oscan) += NEXT_OFF(next);
3277 }
3278 continue;
3279 default: /* REF and CLUMP only? */
3280 if (flags & SCF_DO_SUBSTR) {
304ee84b 3281 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3282 data->longest = &(data->longest_float);
3283 }
3284 is_inf = is_inf_internal = 1;
3285 if (flags & SCF_DO_STCLASS_OR)
3286 cl_anything(pRExC_state, data->start_class);
3287 flags &= ~SCF_DO_STCLASS;
3288 break;
c277df42 3289 }
8aa23a47
YO
3290 }
3291 else if (strchr((const char*)PL_simple,OP(scan))) {
3292 int value = 0;
653099ff 3293
8aa23a47 3294 if (flags & SCF_DO_SUBSTR) {
304ee84b 3295 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3296 data->pos_min++;
3297 }
3298 min++;
3299 if (flags & SCF_DO_STCLASS) {
3300 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3301
8aa23a47
YO
3302 /* Some of the logic below assumes that switching
3303 locale on will only add false positives. */
3304 switch (PL_regkind[OP(scan)]) {
3305 case SANY:
3306 default:
3307 do_default:
3308 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3309 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3310 cl_anything(pRExC_state, data->start_class);
3311 break;
3312 case REG_ANY:
3313 if (OP(scan) == SANY)
3314 goto do_default;
3315 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3316 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3317 || (data->start_class->flags & ANYOF_CLASS));
3318 cl_anything(pRExC_state, data->start_class);
653099ff 3319 }
8aa23a47
YO
3320 if (flags & SCF_DO_STCLASS_AND || !value)
3321 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3322 break;
3323 case ANYOF:
3324 if (flags & SCF_DO_STCLASS_AND)
3325 cl_and(data->start_class,
3326 (struct regnode_charclass_class*)scan);
653099ff 3327 else
8aa23a47
YO
3328 cl_or(pRExC_state, data->start_class,
3329 (struct regnode_charclass_class*)scan);
3330 break;
3331 case ALNUM:
3332 if (flags & SCF_DO_STCLASS_AND) {
3333 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3334 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3335 for (value = 0; value < 256; value++)
3336 if (!isALNUM(value))
3337 ANYOF_BITMAP_CLEAR(data->start_class, value);
3338 }
653099ff 3339 }
8aa23a47
YO
3340 else {
3341 if (data->start_class->flags & ANYOF_LOCALE)
3342 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3343 else {
3344 for (value = 0; value < 256; value++)
3345 if (isALNUM(value))
3346 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3347 }
8aa23a47
YO
3348 }
3349 break;
3350 case ALNUML:
3351 if (flags & SCF_DO_STCLASS_AND) {
3352 if (data->start_class->flags & ANYOF_LOCALE)
3353 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3354 }
3355 else {
3356 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3357 data->start_class->flags |= ANYOF_LOCALE;
3358 }
3359 break;
3360 case NALNUM:
3361 if (flags & SCF_DO_STCLASS_AND) {
3362 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3363 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3364 for (value = 0; value < 256; value++)
3365 if (isALNUM(value))
3366 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3367 }
3368 }
8aa23a47
YO
3369 else {
3370 if (data->start_class->flags & ANYOF_LOCALE)
3371 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3372 else {
3373 for (value = 0; value < 256; value++)
3374 if (!isALNUM(value))
3375 ANYOF_BITMAP_SET(data->start_class, value);
3376 }
653099ff 3377 }
8aa23a47
YO
3378 break;
3379 case NALNUML:
3380 if (flags & SCF_DO_STCLASS_AND) {
3381 if (data->start_class->flags & ANYOF_LOCALE)
3382 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3383 }
8aa23a47
YO
3384 else {
3385 data->start_class->flags |= ANYOF_LOCALE;
3386 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3387 }
3388 break;
3389 case SPACE:
3390 if (flags & SCF_DO_STCLASS_AND) {
3391 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3392 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3393 for (value = 0; value < 256; value++)
3394 if (!isSPACE(value))
3395 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3396 }
3397 }
8aa23a47
YO
3398 else {
3399 if (data->start_class->flags & ANYOF_LOCALE)
3400 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3401 else {
3402 for (value = 0; value < 256; value++)
3403 if (isSPACE(value))
3404 ANYOF_BITMAP_SET(data->start_class, value);
3405 }
653099ff 3406 }
8aa23a47
YO
3407 break;
3408 case SPACEL:
3409 if (flags & SCF_DO_STCLASS_AND) {
3410 if (data->start_class->flags & ANYOF_LOCALE)
3411 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3412 }
3413 else {
3414 data->start_class->flags |= ANYOF_LOCALE;
3415 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3416 }
3417 break;
3418 case NSPACE:
3419 if (flags & SCF_DO_STCLASS_AND) {
3420 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3421 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3422 for (value = 0; value < 256; value++)
3423 if (isSPACE(value))
3424 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3425 }
8aa23a47
YO
3426 }
3427 else {
3428 if (data->start_class->flags & ANYOF_LOCALE)
3429 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3430 else {
3431 for (value = 0; value < 256; value++)
3432 if (!isSPACE(value))
3433 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3434 }
3435 }
8aa23a47
YO
3436 break;
3437 case NSPACEL:
3438 if (flags & SCF_DO_STCLASS_AND) {
3439 if (data->start_class->flags & ANYOF_LOCALE) {
3440 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3441 for (value = 0; value < 256; value++)
3442 if (!isSPACE(value))
3443 ANYOF_BITMAP_CLEAR(data->start_class, value);
3444 }
653099ff 3445 }
8aa23a47
YO
3446 else {
3447 data->start_class->flags |= ANYOF_LOCALE;
3448 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3449 }
3450 break;
3451 case DIGIT:
3452 if (flags & SCF_DO_STCLASS_AND) {
3453 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3454 for (value = 0; value < 256; value++)
3455 if (!isDIGIT(value))
3456 ANYOF_BITMAP_CLEAR(data->start_class, value);
3457 }
3458 else {
3459 if (data->start_class->flags & ANYOF_LOCALE)
3460 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3461 else {
3462 for (value = 0; value < 256; value++)
3463 if (isDIGIT(value))
3464 ANYOF_BITMAP_SET(data->start_class, value);
3465 }
3466 }
3467 break;
3468 case NDIGIT:
3469 if (flags & SCF_DO_STCLASS_AND) {
3470 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3471 for (value = 0; value < 256; value++)
3472 if (isDIGIT(value))
3473 ANYOF_BITMAP_CLEAR(data->start_class, value);
3474 }
3475 else {
3476 if (data->start_class->flags & ANYOF_LOCALE)
3477 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3478 else {
3479 for (value = 0; value < 256; value++)
3480 if (!isDIGIT(value))
3481 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3482 }
3483 }
8aa23a47
YO
3484 break;
3485 }
3486 if (flags & SCF_DO_STCLASS_OR)
3487 cl_and(data->start_class, and_withp);
3488 flags &= ~SCF_DO_STCLASS;
3489 }
3490 }
3491 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3492 data->flags |= (OP(scan) == MEOL
3493 ? SF_BEFORE_MEOL
3494 : SF_BEFORE_SEOL);
3495 }
3496 else if ( PL_regkind[OP(scan)] == BRANCHJ
3497 /* Lookbehind, or need to calculate parens/evals/stclass: */
3498 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3499 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3500 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3501 || OP(scan) == UNLESSM )
3502 {
3503 /* Negative Lookahead/lookbehind
3504 In this case we can't do fixed string optimisation.
3505 */
1de06328 3506
8aa23a47
YO
3507 I32 deltanext, minnext, fake = 0;
3508 regnode *nscan;
3509 struct regnode_charclass_class intrnl;
3510 int f = 0;
1de06328 3511
8aa23a47
YO
3512 data_fake.flags = 0;
3513 if (data) {
3514 data_fake.whilem_c = data->whilem_c;
3515 data_fake.last_closep = data->last_closep;
c277df42 3516 }
8aa23a47
YO
3517 else
3518 data_fake.last_closep = &fake;
58e23c8d 3519 data_fake.pos_delta = delta;
8aa23a47
YO
3520 if ( flags & SCF_DO_STCLASS && !scan->flags
3521 && OP(scan) == IFMATCH ) { /* Lookahead */
3522 cl_init(pRExC_state, &intrnl);
3523 data_fake.start_class = &intrnl;
3524 f |= SCF_DO_STCLASS_AND;
3525 }
3526 if (flags & SCF_WHILEM_VISITED_POS)
3527 f |= SCF_WHILEM_VISITED_POS;
3528 next = regnext(scan);
3529 nscan = NEXTOPER(NEXTOPER(scan));
3530 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3531 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3532 if (scan->flags) {
3533 if (deltanext) {
58e23c8d 3534 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3535 }
3536 else if (minnext > (I32)U8_MAX) {
58e23c8d 3537 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3538 }
3539 scan->flags = (U8)minnext;
3540 }
3541 if (data) {
3542 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3543 pars++;
3544 if (data_fake.flags & SF_HAS_EVAL)
3545 data->flags |= SF_HAS_EVAL;
3546 data->whilem_c = data_fake.whilem_c;
3547 }
3548 if (f & SCF_DO_STCLASS_AND) {
3549 const int was = (data->start_class->flags & ANYOF_EOS);
3550
3551 cl_and(data->start_class, &intrnl);
3552 if (was)
3553 data->start_class->flags |= ANYOF_EOS;
3554 }
cb434fcc 3555 }
8aa23a47
YO
3556#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3557 else {
3558 /* Positive Lookahead/lookbehind
3559 In this case we can do fixed string optimisation,
3560 but we must be careful about it. Note in the case of
3561 lookbehind the positions will be offset by the minimum
3562 length of the pattern, something we won't know about
3563 until after the recurse.
3564 */
3565 I32 deltanext, fake = 0;
3566 regnode *nscan;
3567 struct regnode_charclass_class intrnl;
3568 int f = 0;
3569 /* We use SAVEFREEPV so that when the full compile
3570 is finished perl will clean up the allocated
3571 minlens when its all done. This was we don't
3572 have to worry about freeing them when we know
3573 they wont be used, which would be a pain.
3574 */
3575 I32 *minnextp;
3576 Newx( minnextp, 1, I32 );
3577 SAVEFREEPV(minnextp);
3578
3579 if (data) {
3580 StructCopy(data, &data_fake, scan_data_t);
3581 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3582 f |= SCF_DO_SUBSTR;
3583 if (scan->flags)
304ee84b 3584 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3585 data_fake.last_found=newSVsv(data->last_found);
3586 }
3587 }
3588 else
3589 data_fake.last_closep = &fake;
3590 data_fake.flags = 0;
58e23c8d 3591 data_fake.pos_delta = delta;
8aa23a47
YO
3592 if (is_inf)
3593 data_fake.flags |= SF_IS_INF;
3594 if ( flags & SCF_DO_STCLASS && !scan->flags
3595 && OP(scan) == IFMATCH ) { /* Lookahead */
3596 cl_init(pRExC_state, &intrnl);
3597 data_fake.start_class = &intrnl;
3598 f |= SCF_DO_STCLASS_AND;
3599 }
3600 if (flags & SCF_WHILEM_VISITED_POS)
3601 f |= SCF_WHILEM_VISITED_POS;
3602 next = regnext(scan);
3603 nscan = NEXTOPER(NEXTOPER(scan));
3604
3605 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3606 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3607 if (scan->flags) {
3608 if (deltanext) {
58e23c8d 3609 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3610 }
3611 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3612 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3613 }
3614 scan->flags = (U8)*minnextp;
3615 }
3616
3617 *minnextp += min;
3618
3619 if (f & SCF_DO_STCLASS_AND) {
3620 const int was = (data->start_class->flags & ANYOF_EOS);
3621
3622 cl_and(data->start_class, &intrnl);
3623 if (was)
3624 data->start_class->flags |= ANYOF_EOS;
3625 }
3626 if (data) {
3627 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3628 pars++;
3629 if (data_fake.flags & SF_HAS_EVAL)
3630 data->flags |= SF_HAS_EVAL;
3631 data->whilem_c = data_fake.whilem_c;
3632 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3633 if (RExC_rx->minlen<*minnextp)
3634 RExC_rx->minlen=*minnextp;
304ee84b 3635 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3636 SvREFCNT_dec(data_fake.last_found);
3637
3638 if ( data_fake.minlen_fixed != minlenp )
3639 {
3640 data->offset_fixed= data_fake.offset_fixed;
3641 data->minlen_fixed= data_fake.minlen_fixed;
3642 data->lookbehind_fixed+= scan->flags;
3643 }
3644 if ( data_fake.minlen_float != minlenp )
3645 {
3646 data->minlen_float= data_fake.minlen_float;
3647 data->offset_float_min=data_fake.offset_float_min;
3648 data->offset_float_max=data_fake.offset_float_max;
3649 data->lookbehind_float+= scan->flags;
3650 }
3651 }
3652 }
3653
3654
40d049e4 3655 }
8aa23a47
YO
3656#endif
3657 }
3658 else if (OP(scan) == OPEN) {
3659 if (stopparen != (I32)ARG(scan))
3660 pars++;
3661 }
3662 else if (OP(scan) == CLOSE) {
3663 if (stopparen == (I32)ARG(scan)) {
3664 break;
3665 }
3666 if ((I32)ARG(scan) == is_par) {
3667 next = regnext(scan);
b515a41d 3668
8aa23a47
YO
3669 if ( next && (OP(next) != WHILEM) && next < last)
3670 is_par = 0; /* Disable optimization */
40d049e4 3671 }
8aa23a47
YO
3672 if (data)
3673 *(data->last_closep) = ARG(scan);
3674 }
3675 else if (OP(scan) == EVAL) {
c277df42
IZ
3676 if (data)
3677 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3678 }
3679 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3680 if (flags & SCF_DO_SUBSTR) {
304ee84b 3681 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3682 flags &= ~SCF_DO_SUBSTR;
40d049e4 3683 }
8aa23a47
YO
3684 if (data && OP(scan)==ACCEPT) {
3685 data->flags |= SCF_SEEN_ACCEPT;
3686 if (stopmin > min)
3687 stopmin = min;
e2e6a0f1 3688 }
8aa23a47
YO
3689 }
3690 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3691 {
0f5d15d6 3692 if (flags & SCF_DO_SUBSTR) {
304ee84b 3693 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3694 data->longest = &(data->longest_float);
3695 }
3696 is_inf = is_inf_internal = 1;
653099ff 3697 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3698 cl_anything(pRExC_state, data->start_class);
96776eda 3699 flags &= ~SCF_DO_STCLASS;
8aa23a47 3700 }
58e23c8d 3701 else if (OP(scan) == GPOS) {
bbe252da 3702 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3703 !(delta || is_inf || (data && data->pos_delta)))
3704 {
bbe252da
YO
3705 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3706 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3707 if (RExC_rx->gofs < (U32)min)
3708 RExC_rx->gofs = min;
3709 } else {
bbe252da 3710 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3711 RExC_rx->gofs = 0;
3712 }
3713 }
786e8c11 3714#ifdef TRIE_STUDY_OPT
40d049e4 3715#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3716 else if (PL_regkind[OP(scan)] == TRIE) {
3717 /* NOTE - There is similar code to this block above for handling
3718 BRANCH nodes on the initial study. If you change stuff here
3719 check there too. */
3720 regnode *trie_node= scan;
3721 regnode *tail= regnext(scan);
f8fc2ecf 3722 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3723 I32 max1 = 0, min1 = I32_MAX;
3724 struct regnode_charclass_class accum;
3725
3726 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3727 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3728 if (flags & SCF_DO_STCLASS)
3729 cl_init_zero(pRExC_state, &accum);
3730
3731 if (!trie->jump) {
3732 min1= trie->minlen;
3733 max1= trie->maxlen;
3734 } else {
3735 const regnode *nextbranch= NULL;
3736 U32 word;
3737
3738 for ( word=1 ; word <= trie->wordcount ; word++)
3739 {
3740 I32 deltanext=0, minnext=0, f = 0, fake;
3741 struct regnode_charclass_class this_class;
3742
3743 data_fake.flags = 0;
3744 if (data) {
3745 data_fake.whilem_c = data->whilem_c;
3746 data_fake.last_closep = data->last_closep;
3747 }
3748 else
3749 data_fake.last_closep = &fake;
58e23c8d 3750 data_fake.pos_delta = delta;
8aa23a47
YO
3751 if (flags & SCF_DO_STCLASS) {
3752 cl_init(pRExC_state, &this_class);
3753 data_fake.start_class = &this_class;
3754 f = SCF_DO_STCLASS_AND;
3755 }
3756 if (flags & SCF_WHILEM_VISITED_POS)
3757 f |= SCF_WHILEM_VISITED_POS;
3758
3759 if (trie->jump[word]) {
3760 if (!nextbranch)
3761 nextbranch = trie_node + trie->jump[0];
3762 scan= trie_node + trie->jump[word];
3763 /* We go from the jump point to the branch that follows
3764 it. Note this means we need the vestigal unused branches
3765 even though they arent otherwise used.
3766 */
3767 minnext = study_chunk(pRExC_state, &scan, minlenp,
3768 &deltanext, (regnode *)nextbranch, &data_fake,
3769 stopparen, recursed, NULL, f,depth+1);
3770 }
3771 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3772 nextbranch= regnext((regnode*)nextbranch);
3773
3774 if (min1 > (I32)(minnext + trie->minlen))
3775 min1 = minnext + trie->minlen;
3776 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3777 max1 = minnext + deltanext + trie->maxlen;
3778 if (deltanext == I32_MAX)
3779 is_inf = is_inf_internal = 1;
3780
3781 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3782 pars++;
3783 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3784 if ( stopmin > min + min1)
3785 stopmin = min + min1;
3786 flags &= ~SCF_DO_SUBSTR;
3787 if (data)
3788 data->flags |= SCF_SEEN_ACCEPT;
3789 }
3790 if (data) {
3791 if (data_fake.flags & SF_HAS_EVAL)
3792 data->flags |= SF_HAS_EVAL;
3793 data->whilem_c = data_fake.whilem_c;
3794 }
3795 if (flags & SCF_DO_STCLASS)
3796 cl_or(pRExC_state, &accum, &this_class);
3797 }
3798 }
3799 if (flags & SCF_DO_SUBSTR) {
3800 data->pos_min += min1;
3801 data->pos_delta += max1 - min1;
3802 if (max1 != min1 || is_inf)
3803 data->longest = &(data->longest_float);
3804 }
3805 min += min1;
3806 delta += max1 - min1;
3807 if (flags & SCF_DO_STCLASS_OR) {
3808 cl_or(pRExC_state, data->start_class, &accum);
3809 if (min1) {
3810 cl_and(data->start_class, and_withp);
3811 flags &= ~SCF_DO_STCLASS;
3812 }
3813 }
3814 else if (flags & SCF_DO_STCLASS_AND) {
3815 if (min1) {
3816 cl_and(data->start_class, &accum);
3817 flags &= ~SCF_DO_STCLASS;
3818 }
3819 else {
3820 /* Switch to OR mode: cache the old value of
3821 * data->start_class */
3822 INIT_AND_WITHP;
3823 StructCopy(data->start_class, and_withp,
3824 struct regnode_charclass_class);
3825 flags &= ~SCF_DO_STCLASS_AND;
3826 StructCopy(&accum, data->start_class,
3827 struct regnode_charclass_class);
3828 flags |= SCF_DO_STCLASS_OR;
3829 data->start_class->flags |= ANYOF_EOS;
3830 }
3831 }
3832 scan= tail;
3833 continue;
3834 }
786e8c11 3835#else
8aa23a47 3836 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 3837 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3838 U8*bang=NULL;
3839
3840 min += trie->minlen;
3841 delta += (trie->maxlen - trie->minlen);
3842 flags &= ~SCF_DO_STCLASS; /* xxx */
3843 if (flags & SCF_DO_SUBSTR) {
304ee84b 3844 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3845 data->pos_min += trie->minlen;
3846 data->pos_delta += (trie->maxlen - trie->minlen);
3847 if (trie->maxlen != trie->minlen)
3848 data->longest = &(data->longest_float);
3849 }
3850 if (trie->jump) /* no more substrings -- for now /grr*/
3851 flags &= ~SCF_DO_SUBSTR;
b515a41d 3852 }
8aa23a47
YO
3853#endif /* old or new */
3854#endif /* TRIE_STUDY_OPT */
3855 /* Else: zero-length, ignore. */
3856 scan = regnext(scan);
3857 }
3858 if (frame) {
3859 last = frame->last;
3860 scan = frame->next;
3861 stopparen = frame->stop;
3862 frame = frame->prev;
3863 goto fake_study_recurse;
c277df42
IZ
3864 }
3865
3866 finish:
8aa23a47 3867 assert(!frame);
304ee84b 3868 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 3869
c277df42 3870 *scanp = scan;
aca2d497 3871 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3872 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3873 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3874 if (is_par > (I32)U8_MAX)
c277df42
IZ
3875 is_par = 0;
3876 if (is_par && pars==1 && data) {
3877 data->flags |= SF_IN_PAR;
3878 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3879 }
3880 else if (pars && data) {
c277df42
IZ
3881 data->flags |= SF_HAS_PAR;
3882 data->flags &= ~SF_IN_PAR;
3883 }
653099ff 3884 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3885 cl_and(data->start_class, and_withp);
786e8c11
YO
3886 if (flags & SCF_TRIE_RESTUDY)
3887 data->flags |= SCF_TRIE_RESTUDY;
1de06328 3888
304ee84b 3889 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 3890
e2e6a0f1 3891 return min < stopmin ? min : stopmin;
c277df42
IZ
3892}
3893
2eccd3b2
NC
3894STATIC U32
3895S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 3896{
4a4e7719
NC
3897 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
3898
3899 Renewc(RExC_rxi->data,
3900 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
3901 char, struct reg_data);
3902 if(count)
f8fc2ecf 3903 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 3904 else
f8fc2ecf 3905 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
3906 RExC_rxi->data->count = count + n;
3907 Copy(s, RExC_rxi->data->what + count, n, U8);
3908 return count;
c277df42
IZ
3909}
3910
f8149455 3911/*XXX: todo make this not included in a non debugging perl */
76234dfb 3912#ifndef PERL_IN_XSUB_RE
d88dccdf 3913void
864dbfa3 3914Perl_reginitcolors(pTHX)
d88dccdf 3915{
97aff369 3916 dVAR;
1df70142 3917 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3918 if (s) {
1df70142
AL
3919 char *t = savepv(s);
3920 int i = 0;
3921 PL_colors[0] = t;
d88dccdf 3922 while (++i < 6) {
1df70142
AL
3923 t = strchr(t, '\t');
3924 if (t) {
3925 *t = '\0';
3926 PL_colors[i] = ++t;
d88dccdf
IZ
3927 }
3928 else
1df70142 3929 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3930 }
3931 } else {
1df70142 3932 int i = 0;
b81d288d 3933 while (i < 6)
06b5626a 3934 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3935 }
3936 PL_colorset = 1;
3937}
76234dfb 3938#endif
8615cb43 3939
07be1b83 3940
786e8c11
YO
3941#ifdef TRIE_STUDY_OPT
3942#define CHECK_RESTUDY_GOTO \
3943 if ( \
3944 (data.flags & SCF_TRIE_RESTUDY) \
3945 && ! restudied++ \
3946 ) goto reStudy
3947#else
3948#define CHECK_RESTUDY_GOTO
3949#endif
f9f4320a 3950
a687059c 3951/*
e50aee73 3952 - pregcomp - compile a regular expression into internal code
a687059c
LW
3953 *
3954 * We can't allocate space until we know how big the compiled form will be,
3955 * but we can't compile it (and thus know how big it is) until we've got a
3956 * place to put the code. So we cheat: we compile it twice, once with code
3957 * generation turned off and size counting turned on, and once "for real".
3958 * This also means that we don't allocate space until we are sure that the
3959 * thing really will compile successfully, and we never have to move the
3960 * code and thus invalidate pointers into it. (Note that it has to be in
3961 * one piece because free() must be able to free it all.) [NB: not true in perl]
3962 *
3963 * Beware that the optimization-preparation code in here knows about some
3964 * of the structure of the compiled regexp. [I'll say.]
3965 */
b9b4dddf
YO
3966
3967
3968
f9f4320a 3969#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3970#define RE_ENGINE_PTR &PL_core_reg_engine
3971#else
f9f4320a
YO
3972extern const struct regexp_engine my_reg_engine;
3973#define RE_ENGINE_PTR &my_reg_engine
3974#endif
6d5c990f
RGS
3975
3976#ifndef PERL_IN_XSUB_RE
a687059c 3977regexp *
864dbfa3 3978Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3979{
97aff369 3980 dVAR;
6d5c990f 3981 HV * const table = GvHV(PL_hintgv);
f9f4320a
YO
3982 /* Dispatch a request to compile a regexp to correct
3983 regexp engine. */
f9f4320a
YO
3984 if (table) {
3985 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 3986 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 3987 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3988 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3989 DEBUG_COMPILE_r({
8d8756e7 3990 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3991 SvIV(*ptr));
3992 });
f2f78491 3993 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3994 }
b9b4dddf 3995 }
2a5d9b1d
RGS
3996 return Perl_re_compile(aTHX_ exp, xend, pm);
3997}
6d5c990f 3998#endif
2a5d9b1d
RGS
3999
4000regexp *
4001Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm)
4002{
4003 dVAR;
a0d0e21e 4004 register regexp *r;
f8fc2ecf 4005 register regexp_internal *ri;
c277df42 4006 regnode *scan;
c277df42 4007 regnode *first;
a0d0e21e 4008 I32 flags;
a0d0e21e
LW
4009 I32 minlen = 0;
4010 I32 sawplus = 0;
4011 I32 sawopen = 0;
2c2d71f5 4012 scan_data_t data;
830247a4 4013 RExC_state_t RExC_state;
be8e71aa 4014 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
4015#ifdef TRIE_STUDY_OPT
4016 int restudied= 0;
4017 RExC_state_t copyRExC_state;
4018#endif
2a5d9b1d 4019 GET_RE_DEBUG_FLAGS_DECL;
6d5c990f
RGS
4020 DEBUG_r(if (!PL_colorset) reginitcolors());
4021
a0d0e21e 4022 if (exp == NULL)
c277df42 4023 FAIL("NULL regexp argument");
a0d0e21e 4024
a5961de5 4025 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 4026
5cfc7842 4027 RExC_precomp = exp;
a3621e74 4028 DEBUG_COMPILE_r({
ab3bbdeb
YO
4029 SV *dsv= sv_newmortal();
4030 RE_PV_QUOTED_DECL(s, RExC_utf8,
4031 dsv, RExC_precomp, (xend - exp), 60);
4032 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4033 PL_colors[4],PL_colors[5],s);
a5961de5 4034 });
e2509266 4035 RExC_flags = pm->op_pmflags;
830247a4 4036 RExC_sawback = 0;
bbce6d69 4037
830247a4
IZ
4038 RExC_seen = 0;
4039 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4040 RExC_seen_evals = 0;
4041 RExC_extralen = 0;
c277df42 4042
bbce6d69 4043 /* First pass: determine size, legality. */
830247a4 4044 RExC_parse = exp;
fac92740 4045 RExC_start = exp;
830247a4
IZ
4046 RExC_end = xend;
4047 RExC_naughty = 0;
4048 RExC_npar = 1;
e2e6a0f1 4049 RExC_nestroot = 0;
830247a4
IZ
4050 RExC_size = 0L;
4051 RExC_emit = &PL_regdummy;
4052 RExC_whilem_seen = 0;
fc8cd66c 4053 RExC_charnames = NULL;
40d049e4
YO
4054 RExC_open_parens = NULL;
4055 RExC_close_parens = NULL;
4056 RExC_opend = NULL;
81714fb9 4057 RExC_paren_names = NULL;
40d049e4
YO
4058 RExC_recurse = NULL;
4059 RExC_recurse_count = 0;
81714fb9 4060
85ddcde9
JH
4061#if 0 /* REGC() is (currently) a NOP at the first pass.
4062 * Clever compilers notice this and complain. --jhi */
830247a4 4063 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4064#endif
3dab1dad
YO
4065 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4066 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4067 RExC_precomp = NULL;
a0d0e21e
LW
4068 return(NULL);
4069 }
07be1b83 4070 DEBUG_PARSE_r({
81714fb9
YO
4071 PerlIO_printf(Perl_debug_log,
4072 "Required size %"IVdf" nodes\n"
4073 "Starting second pass (creation)\n",
4074 (IV)RExC_size);
07be1b83
YO
4075 RExC_lastnum=0;
4076 RExC_lastparse=NULL;
4077 });
c277df42
IZ
4078 /* Small enough for pointer-storage convention?
4079 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4080 if (RExC_size >= 0x10000L && RExC_extralen)
4081 RExC_size += RExC_extralen;
c277df42 4082 else
830247a4
IZ
4083 RExC_extralen = 0;
4084 if (RExC_whilem_seen > 15)
4085 RExC_whilem_seen = 15;
a0d0e21e 4086
e2e6a0f1
YO
4087#ifdef DEBUGGING
4088 /* Make room for a sentinel value at the end of the program */
4089 RExC_size++;
4090#endif
4091
f9f4320a
YO
4092 /* Allocate space and zero-initialize. Note, the two step process
4093 of zeroing when in debug mode, thus anything assigned has to
4094 happen after that */
f8fc2ecf
YO
4095 Newxz(r, 1, regexp);
4096 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4097 char, regexp_internal);
4098 if ( r == NULL || ri == NULL )
b45f050a 4099 FAIL("Regexp out of space");
0f79a09d
GS
4100#ifdef DEBUGGING
4101 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4102 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4103#else
f8fc2ecf
YO
4104 /* bulk initialize base fields with 0. */
4105 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4106#endif
58e23c8d
YO
4107
4108 /* non-zero initialization begins here */
f8fc2ecf 4109 RXi_SET( r, ri );
f9f4320a 4110 r->engine= RE_ENGINE_PTR;
c277df42 4111 r->refcnt = 1;
bbce6d69 4112 r->prelen = xend - exp;
5cfc7842 4113 r->precomp = savepvn(RExC_precomp, r->prelen);
bbe252da
YO
4114 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
4115 r->intflags = 0;
830247a4 4116 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4117
6bda09f9 4118 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4119 Newxz(RExC_open_parens, RExC_npar,regnode *);
4120 SAVEFREEPV(RExC_open_parens);
4121 Newxz(RExC_close_parens,RExC_npar,regnode *);
4122 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4123 }
4124
4125 /* Useful during FAIL. */
f8fc2ecf
YO
4126 Newxz(ri->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4127 if (ri->offsets) {
4128 ri->offsets[0] = RExC_size;
fac92740 4129 }
a3621e74 4130 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4131 "%s %"UVuf" bytes for offset annotations.\n",
f8fc2ecf 4132 ri->offsets ? "Got" : "Couldn't get",
392fbf5d 4133 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4134
830247a4 4135 RExC_rx = r;
f8fc2ecf 4136 RExC_rxi = ri;
bbce6d69 4137
4138 /* Second pass: emit code. */
e2509266 4139 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4140 RExC_parse = exp;
4141 RExC_end = xend;
4142 RExC_naughty = 0;
4143 RExC_npar = 1;
f8fc2ecf
YO
4144 RExC_emit_start = ri->program;
4145 RExC_emit = ri->program;
e2e6a0f1
YO
4146#ifdef DEBUGGING
4147 /* put a sentinal on the end of the program so we can check for
4148 overwrites */
f8fc2ecf 4149 ri->program[RExC_size].type = 255;
e2e6a0f1 4150#endif
2cd61cdb 4151 /* Store the count of eval-groups for security checks: */
f8149455 4152 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4153 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3dab1dad 4154 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4155 return(NULL);
6bda09f9 4156
07be1b83
YO
4157 /* XXXX To minimize changes to RE engine we always allocate
4158 3-units-long substrs field. */
4159 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4160 if (RExC_recurse_count) {
4161 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4162 SAVEFREEPV(RExC_recurse);
4163 }
a0d0e21e 4164
07be1b83 4165reStudy:
1de06328 4166 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4167 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4168
07be1b83
YO
4169#ifdef TRIE_STUDY_OPT
4170 if ( restudied ) {
5d458dd8 4171 U32 seen=RExC_seen;
07be1b83 4172 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4173
4174 RExC_state = copyRExC_state;
4175 if (seen & REG_TOP_LEVEL_BRANCHES)
4176 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4177 else
4178 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4179 if (data.last_found) {
07be1b83 4180 SvREFCNT_dec(data.longest_fixed);
07be1b83 4181 SvREFCNT_dec(data.longest_float);
07be1b83 4182 SvREFCNT_dec(data.last_found);
1de06328 4183 }
40d049e4 4184 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4185 } else {
40d049e4 4186 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4187 copyRExC_state = RExC_state;
07be1b83 4188 }
40d049e4
YO
4189#else
4190 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4191#endif
fc8cd66c 4192
a0d0e21e 4193 /* Dig out information for optimizations. */
bbe252da 4194 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */
e2509266 4195 pm->op_pmflags = RExC_flags;
a0ed51b3 4196 if (UTF)
bbe252da 4197 r->extflags |= RXf_UTF8; /* Unicode in it? */
f8fc2ecf 4198 ri->regstclass = NULL;
830247a4 4199 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4200 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4201 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4202
1de06328
YO
4203 /* testing for BRANCH here tells us whether there is "must appear"
4204 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4205 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4206 I32 fake;
c5254dd6 4207 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4208 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4209 int stclass_flag;
07be1b83 4210 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4211
4212 first = scan;
c277df42 4213 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4214 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4215 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4216 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4217 /* for now we can't handle lookbehind IFMATCH*/
4218 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4219 (OP(first) == PLUS) ||
4220 (OP(first) == MINMOD) ||
653099ff 4221 /* An {n,m} with n>0 */
07be1b83
YO
4222 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4223 {
786e8c11 4224
a0d0e21e
LW
4225 if (OP(first) == PLUS)
4226 sawplus = 1;
4227 else
3dab1dad 4228 first += regarglen[OP(first)];
07be1b83
YO
4229 if (OP(first) == IFMATCH) {
4230 first = NEXTOPER(first);
4231 first += EXTRA_STEP_2ARGS;
7c167cea 4232 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4233 first = NEXTOPER(first);
a687059c
LW
4234 }
4235
a0d0e21e
LW
4236 /* Starting-point info. */
4237 again:
786e8c11 4238 DEBUG_PEEP("first:",first,0);
07be1b83 4239 /* Ignore EXACT as we deal with it later. */
3dab1dad 4240 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4241 if (OP(first) == EXACT)
6f207bd3 4242 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4243 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4244 ri->regstclass = first;
b3c9acc1 4245 }
07be1b83 4246#ifdef TRIE_STCLASS
786e8c11 4247 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4248 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4249 {
786e8c11 4250 regnode *trie_op;
07be1b83 4251 /* this can happen only on restudy */
786e8c11 4252 if ( OP(first) == TRIE ) {
c944940b 4253 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4254 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4255 StructCopy(first,trieop,struct regnode_1);
4256 trie_op=(regnode *)trieop;
4257 } else {
c944940b 4258 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4259 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4260 StructCopy(first,trieop,struct regnode_charclass);
4261 trie_op=(regnode *)trieop;
4262 }
1de06328 4263 OP(trie_op)+=2;
786e8c11 4264 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4265 ri->regstclass = trie_op;
07be1b83
YO
4266 }
4267#endif
bfed75c6 4268 else if (strchr((const char*)PL_simple,OP(first)))
f8fc2ecf 4269 ri->regstclass = first;
3dab1dad
YO
4270 else if (PL_regkind[OP(first)] == BOUND ||
4271 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4272 ri->regstclass = first;
3dab1dad 4273 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4274 r->extflags |= (OP(first) == MBOL
4275 ? RXf_ANCH_MBOL
cad2e5aa 4276 : (OP(first) == SBOL
bbe252da
YO
4277 ? RXf_ANCH_SBOL
4278 : RXf_ANCH_BOL));
a0d0e21e 4279 first = NEXTOPER(first);
774d564b 4280 goto again;
4281 }
4282 else if (OP(first) == GPOS) {
bbe252da 4283 r->extflags |= RXf_ANCH_GPOS;
774d564b 4284 first = NEXTOPER(first);
4285 goto again;
a0d0e21e 4286 }
cf2a2b69
YO
4287 else if ((!sawopen || !RExC_sawback) &&
4288 (OP(first) == STAR &&
3dab1dad 4289 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4290 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4291 {
4292 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4293 const int type =
4294 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4295 ? RXf_ANCH_MBOL
4296 : RXf_ANCH_SBOL;
4297 r->extflags |= type;
4298 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4299 first = NEXTOPER(first);
774d564b 4300 goto again;
a0d0e21e 4301 }
b81d288d 4302 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4303 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4304 /* x+ must match at the 1st pos of run of x's */
bbe252da 4305 r->intflags |= PREGf_SKIP;
a0d0e21e 4306
c277df42 4307 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4308#ifdef TRIE_STUDY_OPT
81714fb9 4309 DEBUG_PARSE_r(
be8e71aa
YO
4310 if (!restudied)
4311 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4312 (IV)(first - scan + 1))
4313 );
4314#else
81714fb9 4315 DEBUG_PARSE_r(
be8e71aa
YO
4316 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4317 (IV)(first - scan + 1))
4318 );
4319#endif
4320
4321
a0d0e21e
LW
4322 /*
4323 * If there's something expensive in the r.e., find the
4324 * longest literal string that must appear and make it the
4325 * regmust. Resolve ties in favor of later strings, since
4326 * the regstart check works with the beginning of the r.e.
4327 * and avoiding duplication strengthens checking. Not a
4328 * strong reason, but sufficient in the absence of others.
4329 * [Now we resolve ties in favor of the earlier string if
c277df42 4330 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4331 * earlier string may buy us something the later one won't.]
4332 */
de8c5301 4333
396482e1
GA
4334 data.longest_fixed = newSVpvs("");
4335 data.longest_float = newSVpvs("");
4336 data.last_found = newSVpvs("");
c277df42
IZ
4337 data.longest = &(data.longest_fixed);
4338 first = scan;
f8fc2ecf 4339 if (!ri->regstclass) {
830247a4 4340 cl_init(pRExC_state, &ch_class);
653099ff
GS
4341 data.start_class = &ch_class;
4342 stclass_flag = SCF_DO_STCLASS_AND;
4343 } else /* XXXX Check for BOUND? */
4344 stclass_flag = 0;
cb434fcc 4345 data.last_closep = &last_close;
de8c5301 4346
1de06328 4347 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4348 &data, -1, NULL, NULL,
4349 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4350
07be1b83 4351
786e8c11
YO
4352 CHECK_RESTUDY_GOTO;
4353
4354
830247a4 4355 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4356 && data.last_start_min == 0 && data.last_end > 0
830247a4 4357 && !RExC_seen_zerolen
2bf803e2 4358 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4359 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4360 r->extflags |= RXf_CHECK_ALL;
304ee84b 4361 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4362 SvREFCNT_dec(data.last_found);
4363
1de06328
YO
4364 /* Note that code very similar to this but for anchored string
4365 follows immediately below, changes may need to be made to both.
4366 Be careful.
4367 */
a0ed51b3 4368 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4369 if (longest_float_length
c277df42
IZ
4370 || (data.flags & SF_FL_BEFORE_EOL
4371 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4372 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4373 {
1182767e 4374 I32 t,ml;
cf93c79d 4375
1de06328 4376 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4377 && data.offset_fixed == data.offset_float_min
4378 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4379 goto remove_float; /* As in (a)+. */
4380
1de06328
YO
4381 /* copy the information about the longest float from the reg_scan_data
4382 over to the program. */
33b8afdf
JH
4383 if (SvUTF8(data.longest_float)) {
4384 r->float_utf8 = data.longest_float;
c445ea15 4385 r->float_substr = NULL;
33b8afdf
JH
4386 } else {
4387 r->float_substr = data.longest_float;
c445ea15 4388 r->float_utf8 = NULL;
33b8afdf 4389 }
1de06328
YO
4390 /* float_end_shift is how many chars that must be matched that
4391 follow this item. We calculate it ahead of time as once the
4392 lookbehind offset is added in we lose the ability to correctly
4393 calculate it.*/
4394 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4395 : (I32)longest_float_length;
1de06328
YO
4396 r->float_end_shift = ml - data.offset_float_min
4397 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4398 + data.lookbehind_float;
4399 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4400 r->float_max_offset = data.offset_float_max;
1182767e 4401 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4402 r->float_max_offset -= data.lookbehind_float;
4403
cf93c79d
IZ
4404 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4405 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4406 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4407 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4408 }
4409 else {
aca2d497 4410 remove_float:
c445ea15 4411 r->float_substr = r->float_utf8 = NULL;
c277df42 4412 SvREFCNT_dec(data.longest_float);
c5254dd6 4413 longest_float_length = 0;
a0d0e21e 4414 }
c277df42 4415
1de06328
YO
4416 /* Note that code very similar to this but for floating string
4417 is immediately above, changes may need to be made to both.
4418 Be careful.
4419 */
a0ed51b3 4420 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4421 if (longest_fixed_length
c277df42
IZ
4422 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4423 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4424 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4425 {
1182767e 4426 I32 t,ml;
cf93c79d 4427
1de06328
YO
4428 /* copy the information about the longest fixed
4429 from the reg_scan_data over to the program. */
33b8afdf
JH
4430 if (SvUTF8(data.longest_fixed)) {
4431 r->anchored_utf8 = data.longest_fixed;
c445ea15 4432 r->anchored_substr = NULL;
33b8afdf
JH
4433 } else {
4434 r->anchored_substr = data.longest_fixed;
c445ea15 4435 r->anchored_utf8 = NULL;
33b8afdf 4436 }
1de06328
YO
4437 /* fixed_end_shift is how many chars that must be matched that
4438 follow this item. We calculate it ahead of time as once the
4439 lookbehind offset is added in we lose the ability to correctly
4440 calculate it.*/
4441 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4442 : (I32)longest_fixed_length;
1de06328
YO
4443 r->anchored_end_shift = ml - data.offset_fixed
4444 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4445 + data.lookbehind_fixed;
4446 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4447
cf93c79d
IZ
4448 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4449 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4450 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4451 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4452 }
4453 else {
c445ea15 4454 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4455 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4456 longest_fixed_length = 0;
a0d0e21e 4457 }
f8fc2ecf
YO
4458 if (ri->regstclass
4459 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4460 ri->regstclass = NULL;
33b8afdf
JH
4461 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4462 && stclass_flag
653099ff 4463 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4464 && !cl_is_anything(data.start_class))
4465 {
2eccd3b2 4466 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4467
f8fc2ecf 4468 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4469 struct regnode_charclass_class);
4470 StructCopy(data.start_class,
f8fc2ecf 4471 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4472 struct regnode_charclass_class);
f8fc2ecf 4473 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4474 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4475 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4476 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4477 PerlIO_printf(Perl_debug_log,
a0288114 4478 "synthetic stclass \"%s\".\n",
3f7c398e 4479 SvPVX_const(sv));});
653099ff 4480 }
c277df42
IZ
4481
4482 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4483 if (longest_fixed_length > longest_float_length) {
1de06328 4484 r->check_end_shift = r->anchored_end_shift;
c277df42 4485 r->check_substr = r->anchored_substr;
33b8afdf 4486 r->check_utf8 = r->anchored_utf8;
c277df42 4487 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4488 if (r->extflags & RXf_ANCH_SINGLE)
4489 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4490 }
4491 else {
1de06328 4492 r->check_end_shift = r->float_end_shift;
c277df42 4493 r->check_substr = r->float_substr;
33b8afdf 4494 r->check_utf8 = r->float_utf8;
1de06328
YO
4495 r->check_offset_min = r->float_min_offset;
4496 r->check_offset_max = r->float_max_offset;
a0d0e21e 4497 }
30382c73
IZ
4498 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4499 This should be changed ASAP! */
bbe252da
YO
4500 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4501 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4502 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4503 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4504 }
1de06328
YO
4505 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4506 if ( (STRLEN)minlen < longest_float_length )
4507 minlen= longest_float_length;
4508 if ( (STRLEN)minlen < longest_fixed_length )
4509 minlen= longest_fixed_length;
4510 */
a0ed51b3
LW
4511 }
4512 else {
c277df42
IZ
4513 /* Several toplevels. Best we can is to set minlen. */
4514 I32 fake;
653099ff 4515 struct regnode_charclass_class ch_class;
cb434fcc 4516 I32 last_close = 0;
c277df42 4517
5d458dd8 4518 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4519
f8fc2ecf 4520 scan = ri->program + 1;
830247a4 4521 cl_init(pRExC_state, &ch_class);
653099ff 4522 data.start_class = &ch_class;
cb434fcc 4523 data.last_closep = &last_close;
07be1b83 4524
de8c5301 4525
1de06328 4526 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4527 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4528
786e8c11 4529 CHECK_RESTUDY_GOTO;
07be1b83 4530
33b8afdf 4531 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4532 = r->float_substr = r->float_utf8 = NULL;
653099ff 4533 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4534 && !cl_is_anything(data.start_class))
4535 {
2eccd3b2 4536 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4537
f8fc2ecf 4538 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4539 struct regnode_charclass_class);
4540 StructCopy(data.start_class,
f8fc2ecf 4541 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4542 struct regnode_charclass_class);
f8fc2ecf 4543 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4544 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4545 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4546 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4547 PerlIO_printf(Perl_debug_log,
a0288114 4548 "synthetic stclass \"%s\".\n",
3f7c398e 4549 SvPVX_const(sv));});
653099ff 4550 }
a0d0e21e
LW
4551 }
4552
1de06328
YO
4553 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4554 the "real" pattern. */
cf9788e3
RGS
4555 DEBUG_OPTIMISE_r({
4556 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4557 (IV)minlen, (IV)r->minlen);
cf9788e3 4558 });
de8c5301 4559 r->minlenret = minlen;
1de06328
YO
4560 if (r->minlen < minlen)
4561 r->minlen = minlen;
4562
b81d288d 4563 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4564 r->extflags |= RXf_GPOS_SEEN;
830247a4 4565 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4566 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4567 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4568 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4569 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4570 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4571 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4572 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4573 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4574 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9
YO
4575 if (RExC_paren_names)
4576 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4577 else
4578 r->paren_names = NULL;
4579
40d049e4
YO
4580 if (RExC_recurse_count) {
4581 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4582 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4583 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4584 }
4585 }
a02a5408
JC
4586 Newxz(r->startp, RExC_npar, I32);
4587 Newxz(r->endp, RExC_npar, I32);
c74340f9
YO
4588 /* assume we don't need to swap parens around before we match */
4589
be8e71aa
YO
4590 DEBUG_DUMP_r({
4591 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4592 regdump(r);
4593 });
f8fc2ecf
YO
4594 DEBUG_OFFSETS_r(if (ri->offsets) {
4595 const U32 len = ri->offsets[0];
8e9a8a48
YO
4596 U32 i;
4597 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 4598 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->offsets[0]);
8e9a8a48 4599 for (i = 1; i <= len; i++) {
f8fc2ecf 4600 if (ri->offsets[i*2-1] || ri->offsets[i*2])
8e9a8a48 4601 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
f8fc2ecf 4602 (UV)i, (UV)ri->offsets[i*2-1], (UV)ri->offsets[i*2]);
8e9a8a48
YO
4603 }
4604 PerlIO_printf(Perl_debug_log, "\n");
4605 });
a0d0e21e 4606 return(r);
a687059c
LW
4607}
4608
f9f4320a 4609#undef CORE_ONLY_BLOCK
f9f4320a 4610#undef RE_ENGINE_PTR
3dab1dad 4611
9af228c6 4612#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4613SV*
4614Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4615{
4616 I32 parno = 0; /* no match */
4617 if (PL_curpm) {
4618 const REGEXP * const rx = PM_GETRE(PL_curpm);
4619 if (rx && rx->paren_names) {
4620 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4621 if (he_str) {
4622 IV i;
4623 SV* sv_dat=HeVAL(he_str);
4624 I32 *nums=(I32*)SvPVX(sv_dat);
4625 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4626 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4627 rx->endp[nums[i]] != -1)
4628 {
4629 parno = nums[i];
4630 break;
4631 }
4632 }
4633 }
4634 }
4635 }
4636 if ( !parno ) {
4637 return 0;
4638 } else {
4639 GV *gv_paren;
4640 SV *sv= sv_newmortal();
4641 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4642 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4643 return GvSVn(gv_paren);
4644 }
4645}
9af228c6 4646#endif
0a4db386 4647
894be9b7 4648/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4649 * If flags is REG_RSN_RETURN_NULL returns null.
4650 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4651 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4652 * to the parsed name as looked up in the RExC_paren_names hash.
4653 * If there is an error throws a vFAIL().. type exception.
894be9b7 4654 */
0a4db386
YO
4655
4656#define REG_RSN_RETURN_NULL 0
4657#define REG_RSN_RETURN_NAME 1
4658#define REG_RSN_RETURN_DATA 2
4659
894be9b7
YO
4660STATIC SV*
4661S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4662 char *name_start = RExC_parse;
0a4db386 4663 if ( UTF ) {
894be9b7 4664 STRLEN numlen;
0a4db386
YO
4665 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4666 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4667 {
4668 RExC_parse += numlen;
4669 }
4670 } else {
4671 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4672 RExC_parse++;
4673 }
0a4db386
YO
4674 if ( flags ) {
4675 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4676 (int)(RExC_parse - name_start)));
894be9b7 4677 if (UTF)
0a4db386
YO
4678 SvUTF8_on(sv_name);
4679 if ( flags == REG_RSN_RETURN_NAME)
4680 return sv_name;
4681 else if (flags==REG_RSN_RETURN_DATA) {
4682 HE *he_str = NULL;
4683 SV *sv_dat = NULL;
4684 if ( ! sv_name ) /* should not happen*/
4685 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4686 if (RExC_paren_names)
4687 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4688 if ( he_str )
4689 sv_dat = HeVAL(he_str);
4690 if ( ! sv_dat )
4691 vFAIL("Reference to nonexistent named group");
4692 return sv_dat;
4693 }
4694 else {
4695 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4696 }
4697 /* NOT REACHED */
894be9b7 4698 }
0a4db386 4699 return NULL;
894be9b7
YO
4700}
4701
3dab1dad
YO
4702#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4703 int rem=(int)(RExC_end - RExC_parse); \
4704 int cut; \
4705 int num; \
4706 int iscut=0; \
4707 if (rem>10) { \
4708 rem=10; \
4709 iscut=1; \
4710 } \
4711 cut=10-rem; \
4712 if (RExC_lastparse!=RExC_parse) \
4713 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4714 rem, RExC_parse, \
4715 cut + 4, \
4716 iscut ? "..." : "<" \
4717 ); \
4718 else \
4719 PerlIO_printf(Perl_debug_log,"%16s",""); \
4720 \
4721 if (SIZE_ONLY) \
4722 num=RExC_size; \
4723 else \
4724 num=REG_NODE_NUM(RExC_emit); \
4725 if (RExC_lastnum!=num) \
0a4db386 4726 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4727 else \
0a4db386 4728 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4729 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4730 (int)((depth*2)), "", \
3dab1dad
YO
4731 (funcname) \
4732 ); \
4733 RExC_lastnum=num; \
4734 RExC_lastparse=RExC_parse; \
4735})
4736
07be1b83
YO
4737
4738
3dab1dad
YO
4739#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4740 DEBUG_PARSE_MSG((funcname)); \
4741 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4742})
6bda09f9
YO
4743#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4744 DEBUG_PARSE_MSG((funcname)); \
4745 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4746})
a687059c
LW
4747/*
4748 - reg - regular expression, i.e. main body or parenthesized thing
4749 *
4750 * Caller must absorb opening parenthesis.
4751 *
4752 * Combining parenthesis handling with the base level of regular expression
4753 * is a trifle forced, but the need to tie the tails of the branches to what
4754 * follows makes it hard to avoid.
4755 */
07be1b83
YO
4756#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4757#ifdef DEBUGGING
4758#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4759#else
4760#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4761#endif
3dab1dad 4762
e2e6a0f1
YO
4763/* this idea is borrowed from STR_WITH_LEN in handy.h */
4764#define CHECK_WORD(s,v,l) \
4765 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4766
76e3520e 4767STATIC regnode *
3dab1dad 4768S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4769 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4770{
27da23d5 4771 dVAR;
c277df42
IZ
4772 register regnode *ret; /* Will be the head of the group. */
4773 register regnode *br;
4774 register regnode *lastbr;
cbbf8932 4775 register regnode *ender = NULL;
a0d0e21e 4776 register I32 parno = 0;
cbbf8932
AL
4777 I32 flags;
4778 const I32 oregflags = RExC_flags;
6136c704
AL
4779 bool have_branch = 0;
4780 bool is_open = 0;
9d1d55b5
JP
4781
4782 /* for (?g), (?gc), and (?o) warnings; warning
4783 about (?c) will warn about (?g) -- japhy */
4784
6136c704
AL
4785#define WASTED_O 0x01
4786#define WASTED_G 0x02
4787#define WASTED_C 0x04
4788#define WASTED_GC (0x02|0x04)
cbbf8932 4789 I32 wastedflags = 0x00;
9d1d55b5 4790
fac92740 4791 char * parse_start = RExC_parse; /* MJD */
a28509cc 4792 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4793
3dab1dad
YO
4794 GET_RE_DEBUG_FLAGS_DECL;
4795 DEBUG_PARSE("reg ");
4796
4797
821b33a5 4798 *flagp = 0; /* Tentatively. */
a0d0e21e 4799
9d1d55b5 4800
a0d0e21e
LW
4801 /* Make an OPEN node, if parenthesized. */
4802 if (paren) {
e2e6a0f1
YO
4803 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4804 char *start_verb = RExC_parse;
4805 STRLEN verb_len = 0;
4806 char *start_arg = NULL;
4807 unsigned char op = 0;
4808 int argok = 1;
4809 int internal_argval = 0; /* internal_argval is only useful if !argok */
4810 while ( *RExC_parse && *RExC_parse != ')' ) {
4811 if ( *RExC_parse == ':' ) {
4812 start_arg = RExC_parse + 1;
4813 break;
4814 }
4815 RExC_parse++;
4816 }
4817 ++start_verb;
4818 verb_len = RExC_parse - start_verb;
4819 if ( start_arg ) {
4820 RExC_parse++;
4821 while ( *RExC_parse && *RExC_parse != ')' )
4822 RExC_parse++;
4823 if ( *RExC_parse != ')' )
4824 vFAIL("Unterminated verb pattern argument");
4825 if ( RExC_parse == start_arg )
4826 start_arg = NULL;
4827 } else {
4828 if ( *RExC_parse != ')' )
4829 vFAIL("Unterminated verb pattern");
4830 }
5d458dd8 4831
e2e6a0f1
YO
4832 switch ( *start_verb ) {
4833 case 'A': /* (*ACCEPT) */
4834 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4835 op = ACCEPT;
4836 internal_argval = RExC_nestroot;
4837 }
4838 break;
4839 case 'C': /* (*COMMIT) */
4840 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4841 op = COMMIT;
e2e6a0f1
YO
4842 break;
4843 case 'F': /* (*FAIL) */
4844 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4845 op = OPFAIL;
4846 argok = 0;
4847 }
4848 break;
5d458dd8
YO
4849 case ':': /* (*:NAME) */
4850 case 'M': /* (*MARK:NAME) */
4851 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 4852 op = MARKPOINT;
5d458dd8
YO
4853 argok = -1;
4854 }
4855 break;
4856 case 'P': /* (*PRUNE) */
4857 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4858 op = PRUNE;
e2e6a0f1 4859 break;
5d458dd8
YO
4860 case 'S': /* (*SKIP) */
4861 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4862 op = SKIP;
4863 break;
4864 case 'T': /* (*THEN) */
4865 /* [19:06] <TimToady> :: is then */
4866 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4867 op = CUTGROUP;
4868 RExC_seen |= REG_SEEN_CUTGROUP;
4869 }
e2e6a0f1
YO
4870 break;
4871 }
4872 if ( ! op ) {
4873 RExC_parse++;
4874 vFAIL3("Unknown verb pattern '%.*s'",
4875 verb_len, start_verb);
4876 }
4877 if ( argok ) {
4878 if ( start_arg && internal_argval ) {
4879 vFAIL3("Verb pattern '%.*s' may not have an argument",
4880 verb_len, start_verb);
4881 } else if ( argok < 0 && !start_arg ) {
4882 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4883 verb_len, start_verb);
4884 } else {
4885 ret = reganode(pRExC_state, op, internal_argval);
4886 if ( ! internal_argval && ! SIZE_ONLY ) {
4887 if (start_arg) {
4888 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4889 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 4890 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
4891 ret->flags = 0;
4892 } else {
4893 ret->flags = 1;
4894 }
4895 }
4896 }
4897 if (!internal_argval)
4898 RExC_seen |= REG_SEEN_VERBARG;
4899 } else if ( start_arg ) {
4900 vFAIL3("Verb pattern '%.*s' may not have an argument",
4901 verb_len, start_verb);
4902 } else {
4903 ret = reg_node(pRExC_state, op);
4904 }
4905 nextchar(pRExC_state);
4906 return ret;
4907 } else
fac92740 4908 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4909 U32 posflags = 0, negflags = 0;
4910 U32 *flagsp = &posflags;
6136c704 4911 bool is_logical = 0;
a28509cc 4912 const char * const seqstart = RExC_parse;
ca9dfc88 4913
830247a4
IZ
4914 RExC_parse++;
4915 paren = *RExC_parse++;
c277df42 4916 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4917 switch (paren) {
894be9b7 4918
fac92740 4919 case '<': /* (?<...) */
b81d288d 4920 if (*RExC_parse == '!')
c277df42 4921 paren = ',';
0a4db386
YO
4922 else if (*RExC_parse != '=')
4923 { /* (?<...>) */
81714fb9 4924 char *name_start;
894be9b7 4925 SV *svname;
81714fb9
YO
4926 paren= '>';
4927 case '\'': /* (?'...') */
4928 name_start= RExC_parse;
0a4db386
YO
4929 svname = reg_scan_name(pRExC_state,
4930 SIZE_ONLY ? /* reverse test from the others */
4931 REG_RSN_RETURN_NAME :
4932 REG_RSN_RETURN_NULL);
81714fb9
YO
4933 if (RExC_parse == name_start)
4934 goto unknown;
4935 if (*RExC_parse != paren)
4936 vFAIL2("Sequence (?%c... not terminated",
4937 paren=='>' ? '<' : paren);
4938 if (SIZE_ONLY) {
e62cc96a
YO
4939 HE *he_str;
4940 SV *sv_dat = NULL;
894be9b7
YO
4941 if (!svname) /* shouldnt happen */
4942 Perl_croak(aTHX_
4943 "panic: reg_scan_name returned NULL");
81714fb9
YO
4944 if (!RExC_paren_names) {
4945 RExC_paren_names= newHV();
4946 sv_2mortal((SV*)RExC_paren_names);
4947 }
4948 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4949 if ( he_str )
81714fb9 4950 sv_dat = HeVAL(he_str);
e62cc96a 4951 if ( ! sv_dat ) {
81714fb9 4952 /* croak baby croak */
e62cc96a
YO
4953 Perl_croak(aTHX_
4954 "panic: paren_name hash element allocation failed");
4955 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4956 IV count=SvIV(sv_dat);
4957 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4958 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4959 pv[count]=RExC_npar;
4960 SvIVX(sv_dat)++;
4961 } else {
4962 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4963 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4964 SvIOK_on(sv_dat);
4965 SvIVX(sv_dat)= 1;
e62cc96a
YO
4966 }
4967
81714fb9
YO
4968 /*sv_dump(sv_dat);*/
4969 }
4970 nextchar(pRExC_state);
4971 paren = 1;
4972 goto capturing_parens;
4973 }
4974 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4975 RExC_parse++;
fac92740
MJD
4976 case '=': /* (?=...) */
4977 case '!': /* (?!...) */
830247a4 4978 RExC_seen_zerolen++;
e2e6a0f1
YO
4979 if (*RExC_parse == ')') {
4980 ret=reg_node(pRExC_state, OPFAIL);
4981 nextchar(pRExC_state);
4982 return ret;
4983 }
fac92740
MJD
4984 case ':': /* (?:...) */
4985 case '>': /* (?>...) */
a0d0e21e 4986 break;
fac92740
MJD
4987 case '$': /* (?$...) */
4988 case '@': /* (?@...) */
8615cb43 4989 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4990 break;
fac92740 4991 case '#': /* (?#...) */
830247a4
IZ
4992 while (*RExC_parse && *RExC_parse != ')')
4993 RExC_parse++;
4994 if (*RExC_parse != ')')
c277df42 4995 FAIL("Sequence (?#... not terminated");
830247a4 4996 nextchar(pRExC_state);
a0d0e21e
LW
4997 *flagp = TRYAGAIN;
4998 return NULL;
894be9b7
YO
4999 case '0' : /* (?0) */
5000 case 'R' : /* (?R) */
5001 if (*RExC_parse != ')')
6bda09f9 5002 FAIL("Sequence (?R) not terminated");
1a147d38 5003 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
5004 nextchar(pRExC_state);
5005 return ret;
5006 /*notreached*/
894be9b7
YO
5007 { /* named and numeric backreferences */
5008 I32 num;
5009 char * parse_start;
5010 case '&': /* (?&NAME) */
5011 parse_start = RExC_parse - 1;
5012 {
0a4db386
YO
5013 SV *sv_dat = reg_scan_name(pRExC_state,
5014 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5015 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5016 }
5017 goto gen_recurse_regop;
5018 /* NOT REACHED */
542fa716
YO
5019 case '+':
5020 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5021 RExC_parse++;
5022 vFAIL("Illegal pattern");
5023 }
5024 goto parse_recursion;
5025 /* NOT REACHED*/
5026 case '-': /* (?-1) */
5027 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5028 RExC_parse--; /* rewind to let it be handled later */
5029 goto parse_flags;
5030 }
5031 /*FALLTHROUGH */
6bda09f9
YO
5032 case '1': case '2': case '3': case '4': /* (?1) */
5033 case '5': case '6': case '7': case '8': case '9':
5034 RExC_parse--;
542fa716 5035 parse_recursion:
894be9b7
YO
5036 num = atoi(RExC_parse);
5037 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5038 if (*RExC_parse == '-')
5039 RExC_parse++;
6bda09f9
YO
5040 while (isDIGIT(*RExC_parse))
5041 RExC_parse++;
5042 if (*RExC_parse!=')')
5043 vFAIL("Expecting close bracket");
894be9b7
YO
5044
5045 gen_recurse_regop:
542fa716
YO
5046 if ( paren == '-' ) {
5047 /*
5048 Diagram of capture buffer numbering.
5049 Top line is the normal capture buffer numbers
5050 Botton line is the negative indexing as from
5051 the X (the (?-2))
5052
5053 + 1 2 3 4 5 X 6 7
5054 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5055 - 5 4 3 2 1 X x x
5056
5057 */
5058 num = RExC_npar + num;
5059 if (num < 1) {
5060 RExC_parse++;
5061 vFAIL("Reference to nonexistent group");
5062 }
5063 } else if ( paren == '+' ) {
5064 num = RExC_npar + num - 1;
5065 }
5066
1a147d38 5067 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5068 if (!SIZE_ONLY) {
5069 if (num > (I32)RExC_rx->nparens) {
5070 RExC_parse++;
5071 vFAIL("Reference to nonexistent group");
5072 }
40d049e4 5073 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5074 RExC_emit++;
226de585 5075 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5076 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5077 } else {
6bda09f9 5078 RExC_size++;
6bda09f9 5079 }
0a4db386 5080 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5081 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5082 Set_Node_Offset(ret, parse_start); /* MJD */
5083
6bda09f9
YO
5084 nextchar(pRExC_state);
5085 return ret;
894be9b7
YO
5086 } /* named and numeric backreferences */
5087 /* NOT REACHED */
5088
fac92740 5089 case 'p': /* (?p...) */
9014280d 5090 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 5091 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 5092 /* FALL THROUGH*/
fac92740 5093 case '?': /* (??...) */
6136c704 5094 is_logical = 1;
438a3801
YST
5095 if (*RExC_parse != '{')
5096 goto unknown;
830247a4 5097 paren = *RExC_parse++;
0f5d15d6 5098 /* FALL THROUGH */
fac92740 5099 case '{': /* (?{...}) */
c277df42 5100 {
2eccd3b2
NC
5101 I32 count = 1;
5102 U32 n = 0;
c277df42 5103 char c;
830247a4 5104 char *s = RExC_parse;
c277df42 5105
830247a4
IZ
5106 RExC_seen_zerolen++;
5107 RExC_seen |= REG_SEEN_EVAL;
5108 while (count && (c = *RExC_parse)) {
6136c704
AL
5109 if (c == '\\') {
5110 if (RExC_parse[1])
5111 RExC_parse++;
5112 }
b81d288d 5113 else if (c == '{')
c277df42 5114 count++;
b81d288d 5115 else if (c == '}')
c277df42 5116 count--;
830247a4 5117 RExC_parse++;
c277df42 5118 }
6136c704 5119 if (*RExC_parse != ')') {
b81d288d 5120 RExC_parse = s;
b45f050a
JF
5121 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5122 }
c277df42 5123 if (!SIZE_ONLY) {
f3548bdc 5124 PAD *pad;
6136c704
AL
5125 OP_4tree *sop, *rop;
5126 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5127
569233ed
SB
5128 ENTER;
5129 Perl_save_re_context(aTHX);
f3548bdc 5130 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5131 sop->op_private |= OPpREFCOUNTED;
5132 /* re_dup will OpREFCNT_inc */
5133 OpREFCNT_set(sop, 1);
569233ed 5134 LEAVE;
c277df42 5135
830247a4 5136 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5137 RExC_rxi->data->data[n] = (void*)rop;
5138 RExC_rxi->data->data[n+1] = (void*)sop;
5139 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 5140 SvREFCNT_dec(sv);
a0ed51b3 5141 }
e24b16f9 5142 else { /* First pass */
830247a4 5143 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5144 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5145 /* No compiled RE interpolated, has runtime
5146 components ===> unsafe. */
5147 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5148 if (PL_tainting && PL_tainted)
cc6b7395 5149 FAIL("Eval-group in insecure regular expression");
54df2634 5150#if PERL_VERSION > 8
923e4eb5 5151 if (IN_PERL_COMPILETIME)
b5c19bd7 5152 PL_cv_has_eval = 1;
54df2634 5153#endif
c277df42 5154 }
b5c19bd7 5155
830247a4 5156 nextchar(pRExC_state);
6136c704 5157 if (is_logical) {
830247a4 5158 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5159 if (!SIZE_ONLY)
5160 ret->flags = 2;
3dab1dad 5161 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5162 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5163 return ret;
5164 }
ccb2c380
MP
5165 ret = reganode(pRExC_state, EVAL, n);
5166 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5167 Set_Node_Offset(ret, parse_start);
5168 return ret;
c277df42 5169 }
fac92740 5170 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5171 {
0a4db386 5172 int is_define= 0;
fac92740 5173 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5174 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5175 || RExC_parse[1] == '<'
830247a4 5176 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5177 I32 flag;
5178
830247a4 5179 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5180 if (!SIZE_ONLY)
5181 ret->flags = 1;
3dab1dad 5182 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5183 goto insert_if;
b81d288d 5184 }
a0ed51b3 5185 }
0a4db386
YO
5186 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5187 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5188 {
5189 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5190 char *name_start= RExC_parse++;
2eccd3b2 5191 U32 num = 0;
0a4db386
YO
5192 SV *sv_dat=reg_scan_name(pRExC_state,
5193 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5194 if (RExC_parse == name_start || *RExC_parse != ch)
5195 vFAIL2("Sequence (?(%c... not terminated",
5196 (ch == '>' ? '<' : ch));
5197 RExC_parse++;
5198 if (!SIZE_ONLY) {
5199 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5200 RExC_rxi->data->data[num]=(void*)sv_dat;
0a4db386
YO
5201 SvREFCNT_inc(sv_dat);
5202 }
5203 ret = reganode(pRExC_state,NGROUPP,num);
5204 goto insert_if_check_paren;
5205 }
5206 else if (RExC_parse[0] == 'D' &&
5207 RExC_parse[1] == 'E' &&
5208 RExC_parse[2] == 'F' &&
5209 RExC_parse[3] == 'I' &&
5210 RExC_parse[4] == 'N' &&
5211 RExC_parse[5] == 'E')
5212 {
5213 ret = reganode(pRExC_state,DEFINEP,0);
5214 RExC_parse +=6 ;
5215 is_define = 1;
5216 goto insert_if_check_paren;
5217 }
5218 else if (RExC_parse[0] == 'R') {
5219 RExC_parse++;
5220 parno = 0;
5221 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5222 parno = atoi(RExC_parse++);
5223 while (isDIGIT(*RExC_parse))
5224 RExC_parse++;
5225 } else if (RExC_parse[0] == '&') {
5226 SV *sv_dat;
5227 RExC_parse++;
5228 sv_dat = reg_scan_name(pRExC_state,
5229 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5230 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5231 }
1a147d38 5232 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5233 goto insert_if_check_paren;
5234 }
830247a4 5235 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5236 /* (?(1)...) */
6136c704 5237 char c;
830247a4 5238 parno = atoi(RExC_parse++);
c277df42 5239
830247a4
IZ
5240 while (isDIGIT(*RExC_parse))
5241 RExC_parse++;
fac92740 5242 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5243
0a4db386 5244 insert_if_check_paren:
830247a4 5245 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5246 vFAIL("Switch condition not recognized");
c277df42 5247 insert_if:
3dab1dad
YO
5248 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5249 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5250 if (br == NULL)
830247a4 5251 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5252 else
3dab1dad 5253 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5254 c = *nextchar(pRExC_state);
d1b80229
IZ
5255 if (flags&HASWIDTH)
5256 *flagp |= HASWIDTH;
c277df42 5257 if (c == '|') {
0a4db386
YO
5258 if (is_define)
5259 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5260 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5261 regbranch(pRExC_state, &flags, 1,depth+1);
5262 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5263 if (flags&HASWIDTH)
5264 *flagp |= HASWIDTH;
830247a4 5265 c = *nextchar(pRExC_state);
a0ed51b3
LW
5266 }
5267 else
c277df42
IZ
5268 lastbr = NULL;
5269 if (c != ')')
8615cb43 5270 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5271 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5272 REGTAIL(pRExC_state, br, ender);
c277df42 5273 if (lastbr) {
3dab1dad
YO
5274 REGTAIL(pRExC_state, lastbr, ender);
5275 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5276 }
5277 else
3dab1dad 5278 REGTAIL(pRExC_state, ret, ender);
c277df42 5279 return ret;
a0ed51b3
LW
5280 }
5281 else {
830247a4 5282 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5283 }
5284 }
1b1626e4 5285 case 0:
830247a4 5286 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5287 vFAIL("Sequence (? incomplete");
1b1626e4 5288 break;
a0d0e21e 5289 default:
830247a4 5290 --RExC_parse;
fac92740 5291 parse_flags: /* (?i) */
830247a4 5292 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5293 /* (?g), (?gc) and (?o) are useless here
5294 and must be globally applied -- japhy */
5295
5296 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5297 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5298 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5299 if (! (wastedflags & wflagbit) ) {
5300 wastedflags |= wflagbit;
5301 vWARN5(
5302 RExC_parse + 1,
5303 "Useless (%s%c) - %suse /%c modifier",
5304 flagsp == &negflags ? "?-" : "?",
5305 *RExC_parse,
5306 flagsp == &negflags ? "don't " : "",
5307 *RExC_parse
5308 );
5309 }
5310 }
5311 }
5312 else if (*RExC_parse == 'c') {
5313 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5314 if (! (wastedflags & WASTED_C) ) {
5315 wastedflags |= WASTED_GC;
9d1d55b5
JP
5316 vWARN3(
5317 RExC_parse + 1,
5318 "Useless (%sc) - %suse /gc modifier",
5319 flagsp == &negflags ? "?-" : "?",
5320 flagsp == &negflags ? "don't " : ""
5321 );
5322 }
5323 }
5324 }
5325 else { pmflag(flagsp, *RExC_parse); }
5326
830247a4 5327 ++RExC_parse;
ca9dfc88 5328 }
830247a4 5329 if (*RExC_parse == '-') {
ca9dfc88 5330 flagsp = &negflags;
9d1d55b5 5331 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5332 ++RExC_parse;
ca9dfc88 5333 goto parse_flags;
48c036b1 5334 }
e2509266
JH
5335 RExC_flags |= posflags;
5336 RExC_flags &= ~negflags;
830247a4
IZ
5337 if (*RExC_parse == ':') {
5338 RExC_parse++;
ca9dfc88
IZ
5339 paren = ':';
5340 break;
5341 }
c277df42 5342 unknown:
830247a4
IZ
5343 if (*RExC_parse != ')') {
5344 RExC_parse++;
5345 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5346 }
830247a4 5347 nextchar(pRExC_state);
a0d0e21e
LW
5348 *flagp = TRYAGAIN;
5349 return NULL;
5350 }
5351 }
fac92740 5352 else { /* (...) */
81714fb9 5353 capturing_parens:
830247a4
IZ
5354 parno = RExC_npar;
5355 RExC_npar++;
e2e6a0f1 5356
830247a4 5357 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5358 if (!SIZE_ONLY ){
5359 if (!RExC_nestroot)
5360 RExC_nestroot = parno;
5361 if (RExC_seen & REG_SEEN_RECURSE) {
5362 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5363 "Setting open paren #%"IVdf" to %d\n",
5364 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5365 RExC_open_parens[parno-1]= ret;
5366 }
6bda09f9 5367 }
fac92740
MJD
5368 Set_Node_Length(ret, 1); /* MJD */
5369 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5370 is_open = 1;
a0d0e21e 5371 }
a0ed51b3 5372 }
fac92740 5373 else /* ! paren */
a0d0e21e
LW
5374 ret = NULL;
5375
5376 /* Pick up the branches, linking them together. */
fac92740 5377 parse_start = RExC_parse; /* MJD */
3dab1dad 5378 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5379 /* branch_len = (paren != 0); */
2af232bd 5380
a0d0e21e
LW
5381 if (br == NULL)
5382 return(NULL);
830247a4
IZ
5383 if (*RExC_parse == '|') {
5384 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5385 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5386 }
fac92740 5387 else { /* MJD */
6bda09f9 5388 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5389 Set_Node_Length(br, paren != 0);
5390 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5391 }
c277df42
IZ
5392 have_branch = 1;
5393 if (SIZE_ONLY)
830247a4 5394 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5395 }
5396 else if (paren == ':') {
c277df42
IZ
5397 *flagp |= flags&SIMPLE;
5398 }
6136c704 5399 if (is_open) { /* Starts with OPEN. */
3dab1dad 5400 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5401 }
5402 else if (paren != '?') /* Not Conditional */
a0d0e21e 5403 ret = br;
32a0ca98 5404 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5405 lastbr = br;
830247a4
IZ
5406 while (*RExC_parse == '|') {
5407 if (!SIZE_ONLY && RExC_extralen) {
5408 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5409 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5410 }
5411 if (SIZE_ONLY)
830247a4
IZ
5412 RExC_extralen += 2; /* Account for LONGJMP. */
5413 nextchar(pRExC_state);
3dab1dad 5414 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5415
a687059c 5416 if (br == NULL)
a0d0e21e 5417 return(NULL);
3dab1dad 5418 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5419 lastbr = br;
821b33a5
IZ
5420 if (flags&HASWIDTH)
5421 *flagp |= HASWIDTH;
a687059c 5422 *flagp |= flags&SPSTART;
a0d0e21e
LW
5423 }
5424
c277df42
IZ
5425 if (have_branch || paren != ':') {
5426 /* Make a closing node, and hook it on the end. */
5427 switch (paren) {
5428 case ':':
830247a4 5429 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5430 break;
5431 case 1:
830247a4 5432 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5433 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5434 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5435 "Setting close paren #%"IVdf" to %d\n",
5436 (IV)parno, REG_NODE_NUM(ender)));
5437 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5438 if (RExC_nestroot == parno)
5439 RExC_nestroot = 0;
40d049e4 5440 }
fac92740
MJD
5441 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5442 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5443 break;
5444 case '<':
c277df42
IZ
5445 case ',':
5446 case '=':
5447 case '!':
c277df42 5448 *flagp &= ~HASWIDTH;
821b33a5
IZ
5449 /* FALL THROUGH */
5450 case '>':
830247a4 5451 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5452 break;
5453 case 0:
830247a4 5454 ender = reg_node(pRExC_state, END);
40d049e4
YO
5455 if (!SIZE_ONLY) {
5456 assert(!RExC_opend); /* there can only be one! */
5457 RExC_opend = ender;
5458 }
c277df42
IZ
5459 break;
5460 }
eaf3ca90 5461 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5462
9674d46a 5463 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5464 if (depth==1)
5465 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5466
c277df42 5467 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5468 for (br = ret; br; br = regnext(br)) {
5469 const U8 op = PL_regkind[OP(br)];
5470 if (op == BRANCH) {
07be1b83 5471 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5472 }
5473 else if (op == BRANCHJ) {
07be1b83 5474 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5475 }
c277df42
IZ
5476 }
5477 }
a0d0e21e 5478 }
c277df42
IZ
5479
5480 {
e1ec3a88
AL
5481 const char *p;
5482 static const char parens[] = "=!<,>";
c277df42
IZ
5483
5484 if (paren && (p = strchr(parens, paren))) {
eb160463 5485 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5486 int flag = (p - parens) > 1;
5487
5488 if (paren == '>')
5489 node = SUSPEND, flag = 0;
6bda09f9 5490 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5491 Set_Node_Cur_Length(ret);
5492 Set_Node_Offset(ret, parse_start + 1);
c277df42 5493 ret->flags = flag;
07be1b83 5494 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5495 }
a0d0e21e
LW
5496 }
5497
5498 /* Check for proper termination. */
ce3e6498 5499 if (paren) {
e2509266 5500 RExC_flags = oregflags;
830247a4
IZ
5501 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5502 RExC_parse = oregcomp_parse;
380a0633 5503 vFAIL("Unmatched (");
ce3e6498 5504 }
a0ed51b3 5505 }
830247a4
IZ
5506 else if (!paren && RExC_parse < RExC_end) {
5507 if (*RExC_parse == ')') {
5508 RExC_parse++;
380a0633 5509 vFAIL("Unmatched )");
a0ed51b3
LW
5510 }
5511 else
b45f050a 5512 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5513 /* NOTREACHED */
5514 }
a687059c 5515
a0d0e21e 5516 return(ret);
a687059c
LW
5517}
5518
5519/*
5520 - regbranch - one alternative of an | operator
5521 *
5522 * Implements the concatenation operator.
5523 */
76e3520e 5524STATIC regnode *
3dab1dad 5525S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5526{
97aff369 5527 dVAR;
c277df42
IZ
5528 register regnode *ret;
5529 register regnode *chain = NULL;
5530 register regnode *latest;
5531 I32 flags = 0, c = 0;
3dab1dad
YO
5532 GET_RE_DEBUG_FLAGS_DECL;
5533 DEBUG_PARSE("brnc");
b81d288d 5534 if (first)
c277df42
IZ
5535 ret = NULL;
5536 else {
b81d288d 5537 if (!SIZE_ONLY && RExC_extralen)
830247a4 5538 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5539 else {
830247a4 5540 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5541 Set_Node_Length(ret, 1);
5542 }
c277df42
IZ
5543 }
5544
b81d288d 5545 if (!first && SIZE_ONLY)
830247a4 5546 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5547
c277df42 5548 *flagp = WORST; /* Tentatively. */
a0d0e21e 5549
830247a4
IZ
5550 RExC_parse--;
5551 nextchar(pRExC_state);
5552 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5553 flags &= ~TRYAGAIN;
3dab1dad 5554 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5555 if (latest == NULL) {
5556 if (flags & TRYAGAIN)
5557 continue;
5558 return(NULL);
a0ed51b3
LW
5559 }
5560 else if (ret == NULL)
c277df42 5561 ret = latest;
a0d0e21e 5562 *flagp |= flags&HASWIDTH;
c277df42 5563 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5564 *flagp |= flags&SPSTART;
5565 else {
830247a4 5566 RExC_naughty++;
3dab1dad 5567 REGTAIL(pRExC_state, chain, latest);
a687059c 5568 }
a0d0e21e 5569 chain = latest;
c277df42
IZ
5570 c++;
5571 }
5572 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5573 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5574 if (ret == NULL)
5575 ret = chain;
5576 }
5577 if (c == 1) {
5578 *flagp |= flags&SIMPLE;
a0d0e21e 5579 }
a687059c 5580
d4c19fe8 5581 return ret;
a687059c
LW
5582}
5583
5584/*
5585 - regpiece - something followed by possible [*+?]
5586 *
5587 * Note that the branching code sequences used for ? and the general cases
5588 * of * and + are somewhat optimized: they use the same NOTHING node as
5589 * both the endmarker for their branch list and the body of the last branch.
5590 * It might seem that this node could be dispensed with entirely, but the
5591 * endmarker role is not redundant.
5592 */
76e3520e 5593STATIC regnode *
3dab1dad 5594S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5595{
97aff369 5596 dVAR;
c277df42 5597 register regnode *ret;
a0d0e21e
LW
5598 register char op;
5599 register char *next;
5600 I32 flags;
1df70142 5601 const char * const origparse = RExC_parse;
a0d0e21e 5602 I32 min;
c277df42 5603 I32 max = REG_INFTY;
fac92740 5604 char *parse_start;
10edeb5d 5605 const char *maxpos = NULL;
3dab1dad
YO
5606 GET_RE_DEBUG_FLAGS_DECL;
5607 DEBUG_PARSE("piec");
a0d0e21e 5608
3dab1dad 5609 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5610 if (ret == NULL) {
5611 if (flags & TRYAGAIN)
5612 *flagp |= TRYAGAIN;
5613 return(NULL);
5614 }
5615
830247a4 5616 op = *RExC_parse;
a0d0e21e 5617
830247a4 5618 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5619 maxpos = NULL;
fac92740 5620 parse_start = RExC_parse; /* MJD */
830247a4 5621 next = RExC_parse + 1;
a0d0e21e
LW
5622 while (isDIGIT(*next) || *next == ',') {
5623 if (*next == ',') {
5624 if (maxpos)
5625 break;
5626 else
5627 maxpos = next;
a687059c 5628 }
a0d0e21e
LW
5629 next++;
5630 }
5631 if (*next == '}') { /* got one */
5632 if (!maxpos)
5633 maxpos = next;
830247a4
IZ
5634 RExC_parse++;
5635 min = atoi(RExC_parse);
a0d0e21e
LW
5636 if (*maxpos == ',')
5637 maxpos++;
5638 else
830247a4 5639 maxpos = RExC_parse;
a0d0e21e
LW
5640 max = atoi(maxpos);
5641 if (!max && *maxpos != '0')
c277df42
IZ
5642 max = REG_INFTY; /* meaning "infinity" */
5643 else if (max >= REG_INFTY)
8615cb43 5644 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5645 RExC_parse = next;
5646 nextchar(pRExC_state);
a0d0e21e
LW
5647
5648 do_curly:
5649 if ((flags&SIMPLE)) {
830247a4 5650 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5651 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5652 Set_Node_Offset(ret, parse_start+1); /* MJD */
5653 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5654 }
5655 else {
3dab1dad 5656 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5657
5658 w->flags = 0;
3dab1dad 5659 REGTAIL(pRExC_state, ret, w);
830247a4 5660 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5661 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5662 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5663 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5664 }
6bda09f9 5665 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5666 /* MJD hk */
5667 Set_Node_Offset(ret, parse_start+1);
2af232bd 5668 Set_Node_Length(ret,
fac92740 5669 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5670
830247a4 5671 if (!SIZE_ONLY && RExC_extralen)
c277df42 5672 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5673 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5674 if (SIZE_ONLY)
830247a4
IZ
5675 RExC_whilem_seen++, RExC_extralen += 3;
5676 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5677 }
c277df42 5678 ret->flags = 0;
a0d0e21e
LW
5679
5680 if (min > 0)
821b33a5
IZ
5681 *flagp = WORST;
5682 if (max > 0)
5683 *flagp |= HASWIDTH;
a0d0e21e 5684 if (max && max < min)
8615cb43 5685 vFAIL("Can't do {n,m} with n > m");
c277df42 5686 if (!SIZE_ONLY) {
eb160463
GS
5687 ARG1_SET(ret, (U16)min);
5688 ARG2_SET(ret, (U16)max);
a687059c 5689 }
a687059c 5690
a0d0e21e 5691 goto nest_check;
a687059c 5692 }
a0d0e21e 5693 }
a687059c 5694
a0d0e21e
LW
5695 if (!ISMULT1(op)) {
5696 *flagp = flags;
a687059c 5697 return(ret);
a0d0e21e 5698 }
bb20fd44 5699
c277df42 5700#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5701
5702 /* if this is reinstated, don't forget to put this back into perldiag:
5703
5704 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5705
5706 (F) The part of the regexp subject to either the * or + quantifier
5707 could match an empty string. The {#} shows in the regular
5708 expression about where the problem was discovered.
5709
5710 */
5711
bb20fd44 5712 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5713 vFAIL("Regexp *+ operand could be empty");
b81d288d 5714#endif
bb20fd44 5715
fac92740 5716 parse_start = RExC_parse;
830247a4 5717 nextchar(pRExC_state);
a0d0e21e 5718
821b33a5 5719 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5720
5721 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5722 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5723 ret->flags = 0;
830247a4 5724 RExC_naughty += 4;
a0d0e21e
LW
5725 }
5726 else if (op == '*') {
5727 min = 0;
5728 goto do_curly;
a0ed51b3
LW
5729 }
5730 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5731 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5732 ret->flags = 0;
830247a4 5733 RExC_naughty += 3;
a0d0e21e
LW
5734 }
5735 else if (op == '+') {
5736 min = 1;
5737 goto do_curly;
a0ed51b3
LW
5738 }
5739 else if (op == '?') {
a0d0e21e
LW
5740 min = 0; max = 1;
5741 goto do_curly;
5742 }
5743 nest_check:
041457d9 5744 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5745 vWARN3(RExC_parse,
b45f050a 5746 "%.*s matches null string many times",
afd78fd5 5747 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5748 origparse);
a0d0e21e
LW
5749 }
5750
b9b4dddf 5751 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5752 nextchar(pRExC_state);
6bda09f9 5753 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5754 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5755 }
b9b4dddf
YO
5756#ifndef REG_ALLOW_MINMOD_SUSPEND
5757 else
5758#endif
5759 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5760 regnode *ender;
5761 nextchar(pRExC_state);
5762 ender = reg_node(pRExC_state, SUCCEED);
5763 REGTAIL(pRExC_state, ret, ender);
5764 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5765 ret->flags = 0;
5766 ender = reg_node(pRExC_state, TAIL);
5767 REGTAIL(pRExC_state, ret, ender);
5768 /*ret= ender;*/
5769 }
5770
5771 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5772 RExC_parse++;
b45f050a
JF
5773 vFAIL("Nested quantifiers");
5774 }
a0d0e21e
LW
5775
5776 return(ret);
a687059c
LW
5777}
5778
fc8cd66c
YO
5779
5780/* reg_namedseq(pRExC_state,UVp)
5781
5782 This is expected to be called by a parser routine that has
5783 recognized'\N' and needs to handle the rest. RExC_parse is
5784 expected to point at the first char following the N at the time
5785 of the call.
5786
5787 If valuep is non-null then it is assumed that we are parsing inside
5788 of a charclass definition and the first codepoint in the resolved
5789 string is returned via *valuep and the routine will return NULL.
5790 In this mode if a multichar string is returned from the charnames
5791 handler a warning will be issued, and only the first char in the
5792 sequence will be examined. If the string returned is zero length
5793 then the value of *valuep is undefined and NON-NULL will
5794 be returned to indicate failure. (This will NOT be a valid pointer
5795 to a regnode.)
5796
5797 If value is null then it is assumed that we are parsing normal text
5798 and inserts a new EXACT node into the program containing the resolved
5799 string and returns a pointer to the new node. If the string is
5800 zerolength a NOTHING node is emitted.
5801
5802 On success RExC_parse is set to the char following the endbrace.
5803 Parsing failures will generate a fatal errorvia vFAIL(...)
5804
5805 NOTE: We cache all results from the charnames handler locally in
5806 the RExC_charnames hash (created on first use) to prevent a charnames
5807 handler from playing silly-buggers and returning a short string and
5808 then a long string for a given pattern. Since the regexp program
5809 size is calculated during an initial parse this would result
5810 in a buffer overrun so we cache to prevent the charname result from
5811 changing during the course of the parse.
5812
5813 */
5814STATIC regnode *
5815S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5816{
5817 char * name; /* start of the content of the name */
5818 char * endbrace; /* endbrace following the name */
5819 SV *sv_str = NULL;
5820 SV *sv_name = NULL;
5821 STRLEN len; /* this has various purposes throughout the code */
5822 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5823 regnode *ret = NULL;
5824
5825 if (*RExC_parse != '{') {
5826 vFAIL("Missing braces on \\N{}");
5827 }
5828 name = RExC_parse+1;
5829 endbrace = strchr(RExC_parse, '}');
5830 if ( ! endbrace ) {
5831 RExC_parse++;
5832 vFAIL("Missing right brace on \\N{}");
5833 }
5834 RExC_parse = endbrace + 1;
5835
5836
5837 /* RExC_parse points at the beginning brace,
5838 endbrace points at the last */
5839 if ( name[0]=='U' && name[1]=='+' ) {
5840 /* its a "unicode hex" notation {U+89AB} */
5841 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5842 | PERL_SCAN_DISALLOW_PREFIX
5843 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5844 UV cp;
196f1508 5845 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5846 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5847 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5848 cp = 0xFFFD;
5849 }
5850 if (cp > 0xff)
5851 RExC_utf8 = 1;
5852 if ( valuep ) {
5853 *valuep = cp;
5854 return NULL;
5855 }
5856 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5857 } else {
5858 /* fetch the charnames handler for this scope */
5859 HV * const table = GvHV(PL_hintgv);
5860 SV **cvp= table ?
5861 hv_fetchs(table, "charnames", FALSE) :
5862 NULL;
5863 SV *cv= cvp ? *cvp : NULL;
5864 HE *he_str;
5865 int count;
5866 /* create an SV with the name as argument */
5867 sv_name = newSVpvn(name, endbrace - name);
5868
5869 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5870 vFAIL2("Constant(\\N{%s}) unknown: "
5871 "(possibly a missing \"use charnames ...\")",
5872 SvPVX(sv_name));
5873 }
5874 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5875 vFAIL2("Constant(\\N{%s}): "
5876 "$^H{charnames} is not defined",SvPVX(sv_name));
5877 }
5878
5879
5880
5881 if (!RExC_charnames) {
5882 /* make sure our cache is allocated */
5883 RExC_charnames = newHV();
6bda09f9 5884 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5885 }
5886 /* see if we have looked this one up before */
5887 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5888 if ( he_str ) {
5889 sv_str = HeVAL(he_str);
5890 cached = 1;
5891 } else {
5892 dSP ;
5893
5894 ENTER ;
5895 SAVETMPS ;
5896 PUSHMARK(SP) ;
5897
5898 XPUSHs(sv_name);
5899
5900 PUTBACK ;
5901
5902 count= call_sv(cv, G_SCALAR);
5903
5904 if (count == 1) { /* XXXX is this right? dmq */
5905 sv_str = POPs;
5906 SvREFCNT_inc_simple_void(sv_str);
5907 }
5908
5909 SPAGAIN ;
5910 PUTBACK ;
5911 FREETMPS ;
5912 LEAVE ;
5913
5914 if ( !sv_str || !SvOK(sv_str) ) {
5915 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5916 "did not return a defined value",SvPVX(sv_name));
5917 }
5918 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5919 cached = 1;
5920 }
5921 }
5922 if (valuep) {
5923 char *p = SvPV(sv_str, len);
5924 if (len) {
5925 STRLEN numlen = 1;
5926 if ( SvUTF8(sv_str) ) {
196f1508 5927 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5928 if (*valuep > 0x7F)
5929 RExC_utf8 = 1;
5930 /* XXXX
5931 We have to turn on utf8 for high bit chars otherwise
5932 we get failures with
5933
5934 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5935 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5936
5937 This is different from what \x{} would do with the same
5938 codepoint, where the condition is > 0xFF.
5939 - dmq
5940 */
5941
5942
5943 } else {
5944 *valuep = (UV)*p;
5945 /* warn if we havent used the whole string? */
5946 }
5947 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5948 vWARN2(RExC_parse,
5949 "Ignoring excess chars from \\N{%s} in character class",
5950 SvPVX(sv_name)
5951 );
5952 }
5953 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5954 vWARN2(RExC_parse,
5955 "Ignoring zero length \\N{%s} in character class",
5956 SvPVX(sv_name)
5957 );
5958 }
5959 if (sv_name)
5960 SvREFCNT_dec(sv_name);
5961 if (!cached)
5962 SvREFCNT_dec(sv_str);
5963 return len ? NULL : (regnode *)&len;
5964 } else if(SvCUR(sv_str)) {
5965
5966 char *s;
5967 char *p, *pend;
5968 STRLEN charlen = 1;
5969 char * parse_start = name-3; /* needed for the offsets */
5970 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5971
5972 ret = reg_node(pRExC_state,
5973 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5974 s= STRING(ret);
5975
5976 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5977 sv_utf8_upgrade(sv_str);
5978 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5979 RExC_utf8= 1;
5980 }
5981
5982 p = SvPV(sv_str, len);
5983 pend = p + len;
5984 /* len is the length written, charlen is the size the char read */
5985 for ( len = 0; p < pend; p += charlen ) {
5986 if (UTF) {
196f1508 5987 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5988 if (FOLD) {
5989 STRLEN foldlen,numlen;
5990 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5991 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5992 /* Emit all the Unicode characters. */
5993
5994 for (foldbuf = tmpbuf;
5995 foldlen;
5996 foldlen -= numlen)
5997 {
5998 uvc = utf8_to_uvchr(foldbuf, &numlen);
5999 if (numlen > 0) {
6000 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6001 s += unilen;
6002 len += unilen;
6003 /* In EBCDIC the numlen
6004 * and unilen can differ. */
6005 foldbuf += numlen;
6006 if (numlen >= foldlen)
6007 break;
6008 }
6009 else
6010 break; /* "Can't happen." */
6011 }
6012 } else {
6013 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6014 if (unilen > 0) {
6015 s += unilen;
6016 len += unilen;
6017 }
6018 }
6019 } else {
6020 len++;
6021 REGC(*p, s++);
6022 }
6023 }
6024 if (SIZE_ONLY) {
6025 RExC_size += STR_SZ(len);
6026 } else {
6027 STR_LEN(ret) = len;
6028 RExC_emit += STR_SZ(len);
6029 }
6030 Set_Node_Cur_Length(ret); /* MJD */
6031 RExC_parse--;
6032 nextchar(pRExC_state);
6033 } else {
6034 ret = reg_node(pRExC_state,NOTHING);
6035 }
6036 if (!cached) {
6037 SvREFCNT_dec(sv_str);
6038 }
6039 if (sv_name) {
6040 SvREFCNT_dec(sv_name);
6041 }
6042 return ret;
6043
6044}
6045
6046
9e08bc66
TS
6047/*
6048 * reg_recode
6049 *
6050 * It returns the code point in utf8 for the value in *encp.
6051 * value: a code value in the source encoding
6052 * encp: a pointer to an Encode object
6053 *
6054 * If the result from Encode is not a single character,
6055 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6056 */
6057STATIC UV
6058S_reg_recode(pTHX_ const char value, SV **encp)
6059{
6060 STRLEN numlen = 1;
6061 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6062 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6063 : SvPVX(sv);
6064 const STRLEN newlen = SvCUR(sv);
6065 UV uv = UNICODE_REPLACEMENT;
6066
6067 if (newlen)
6068 uv = SvUTF8(sv)
6069 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6070 : *(U8*)s;
6071
6072 if (!newlen || numlen != newlen) {
6073 uv = UNICODE_REPLACEMENT;
6074 if (encp)
6075 *encp = NULL;
6076 }
6077 return uv;
6078}
6079
fc8cd66c 6080
a687059c
LW
6081/*
6082 - regatom - the lowest level
6083 *
6084 * Optimization: gobbles an entire sequence of ordinary characters so that
6085 * it can turn them into a single node, which is smaller to store and
6086 * faster to run. Backslashed characters are exceptions, each becoming a
6087 * separate node; the code is simpler that way and it's not worth fixing.
6088 *
7f6f358c
YO
6089 * [Yes, it is worth fixing, some scripts can run twice the speed.]
6090 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6091 */
76e3520e 6092STATIC regnode *
3dab1dad 6093S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6094{
97aff369 6095 dVAR;
cbbf8932 6096 register regnode *ret = NULL;
a0d0e21e 6097 I32 flags;
45948336 6098 char *parse_start = RExC_parse;
3dab1dad
YO
6099 GET_RE_DEBUG_FLAGS_DECL;
6100 DEBUG_PARSE("atom");
a0d0e21e
LW
6101 *flagp = WORST; /* Tentatively. */
6102
6103tryagain:
830247a4 6104 switch (*RExC_parse) {
a0d0e21e 6105 case '^':
830247a4
IZ
6106 RExC_seen_zerolen++;
6107 nextchar(pRExC_state);
bbe252da 6108 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6109 ret = reg_node(pRExC_state, MBOL);
bbe252da 6110 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6111 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6112 else
830247a4 6113 ret = reg_node(pRExC_state, BOL);
fac92740 6114 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6115 break;
6116 case '$':
830247a4 6117 nextchar(pRExC_state);
b81d288d 6118 if (*RExC_parse)
830247a4 6119 RExC_seen_zerolen++;
bbe252da 6120 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6121 ret = reg_node(pRExC_state, MEOL);
bbe252da 6122 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6123 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6124 else
830247a4 6125 ret = reg_node(pRExC_state, EOL);
fac92740 6126 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6127 break;
6128 case '.':
830247a4 6129 nextchar(pRExC_state);
bbe252da 6130 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
6131 ret = reg_node(pRExC_state, SANY);
6132 else
6133 ret = reg_node(pRExC_state, REG_ANY);
6134 *flagp |= HASWIDTH|SIMPLE;
830247a4 6135 RExC_naughty++;
fac92740 6136 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6137 break;
6138 case '[':
b45f050a 6139 {
3dab1dad
YO
6140 char * const oregcomp_parse = ++RExC_parse;
6141 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6142 if (*RExC_parse != ']') {
6143 RExC_parse = oregcomp_parse;
b45f050a
JF
6144 vFAIL("Unmatched [");
6145 }
830247a4 6146 nextchar(pRExC_state);
a0d0e21e 6147 *flagp |= HASWIDTH|SIMPLE;
fac92740 6148 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6149 break;
b45f050a 6150 }
a0d0e21e 6151 case '(':
830247a4 6152 nextchar(pRExC_state);
3dab1dad 6153 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6154 if (ret == NULL) {
bf93d4cc 6155 if (flags & TRYAGAIN) {
830247a4 6156 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6157 /* Make parent create an empty node if needed. */
6158 *flagp |= TRYAGAIN;
6159 return(NULL);
6160 }
a0d0e21e 6161 goto tryagain;
bf93d4cc 6162 }
a0d0e21e
LW
6163 return(NULL);
6164 }
c277df42 6165 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
6166 break;
6167 case '|':
6168 case ')':
6169 if (flags & TRYAGAIN) {
6170 *flagp |= TRYAGAIN;
6171 return NULL;
6172 }
b45f050a 6173 vFAIL("Internal urp");
a0d0e21e
LW
6174 /* Supposed to be caught earlier. */
6175 break;
85afd4ae 6176 case '{':
830247a4
IZ
6177 if (!regcurly(RExC_parse)) {
6178 RExC_parse++;
85afd4ae
CS
6179 goto defchar;
6180 }
6181 /* FALL THROUGH */
a0d0e21e
LW
6182 case '?':
6183 case '+':
6184 case '*':
830247a4 6185 RExC_parse++;
b45f050a 6186 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6187 break;
6188 case '\\':
830247a4 6189 switch (*++RExC_parse) {
a0d0e21e 6190 case 'A':
830247a4
IZ
6191 RExC_seen_zerolen++;
6192 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6193 *flagp |= SIMPLE;
830247a4 6194 nextchar(pRExC_state);
fac92740 6195 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6196 break;
6197 case 'G':
830247a4
IZ
6198 ret = reg_node(pRExC_state, GPOS);
6199 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6200 *flagp |= SIMPLE;
830247a4 6201 nextchar(pRExC_state);
fac92740 6202 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6203 break;
6204 case 'Z':
830247a4 6205 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6206 *flagp |= SIMPLE;
a1917ab9 6207 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 6208 nextchar(pRExC_state);
a0d0e21e 6209 break;
b85d18e9 6210 case 'z':
830247a4 6211 ret = reg_node(pRExC_state, EOS);
b85d18e9 6212 *flagp |= SIMPLE;
830247a4
IZ
6213 RExC_seen_zerolen++; /* Do not optimize RE away */
6214 nextchar(pRExC_state);
fac92740 6215 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 6216 break;
4a2d328f 6217 case 'C':
f33976b4
DB
6218 ret = reg_node(pRExC_state, CANY);
6219 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6220 *flagp |= HASWIDTH|SIMPLE;
830247a4 6221 nextchar(pRExC_state);
fac92740 6222 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
6223 break;
6224 case 'X':
830247a4 6225 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6226 *flagp |= HASWIDTH;
830247a4 6227 nextchar(pRExC_state);
fac92740 6228 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 6229 break;
a0d0e21e 6230 case 'w':
eb160463 6231 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6232 *flagp |= HASWIDTH|SIMPLE;
830247a4 6233 nextchar(pRExC_state);
fac92740 6234 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6235 break;
6236 case 'W':
eb160463 6237 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6238 *flagp |= HASWIDTH|SIMPLE;
830247a4 6239 nextchar(pRExC_state);
fac92740 6240 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6241 break;
6242 case 'b':
830247a4
IZ
6243 RExC_seen_zerolen++;
6244 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6245 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6246 *flagp |= SIMPLE;
830247a4 6247 nextchar(pRExC_state);
fac92740 6248 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6249 break;
6250 case 'B':
830247a4
IZ
6251 RExC_seen_zerolen++;
6252 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6253 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6254 *flagp |= SIMPLE;
830247a4 6255 nextchar(pRExC_state);
fac92740 6256 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6257 break;
6258 case 's':
eb160463 6259 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6260 *flagp |= HASWIDTH|SIMPLE;
830247a4 6261 nextchar(pRExC_state);
fac92740 6262 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6263 break;
6264 case 'S':
eb160463 6265 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6266 *flagp |= HASWIDTH|SIMPLE;
830247a4 6267 nextchar(pRExC_state);
fac92740 6268 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6269 break;
6270 case 'd':
ffc61ed2 6271 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6272 *flagp |= HASWIDTH|SIMPLE;
830247a4 6273 nextchar(pRExC_state);
fac92740 6274 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6275 break;
6276 case 'D':
ffc61ed2 6277 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6278 *flagp |= HASWIDTH|SIMPLE;
830247a4 6279 nextchar(pRExC_state);
fac92740 6280 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 6281 break;
a14b48bc
LW
6282 case 'p':
6283 case 'P':
3568d838 6284 {
3dab1dad 6285 char* const oldregxend = RExC_end;
ccb2c380 6286 char* parse_start = RExC_parse - 2;
a14b48bc 6287
830247a4 6288 if (RExC_parse[1] == '{') {
3568d838 6289 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6290 RExC_end = strchr(RExC_parse, '}');
6291 if (!RExC_end) {
3dab1dad 6292 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6293 RExC_parse += 2;
6294 RExC_end = oldregxend;
0da60cf5 6295 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6296 }
830247a4 6297 RExC_end++;
a14b48bc 6298 }
af6f566e 6299 else {
830247a4 6300 RExC_end = RExC_parse + 2;
af6f566e
HS
6301 if (RExC_end > oldregxend)
6302 RExC_end = oldregxend;
6303 }
830247a4 6304 RExC_parse--;
a14b48bc 6305
3dab1dad 6306 ret = regclass(pRExC_state,depth+1);
a14b48bc 6307
830247a4
IZ
6308 RExC_end = oldregxend;
6309 RExC_parse--;
ccb2c380
MP
6310
6311 Set_Node_Offset(ret, parse_start + 2);
6312 Set_Node_Cur_Length(ret);
830247a4 6313 nextchar(pRExC_state);
a14b48bc
LW
6314 *flagp |= HASWIDTH|SIMPLE;
6315 }
6316 break;
fc8cd66c
YO
6317 case 'N':
6318 /* Handle \N{NAME} here and not below because it can be
6319 multicharacter. join_exact() will join them up later on.
6320 Also this makes sure that things like /\N{BLAH}+/ and
6321 \N{BLAH} being multi char Just Happen. dmq*/
6322 ++RExC_parse;
6323 ret= reg_namedseq(pRExC_state, NULL);
6324 break;
0a4db386 6325 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
6326 {
6327 char ch= RExC_parse[1];
6328 if (ch != '<' && ch != '\'') {
6329 if (SIZE_ONLY)
6330 vWARN( RExC_parse + 1,
6331 "Possible broken named back reference treated as literal k");
6332 parse_start--;
6333 goto defchar;
6334 } else {
6335 char* name_start = (RExC_parse += 2);
2eccd3b2 6336 U32 num = 0;
0a4db386
YO
6337 SV *sv_dat = reg_scan_name(pRExC_state,
6338 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
6339 ch= (ch == '<') ? '>' : '\'';
6340
81714fb9
YO
6341 if (RExC_parse == name_start || *RExC_parse != ch)
6342 vFAIL2("Sequence \\k%c... not terminated",
6343 (ch == '>' ? '<' : ch));
6344
6345 RExC_sawback = 1;
6346 ret = reganode(pRExC_state,
6347 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6348 num);
6349 *flagp |= HASWIDTH;
6350
6351
6352 if (!SIZE_ONLY) {
81714fb9
YO
6353 num = add_data( pRExC_state, 1, "S" );
6354 ARG_SET(ret,num);
f8fc2ecf 6355 RExC_rxi->data->data[num]=(void*)sv_dat;
81714fb9
YO
6356 SvREFCNT_inc(sv_dat);
6357 }
6358 /* override incorrect value set in reganode MJD */
6359 Set_Node_Offset(ret, parse_start+1);
6360 Set_Node_Cur_Length(ret); /* MJD */
6361 nextchar(pRExC_state);
6362
6363 }
6364 break;
6365 }
a0d0e21e
LW
6366 case 'n':
6367 case 'r':
6368 case 't':
6369 case 'f':
6370 case 'e':
6371 case 'a':
6372 case 'x':
6373 case 'c':
6374 case '0':
6375 goto defchar;
2bf803e2 6376 case 'g':
a0d0e21e
LW
6377 case '1': case '2': case '3': case '4':
6378 case '5': case '6': case '7': case '8': case '9':
6379 {
c74340f9 6380 I32 num;
2bf803e2
YO
6381 bool isg = *RExC_parse == 'g';
6382 bool isrel = 0;
6383 bool hasbrace = 0;
6384 if (isg) {
c74340f9 6385 RExC_parse++;
2bf803e2
YO
6386 if (*RExC_parse == '{') {
6387 RExC_parse++;
6388 hasbrace = 1;
6389 }
6390 if (*RExC_parse == '-') {
6391 RExC_parse++;
6392 isrel = 1;
6393 }
6394 }
c74340f9
YO
6395 num = atoi(RExC_parse);
6396 if (isrel) {
5624f11d 6397 num = RExC_npar - num;
c74340f9
YO
6398 if (num < 1)
6399 vFAIL("Reference to nonexistent or unclosed group");
6400 }
2bf803e2 6401 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
6402 goto defchar;
6403 else {
3dab1dad 6404 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6405 while (isDIGIT(*RExC_parse))
6406 RExC_parse++;
2bf803e2
YO
6407 if (hasbrace) {
6408 if (*RExC_parse != '}')
6409 vFAIL("Unterminated \\g{...} pattern");
6410 RExC_parse++;
6411 }
c74340f9
YO
6412 if (!SIZE_ONLY) {
6413 if (num > (I32)RExC_rx->nparens)
6414 vFAIL("Reference to nonexistent group");
c74340f9 6415 }
830247a4 6416 RExC_sawback = 1;
eb160463
GS
6417 ret = reganode(pRExC_state,
6418 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6419 num);
a0d0e21e 6420 *flagp |= HASWIDTH;
2af232bd 6421
fac92740 6422 /* override incorrect value set in reganode MJD */
2af232bd 6423 Set_Node_Offset(ret, parse_start+1);
fac92740 6424 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6425 RExC_parse--;
6426 nextchar(pRExC_state);
a0d0e21e
LW
6427 }
6428 }
6429 break;
6430 case '\0':
830247a4 6431 if (RExC_parse >= RExC_end)
b45f050a 6432 FAIL("Trailing \\");
a0d0e21e
LW
6433 /* FALL THROUGH */
6434 default:
a0288114 6435 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6436 back into the quick-grab loop below */
45948336 6437 parse_start--;
a0d0e21e
LW
6438 goto defchar;
6439 }
6440 break;
4633a7c4
LW
6441
6442 case '#':
bbe252da 6443 if (RExC_flags & RXf_PMf_EXTENDED) {
3dab1dad
YO
6444 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6445 RExC_parse++;
830247a4 6446 if (RExC_parse < RExC_end)
4633a7c4
LW
6447 goto tryagain;
6448 }
6449 /* FALL THROUGH */
6450
a0d0e21e 6451 default: {
ba210ebe 6452 register STRLEN len;
58ae7d3f 6453 register UV ender;
a0d0e21e 6454 register char *p;
3dab1dad 6455 char *s;
80aecb99 6456 STRLEN foldlen;
89ebb4a3 6457 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6458
6459 parse_start = RExC_parse - 1;
a0d0e21e 6460
830247a4 6461 RExC_parse++;
a0d0e21e
LW
6462
6463 defchar:
58ae7d3f 6464 ender = 0;
eb160463
GS
6465 ret = reg_node(pRExC_state,
6466 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6467 s = STRING(ret);
830247a4
IZ
6468 for (len = 0, p = RExC_parse - 1;
6469 len < 127 && p < RExC_end;
a0d0e21e
LW
6470 len++)
6471 {
3dab1dad 6472 char * const oldp = p;
5b5a24f7 6473
bbe252da 6474 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6475 p = regwhite(p, RExC_end);
a0d0e21e
LW
6476 switch (*p) {
6477 case '^':
6478 case '$':
6479 case '.':
6480 case '[':
6481 case '(':
6482 case ')':
6483 case '|':
6484 goto loopdone;
6485 case '\\':
6486 switch (*++p) {
6487 case 'A':
1ed8eac0
JF
6488 case 'C':
6489 case 'X':
a0d0e21e 6490 case 'G':
2bf803e2 6491 case 'g':
a0d0e21e 6492 case 'Z':
b85d18e9 6493 case 'z':
a0d0e21e
LW
6494 case 'w':
6495 case 'W':
6496 case 'b':
6497 case 'B':
6498 case 's':
6499 case 'S':
6500 case 'd':
6501 case 'D':
a14b48bc
LW
6502 case 'p':
6503 case 'P':
fc8cd66c 6504 case 'N':
c74340f9 6505 case 'R':
99ed40ff 6506 case 'k':
a0d0e21e
LW
6507 --p;
6508 goto loopdone;
6509 case 'n':
6510 ender = '\n';
6511 p++;
a687059c 6512 break;
a0d0e21e
LW
6513 case 'r':
6514 ender = '\r';
6515 p++;
a687059c 6516 break;
a0d0e21e
LW
6517 case 't':
6518 ender = '\t';
6519 p++;
a687059c 6520 break;
a0d0e21e
LW
6521 case 'f':
6522 ender = '\f';
6523 p++;
a687059c 6524 break;
a0d0e21e 6525 case 'e':
c7f1f016 6526 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6527 p++;
a687059c 6528 break;
a0d0e21e 6529 case 'a':
c7f1f016 6530 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6531 p++;
a687059c 6532 break;
a0d0e21e 6533 case 'x':
a0ed51b3 6534 if (*++p == '{') {
1df70142 6535 char* const e = strchr(p, '}');
b81d288d 6536
b45f050a 6537 if (!e) {
830247a4 6538 RExC_parse = p + 1;
b45f050a
JF
6539 vFAIL("Missing right brace on \\x{}");
6540 }
de5f0749 6541 else {
a4c04bdc
NC
6542 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6543 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6544 STRLEN numlen = e - p - 1;
53305cf1 6545 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6546 if (ender > 0xff)
6547 RExC_utf8 = 1;
a0ed51b3
LW
6548 p = e + 1;
6549 }
a0ed51b3
LW
6550 }
6551 else {
a4c04bdc 6552 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6553 STRLEN numlen = 2;
53305cf1 6554 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6555 p += numlen;
6556 }
9e08bc66
TS
6557 if (PL_encoding && ender < 0x100)
6558 goto recode_encoding;
a687059c 6559 break;
a0d0e21e
LW
6560 case 'c':
6561 p++;
bbce6d69 6562 ender = UCHARAT(p++);
6563 ender = toCTRL(ender);
a687059c 6564 break;
a0d0e21e
LW
6565 case '0': case '1': case '2': case '3':case '4':
6566 case '5': case '6': case '7': case '8':case '9':
6567 if (*p == '0' ||
830247a4 6568 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6569 I32 flags = 0;
1df70142 6570 STRLEN numlen = 3;
53305cf1 6571 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6572 p += numlen;
6573 }
6574 else {
6575 --p;
6576 goto loopdone;
a687059c 6577 }
9e08bc66
TS
6578 if (PL_encoding && ender < 0x100)
6579 goto recode_encoding;
6580 break;
6581 recode_encoding:
6582 {
6583 SV* enc = PL_encoding;
6584 ender = reg_recode((const char)(U8)ender, &enc);
6585 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6586 vWARN(p, "Invalid escape in the specified encoding");
6587 RExC_utf8 = 1;
6588 }
a687059c 6589 break;
a0d0e21e 6590 case '\0':
830247a4 6591 if (p >= RExC_end)
b45f050a 6592 FAIL("Trailing \\");
a687059c 6593 /* FALL THROUGH */
a0d0e21e 6594 default:
041457d9 6595 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6596 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6597 goto normal_default;
a0d0e21e
LW
6598 }
6599 break;
a687059c 6600 default:
a0ed51b3 6601 normal_default:
fd400ab9 6602 if (UTF8_IS_START(*p) && UTF) {
1df70142 6603 STRLEN numlen;
5e12f4fb 6604 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6605 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6606 p += numlen;
6607 }
6608 else
6609 ender = *p++;
a0d0e21e 6610 break;
a687059c 6611 }
bbe252da 6612 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6613 p = regwhite(p, RExC_end);
60a8b682
JH
6614 if (UTF && FOLD) {
6615 /* Prime the casefolded buffer. */
ac7e0132 6616 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6617 }
a0d0e21e
LW
6618 if (ISMULT2(p)) { /* Back off on ?+*. */
6619 if (len)
6620 p = oldp;
16ea2a2e 6621 else if (UTF) {
80aecb99 6622 if (FOLD) {
60a8b682 6623 /* Emit all the Unicode characters. */
1df70142 6624 STRLEN numlen;
80aecb99
JH
6625 for (foldbuf = tmpbuf;
6626 foldlen;
6627 foldlen -= numlen) {
6628 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6629 if (numlen > 0) {
71207a34 6630 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6631 s += unilen;
6632 len += unilen;
6633 /* In EBCDIC the numlen
6634 * and unilen can differ. */
9dc45d57 6635 foldbuf += numlen;
47654450
JH
6636 if (numlen >= foldlen)
6637 break;
9dc45d57
JH
6638 }
6639 else
6640 break; /* "Can't happen." */
80aecb99
JH
6641 }
6642 }
6643 else {
71207a34 6644 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6645 if (unilen > 0) {
0ebc6274
JH
6646 s += unilen;
6647 len += unilen;
9dc45d57 6648 }
80aecb99 6649 }
a0ed51b3 6650 }
a0d0e21e
LW
6651 else {
6652 len++;
eb160463 6653 REGC((char)ender, s++);
a0d0e21e
LW
6654 }
6655 break;
a687059c 6656 }
16ea2a2e 6657 if (UTF) {
80aecb99 6658 if (FOLD) {
60a8b682 6659 /* Emit all the Unicode characters. */
1df70142 6660 STRLEN numlen;
80aecb99
JH
6661 for (foldbuf = tmpbuf;
6662 foldlen;
6663 foldlen -= numlen) {
6664 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6665 if (numlen > 0) {
71207a34 6666 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6667 len += unilen;
6668 s += unilen;
6669 /* In EBCDIC the numlen
6670 * and unilen can differ. */
9dc45d57 6671 foldbuf += numlen;
47654450
JH
6672 if (numlen >= foldlen)
6673 break;
9dc45d57
JH
6674 }
6675 else
6676 break;
80aecb99
JH
6677 }
6678 }
6679 else {
71207a34 6680 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6681 if (unilen > 0) {
0ebc6274
JH
6682 s += unilen;
6683 len += unilen;
9dc45d57 6684 }
80aecb99
JH
6685 }
6686 len--;
a0ed51b3
LW
6687 }
6688 else
eb160463 6689 REGC((char)ender, s++);
a0d0e21e
LW
6690 }
6691 loopdone:
830247a4 6692 RExC_parse = p - 1;
fac92740 6693 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6694 nextchar(pRExC_state);
793db0cb
JH
6695 {
6696 /* len is STRLEN which is unsigned, need to copy to signed */
6697 IV iv = len;
6698 if (iv < 0)
6699 vFAIL("Internal disaster");
6700 }
a0d0e21e
LW
6701 if (len > 0)
6702 *flagp |= HASWIDTH;
090f7165 6703 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6704 *flagp |= SIMPLE;
3dab1dad 6705
cd439c50 6706 if (SIZE_ONLY)
830247a4 6707 RExC_size += STR_SZ(len);
3dab1dad
YO
6708 else {
6709 STR_LEN(ret) = len;
830247a4 6710 RExC_emit += STR_SZ(len);
07be1b83 6711 }
3dab1dad 6712 }
a0d0e21e
LW
6713 break;
6714 }
a687059c 6715
a0d0e21e 6716 return(ret);
a687059c
LW
6717}
6718
873ef191 6719STATIC char *
5f66b61c 6720S_regwhite(char *p, const char *e)
5b5a24f7
CS
6721{
6722 while (p < e) {
6723 if (isSPACE(*p))
6724 ++p;
6725 else if (*p == '#') {
6726 do {
6727 p++;
6728 } while (p < e && *p != '\n');
6729 }
6730 else
6731 break;
6732 }
6733 return p;
6734}
6735
b8c5462f
JH
6736/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6737 Character classes ([:foo:]) can also be negated ([:^foo:]).
6738 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6739 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6740 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6741
6742#define POSIXCC_DONE(c) ((c) == ':')
6743#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6744#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6745
b8c5462f 6746STATIC I32
830247a4 6747S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6748{
97aff369 6749 dVAR;
936ed897 6750 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6751
830247a4 6752 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6753 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6754 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6755 const char c = UCHARAT(RExC_parse);
097eb12c 6756 char* const s = RExC_parse++;
b81d288d 6757
9a86a77b 6758 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6759 RExC_parse++;
6760 if (RExC_parse == RExC_end)
620e46c5 6761 /* Grandfather lone [:, [=, [. */
830247a4 6762 RExC_parse = s;
620e46c5 6763 else {
3dab1dad 6764 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6765 assert(*t == c);
6766
9a86a77b 6767 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6768 const char *posixcc = s + 1;
830247a4 6769 RExC_parse++; /* skip over the ending ] */
3dab1dad 6770
b8c5462f 6771 if (*s == ':') {
1df70142
AL
6772 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6773 const I32 skip = t - posixcc;
80916619
NC
6774
6775 /* Initially switch on the length of the name. */
6776 switch (skip) {
6777 case 4:
3dab1dad
YO
6778 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6779 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6780 break;
80916619
NC
6781 case 5:
6782 /* Names all of length 5. */
6783 /* alnum alpha ascii blank cntrl digit graph lower
6784 print punct space upper */
6785 /* Offset 4 gives the best switch position. */
6786 switch (posixcc[4]) {
6787 case 'a':
3dab1dad
YO
6788 if (memEQ(posixcc, "alph", 4)) /* alpha */
6789 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6790 break;
6791 case 'e':
3dab1dad
YO
6792 if (memEQ(posixcc, "spac", 4)) /* space */
6793 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6794 break;
6795 case 'h':
3dab1dad
YO
6796 if (memEQ(posixcc, "grap", 4)) /* graph */
6797 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6798 break;
6799 case 'i':
3dab1dad
YO
6800 if (memEQ(posixcc, "asci", 4)) /* ascii */
6801 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6802 break;
6803 case 'k':
3dab1dad
YO
6804 if (memEQ(posixcc, "blan", 4)) /* blank */
6805 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6806 break;
6807 case 'l':
3dab1dad
YO
6808 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6809 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6810 break;
6811 case 'm':
3dab1dad
YO
6812 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6813 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6814 break;
6815 case 'r':
3dab1dad
YO
6816 if (memEQ(posixcc, "lowe", 4)) /* lower */
6817 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6818 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6819 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6820 break;
6821 case 't':
3dab1dad
YO
6822 if (memEQ(posixcc, "digi", 4)) /* digit */
6823 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6824 else if (memEQ(posixcc, "prin", 4)) /* print */
6825 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6826 else if (memEQ(posixcc, "punc", 4)) /* punct */
6827 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6828 break;
b8c5462f
JH
6829 }
6830 break;
80916619 6831 case 6:
3dab1dad
YO
6832 if (memEQ(posixcc, "xdigit", 6))
6833 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6834 break;
6835 }
80916619
NC
6836
6837 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6838 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6839 t - s - 1, s + 1);
80916619
NC
6840 assert (posixcc[skip] == ':');
6841 assert (posixcc[skip+1] == ']');
b45f050a 6842 } else if (!SIZE_ONLY) {
b8c5462f 6843 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6844
830247a4 6845 /* adjust RExC_parse so the warning shows after
b45f050a 6846 the class closes */
9a86a77b 6847 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6848 RExC_parse++;
b45f050a
JF
6849 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6850 }
b8c5462f
JH
6851 } else {
6852 /* Maternal grandfather:
6853 * "[:" ending in ":" but not in ":]" */
830247a4 6854 RExC_parse = s;
767d463e 6855 }
620e46c5
JH
6856 }
6857 }
6858
b8c5462f
JH
6859 return namedclass;
6860}
6861
6862STATIC void
830247a4 6863S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6864{
97aff369 6865 dVAR;
3dab1dad 6866 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6867 const char *s = RExC_parse;
6868 const char c = *s++;
b8c5462f 6869
3dab1dad 6870 while (isALNUM(*s))
b8c5462f
JH
6871 s++;
6872 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6873 if (ckWARN(WARN_REGEXP))
6874 vWARN3(s+2,
6875 "POSIX syntax [%c %c] belongs inside character classes",
6876 c, c);
b45f050a
JF
6877
6878 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6879 if (POSIXCC_NOTYET(c)) {
830247a4 6880 /* adjust RExC_parse so the error shows after
b45f050a 6881 the class closes */
9a86a77b 6882 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6883 NOOP;
b45f050a
JF
6884 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6885 }
b8c5462f
JH
6886 }
6887 }
620e46c5
JH
6888}
6889
7f6f358c
YO
6890
6891/*
6892 parse a class specification and produce either an ANYOF node that
6893 matches the pattern. If the pattern matches a single char only and
6894 that char is < 256 then we produce an EXACT node instead.
6895*/
76e3520e 6896STATIC regnode *
3dab1dad 6897S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6898{
97aff369 6899 dVAR;
9ef43ace 6900 register UV value = 0;
9a86a77b 6901 register UV nextvalue;
3568d838 6902 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6903 register IV range = 0;
c277df42 6904 register regnode *ret;
ba210ebe 6905 STRLEN numlen;
ffc61ed2 6906 IV namedclass;
cbbf8932 6907 char *rangebegin = NULL;
936ed897 6908 bool need_class = 0;
c445ea15 6909 SV *listsv = NULL;
ffc61ed2 6910 UV n;
9e55ce06 6911 bool optimize_invert = TRUE;
cbbf8932 6912 AV* unicode_alternate = NULL;
1b2d223b
JH
6913#ifdef EBCDIC
6914 UV literal_endpoint = 0;
6915#endif
7f6f358c 6916 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6917
3dab1dad 6918 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6919 case we need to change the emitted regop to an EXACT. */
07be1b83 6920 const char * orig_parse = RExC_parse;
72f13be8 6921 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6922#ifndef DEBUGGING
6923 PERL_UNUSED_ARG(depth);
6924#endif
72f13be8 6925
3dab1dad 6926 DEBUG_PARSE("clas");
7f6f358c
YO
6927
6928 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6929 ret = reganode(pRExC_state, ANYOF, 0);
6930
6931 if (!SIZE_ONLY)
6932 ANYOF_FLAGS(ret) = 0;
6933
9a86a77b 6934 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6935 RExC_naughty++;
6936 RExC_parse++;
6937 if (!SIZE_ONLY)
6938 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6939 }
a0d0e21e 6940
73060fc4 6941 if (SIZE_ONLY) {
830247a4 6942 RExC_size += ANYOF_SKIP;
73060fc4
JH
6943 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6944 }
936ed897 6945 else {
830247a4 6946 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6947 if (FOLD)
6948 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6949 if (LOC)
6950 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6951 ANYOF_BITMAP_ZERO(ret);
396482e1 6952 listsv = newSVpvs("# comment\n");
a0d0e21e 6953 }
b8c5462f 6954
9a86a77b
JH
6955 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6956
b938889d 6957 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6958 checkposixcc(pRExC_state);
b8c5462f 6959
f064b6ad
HS
6960 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6961 if (UCHARAT(RExC_parse) == ']')
6962 goto charclassloop;
ffc61ed2 6963
fc8cd66c 6964parseit:
9a86a77b 6965 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6966
6967 charclassloop:
6968
6969 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6970
73b437c8 6971 if (!range)
830247a4 6972 rangebegin = RExC_parse;
ffc61ed2 6973 if (UTF) {
5e12f4fb 6974 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6975 RExC_end - RExC_parse,
9f7f3913 6976 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6977 RExC_parse += numlen;
6978 }
6979 else
6980 value = UCHARAT(RExC_parse++);
7f6f358c 6981
9a86a77b
JH
6982 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6983 if (value == '[' && POSIXCC(nextvalue))
830247a4 6984 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6985 else if (value == '\\') {
ffc61ed2 6986 if (UTF) {
5e12f4fb 6987 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6988 RExC_end - RExC_parse,
9f7f3913 6989 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6990 RExC_parse += numlen;
6991 }
6992 else
6993 value = UCHARAT(RExC_parse++);
470c3474 6994 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6995 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6996 * be a problem later if we want switch on Unicode.
6997 * A similar issue a little bit later when switching on
6998 * namedclass. --jhi */
ffc61ed2 6999 switch ((I32)value) {
b8c5462f
JH
7000 case 'w': namedclass = ANYOF_ALNUM; break;
7001 case 'W': namedclass = ANYOF_NALNUM; break;
7002 case 's': namedclass = ANYOF_SPACE; break;
7003 case 'S': namedclass = ANYOF_NSPACE; break;
7004 case 'd': namedclass = ANYOF_DIGIT; break;
7005 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
7006 case 'N': /* Handle \N{NAME} in class */
7007 {
7008 /* We only pay attention to the first char of
7009 multichar strings being returned. I kinda wonder
7010 if this makes sense as it does change the behaviour
7011 from earlier versions, OTOH that behaviour was broken
7012 as well. */
7013 UV v; /* value is register so we cant & it /grrr */
7014 if (reg_namedseq(pRExC_state, &v)) {
7015 goto parseit;
7016 }
7017 value= v;
7018 }
7019 break;
ffc61ed2
JH
7020 case 'p':
7021 case 'P':
3dab1dad
YO
7022 {
7023 char *e;
af6f566e 7024 if (RExC_parse >= RExC_end)
2a4859cd 7025 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 7026 if (*RExC_parse == '{') {
1df70142 7027 const U8 c = (U8)value;
ffc61ed2
JH
7028 e = strchr(RExC_parse++, '}');
7029 if (!e)
0da60cf5 7030 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
7031 while (isSPACE(UCHARAT(RExC_parse)))
7032 RExC_parse++;
7033 if (e == RExC_parse)
0da60cf5 7034 vFAIL2("Empty \\%c{}", c);
ffc61ed2 7035 n = e - RExC_parse;
ab13f0c7
JH
7036 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7037 n--;
ffc61ed2
JH
7038 }
7039 else {
7040 e = RExC_parse;
7041 n = 1;
7042 }
7043 if (!SIZE_ONLY) {
ab13f0c7
JH
7044 if (UCHARAT(RExC_parse) == '^') {
7045 RExC_parse++;
7046 n--;
7047 value = value == 'p' ? 'P' : 'p'; /* toggle */
7048 while (isSPACE(UCHARAT(RExC_parse))) {
7049 RExC_parse++;
7050 n--;
7051 }
7052 }
097eb12c
AL
7053 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7054 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
7055 }
7056 RExC_parse = e + 1;
7057 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 7058 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 7059 }
f81125e2 7060 break;
b8c5462f
JH
7061 case 'n': value = '\n'; break;
7062 case 'r': value = '\r'; break;
7063 case 't': value = '\t'; break;
7064 case 'f': value = '\f'; break;
7065 case 'b': value = '\b'; break;
c7f1f016
NIS
7066 case 'e': value = ASCII_TO_NATIVE('\033');break;
7067 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 7068 case 'x':
ffc61ed2 7069 if (*RExC_parse == '{') {
a4c04bdc
NC
7070 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7071 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 7072 char * const e = strchr(RExC_parse++, '}');
b81d288d 7073 if (!e)
ffc61ed2 7074 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
7075
7076 numlen = e - RExC_parse;
7077 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7078 RExC_parse = e + 1;
7079 }
7080 else {
a4c04bdc 7081 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
7082 numlen = 2;
7083 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7084 RExC_parse += numlen;
7085 }
9e08bc66
TS
7086 if (PL_encoding && value < 0x100)
7087 goto recode_encoding;
b8c5462f
JH
7088 break;
7089 case 'c':
830247a4 7090 value = UCHARAT(RExC_parse++);
b8c5462f
JH
7091 value = toCTRL(value);
7092 break;
7093 case '0': case '1': case '2': case '3': case '4':
7094 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
7095 {
7096 I32 flags = 0;
7097 numlen = 3;
7098 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7099 RExC_parse += numlen;
7100 if (PL_encoding && value < 0x100)
7101 goto recode_encoding;
7102 break;
7103 }
7104 recode_encoding:
7105 {
7106 SV* enc = PL_encoding;
7107 value = reg_recode((const char)(U8)value, &enc);
7108 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7109 vWARN(RExC_parse,
7110 "Invalid escape in the specified encoding");
7111 break;
7112 }
1028017a 7113 default:
041457d9 7114 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7115 vWARN2(RExC_parse,
7116 "Unrecognized escape \\%c in character class passed through",
7117 (int)value);
1028017a 7118 break;
b8c5462f 7119 }
ffc61ed2 7120 } /* end of \blah */
1b2d223b
JH
7121#ifdef EBCDIC
7122 else
7123 literal_endpoint++;
7124#endif
ffc61ed2
JH
7125
7126 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7127
7128 if (!SIZE_ONLY && !need_class)
936ed897 7129 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7130
936ed897 7131 need_class = 1;
ffc61ed2
JH
7132
7133 /* a bad range like a-\d, a-[:digit:] ? */
7134 if (range) {
73b437c8 7135 if (!SIZE_ONLY) {
afd78fd5 7136 if (ckWARN(WARN_REGEXP)) {
097eb12c 7137 const int w =
afd78fd5
JH
7138 RExC_parse >= rangebegin ?
7139 RExC_parse - rangebegin : 0;
830247a4 7140 vWARN4(RExC_parse,
b45f050a 7141 "False [] range \"%*.*s\"",
097eb12c 7142 w, w, rangebegin);
afd78fd5 7143 }
3568d838
JH
7144 if (prevvalue < 256) {
7145 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7146 ANYOF_BITMAP_SET(ret, '-');
7147 }
7148 else {
7149 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7150 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7151 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7152 }
b8c5462f 7153 }
ffc61ed2
JH
7154
7155 range = 0; /* this was not a true range */
73b437c8 7156 }
ffc61ed2 7157
73b437c8 7158 if (!SIZE_ONLY) {
c49a72a9
NC
7159 const char *what = NULL;
7160 char yesno = 0;
7161
3568d838
JH
7162 if (namedclass > OOB_NAMEDCLASS)
7163 optimize_invert = FALSE;
e2962f66
JH
7164 /* Possible truncation here but in some 64-bit environments
7165 * the compiler gets heartburn about switch on 64-bit values.
7166 * A similar issue a little earlier when switching on value.
98f323fa 7167 * --jhi */
e2962f66 7168 switch ((I32)namedclass) {
73b437c8
JH
7169 case ANYOF_ALNUM:
7170 if (LOC)
936ed897 7171 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
7172 else {
7173 for (value = 0; value < 256; value++)
7174 if (isALNUM(value))
936ed897 7175 ANYOF_BITMAP_SET(ret, value);
73b437c8 7176 }
c49a72a9
NC
7177 yesno = '+';
7178 what = "Word";
73b437c8
JH
7179 break;
7180 case ANYOF_NALNUM:
7181 if (LOC)
936ed897 7182 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
7183 else {
7184 for (value = 0; value < 256; value++)
7185 if (!isALNUM(value))
936ed897 7186 ANYOF_BITMAP_SET(ret, value);
73b437c8 7187 }
c49a72a9
NC
7188 yesno = '!';
7189 what = "Word";
73b437c8 7190 break;
ffc61ed2 7191 case ANYOF_ALNUMC:
73b437c8 7192 if (LOC)
ffc61ed2 7193 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
7194 else {
7195 for (value = 0; value < 256; value++)
ffc61ed2 7196 if (isALNUMC(value))
936ed897 7197 ANYOF_BITMAP_SET(ret, value);
73b437c8 7198 }
c49a72a9
NC
7199 yesno = '+';
7200 what = "Alnum";
73b437c8
JH
7201 break;
7202 case ANYOF_NALNUMC:
7203 if (LOC)
936ed897 7204 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
7205 else {
7206 for (value = 0; value < 256; value++)
7207 if (!isALNUMC(value))
936ed897 7208 ANYOF_BITMAP_SET(ret, value);
73b437c8 7209 }
c49a72a9
NC
7210 yesno = '!';
7211 what = "Alnum";
73b437c8
JH
7212 break;
7213 case ANYOF_ALPHA:
7214 if (LOC)
936ed897 7215 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
7216 else {
7217 for (value = 0; value < 256; value++)
7218 if (isALPHA(value))
936ed897 7219 ANYOF_BITMAP_SET(ret, value);
73b437c8 7220 }
c49a72a9
NC
7221 yesno = '+';
7222 what = "Alpha";
73b437c8
JH
7223 break;
7224 case ANYOF_NALPHA:
7225 if (LOC)
936ed897 7226 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
7227 else {
7228 for (value = 0; value < 256; value++)
7229 if (!isALPHA(value))
936ed897 7230 ANYOF_BITMAP_SET(ret, value);
73b437c8 7231 }
c49a72a9
NC
7232 yesno = '!';
7233 what = "Alpha";
73b437c8
JH
7234 break;
7235 case ANYOF_ASCII:
7236 if (LOC)
936ed897 7237 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7238 else {
c7f1f016 7239#ifndef EBCDIC
1ba5c669
JH
7240 for (value = 0; value < 128; value++)
7241 ANYOF_BITMAP_SET(ret, value);
7242#else /* EBCDIC */
ffbc6a93 7243 for (value = 0; value < 256; value++) {
3a3c4447
JH
7244 if (isASCII(value))
7245 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7246 }
1ba5c669 7247#endif /* EBCDIC */
73b437c8 7248 }
c49a72a9
NC
7249 yesno = '+';
7250 what = "ASCII";
73b437c8
JH
7251 break;
7252 case ANYOF_NASCII:
7253 if (LOC)
936ed897 7254 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7255 else {
c7f1f016 7256#ifndef EBCDIC
1ba5c669
JH
7257 for (value = 128; value < 256; value++)
7258 ANYOF_BITMAP_SET(ret, value);
7259#else /* EBCDIC */
ffbc6a93 7260 for (value = 0; value < 256; value++) {
3a3c4447
JH
7261 if (!isASCII(value))
7262 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7263 }
1ba5c669 7264#endif /* EBCDIC */
73b437c8 7265 }
c49a72a9
NC
7266 yesno = '!';
7267 what = "ASCII";
73b437c8 7268 break;
aaa51d5e
JF
7269 case ANYOF_BLANK:
7270 if (LOC)
7271 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7272 else {
7273 for (value = 0; value < 256; value++)
7274 if (isBLANK(value))
7275 ANYOF_BITMAP_SET(ret, value);
7276 }
c49a72a9
NC
7277 yesno = '+';
7278 what = "Blank";
aaa51d5e
JF
7279 break;
7280 case ANYOF_NBLANK:
7281 if (LOC)
7282 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7283 else {
7284 for (value = 0; value < 256; value++)
7285 if (!isBLANK(value))
7286 ANYOF_BITMAP_SET(ret, value);
7287 }
c49a72a9
NC
7288 yesno = '!';
7289 what = "Blank";
aaa51d5e 7290 break;
73b437c8
JH
7291 case ANYOF_CNTRL:
7292 if (LOC)
936ed897 7293 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
7294 else {
7295 for (value = 0; value < 256; value++)
7296 if (isCNTRL(value))
936ed897 7297 ANYOF_BITMAP_SET(ret, value);
73b437c8 7298 }
c49a72a9
NC
7299 yesno = '+';
7300 what = "Cntrl";
73b437c8
JH
7301 break;
7302 case ANYOF_NCNTRL:
7303 if (LOC)
936ed897 7304 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
7305 else {
7306 for (value = 0; value < 256; value++)
7307 if (!isCNTRL(value))
936ed897 7308 ANYOF_BITMAP_SET(ret, value);
73b437c8 7309 }
c49a72a9
NC
7310 yesno = '!';
7311 what = "Cntrl";
ffc61ed2
JH
7312 break;
7313 case ANYOF_DIGIT:
7314 if (LOC)
7315 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7316 else {
7317 /* consecutive digits assumed */
7318 for (value = '0'; value <= '9'; value++)
7319 ANYOF_BITMAP_SET(ret, value);
7320 }
c49a72a9
NC
7321 yesno = '+';
7322 what = "Digit";
ffc61ed2
JH
7323 break;
7324 case ANYOF_NDIGIT:
7325 if (LOC)
7326 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7327 else {
7328 /* consecutive digits assumed */
7329 for (value = 0; value < '0'; value++)
7330 ANYOF_BITMAP_SET(ret, value);
7331 for (value = '9' + 1; value < 256; value++)
7332 ANYOF_BITMAP_SET(ret, value);
7333 }
c49a72a9
NC
7334 yesno = '!';
7335 what = "Digit";
73b437c8
JH
7336 break;
7337 case ANYOF_GRAPH:
7338 if (LOC)
936ed897 7339 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
7340 else {
7341 for (value = 0; value < 256; value++)
7342 if (isGRAPH(value))
936ed897 7343 ANYOF_BITMAP_SET(ret, value);
73b437c8 7344 }
c49a72a9
NC
7345 yesno = '+';
7346 what = "Graph";
73b437c8
JH
7347 break;
7348 case ANYOF_NGRAPH:
7349 if (LOC)
936ed897 7350 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
7351 else {
7352 for (value = 0; value < 256; value++)
7353 if (!isGRAPH(value))
936ed897 7354 ANYOF_BITMAP_SET(ret, value);
73b437c8 7355 }
c49a72a9
NC
7356 yesno = '!';
7357 what = "Graph";
73b437c8
JH
7358 break;
7359 case ANYOF_LOWER:
7360 if (LOC)
936ed897 7361 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
7362 else {
7363 for (value = 0; value < 256; value++)
7364 if (isLOWER(value))
936ed897 7365 ANYOF_BITMAP_SET(ret, value);
73b437c8 7366 }
c49a72a9
NC
7367 yesno = '+';
7368 what = "Lower";
73b437c8
JH
7369 break;
7370 case ANYOF_NLOWER:
7371 if (LOC)
936ed897 7372 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
7373 else {
7374 for (value = 0; value < 256; value++)
7375 if (!isLOWER(value))
936ed897 7376 ANYOF_BITMAP_SET(ret, value);
73b437c8 7377 }
c49a72a9
NC
7378 yesno = '!';
7379 what = "Lower";
73b437c8
JH
7380 break;
7381 case ANYOF_PRINT:
7382 if (LOC)
936ed897 7383 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
7384 else {
7385 for (value = 0; value < 256; value++)
7386 if (isPRINT(value))
936ed897 7387 ANYOF_BITMAP_SET(ret, value);
73b437c8 7388 }
c49a72a9
NC
7389 yesno = '+';
7390 what = "Print";
73b437c8
JH
7391 break;
7392 case ANYOF_NPRINT:
7393 if (LOC)
936ed897 7394 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
7395 else {
7396 for (value = 0; value < 256; value++)
7397 if (!isPRINT(value))
936ed897 7398 ANYOF_BITMAP_SET(ret, value);
73b437c8 7399 }
c49a72a9
NC
7400 yesno = '!';
7401 what = "Print";
73b437c8 7402 break;
aaa51d5e
JF
7403 case ANYOF_PSXSPC:
7404 if (LOC)
7405 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7406 else {
7407 for (value = 0; value < 256; value++)
7408 if (isPSXSPC(value))
7409 ANYOF_BITMAP_SET(ret, value);
7410 }
c49a72a9
NC
7411 yesno = '+';
7412 what = "Space";
aaa51d5e
JF
7413 break;
7414 case ANYOF_NPSXSPC:
7415 if (LOC)
7416 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7417 else {
7418 for (value = 0; value < 256; value++)
7419 if (!isPSXSPC(value))
7420 ANYOF_BITMAP_SET(ret, value);
7421 }
c49a72a9
NC
7422 yesno = '!';
7423 what = "Space";
aaa51d5e 7424 break;
73b437c8
JH
7425 case ANYOF_PUNCT:
7426 if (LOC)
936ed897 7427 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
7428 else {
7429 for (value = 0; value < 256; value++)
7430 if (isPUNCT(value))
936ed897 7431 ANYOF_BITMAP_SET(ret, value);
73b437c8 7432 }
c49a72a9
NC
7433 yesno = '+';
7434 what = "Punct";
73b437c8
JH
7435 break;
7436 case ANYOF_NPUNCT:
7437 if (LOC)
936ed897 7438 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
7439 else {
7440 for (value = 0; value < 256; value++)
7441 if (!isPUNCT(value))
936ed897 7442 ANYOF_BITMAP_SET(ret, value);
73b437c8 7443 }
c49a72a9
NC
7444 yesno = '!';
7445 what = "Punct";
ffc61ed2
JH
7446 break;
7447 case ANYOF_SPACE:
7448 if (LOC)
7449 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7450 else {
7451 for (value = 0; value < 256; value++)
7452 if (isSPACE(value))
7453 ANYOF_BITMAP_SET(ret, value);
7454 }
c49a72a9
NC
7455 yesno = '+';
7456 what = "SpacePerl";
ffc61ed2
JH
7457 break;
7458 case ANYOF_NSPACE:
7459 if (LOC)
7460 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7461 else {
7462 for (value = 0; value < 256; value++)
7463 if (!isSPACE(value))
7464 ANYOF_BITMAP_SET(ret, value);
7465 }
c49a72a9
NC
7466 yesno = '!';
7467 what = "SpacePerl";
73b437c8
JH
7468 break;
7469 case ANYOF_UPPER:
7470 if (LOC)
936ed897 7471 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
7472 else {
7473 for (value = 0; value < 256; value++)
7474 if (isUPPER(value))
936ed897 7475 ANYOF_BITMAP_SET(ret, value);
73b437c8 7476 }
c49a72a9
NC
7477 yesno = '+';
7478 what = "Upper";
73b437c8
JH
7479 break;
7480 case ANYOF_NUPPER:
7481 if (LOC)
936ed897 7482 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
7483 else {
7484 for (value = 0; value < 256; value++)
7485 if (!isUPPER(value))
936ed897 7486 ANYOF_BITMAP_SET(ret, value);
73b437c8 7487 }
c49a72a9
NC
7488 yesno = '!';
7489 what = "Upper";
73b437c8
JH
7490 break;
7491 case ANYOF_XDIGIT:
7492 if (LOC)
936ed897 7493 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
7494 else {
7495 for (value = 0; value < 256; value++)
7496 if (isXDIGIT(value))
936ed897 7497 ANYOF_BITMAP_SET(ret, value);
73b437c8 7498 }
c49a72a9
NC
7499 yesno = '+';
7500 what = "XDigit";
73b437c8
JH
7501 break;
7502 case ANYOF_NXDIGIT:
7503 if (LOC)
936ed897 7504 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
7505 else {
7506 for (value = 0; value < 256; value++)
7507 if (!isXDIGIT(value))
936ed897 7508 ANYOF_BITMAP_SET(ret, value);
73b437c8 7509 }
c49a72a9
NC
7510 yesno = '!';
7511 what = "XDigit";
73b437c8 7512 break;
f81125e2
JP
7513 case ANYOF_MAX:
7514 /* this is to handle \p and \P */
7515 break;
73b437c8 7516 default:
b45f050a 7517 vFAIL("Invalid [::] class");
73b437c8 7518 break;
b8c5462f 7519 }
c49a72a9
NC
7520 if (what) {
7521 /* Strings such as "+utf8::isWord\n" */
7522 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7523 }
b8c5462f 7524 if (LOC)
936ed897 7525 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7526 continue;
a0d0e21e 7527 }
ffc61ed2
JH
7528 } /* end of namedclass \blah */
7529
a0d0e21e 7530 if (range) {
eb160463 7531 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7532 const int w = RExC_parse - rangebegin;
7533 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7534 range = 0; /* not a valid range */
73b437c8 7535 }
a0d0e21e
LW
7536 }
7537 else {
3568d838 7538 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7539 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7540 RExC_parse[1] != ']') {
7541 RExC_parse++;
ffc61ed2
JH
7542
7543 /* a bad range like \w-, [:word:]- ? */
7544 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7545 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7546 const int w =
afd78fd5
JH
7547 RExC_parse >= rangebegin ?
7548 RExC_parse - rangebegin : 0;
830247a4 7549 vWARN4(RExC_parse,
b45f050a 7550 "False [] range \"%*.*s\"",
097eb12c 7551 w, w, rangebegin);
afd78fd5 7552 }
73b437c8 7553 if (!SIZE_ONLY)
936ed897 7554 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7555 } else
ffc61ed2
JH
7556 range = 1; /* yeah, it's a range! */
7557 continue; /* but do it the next time */
a0d0e21e 7558 }
a687059c 7559 }
ffc61ed2 7560
93733859 7561 /* now is the next time */
07be1b83 7562 /*stored += (value - prevvalue + 1);*/
ae5c130c 7563 if (!SIZE_ONLY) {
3568d838 7564 if (prevvalue < 256) {
1df70142 7565 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7566 IV i;
3568d838 7567#ifdef EBCDIC
1b2d223b
JH
7568 /* In EBCDIC [\x89-\x91] should include
7569 * the \x8e but [i-j] should not. */
7570 if (literal_endpoint == 2 &&
7571 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7572 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7573 {
3568d838
JH
7574 if (isLOWER(prevvalue)) {
7575 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7576 if (isLOWER(i))
7577 ANYOF_BITMAP_SET(ret, i);
7578 } else {
3568d838 7579 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7580 if (isUPPER(i))
7581 ANYOF_BITMAP_SET(ret, i);
7582 }
8ada0baa 7583 }
ffc61ed2 7584 else
8ada0baa 7585#endif
07be1b83
YO
7586 for (i = prevvalue; i <= ceilvalue; i++) {
7587 if (!ANYOF_BITMAP_TEST(ret,i)) {
7588 stored++;
7589 ANYOF_BITMAP_SET(ret, i);
7590 }
7591 }
3568d838 7592 }
a5961de5 7593 if (value > 255 || UTF) {
1df70142
AL
7594 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7595 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7596 stored+=2; /* can't optimize this class */
ffc61ed2 7597 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7598 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7599 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7600 prevnatvalue, natvalue);
7601 }
7602 else if (prevnatvalue == natvalue) {
7603 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7604 if (FOLD) {
89ebb4a3 7605 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7606 STRLEN foldlen;
1df70142 7607 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7608
e294cc5d
JH
7609#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7610 if (RExC_precomp[0] == ':' &&
7611 RExC_precomp[1] == '[' &&
7612 (f == 0xDF || f == 0x92)) {
7613 f = NATIVE_TO_UNI(f);
7614 }
7615#endif
c840d2a2
JH
7616 /* If folding and foldable and a single
7617 * character, insert also the folded version
7618 * to the charclass. */
9e55ce06 7619 if (f != value) {
e294cc5d
JH
7620#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7621 if ((RExC_precomp[0] == ':' &&
7622 RExC_precomp[1] == '[' &&
7623 (f == 0xA2 &&
7624 (value == 0xFB05 || value == 0xFB06))) ?
7625 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7626 foldlen == (STRLEN)UNISKIP(f) )
7627#else
eb160463 7628 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7629#endif
9e55ce06
JH
7630 Perl_sv_catpvf(aTHX_ listsv,
7631 "%04"UVxf"\n", f);
7632 else {
7633 /* Any multicharacter foldings
7634 * require the following transform:
7635 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7636 * where E folds into "pq" and F folds
7637 * into "rst", all other characters
7638 * fold to single characters. We save
7639 * away these multicharacter foldings,
7640 * to be later saved as part of the
7641 * additional "s" data. */
7642 SV *sv;
7643
7644 if (!unicode_alternate)
7645 unicode_alternate = newAV();
7646 sv = newSVpvn((char*)foldbuf, foldlen);
7647 SvUTF8_on(sv);
7648 av_push(unicode_alternate, sv);
7649 }
7650 }
254ba52a 7651
60a8b682
JH
7652 /* If folding and the value is one of the Greek
7653 * sigmas insert a few more sigmas to make the
7654 * folding rules of the sigmas to work right.
7655 * Note that not all the possible combinations
7656 * are handled here: some of them are handled
9e55ce06
JH
7657 * by the standard folding rules, and some of
7658 * them (literal or EXACTF cases) are handled
7659 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7660 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7661 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7662 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7663 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7664 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7665 }
7666 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7667 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7668 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7669 }
7670 }
ffc61ed2 7671 }
1b2d223b
JH
7672#ifdef EBCDIC
7673 literal_endpoint = 0;
7674#endif
8ada0baa 7675 }
ffc61ed2
JH
7676
7677 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7678 }
ffc61ed2 7679
936ed897 7680 if (need_class) {
4f66b38d 7681 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7682 if (SIZE_ONLY)
830247a4 7683 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7684 else
830247a4 7685 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7686 }
ffc61ed2 7687
7f6f358c
YO
7688
7689 if (SIZE_ONLY)
7690 return ret;
7691 /****** !SIZE_ONLY AFTER HERE *********/
7692
7693 if( stored == 1 && value < 256
7694 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7695 ) {
7696 /* optimize single char class to an EXACT node
7697 but *only* when its not a UTF/high char */
07be1b83
YO
7698 const char * cur_parse= RExC_parse;
7699 RExC_emit = (regnode *)orig_emit;
7700 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7701 ret = reg_node(pRExC_state,
7702 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7703 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7704 *STRING(ret)= (char)value;
7705 STR_LEN(ret)= 1;
7706 RExC_emit += STR_SZ(1);
7707 return ret;
7708 }
ae5c130c 7709 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7710 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7711 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7712 ) {
a0ed51b3 7713 for (value = 0; value < 256; ++value) {
936ed897 7714 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7715 UV fold = PL_fold[value];
ffc61ed2
JH
7716
7717 if (fold != value)
7718 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7719 }
7720 }
936ed897 7721 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7722 }
ffc61ed2 7723
ae5c130c 7724 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7725 if (optimize_invert &&
ffc61ed2
JH
7726 /* If the only flag is inversion. */
7727 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7728 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7729 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7730 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7731 }
7f6f358c 7732 {
097eb12c 7733 AV * const av = newAV();
ffc61ed2 7734 SV *rv;
9e55ce06 7735 /* The 0th element stores the character class description
6a0407ee 7736 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7737 * to initialize the appropriate swash (which gets stored in
7738 * the 1st element), and also useful for dumping the regnode.
7739 * The 2nd element stores the multicharacter foldings,
6a0407ee 7740 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7741 av_store(av, 0, listsv);
7742 av_store(av, 1, NULL);
9e55ce06 7743 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7744 rv = newRV_noinc((SV*)av);
19860706 7745 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 7746 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 7747 ARG_SET(ret, n);
a0ed51b3 7748 }
a0ed51b3
LW
7749 return ret;
7750}
7751
76e3520e 7752STATIC char*
830247a4 7753S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7754{
097eb12c 7755 char* const retval = RExC_parse++;
a0d0e21e 7756
4633a7c4 7757 for (;;) {
830247a4
IZ
7758 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7759 RExC_parse[2] == '#') {
e994fd66
AE
7760 while (*RExC_parse != ')') {
7761 if (RExC_parse == RExC_end)
7762 FAIL("Sequence (?#... not terminated");
830247a4 7763 RExC_parse++;
e994fd66 7764 }
830247a4 7765 RExC_parse++;
4633a7c4
LW
7766 continue;
7767 }
bbe252da 7768 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
7769 if (isSPACE(*RExC_parse)) {
7770 RExC_parse++;
748a9306
LW
7771 continue;
7772 }
830247a4 7773 else if (*RExC_parse == '#') {
e994fd66
AE
7774 while (RExC_parse < RExC_end)
7775 if (*RExC_parse++ == '\n') break;
748a9306
LW
7776 continue;
7777 }
748a9306 7778 }
4633a7c4 7779 return retval;
a0d0e21e 7780 }
a687059c
LW
7781}
7782
7783/*
c277df42 7784- reg_node - emit a node
a0d0e21e 7785*/
76e3520e 7786STATIC regnode * /* Location. */
830247a4 7787S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7788{
97aff369 7789 dVAR;
c277df42 7790 register regnode *ptr;
504618e9 7791 regnode * const ret = RExC_emit;
07be1b83 7792 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7793
c277df42 7794 if (SIZE_ONLY) {
830247a4
IZ
7795 SIZE_ALIGN(RExC_size);
7796 RExC_size += 1;
a0d0e21e
LW
7797 return(ret);
7798 }
e2e6a0f1
YO
7799#ifdef DEBUGGING
7800 if (OP(RExC_emit) == 255)
7801 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7802 reg_name[op], OP(RExC_emit));
7803#endif
c277df42 7804 NODE_ALIGN_FILL(ret);
a0d0e21e 7805 ptr = ret;
c277df42 7806 FILL_ADVANCE_NODE(ptr, op);
fac92740 7807 if (RExC_offsets) { /* MJD */
07be1b83 7808 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7809 "reg_node", __LINE__,
7810 reg_name[op],
07be1b83
YO
7811 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7812 ? "Overwriting end of array!\n" : "OK",
7813 (UV)(RExC_emit - RExC_emit_start),
7814 (UV)(RExC_parse - RExC_start),
7815 (UV)RExC_offsets[0]));
ccb2c380 7816 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7817 }
07be1b83 7818
830247a4 7819 RExC_emit = ptr;
a0d0e21e 7820 return(ret);
a687059c
LW
7821}
7822
7823/*
a0d0e21e
LW
7824- reganode - emit a node with an argument
7825*/
76e3520e 7826STATIC regnode * /* Location. */
830247a4 7827S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7828{
97aff369 7829 dVAR;
c277df42 7830 register regnode *ptr;
504618e9 7831 regnode * const ret = RExC_emit;
07be1b83 7832 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7833
c277df42 7834 if (SIZE_ONLY) {
830247a4
IZ
7835 SIZE_ALIGN(RExC_size);
7836 RExC_size += 2;
6bda09f9
YO
7837 /*
7838 We can't do this:
7839
7840 assert(2==regarglen[op]+1);
7841
7842 Anything larger than this has to allocate the extra amount.
7843 If we changed this to be:
7844
7845 RExC_size += (1 + regarglen[op]);
7846
7847 then it wouldn't matter. Its not clear what side effect
7848 might come from that so its not done so far.
7849 -- dmq
7850 */
a0d0e21e
LW
7851 return(ret);
7852 }
e2e6a0f1
YO
7853#ifdef DEBUGGING
7854 if (OP(RExC_emit) == 255)
7855 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7856#endif
c277df42 7857 NODE_ALIGN_FILL(ret);
a0d0e21e 7858 ptr = ret;
c277df42 7859 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7860 if (RExC_offsets) { /* MJD */
07be1b83 7861 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7862 "reganode",
ccb2c380
MP
7863 __LINE__,
7864 reg_name[op],
07be1b83 7865 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7866 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7867 (UV)(RExC_emit - RExC_emit_start),
7868 (UV)(RExC_parse - RExC_start),
7869 (UV)RExC_offsets[0]));
ccb2c380 7870 Set_Cur_Node_Offset;
fac92740
MJD
7871 }
7872
830247a4 7873 RExC_emit = ptr;
a0d0e21e 7874 return(ret);
fe14fcc3
LW
7875}
7876
7877/*
cd439c50 7878- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7879*/
71207a34
AL
7880STATIC STRLEN
7881S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7882{
97aff369 7883 dVAR;
71207a34 7884 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7885}
7886
7887/*
a0d0e21e
LW
7888- reginsert - insert an operator in front of already-emitted operand
7889*
7890* Means relocating the operand.
7891*/
76e3520e 7892STATIC void
6bda09f9 7893S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7894{
97aff369 7895 dVAR;
c277df42
IZ
7896 register regnode *src;
7897 register regnode *dst;
7898 register regnode *place;
504618e9 7899 const int offset = regarglen[(U8)op];
6bda09f9 7900 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7901 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7902/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7903 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7904 if (SIZE_ONLY) {
6bda09f9 7905 RExC_size += size;
a0d0e21e
LW
7906 return;
7907 }
a687059c 7908
830247a4 7909 src = RExC_emit;
6bda09f9 7910 RExC_emit += size;
830247a4 7911 dst = RExC_emit;
40d049e4 7912 if (RExC_open_parens) {
6bda09f9 7913 int paren;
6d99fb9b 7914 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 7915 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7916 if ( RExC_open_parens[paren] >= opnd ) {
7917 DEBUG_PARSE_FMT("open"," - %d",size);
7918 RExC_open_parens[paren] += size;
7919 } else {
7920 DEBUG_PARSE_FMT("open"," - %s","ok");
7921 }
7922 if ( RExC_close_parens[paren] >= opnd ) {
7923 DEBUG_PARSE_FMT("close"," - %d",size);
7924 RExC_close_parens[paren] += size;
7925 } else {
7926 DEBUG_PARSE_FMT("close"," - %s","ok");
7927 }
7928 }
6bda09f9 7929 }
40d049e4 7930
fac92740 7931 while (src > opnd) {
c277df42 7932 StructCopy(--src, --dst, regnode);
fac92740 7933 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7934 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7935 "reg_insert",
ccb2c380
MP
7936 __LINE__,
7937 reg_name[op],
07be1b83
YO
7938 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7939 ? "Overwriting end of array!\n" : "OK",
7940 (UV)(src - RExC_emit_start),
7941 (UV)(dst - RExC_emit_start),
7942 (UV)RExC_offsets[0]));
ccb2c380
MP
7943 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7944 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7945 }
7946 }
7947
a0d0e21e
LW
7948
7949 place = opnd; /* Op node, where operand used to be. */
fac92740 7950 if (RExC_offsets) { /* MJD */
07be1b83 7951 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7952 "reginsert",
ccb2c380
MP
7953 __LINE__,
7954 reg_name[op],
07be1b83 7955 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7956 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7957 (UV)(place - RExC_emit_start),
7958 (UV)(RExC_parse - RExC_start),
786e8c11 7959 (UV)RExC_offsets[0]));
ccb2c380 7960 Set_Node_Offset(place, RExC_parse);
45948336 7961 Set_Node_Length(place, 1);
fac92740 7962 }
c277df42
IZ
7963 src = NEXTOPER(place);
7964 FILL_ADVANCE_NODE(place, op);
7965 Zero(src, offset, regnode);
a687059c
LW
7966}
7967
7968/*
c277df42 7969- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7970- SEE ALSO: regtail_study
a0d0e21e 7971*/
097eb12c 7972/* TODO: All three parms should be const */
76e3520e 7973STATIC void
3dab1dad 7974S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7975{
97aff369 7976 dVAR;
c277df42 7977 register regnode *scan;
72f13be8 7978 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7979#ifndef DEBUGGING
7980 PERL_UNUSED_ARG(depth);
7981#endif
a0d0e21e 7982
c277df42 7983 if (SIZE_ONLY)
a0d0e21e
LW
7984 return;
7985
7986 /* Find last node. */
7987 scan = p;
7988 for (;;) {
504618e9 7989 regnode * const temp = regnext(scan);
3dab1dad
YO
7990 DEBUG_PARSE_r({
7991 SV * const mysv=sv_newmortal();
7992 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7993 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7994 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7995 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7996 (temp == NULL ? "->" : ""),
7997 (temp == NULL ? reg_name[OP(val)] : "")
7998 );
3dab1dad
YO
7999 });
8000 if (temp == NULL)
8001 break;
8002 scan = temp;
8003 }
8004
8005 if (reg_off_by_arg[OP(scan)]) {
8006 ARG_SET(scan, val - scan);
8007 }
8008 else {
8009 NEXT_OFF(scan) = val - scan;
8010 }
8011}
8012
07be1b83 8013#ifdef DEBUGGING
3dab1dad
YO
8014/*
8015- regtail_study - set the next-pointer at the end of a node chain of p to val.
8016- Look for optimizable sequences at the same time.
8017- currently only looks for EXACT chains.
07be1b83
YO
8018
8019This is expermental code. The idea is to use this routine to perform
8020in place optimizations on branches and groups as they are constructed,
8021with the long term intention of removing optimization from study_chunk so
8022that it is purely analytical.
8023
8024Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8025to control which is which.
8026
3dab1dad
YO
8027*/
8028/* TODO: All four parms should be const */
07be1b83 8029
3dab1dad
YO
8030STATIC U8
8031S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8032{
8033 dVAR;
8034 register regnode *scan;
07be1b83
YO
8035 U8 exact = PSEUDO;
8036#ifdef EXPERIMENTAL_INPLACESCAN
8037 I32 min = 0;
8038#endif
8039
3dab1dad
YO
8040 GET_RE_DEBUG_FLAGS_DECL;
8041
07be1b83 8042
3dab1dad
YO
8043 if (SIZE_ONLY)
8044 return exact;
8045
8046 /* Find last node. */
8047
8048 scan = p;
8049 for (;;) {
8050 regnode * const temp = regnext(scan);
07be1b83
YO
8051#ifdef EXPERIMENTAL_INPLACESCAN
8052 if (PL_regkind[OP(scan)] == EXACT)
8053 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8054 return EXACT;
8055#endif
3dab1dad
YO
8056 if ( exact ) {
8057 switch (OP(scan)) {
8058 case EXACT:
8059 case EXACTF:
8060 case EXACTFL:
8061 if( exact == PSEUDO )
8062 exact= OP(scan);
07be1b83
YO
8063 else if ( exact != OP(scan) )
8064 exact= 0;
3dab1dad
YO
8065 case NOTHING:
8066 break;
8067 default:
8068 exact= 0;
8069 }
8070 }
8071 DEBUG_PARSE_r({
8072 SV * const mysv=sv_newmortal();
8073 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8074 regprop(RExC_rx, mysv, scan);
eaf3ca90 8075 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8076 SvPV_nolen_const(mysv),
eaf3ca90
YO
8077 REG_NODE_NUM(scan),
8078 reg_name[exact]);
3dab1dad 8079 });
a0d0e21e
LW
8080 if (temp == NULL)
8081 break;
8082 scan = temp;
8083 }
07be1b83
YO
8084 DEBUG_PARSE_r({
8085 SV * const mysv_val=sv_newmortal();
8086 DEBUG_PARSE_MSG("");
8087 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
8088 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8089 SvPV_nolen_const(mysv_val),
8090 (IV)REG_NODE_NUM(val),
8091 (IV)(val - scan)
07be1b83
YO
8092 );
8093 });
c277df42
IZ
8094 if (reg_off_by_arg[OP(scan)]) {
8095 ARG_SET(scan, val - scan);
a0ed51b3
LW
8096 }
8097 else {
c277df42
IZ
8098 NEXT_OFF(scan) = val - scan;
8099 }
3dab1dad
YO
8100
8101 return exact;
a687059c 8102}
07be1b83 8103#endif
a687059c
LW
8104
8105/*
a687059c
LW
8106 - regcurly - a little FSA that accepts {\d+,?\d*}
8107 */
79072805 8108STATIC I32
5f66b61c 8109S_regcurly(register const char *s)
a687059c
LW
8110{
8111 if (*s++ != '{')
8112 return FALSE;
f0fcb552 8113 if (!isDIGIT(*s))
a687059c 8114 return FALSE;
f0fcb552 8115 while (isDIGIT(*s))
a687059c
LW
8116 s++;
8117 if (*s == ',')
8118 s++;
f0fcb552 8119 while (isDIGIT(*s))
a687059c
LW
8120 s++;
8121 if (*s != '}')
8122 return FALSE;
8123 return TRUE;
8124}
8125
a687059c
LW
8126
8127/*
fd181c75 8128 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
8129 */
8130void
097eb12c 8131Perl_regdump(pTHX_ const regexp *r)
a687059c 8132{
35ff7856 8133#ifdef DEBUGGING
97aff369 8134 dVAR;
c445ea15 8135 SV * const sv = sv_newmortal();
ab3bbdeb 8136 SV *dsv= sv_newmortal();
f8fc2ecf 8137 RXi_GET_DECL(r,ri);
a687059c 8138
f8fc2ecf 8139 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8140
8141 /* Header fields of interest. */
ab3bbdeb
YO
8142 if (r->anchored_substr) {
8143 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8144 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8145 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8146 "anchored %s%s at %"IVdf" ",
8147 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8148 (IV)r->anchored_offset);
ab3bbdeb
YO
8149 } else if (r->anchored_utf8) {
8150 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8151 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8152 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8153 "anchored utf8 %s%s at %"IVdf" ",
8154 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8155 (IV)r->anchored_offset);
ab3bbdeb
YO
8156 }
8157 if (r->float_substr) {
8158 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8159 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8160 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8161 "floating %s%s at %"IVdf"..%"UVuf" ",
8162 s, RE_SV_TAIL(r->float_substr),
7b0972df 8163 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8164 } else if (r->float_utf8) {
8165 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8166 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8167 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8168 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8169 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8170 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8171 }
33b8afdf 8172 if (r->check_substr || r->check_utf8)
b81d288d 8173 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8174 (const char *)
8175 (r->check_substr == r->float_substr
8176 && r->check_utf8 == r->float_utf8
8177 ? "(checking floating" : "(checking anchored"));
bbe252da 8178 if (r->extflags & RXf_NOSCAN)
c277df42 8179 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 8180 if (r->extflags & RXf_CHECK_ALL)
c277df42 8181 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8182 if (r->check_substr || r->check_utf8)
c277df42
IZ
8183 PerlIO_printf(Perl_debug_log, ") ");
8184
f8fc2ecf
YO
8185 if (ri->regstclass) {
8186 regprop(r, sv, ri->regstclass);
1de06328 8187 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8188 }
bbe252da 8189 if (r->extflags & RXf_ANCH) {
774d564b 8190 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 8191 if (r->extflags & RXf_ANCH_BOL)
774d564b 8192 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 8193 if (r->extflags & RXf_ANCH_MBOL)
c277df42 8194 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 8195 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 8196 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 8197 if (r->extflags & RXf_ANCH_GPOS)
774d564b 8198 PerlIO_printf(Perl_debug_log, "(GPOS)");
8199 PerlIO_putc(Perl_debug_log, ' ');
8200 }
bbe252da 8201 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 8202 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 8203 if (r->intflags & PREGf_SKIP)
760ac839 8204 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 8205 if (r->intflags & PREGf_IMPLICIT)
760ac839 8206 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 8207 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 8208 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 8209 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8210 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8211#else
96a5add6 8212 PERL_UNUSED_CONTEXT;
65e66c80 8213 PERL_UNUSED_ARG(r);
17c3b450 8214#endif /* DEBUGGING */
a687059c
LW
8215}
8216
8217/*
a0d0e21e
LW
8218- regprop - printable representation of opcode
8219*/
46fc3d4c 8220void
32fc9b6a 8221Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8222{
35ff7856 8223#ifdef DEBUGGING
97aff369 8224 dVAR;
9b155405 8225 register int k;
f8fc2ecf 8226 RXi_GET_DECL(prog,progi);
1de06328 8227 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 8228
a0d0e21e 8229
54dc92de 8230 sv_setpvn(sv, "", 0);
8aa23a47 8231
03363afd 8232 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8233 /* It would be nice to FAIL() here, but this may be called from
8234 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8235 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8236 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8237
3dab1dad 8238 k = PL_regkind[OP(o)];
9b155405 8239
2a782b5b 8240 if (k == EXACT) {
396482e1 8241 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8242 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8243 * is a crude hack but it may be the best for now since
8244 * we have no flag "this EXACTish node was UTF-8"
8245 * --jhi */
8246 const char * const s =
ddc5bc0f 8247 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8248 PL_colors[0], PL_colors[1],
8249 PERL_PV_ESCAPE_UNI_DETECT |
8250 PERL_PV_PRETTY_ELIPSES |
8251 PERL_PV_PRETTY_LTGT
8252 );
8253 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8254 } else if (k == TRIE) {
3dab1dad 8255 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 8256 * progi->data isn't available here */
1de06328
YO
8257 const char op = OP(o);
8258 const I32 n = ARG(o);
8259 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 8260 (reg_ac_data *)progi->data->data[n] :
1de06328 8261 NULL;
3251b653
NC
8262 const reg_trie_data * const trie
8263 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328
YO
8264
8265 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8266 DEBUG_TRIE_COMPILE_r(
8267 Perl_sv_catpvf(aTHX_ sv,
8268 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8269 (UV)trie->startstate,
1e2e3d02 8270 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8271 (UV)trie->wordcount,
8272 (UV)trie->minlen,
8273 (UV)trie->maxlen,
8274 (UV)TRIE_CHARCOUNT(trie),
8275 (UV)trie->uniquecharcount
8276 )
8277 );
8278 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8279 int i;
8280 int rangestart = -1;
f46cb337 8281 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8282 Perl_sv_catpvf(aTHX_ sv, "[");
8283 for (i = 0; i <= 256; i++) {
8284 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8285 if (rangestart == -1)
8286 rangestart = i;
8287 } else if (rangestart != -1) {
8288 if (i <= rangestart + 3)
8289 for (; rangestart < i; rangestart++)
8290 put_byte(sv, rangestart);
8291 else {
8292 put_byte(sv, rangestart);
8293 sv_catpvs(sv, "-");
8294 put_byte(sv, i - 1);
8295 }
8296 rangestart = -1;
8297 }
8298 }
8299 Perl_sv_catpvf(aTHX_ sv, "]");
8300 }
8301
a3621e74 8302 } else if (k == CURLY) {
cb434fcc 8303 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8304 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8305 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8306 }
2c2d71f5
JH
8307 else if (k == WHILEM && o->flags) /* Ordinal/of */
8308 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
e2e6a0f1 8309 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
894356b3 8310 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1a147d38 8311 else if (k == GOSUB)
6bda09f9 8312 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8313 else if (k == VERB) {
8314 if (!o->flags)
8315 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
f8fc2ecf 8316 (SV*)progi->data->data[ ARG( o ) ]);
e2e6a0f1 8317 } else if (k == LOGICAL)
04ebc1ab 8318 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8319 else if (k == ANYOF) {
8320 int i, rangestart = -1;
2d03de9c 8321 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8322
8323 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8324 static const char * const anyofs[] = {
653099ff
GS
8325 "\\w",
8326 "\\W",
8327 "\\s",
8328 "\\S",
8329 "\\d",
8330 "\\D",
8331 "[:alnum:]",
8332 "[:^alnum:]",
8333 "[:alpha:]",
8334 "[:^alpha:]",
8335 "[:ascii:]",
8336 "[:^ascii:]",
8337 "[:ctrl:]",
8338 "[:^ctrl:]",
8339 "[:graph:]",
8340 "[:^graph:]",
8341 "[:lower:]",
8342 "[:^lower:]",
8343 "[:print:]",
8344 "[:^print:]",
8345 "[:punct:]",
8346 "[:^punct:]",
8347 "[:upper:]",
aaa51d5e 8348 "[:^upper:]",
653099ff 8349 "[:xdigit:]",
aaa51d5e
JF
8350 "[:^xdigit:]",
8351 "[:space:]",
8352 "[:^space:]",
8353 "[:blank:]",
8354 "[:^blank:]"
653099ff
GS
8355 };
8356
19860706 8357 if (flags & ANYOF_LOCALE)
396482e1 8358 sv_catpvs(sv, "{loc}");
19860706 8359 if (flags & ANYOF_FOLD)
396482e1 8360 sv_catpvs(sv, "{i}");
653099ff 8361 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8362 if (flags & ANYOF_INVERT)
396482e1 8363 sv_catpvs(sv, "^");
ffc61ed2
JH
8364 for (i = 0; i <= 256; i++) {
8365 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8366 if (rangestart == -1)
8367 rangestart = i;
8368 } else if (rangestart != -1) {
8369 if (i <= rangestart + 3)
8370 for (; rangestart < i; rangestart++)
653099ff 8371 put_byte(sv, rangestart);
ffc61ed2
JH
8372 else {
8373 put_byte(sv, rangestart);
396482e1 8374 sv_catpvs(sv, "-");
ffc61ed2 8375 put_byte(sv, i - 1);
653099ff 8376 }
ffc61ed2 8377 rangestart = -1;
653099ff 8378 }
847a199f 8379 }
ffc61ed2
JH
8380
8381 if (o->flags & ANYOF_CLASS)
bb7a0f54 8382 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8383 if (ANYOF_CLASS_TEST(o,i))
8384 sv_catpv(sv, anyofs[i]);
8385
8386 if (flags & ANYOF_UNICODE)
396482e1 8387 sv_catpvs(sv, "{unicode}");
1aa99e6b 8388 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8389 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8390
8391 {
8392 SV *lv;
32fc9b6a 8393 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8394
ffc61ed2
JH
8395 if (lv) {
8396 if (sw) {
89ebb4a3 8397 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8398
ffc61ed2 8399 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8400 uvchr_to_utf8(s, i);
ffc61ed2 8401
3568d838 8402 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8403 if (rangestart == -1)
8404 rangestart = i;
8405 } else if (rangestart != -1) {
ffc61ed2
JH
8406 if (i <= rangestart + 3)
8407 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8408 const U8 * const e = uvchr_to_utf8(s,rangestart);
8409 U8 *p;
8410 for(p = s; p < e; p++)
ffc61ed2
JH
8411 put_byte(sv, *p);
8412 }
8413 else {
2d03de9c
AL
8414 const U8 *e = uvchr_to_utf8(s,rangestart);
8415 U8 *p;
8416 for (p = s; p < e; p++)
ffc61ed2 8417 put_byte(sv, *p);
396482e1 8418 sv_catpvs(sv, "-");
2d03de9c
AL
8419 e = uvchr_to_utf8(s, i-1);
8420 for (p = s; p < e; p++)
1df70142 8421 put_byte(sv, *p);
ffc61ed2
JH
8422 }
8423 rangestart = -1;
8424 }
19860706 8425 }
ffc61ed2 8426
396482e1 8427 sv_catpvs(sv, "..."); /* et cetera */
19860706 8428 }
fde631ed 8429
ffc61ed2 8430 {
2e0de35c 8431 char *s = savesvpv(lv);
c445ea15 8432 char * const origs = s;
b81d288d 8433
3dab1dad
YO
8434 while (*s && *s != '\n')
8435 s++;
b81d288d 8436
ffc61ed2 8437 if (*s == '\n') {
2d03de9c 8438 const char * const t = ++s;
ffc61ed2
JH
8439
8440 while (*s) {
8441 if (*s == '\n')
8442 *s = ' ';
8443 s++;
8444 }
8445 if (s[-1] == ' ')
8446 s[-1] = 0;
8447
8448 sv_catpv(sv, t);
fde631ed 8449 }
b81d288d 8450
ffc61ed2 8451 Safefree(origs);
fde631ed
JH
8452 }
8453 }
653099ff 8454 }
ffc61ed2 8455
653099ff
GS
8456 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8457 }
9b155405 8458 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8459 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8460#else
96a5add6 8461 PERL_UNUSED_CONTEXT;
65e66c80
SP
8462 PERL_UNUSED_ARG(sv);
8463 PERL_UNUSED_ARG(o);
f9049ba1 8464 PERL_UNUSED_ARG(prog);
17c3b450 8465#endif /* DEBUGGING */
35ff7856 8466}
a687059c 8467
cad2e5aa
JH
8468SV *
8469Perl_re_intuit_string(pTHX_ regexp *prog)
8470{ /* Assume that RE_INTUIT is set */
97aff369 8471 dVAR;
a3621e74 8472 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8473 PERL_UNUSED_CONTEXT;
8474
a3621e74 8475 DEBUG_COMPILE_r(
cfd0369c 8476 {
2d03de9c 8477 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8478 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8479
8480 if (!PL_colorset) reginitcolors();
8481 PerlIO_printf(Perl_debug_log,
a0288114 8482 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8483 PL_colors[4],
8484 prog->check_substr ? "" : "utf8 ",
8485 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8486 s,
8487 PL_colors[1],
8488 (strlen(s) > 60 ? "..." : ""));
8489 } );
8490
33b8afdf 8491 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8492}
8493
84da74a7 8494/*
f8149455 8495 pregfree()
84da74a7 8496
f8149455
YO
8497 handles refcounting and freeing the perl core regexp structure. When
8498 it is necessary to actually free the structure the first thing it
8499 does is call the 'free' method of the regexp_engine associated to to
8500 the regexp, allowing the handling of the void *pprivate; member
8501 first. (This routine is not overridable by extensions, which is why
8502 the extensions free is called first.)
8503
8504 See regdupe and regdupe_internal if you change anything here.
84da74a7 8505*/
f8149455 8506#ifndef PERL_IN_XSUB_RE
2b69d0c2 8507void
864dbfa3 8508Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8509{
27da23d5 8510 dVAR;
fc32ee4a 8511 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8512
7821416a
IZ
8513 if (!r || (--r->refcnt > 0))
8514 return;
f8149455
YO
8515
8516 CALLREGFREE_PVT(r); /* free the private data */
8517
43c5f42d
NC
8518 /* gcov results gave these as non-null 100% of the time, so there's no
8519 optimisation in checking them before calling Safefree */
8520 Safefree(r->precomp);
ed252734 8521 RX_MATCH_COPY_FREE(r);
f8c7b90f 8522#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8523 if (r->saved_copy)
8524 SvREFCNT_dec(r->saved_copy);
8525#endif
a193d654
GS
8526 if (r->substrs) {
8527 if (r->anchored_substr)
8528 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8529 if (r->anchored_utf8)
8530 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8531 if (r->float_substr)
8532 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8533 if (r->float_utf8)
8534 SvREFCNT_dec(r->float_utf8);
2779dcf1 8535 Safefree(r->substrs);
a193d654 8536 }
81714fb9
YO
8537 if (r->paren_names)
8538 SvREFCNT_dec(r->paren_names);
f8149455
YO
8539
8540 Safefree(r->startp);
8541 Safefree(r->endp);
8542 Safefree(r);
8543}
8544#endif
8545
8546/* regfree_internal()
8547
8548 Free the private data in a regexp. This is overloadable by
8549 extensions. Perl takes care of the regexp structure in pregfree(),
8550 this covers the *pprivate pointer which technically perldoesnt
8551 know about, however of course we have to handle the
8552 regexp_internal structure when no extension is in use.
8553
8554 Note this is called before freeing anything in the regexp
8555 structure.
8556 */
8557
8558void
8559Perl_regfree_internal(pTHX_ struct regexp *r)
8560{
8561 dVAR;
8562 RXi_GET_DECL(r,ri);
8563 GET_RE_DEBUG_FLAGS_DECL;
8564
8565 DEBUG_COMPILE_r({
8566 if (!PL_colorset)
8567 reginitcolors();
8568 {
8569 SV *dsv= sv_newmortal();
8570 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
8571 dsv, r->precomp, r->prelen, 60);
8572 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8573 PL_colors[4],PL_colors[5],s);
8574 }
8575 });
8576
8577 Safefree(ri->offsets); /* 20010421 MJD */
f8fc2ecf
YO
8578 if (ri->data) {
8579 int n = ri->data->count;
f3548bdc
DM
8580 PAD* new_comppad = NULL;
8581 PAD* old_comppad;
4026c95a 8582 PADOFFSET refcnt;
dfad63ad 8583
c277df42 8584 while (--n >= 0) {
261faec3 8585 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 8586 switch (ri->data->what[n]) {
c277df42 8587 case 's':
81714fb9 8588 case 'S':
55eed653 8589 case 'u':
f8fc2ecf 8590 SvREFCNT_dec((SV*)ri->data->data[n]);
c277df42 8591 break;
653099ff 8592 case 'f':
f8fc2ecf 8593 Safefree(ri->data->data[n]);
653099ff 8594 break;
dfad63ad 8595 case 'p':
f8fc2ecf 8596 new_comppad = (AV*)ri->data->data[n];
dfad63ad 8597 break;
c277df42 8598 case 'o':
dfad63ad 8599 if (new_comppad == NULL)
cea2e8a9 8600 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8601 PAD_SAVE_LOCAL(old_comppad,
8602 /* Watch out for global destruction's random ordering. */
c445ea15 8603 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8604 );
b34c0dd4 8605 OP_REFCNT_LOCK;
f8fc2ecf 8606 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
8607 OP_REFCNT_UNLOCK;
8608 if (!refcnt)
f8fc2ecf 8609 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 8610
f3548bdc 8611 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8612 SvREFCNT_dec((SV*)new_comppad);
8613 new_comppad = NULL;
c277df42
IZ
8614 break;
8615 case 'n':
9e55ce06 8616 break;
07be1b83 8617 case 'T':
be8e71aa
YO
8618 { /* Aho Corasick add-on structure for a trie node.
8619 Used in stclass optimization only */
07be1b83 8620 U32 refcount;
f8fc2ecf 8621 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
8622 OP_REFCNT_LOCK;
8623 refcount = --aho->refcount;
8624 OP_REFCNT_UNLOCK;
8625 if ( !refcount ) {
446bd890
NC
8626 PerlMemShared_free(aho->states);
8627 PerlMemShared_free(aho->fail);
446bd890
NC
8628 /* do this last!!!! */
8629 PerlMemShared_free(ri->data->data[n]);
8630 PerlMemShared_free(ri->regstclass);
07be1b83
YO
8631 }
8632 }
8633 break;
a3621e74 8634 case 't':
07be1b83 8635 {
be8e71aa 8636 /* trie structure. */
07be1b83 8637 U32 refcount;
f8fc2ecf 8638 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
8639 OP_REFCNT_LOCK;
8640 refcount = --trie->refcount;
8641 OP_REFCNT_UNLOCK;
8642 if ( !refcount ) {
446bd890 8643 PerlMemShared_free(trie->charmap);
446bd890
NC
8644 PerlMemShared_free(trie->states);
8645 PerlMemShared_free(trie->trans);
07be1b83 8646 if (trie->bitmap)
446bd890 8647 PerlMemShared_free(trie->bitmap);
07be1b83 8648 if (trie->wordlen)
446bd890 8649 PerlMemShared_free(trie->wordlen);
786e8c11 8650 if (trie->jump)
446bd890 8651 PerlMemShared_free(trie->jump);
786e8c11 8652 if (trie->nextword)
446bd890 8653 PerlMemShared_free(trie->nextword);
446bd890
NC
8654 /* do this last!!!! */
8655 PerlMemShared_free(ri->data->data[n]);
a3621e74 8656 }
07be1b83
YO
8657 }
8658 break;
c277df42 8659 default:
f8fc2ecf 8660 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
8661 }
8662 }
f8fc2ecf
YO
8663 Safefree(ri->data->what);
8664 Safefree(ri->data);
a0d0e21e 8665 }
f8fc2ecf
YO
8666 if (ri->swap) {
8667 Safefree(ri->swap->startp);
8668 Safefree(ri->swap->endp);
8669 Safefree(ri->swap);
c74340f9 8670 }
f8fc2ecf 8671 Safefree(ri);
a687059c 8672}
c277df42 8673
84da74a7
YO
8674#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8675#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8676#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8677#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8678
8679/*
8680 regdupe - duplicate a regexp.
8681
8682 This routine is called by sv.c's re_dup and is expected to clone a
8683 given regexp structure. It is a no-op when not under USE_ITHREADS.
8684 (Originally this *was* re_dup() for change history see sv.c)
8685
f8149455
YO
8686 After all of the core data stored in struct regexp is duplicated
8687 the regexp_engine.dupe method is used to copy any private data
8688 stored in the *pprivate pointer. This allows extensions to handle
8689 any duplication it needs to do.
8690
8691 See pregfree() and regfree_internal() if you change anything here.
84da74a7 8692*/
a3c0e9ca 8693#if defined(USE_ITHREADS)
f8149455 8694#ifndef PERL_IN_XSUB_RE
84da74a7 8695regexp *
f8149455 8696Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
84da74a7 8697{
84da74a7 8698 dVAR;
f8fc2ecf 8699 regexp *ret;
f8149455 8700 int i, npar;
84da74a7 8701 struct reg_substr_datum *s;
644c02aa 8702
84da74a7
YO
8703 if (!r)
8704 return (REGEXP *)NULL;
8705
8706 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8707 return ret;
8708
f8149455 8709
84da74a7 8710 npar = r->nparens+1;
f8fc2ecf 8711 Newxz(ret, 1, regexp);
84da74a7
YO
8712 Newx(ret->startp, npar, I32);
8713 Copy(r->startp, ret->startp, npar, I32);
8714 Newx(ret->endp, npar, I32);
f8149455 8715 Copy(r->endp, ret->endp, npar, I32);
84da74a7 8716
c945c181 8717 if (r->substrs) {
785a26d5
YO
8718 Newx(ret->substrs, 1, struct reg_substr_data);
8719 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8720 s->min_offset = r->substrs->data[i].min_offset;
8721 s->max_offset = r->substrs->data[i].max_offset;
8722 s->end_shift = r->substrs->data[i].end_shift;
8723 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8724 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8725 }
8726 } else
8727 ret->substrs = NULL;
f8149455
YO
8728
8729 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8730 ret->refcnt = r->refcnt;
8731 ret->minlen = r->minlen;
8732 ret->minlenret = r->minlenret;
8733 ret->prelen = r->prelen;
8734 ret->nparens = r->nparens;
8735 ret->lastparen = r->lastparen;
8736 ret->lastcloseparen = r->lastcloseparen;
8737 ret->intflags = r->intflags;
8738 ret->extflags = r->extflags;
8739
8740 ret->sublen = r->sublen;
8741
8742 ret->engine = r->engine;
8743
8744 ret->paren_names = hv_dup_inc(r->paren_names, param);
8745
8746 if (RX_MATCH_COPIED(ret))
8747 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8748 else
8749 ret->subbeg = NULL;
8750#ifdef PERL_OLD_COPY_ON_WRITE
8751 ret->saved_copy = NULL;
8752#endif
8753
8754 ret->pprivate = r->pprivate;
785a26d5
YO
8755 if (ret->pprivate)
8756 RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
f8149455
YO
8757
8758 ptr_table_store(PL_ptr_table, r, ret);
8759 return ret;
8760}
8761#endif /* PERL_IN_XSUB_RE */
8762
8763/*
8764 regdupe_internal()
8765
8766 This is the internal complement to regdupe() which is used to copy
8767 the structure pointed to by the *pprivate pointer in the regexp.
8768 This is the core version of the extension overridable cloning hook.
8769 The regexp structure being duplicated will be copied by perl prior
8770 to this and will be provided as the regexp *r argument, however
8771 with the /old/ structures pprivate pointer value. Thus this routine
8772 may override any copying normally done by perl.
8773
8774 It returns a pointer to the new regexp_internal structure.
8775*/
8776
8777void *
8778Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param)
8779{
8780 dVAR;
8781 regexp_internal *reti;
8782 int len, npar;
8783 RXi_GET_DECL(r,ri);
8784
8785 npar = r->nparens+1;
8786 len = ri->offsets[0];
8787
8788 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
8789 Copy(ri->program, reti->program, len+1, regnode);
8790
8791 if(ri->swap) {
8792 Newx(reti->swap, 1, regexp_paren_ofs);
8793 /* no need to copy these */
8794 Newx(reti->swap->startp, npar, I32);
8795 Newx(reti->swap->endp, npar, I32);
8796 } else {
8797 reti->swap = NULL;
8798 }
8799
84da74a7 8800
f8fc2ecf
YO
8801 reti->regstclass = NULL;
8802 if (ri->data) {
84da74a7 8803 struct reg_data *d;
f8fc2ecf 8804 const int count = ri->data->count;
84da74a7
YO
8805 int i;
8806
8807 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8808 char, struct reg_data);
8809 Newx(d->what, count, U8);
8810
8811 d->count = count;
8812 for (i = 0; i < count; i++) {
f8fc2ecf 8813 d->what[i] = ri->data->what[i];
84da74a7 8814 switch (d->what[i]) {
55eed653 8815 /* legal options are one of: sSfpontTu
84da74a7
YO
8816 see also regcomp.h and pregfree() */
8817 case 's':
81714fb9 8818 case 'S':
0536c0a7 8819 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 8820 case 'u': /* actually an HV, but the dup function is identical. */
f8fc2ecf 8821 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
84da74a7 8822 break;
84da74a7
YO
8823 case 'f':
8824 /* This is cheating. */
8825 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 8826 StructCopy(ri->data->data[i], d->data[i],
84da74a7 8827 struct regnode_charclass_class);
f8fc2ecf 8828 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
8829 break;
8830 case 'o':
bbe252da
YO
8831 /* Compiled op trees are readonly and in shared memory,
8832 and can thus be shared without duplication. */
84da74a7 8833 OP_REFCNT_LOCK;
f8fc2ecf 8834 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
8835 OP_REFCNT_UNLOCK;
8836 break;
23eab42c
NC
8837 case 'T':
8838 /* Trie stclasses are readonly and can thus be shared
8839 * without duplication. We free the stclass in pregfree
8840 * when the corresponding reg_ac_data struct is freed.
8841 */
8842 reti->regstclass= ri->regstclass;
8843 /* Fall through */
84da74a7 8844 case 't':
84da74a7 8845 OP_REFCNT_LOCK;
0536c0a7 8846 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 8847 OP_REFCNT_UNLOCK;
0536c0a7
NC
8848 /* Fall through */
8849 case 'n':
8850 d->data[i] = ri->data->data[i];
84da74a7 8851 break;
84da74a7 8852 default:
f8fc2ecf 8853 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
8854 }
8855 }
8856
f8fc2ecf 8857 reti->data = d;
84da74a7
YO
8858 }
8859 else
f8fc2ecf 8860 reti->data = NULL;
84da74a7 8861
f8fc2ecf
YO
8862 Newx(reti->offsets, 2*len+1, U32);
8863 Copy(ri->offsets, reti->offsets, 2*len+1, U32);
81714fb9 8864
f8149455 8865 return (void*)reti;
84da74a7 8866}
f8149455
YO
8867
8868#endif /* USE_ITHREADS */
84da74a7 8869
de8c5301
YO
8870/*
8871 reg_stringify()
8872
8873 converts a regexp embedded in a MAGIC struct to its stringified form,
8874 caching the converted form in the struct and returns the cached
8875 string.
8876
8877 If lp is nonnull then it is used to return the length of the
8878 resulting string
8879
8880 If flags is nonnull and the returned string contains UTF8 then
f8149455 8881 (*flags & 1) will be true.
de8c5301
YO
8882
8883 If haseval is nonnull then it is used to return whether the pattern
8884 contains evals.
8885
8886 Normally called via macro:
8887
f8149455 8888 CALLREG_STRINGIFY(mg,&len,&utf8);
de8c5301
YO
8889
8890 And internally with
8891
f8149455 8892 CALLREG_AS_STR(mg,&lp,&flags,&haseval)
de8c5301
YO
8893
8894 See sv_2pv_flags() in sv.c for an example of internal usage.
8895
8896 */
f8149455 8897#ifndef PERL_IN_XSUB_RE
de8c5301
YO
8898char *
8899Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8900 dVAR;
8901 const regexp * const re = (regexp *)mg->mg_obj;
f8149455 8902
de8c5301
YO
8903 if (!mg->mg_ptr) {
8904 const char *fptr = "msix";
8905 char reflags[6];
8906 char ch;
8907 int left = 0;
8908 int right = 4;
8909 bool need_newline = 0;
bbe252da 8910 U16 reganch = (U16)((re->extflags & RXf_PMf_COMPILETIME) >> 12);
de8c5301
YO
8911
8912 while((ch = *fptr++)) {
8913 if(reganch & 1) {
8914 reflags[left++] = ch;
8915 }
8916 else {
8917 reflags[right--] = ch;
8918 }
8919 reganch >>= 1;
8920 }
8921 if(left != 4) {
8922 reflags[left] = '-';
8923 left = 5;
8924 }
8925
8926 mg->mg_len = re->prelen + 4 + left;
8927 /*
8928 * If /x was used, we have to worry about a regex ending with a
8929 * comment later being embedded within another regex. If so, we don't
8930 * want this regex's "commentization" to leak out to the right part of
8931 * the enclosing regex, we must cap it with a newline.
8932 *
8933 * So, if /x was used, we scan backwards from the end of the regex. If
8934 * we find a '#' before we find a newline, we need to add a newline
8935 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8936 * we don't need to add anything. -jfriedl
8937 */
bbe252da 8938 if (PMf_EXTENDED & re->extflags) {
de8c5301
YO
8939 const char *endptr = re->precomp + re->prelen;
8940 while (endptr >= re->precomp) {
8941 const char c = *(endptr--);
8942 if (c == '\n')
8943 break; /* don't need another */
8944 if (c == '#') {
8945 /* we end while in a comment, so we need a newline */
8946 mg->mg_len++; /* save space for it */
8947 need_newline = 1; /* note to add it */
8948 break;
8949 }
8950 }
8951 }
8952
8953 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8954 mg->mg_ptr[0] = '(';
8955 mg->mg_ptr[1] = '?';
8956 Copy(reflags, mg->mg_ptr+2, left, char);
8957 *(mg->mg_ptr+left+2) = ':';
8958 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8959 if (need_newline)
8960 mg->mg_ptr[mg->mg_len - 2] = '\n';
8961 mg->mg_ptr[mg->mg_len - 1] = ')';
8962 mg->mg_ptr[mg->mg_len] = 0;
8963 }
8964 if (haseval)
f8149455 8965 *haseval = re->seen_evals;
de8c5301 8966 if (flags)
bbe252da 8967 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
de8c5301
YO
8968
8969 if (lp)
8970 *lp = mg->mg_len;
8971 return mg->mg_ptr;
8972}
8973
c277df42
IZ
8974/*
8975 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8976 */
8977regnode *
864dbfa3 8978Perl_regnext(pTHX_ register regnode *p)
c277df42 8979{
97aff369 8980 dVAR;
c277df42
IZ
8981 register I32 offset;
8982
f8fc2ecf 8983 if (!p)
c277df42
IZ
8984 return(NULL);
8985
8986 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8987 if (offset == 0)
8988 return(NULL);
8989
c277df42 8990 return(p+offset);
c277df42 8991}
76234dfb 8992#endif
c277df42 8993
01f988be 8994STATIC void
cea2e8a9 8995S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8996{
8997 va_list args;
8998 STRLEN l1 = strlen(pat1);
8999 STRLEN l2 = strlen(pat2);
9000 char buf[512];
06bf62c7 9001 SV *msv;
73d840c0 9002 const char *message;
c277df42
IZ
9003
9004 if (l1 > 510)
9005 l1 = 510;
9006 if (l1 + l2 > 510)
9007 l2 = 510 - l1;
9008 Copy(pat1, buf, l1 , char);
9009 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
9010 buf[l1 + l2] = '\n';
9011 buf[l1 + l2 + 1] = '\0';
8736538c
AS
9012#ifdef I_STDARG
9013 /* ANSI variant takes additional second argument */
c277df42 9014 va_start(args, pat2);
8736538c
AS
9015#else
9016 va_start(args);
9017#endif
5a844595 9018 msv = vmess(buf, &args);
c277df42 9019 va_end(args);
cfd0369c 9020 message = SvPV_const(msv,l1);
c277df42
IZ
9021 if (l1 > 512)
9022 l1 = 512;
9023 Copy(message, buf, l1 , char);
197cf9b9 9024 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 9025 Perl_croak(aTHX_ "%s", buf);
c277df42 9026}
a0ed51b3
LW
9027
9028/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9029
76234dfb 9030#ifndef PERL_IN_XSUB_RE
a0ed51b3 9031void
864dbfa3 9032Perl_save_re_context(pTHX)
b81d288d 9033{
97aff369 9034 dVAR;
1ade1aa1
NC
9035
9036 struct re_save_state *state;
9037
9038 SAVEVPTR(PL_curcop);
9039 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9040
9041 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9042 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9043 SSPUSHINT(SAVEt_RE_STATE);
9044
46ab3289 9045 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 9046
a0ed51b3 9047 PL_reg_start_tmp = 0;
a0ed51b3 9048 PL_reg_start_tmpl = 0;
c445ea15 9049 PL_reg_oldsaved = NULL;
a5db57d6 9050 PL_reg_oldsavedlen = 0;
a5db57d6 9051 PL_reg_maxiter = 0;
a5db57d6 9052 PL_reg_leftiter = 0;
c445ea15 9053 PL_reg_poscache = NULL;
a5db57d6 9054 PL_reg_poscache_size = 0;
1ade1aa1
NC
9055#ifdef PERL_OLD_COPY_ON_WRITE
9056 PL_nrs = NULL;
9057#endif
ada6e8a9 9058
c445ea15
AL
9059 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9060 if (PL_curpm) {
9061 const REGEXP * const rx = PM_GETRE(PL_curpm);
9062 if (rx) {
1df70142 9063 U32 i;
ada6e8a9 9064 for (i = 1; i <= rx->nparens; i++) {
1df70142 9065 char digits[TYPE_CHARS(long)];
d9fad198 9066 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
9067 GV *const *const gvp
9068 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9069
b37c2d43
AL
9070 if (gvp) {
9071 GV * const gv = *gvp;
9072 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9073 save_scalar(gv);
49f27e4b 9074 }
ada6e8a9
AMS
9075 }
9076 }
9077 }
a0ed51b3 9078}
76234dfb 9079#endif
51371543 9080
51371543 9081static void
acfe0abc 9082clear_re(pTHX_ void *r)
51371543 9083{
97aff369 9084 dVAR;
51371543
GS
9085 ReREFCNT_dec((regexp *)r);
9086}
ffbc6a93 9087
a28509cc
AL
9088#ifdef DEBUGGING
9089
9090STATIC void
9091S_put_byte(pTHX_ SV *sv, int c)
9092{
9093 if (isCNTRL(c) || c == 255 || !isPRINT(c))
9094 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9095 else if (c == '-' || c == ']' || c == '\\' || c == '^')
9096 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
9097 else
9098 Perl_sv_catpvf(aTHX_ sv, "%c", c);
9099}
9100
786e8c11 9101
3dab1dad
YO
9102#define CLEAR_OPTSTART \
9103 if (optstart) STMT_START { \
70685ca0 9104 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
9105 optstart=NULL; \
9106 } STMT_END
9107
786e8c11 9108#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 9109
b5a2f8d8
NC
9110STATIC const regnode *
9111S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9112 const regnode *last, const regnode *plast,
9113 SV* sv, I32 indent, U32 depth)
a28509cc 9114{
97aff369 9115 dVAR;
786e8c11 9116 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9117 register const regnode *next;
3dab1dad 9118 const regnode *optstart= NULL;
f8fc2ecf 9119 RXi_GET_DECL(r,ri);
3dab1dad 9120 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 9121
786e8c11
YO
9122#ifdef DEBUG_DUMPUNTIL
9123 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9124 last ? last-start : 0,plast ? plast-start : 0);
9125#endif
9126
9127 if (plast && plast < last)
9128 last= plast;
9129
9130 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
9131 /* While that wasn't END last time... */
9132
9133 NODE_ALIGN(node);
9134 op = OP(node);
de734bd5 9135 if (op == CLOSE || op == WHILEM)
786e8c11 9136 indent--;
b5a2f8d8 9137 next = regnext((regnode *)node);
07be1b83 9138
a28509cc 9139 /* Where, what. */
8e11feef 9140 if (OP(node) == OPTIMIZED) {
e68ec53f 9141 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 9142 optstart = node;
3dab1dad 9143 else
8e11feef 9144 goto after_print;
3dab1dad
YO
9145 } else
9146 CLEAR_OPTSTART;
07be1b83 9147
32fc9b6a 9148 regprop(r, sv, node);
a28509cc 9149 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 9150 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
9151
9152 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
9153 if (next == NULL) /* Next ptr. */
9154 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
9155 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9156 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
9157 else
9158 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 9159
1de06328 9160 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 9161 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
9162 }
9163
a28509cc
AL
9164 after_print:
9165 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
9166 assert(next);
9167 {
9168 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
9169 ? regnext((regnode *)next)
9170 : next);
be8e71aa
YO
9171 if (last && nnode > last)
9172 nnode = last;
786e8c11 9173 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 9174 }
a28509cc
AL
9175 }
9176 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9177 assert(next);
786e8c11 9178 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9179 }
9180 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9181 const regnode *this_trie = node;
1de06328 9182 const char op = OP(node);
a28509cc 9183 const I32 n = ARG(node);
1de06328 9184 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 9185 (reg_ac_data *)ri->data->data[n] :
1de06328 9186 NULL;
3251b653
NC
9187 const reg_trie_data * const trie =
9188 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781
NC
9189#ifdef DEBUGGING
9190 AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9191#endif
786e8c11 9192 const regnode *nextbranch= NULL;
a28509cc 9193 I32 word_idx;
1de06328 9194 sv_setpvn(sv, "", 0);
786e8c11 9195 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 9196 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
9197
9198 PerlIO_printf(Perl_debug_log, "%*s%s ",
9199 (int)(2*(indent+3)), "",
9200 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9201 PL_colors[0], PL_colors[1],
9202 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9203 PERL_PV_PRETTY_ELIPSES |
7f69552c 9204 PERL_PV_PRETTY_LTGT
786e8c11
YO
9205 )
9206 : "???"
9207 );
9208 if (trie->jump) {
40d049e4 9209 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
9210 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9211 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
9212 if (dist) {
9213 if (!nextbranch)
24b23f37 9214 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9215 DUMPUNTIL(this_trie + dist, nextbranch);
9216 }
786e8c11
YO
9217 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9218 nextbranch= regnext((regnode *)nextbranch);
9219 } else {
9220 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9221 }
786e8c11
YO
9222 }
9223 if (last && next > last)
9224 node= last;
9225 else
9226 node= next;
a28509cc 9227 }
786e8c11
YO
9228 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9229 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9230 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9231 }
9232 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9233 assert(next);
786e8c11 9234 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9235 }
9236 else if ( op == PLUS || op == STAR) {
786e8c11 9237 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9238 }
9239 else if (op == ANYOF) {
9240 /* arglen 1 + class block */
9241 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9242 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9243 node = NEXTOPER(node);
9244 }
9245 else if (PL_regkind[(U8)op] == EXACT) {
9246 /* Literal string, where present. */
9247 node += NODE_SZ_STR(node) - 1;
9248 node = NEXTOPER(node);
9249 }
9250 else {
9251 node = NEXTOPER(node);
9252 node += regarglen[(U8)op];
9253 }
9254 if (op == CURLYX || op == OPEN)
786e8c11 9255 indent++;
a28509cc 9256 }
3dab1dad 9257 CLEAR_OPTSTART;
786e8c11 9258#ifdef DEBUG_DUMPUNTIL
70685ca0 9259 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 9260#endif
1de06328 9261 return node;
a28509cc
AL
9262}
9263
9264#endif /* DEBUGGING */
9265
241d1a3b
NC
9266/*
9267 * Local variables:
9268 * c-indentation-style: bsd
9269 * c-basic-offset: 4
9270 * indent-tabs-mode: t
9271 * End:
9272 *
37442d52
RGS
9273 * ex: set ts=8 sts=4 sw=4 noet:
9274 */