This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Silence a warning from Module::CoreList that occurs when the module version is
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
40d049e4
YO
129 regnode **recurse; /* Recurse regops */
130 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
131#if ADD_TO_REGEXEC
132 char *starttry; /* -Dr: where regtry was called. */
133#define RExC_starttry (pRExC_state->starttry)
134#endif
3dab1dad 135#ifdef DEBUGGING
be8e71aa 136 const char *lastparse;
3dab1dad
YO
137 I32 lastnum;
138#define RExC_lastparse (pRExC_state->lastparse)
139#define RExC_lastnum (pRExC_state->lastnum)
140#endif
830247a4
IZ
141} RExC_state_t;
142
e2509266 143#define RExC_flags (pRExC_state->flags)
830247a4
IZ
144#define RExC_precomp (pRExC_state->precomp)
145#define RExC_rx (pRExC_state->rx)
f8fc2ecf 146#define RExC_rxi (pRExC_state->rxi)
fac92740 147#define RExC_start (pRExC_state->start)
830247a4
IZ
148#define RExC_end (pRExC_state->end)
149#define RExC_parse (pRExC_state->parse)
150#define RExC_whilem_seen (pRExC_state->whilem_seen)
f8fc2ecf 151#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
830247a4 152#define RExC_emit (pRExC_state->emit)
fac92740 153#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
154#define RExC_naughty (pRExC_state->naughty)
155#define RExC_sawback (pRExC_state->sawback)
156#define RExC_seen (pRExC_state->seen)
157#define RExC_size (pRExC_state->size)
158#define RExC_npar (pRExC_state->npar)
e2e6a0f1 159#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
160#define RExC_extralen (pRExC_state->extralen)
161#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 163#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 164#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
165#define RExC_open_parens (pRExC_state->open_parens)
166#define RExC_close_parens (pRExC_state->close_parens)
167#define RExC_opend (pRExC_state->opend)
81714fb9 168#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
169#define RExC_recurse (pRExC_state->recurse)
170#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 171
a687059c
LW
172#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
173#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
174 ((*s) == '{' && regcurly(s)))
a687059c 175
35c8bce7
LW
176#ifdef SPSTART
177#undef SPSTART /* dratted cpp namespace... */
178#endif
a687059c
LW
179/*
180 * Flags to be passed up and down.
181 */
a687059c 182#define WORST 0 /* Worst case. */
821b33a5 183#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
184#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
185#define SPSTART 0x4 /* Starts with * or +. */
186#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 187
3dab1dad
YO
188#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
189
07be1b83
YO
190/* whether trie related optimizations are enabled */
191#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
192#define TRIE_STUDY_OPT
786e8c11 193#define FULL_TRIE_STUDY
07be1b83
YO
194#define TRIE_STCLASS
195#endif
1de06328
YO
196
197
40d049e4
YO
198
199#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
200#define PBITVAL(paren) (1 << ((paren) & 7))
201#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
202#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
203#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
204
205
1de06328
YO
206/* About scan_data_t.
207
208 During optimisation we recurse through the regexp program performing
209 various inplace (keyhole style) optimisations. In addition study_chunk
210 and scan_commit populate this data structure with information about
211 what strings MUST appear in the pattern. We look for the longest
212 string that must appear for at a fixed location, and we look for the
213 longest string that may appear at a floating location. So for instance
214 in the pattern:
215
216 /FOO[xX]A.*B[xX]BAR/
217
218 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
219 strings (because they follow a .* construct). study_chunk will identify
220 both FOO and BAR as being the longest fixed and floating strings respectively.
221
222 The strings can be composites, for instance
223
224 /(f)(o)(o)/
225
226 will result in a composite fixed substring 'foo'.
227
228 For each string some basic information is maintained:
229
230 - offset or min_offset
231 This is the position the string must appear at, or not before.
232 It also implicitly (when combined with minlenp) tells us how many
233 character must match before the string we are searching.
234 Likewise when combined with minlenp and the length of the string
235 tells us how many characters must appear after the string we have
236 found.
237
238 - max_offset
239 Only used for floating strings. This is the rightmost point that
240 the string can appear at. Ifset to I32 max it indicates that the
241 string can occur infinitely far to the right.
242
243 - minlenp
244 A pointer to the minimum length of the pattern that the string
245 was found inside. This is important as in the case of positive
246 lookahead or positive lookbehind we can have multiple patterns
247 involved. Consider
248
249 /(?=FOO).*F/
250
251 The minimum length of the pattern overall is 3, the minimum length
252 of the lookahead part is 3, but the minimum length of the part that
253 will actually match is 1. So 'FOO's minimum length is 3, but the
254 minimum length for the F is 1. This is important as the minimum length
255 is used to determine offsets in front of and behind the string being
256 looked for. Since strings can be composites this is the length of the
257 pattern at the time it was commited with a scan_commit. Note that
258 the length is calculated by study_chunk, so that the minimum lengths
259 are not known until the full pattern has been compiled, thus the
260 pointer to the value.
261
262 - lookbehind
263
264 In the case of lookbehind the string being searched for can be
265 offset past the start point of the final matching string.
266 If this value was just blithely removed from the min_offset it would
267 invalidate some of the calculations for how many chars must match
268 before or after (as they are derived from min_offset and minlen and
269 the length of the string being searched for).
270 When the final pattern is compiled and the data is moved from the
271 scan_data_t structure into the regexp structure the information
272 about lookbehind is factored in, with the information that would
273 have been lost precalculated in the end_shift field for the
274 associated string.
275
276 The fields pos_min and pos_delta are used to store the minimum offset
277 and the delta to the maximum offset at the current point in the pattern.
278
279*/
2c2d71f5
JH
280
281typedef struct scan_data_t {
1de06328
YO
282 /*I32 len_min; unused */
283 /*I32 len_delta; unused */
2c2d71f5
JH
284 I32 pos_min;
285 I32 pos_delta;
286 SV *last_found;
1de06328 287 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
288 I32 last_start_min;
289 I32 last_start_max;
1de06328
YO
290 SV **longest; /* Either &l_fixed, or &l_float. */
291 SV *longest_fixed; /* longest fixed string found in pattern */
292 I32 offset_fixed; /* offset where it starts */
293 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
294 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
295 SV *longest_float; /* longest floating string found in pattern */
296 I32 offset_float_min; /* earliest point in string it can appear */
297 I32 offset_float_max; /* latest point in string it can appear */
298 I32 *minlen_float; /* pointer to the minlen relevent to the string */
299 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
300 I32 flags;
301 I32 whilem_c;
cb434fcc 302 I32 *last_closep;
653099ff 303 struct regnode_charclass_class *start_class;
2c2d71f5
JH
304} scan_data_t;
305
a687059c 306/*
e50aee73 307 * Forward declarations for pregcomp()'s friends.
a687059c 308 */
a0d0e21e 309
27da23d5 310static const scan_data_t zero_scan_data =
1de06328 311 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
312
313#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
314#define SF_BEFORE_SEOL 0x0001
315#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
316#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
317#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
318
09b7f37c
CB
319#ifdef NO_UNARY_PLUS
320# define SF_FIX_SHIFT_EOL (0+2)
321# define SF_FL_SHIFT_EOL (0+4)
322#else
323# define SF_FIX_SHIFT_EOL (+2)
324# define SF_FL_SHIFT_EOL (+4)
325#endif
c277df42
IZ
326
327#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
328#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
329
330#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
331#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
332#define SF_IS_INF 0x0040
333#define SF_HAS_PAR 0x0080
334#define SF_IN_PAR 0x0100
335#define SF_HAS_EVAL 0x0200
336#define SCF_DO_SUBSTR 0x0400
653099ff
GS
337#define SCF_DO_STCLASS_AND 0x0800
338#define SCF_DO_STCLASS_OR 0x1000
339#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 340#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 341
786e8c11 342#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 343#define SCF_SEEN_ACCEPT 0x8000
07be1b83 344
eb160463 345#define UTF (RExC_utf8 != 0)
bbe252da
YO
346#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
347#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 348
ffc61ed2 349#define OOB_UNICODE 12345678
93733859 350#define OOB_NAMEDCLASS -1
b8c5462f 351
a0ed51b3
LW
352#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
353#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
354
8615cb43 355
b45f050a
JF
356/* length of regex to show in messages that don't mark a position within */
357#define RegexLengthToShowInErrorMessages 127
358
359/*
360 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
361 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
362 * op/pragma/warn/regcomp.
363 */
7253e4e3
RK
364#define MARKER1 "<-- HERE" /* marker as it appears in the description */
365#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 366
7253e4e3 367#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
368
369/*
370 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
371 * arg. Show regex, up to a maximum length. If it's too long, chop and add
372 * "...".
373 */
58e23c8d 374#define _FAIL(code) STMT_START { \
bfed75c6 375 const char *ellipses = ""; \
ccb2c380
MP
376 IV len = RExC_end - RExC_precomp; \
377 \
378 if (!SIZE_ONLY) \
379 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
380 if (len > RegexLengthToShowInErrorMessages) { \
381 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
382 len = RegexLengthToShowInErrorMessages - 10; \
383 ellipses = "..."; \
384 } \
58e23c8d 385 code; \
ccb2c380 386} STMT_END
8615cb43 387
58e23c8d
YO
388#define FAIL(msg) _FAIL( \
389 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
390 msg, (int)len, RExC_precomp, ellipses))
391
392#define FAIL2(msg,arg) _FAIL( \
393 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
394 arg, (int)len, RExC_precomp, ellipses))
395
b45f050a 396/*
b45f050a
JF
397 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
398 */
ccb2c380 399#define Simple_vFAIL(m) STMT_START { \
a28509cc 400 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
401 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
402 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
403} STMT_END
b45f050a
JF
404
405/*
406 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
407 */
ccb2c380
MP
408#define vFAIL(m) STMT_START { \
409 if (!SIZE_ONLY) \
410 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
411 Simple_vFAIL(m); \
412} STMT_END
b45f050a
JF
413
414/*
415 * Like Simple_vFAIL(), but accepts two arguments.
416 */
ccb2c380 417#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 418 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
419 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
420 (int)offset, RExC_precomp, RExC_precomp + offset); \
421} STMT_END
b45f050a
JF
422
423/*
424 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
425 */
ccb2c380
MP
426#define vFAIL2(m,a1) STMT_START { \
427 if (!SIZE_ONLY) \
428 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
429 Simple_vFAIL2(m, a1); \
430} STMT_END
b45f050a
JF
431
432
433/*
434 * Like Simple_vFAIL(), but accepts three arguments.
435 */
ccb2c380 436#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 437 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
438 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
439 (int)offset, RExC_precomp, RExC_precomp + offset); \
440} STMT_END
b45f050a
JF
441
442/*
443 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
444 */
ccb2c380
MP
445#define vFAIL3(m,a1,a2) STMT_START { \
446 if (!SIZE_ONLY) \
447 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
448 Simple_vFAIL3(m, a1, a2); \
449} STMT_END
b45f050a
JF
450
451/*
452 * Like Simple_vFAIL(), but accepts four arguments.
453 */
ccb2c380 454#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 455 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
456 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
457 (int)offset, RExC_precomp, RExC_precomp + offset); \
458} STMT_END
b45f050a 459
ccb2c380 460#define vWARN(loc,m) STMT_START { \
a28509cc 461 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
462 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
463 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
464} STMT_END
465
466#define vWARNdep(loc,m) STMT_START { \
a28509cc 467 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
468 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
469 "%s" REPORT_LOCATION, \
470 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
471} STMT_END
472
473
474#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 475 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
476 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
477 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
478} STMT_END
479
480#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 481 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
482 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
483 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
484} STMT_END
485
486#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 487 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
488 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
489 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
490} STMT_END
491
492#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 493 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
494 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
495 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
496} STMT_END
9d1d55b5 497
8615cb43 498
cd439c50 499/* Allow for side effects in s */
ccb2c380
MP
500#define REGC(c,s) STMT_START { \
501 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
502} STMT_END
cd439c50 503
fac92740
MJD
504/* Macros for recording node offsets. 20001227 mjd@plover.com
505 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
506 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
507 * Element 0 holds the number n.
07be1b83 508 * Position is 1 indexed.
fac92740
MJD
509 */
510
ccb2c380
MP
511#define Set_Node_Offset_To_R(node,byte) STMT_START { \
512 if (! SIZE_ONLY) { \
513 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 514 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 515 if((node) < 0) { \
551405c4 516 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
517 } else { \
518 RExC_offsets[2*(node)-1] = (byte); \
519 } \
520 } \
521} STMT_END
522
523#define Set_Node_Offset(node,byte) \
524 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
525#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
526
527#define Set_Node_Length_To_R(node,len) STMT_START { \
528 if (! SIZE_ONLY) { \
529 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 530 __LINE__, (int)(node), (int)(len))); \
ccb2c380 531 if((node) < 0) { \
551405c4 532 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
533 } else { \
534 RExC_offsets[2*(node)] = (len); \
535 } \
536 } \
537} STMT_END
538
539#define Set_Node_Length(node,len) \
540 Set_Node_Length_To_R((node)-RExC_emit_start, len)
541#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
542#define Set_Node_Cur_Length(node) \
543 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
544
545/* Get offsets and lengths */
546#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
547#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
548
07be1b83
YO
549#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
550 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
551 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
552} STMT_END
553
554
555#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
556#define EXPERIMENTAL_INPLACESCAN
557#endif
558
304ee84b
YO
559#define DEBUG_STUDYDATA(str,data,depth) \
560DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 561 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
562 "%*s" str "Pos:%"IVdf"/%"IVdf \
563 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
564 (int)(depth)*2, "", \
565 (IV)((data)->pos_min), \
566 (IV)((data)->pos_delta), \
304ee84b 567 (UV)((data)->flags), \
1de06328 568 (IV)((data)->whilem_c), \
304ee84b
YO
569 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
570 is_inf ? "INF " : "" \
1de06328
YO
571 ); \
572 if ((data)->last_found) \
573 PerlIO_printf(Perl_debug_log, \
574 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
575 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
576 SvPVX_const((data)->last_found), \
577 (IV)((data)->last_end), \
578 (IV)((data)->last_start_min), \
579 (IV)((data)->last_start_max), \
580 ((data)->longest && \
581 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
582 SvPVX_const((data)->longest_fixed), \
583 (IV)((data)->offset_fixed), \
584 ((data)->longest && \
585 (data)->longest==&((data)->longest_float)) ? "*" : "", \
586 SvPVX_const((data)->longest_float), \
587 (IV)((data)->offset_float_min), \
588 (IV)((data)->offset_float_max) \
589 ); \
590 PerlIO_printf(Perl_debug_log,"\n"); \
591});
592
acfe0abc 593static void clear_re(pTHX_ void *r);
4327152a 594
653099ff 595/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 596 Update the longest found anchored substring and the longest found
653099ff
GS
597 floating substrings if needed. */
598
4327152a 599STATIC void
304ee84b 600S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 601{
e1ec3a88
AL
602 const STRLEN l = CHR_SVLEN(data->last_found);
603 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 604 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 605
c277df42 606 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 607 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
608 if (*data->longest == data->longest_fixed) {
609 data->offset_fixed = l ? data->last_start_min : data->pos_min;
610 if (data->flags & SF_BEFORE_EOL)
b81d288d 611 data->flags
c277df42
IZ
612 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
613 else
614 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
615 data->minlen_fixed=minlenp;
616 data->lookbehind_fixed=0;
a0ed51b3 617 }
304ee84b 618 else { /* *data->longest == data->longest_float */
c277df42 619 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
620 data->offset_float_max = (l
621 ? data->last_start_max
c277df42 622 : data->pos_min + data->pos_delta);
304ee84b 623 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 624 data->offset_float_max = I32_MAX;
c277df42 625 if (data->flags & SF_BEFORE_EOL)
b81d288d 626 data->flags
c277df42
IZ
627 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
628 else
629 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
630 data->minlen_float=minlenp;
631 data->lookbehind_float=0;
c277df42
IZ
632 }
633 }
634 SvCUR_set(data->last_found, 0);
0eda9292 635 {
a28509cc 636 SV * const sv = data->last_found;
097eb12c
AL
637 if (SvUTF8(sv) && SvMAGICAL(sv)) {
638 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
639 if (mg)
640 mg->mg_len = 0;
641 }
0eda9292 642 }
c277df42
IZ
643 data->last_end = -1;
644 data->flags &= ~SF_BEFORE_EOL;
304ee84b 645 DEBUG_STUDYDATA("cl_anything: ",data,0);
c277df42
IZ
646}
647
653099ff
GS
648/* Can match anything (initialization) */
649STATIC void
097eb12c 650S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 651{
653099ff 652 ANYOF_CLASS_ZERO(cl);
f8bef550 653 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 654 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
655 if (LOC)
656 cl->flags |= ANYOF_LOCALE;
657}
658
659/* Can match anything (initialization) */
660STATIC int
5f66b61c 661S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
662{
663 int value;
664
aaa51d5e 665 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
666 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
667 return 1;
1aa99e6b
IH
668 if (!(cl->flags & ANYOF_UNICODE_ALL))
669 return 0;
10edeb5d 670 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 671 return 0;
653099ff
GS
672 return 1;
673}
674
675/* Can match anything (initialization) */
676STATIC void
097eb12c 677S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 678{
8ecf7187 679 Zero(cl, 1, struct regnode_charclass_class);
653099ff 680 cl->type = ANYOF;
830247a4 681 cl_anything(pRExC_state, cl);
653099ff
GS
682}
683
684STATIC void
097eb12c 685S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 686{
8ecf7187 687 Zero(cl, 1, struct regnode_charclass_class);
653099ff 688 cl->type = ANYOF;
830247a4 689 cl_anything(pRExC_state, cl);
653099ff
GS
690 if (LOC)
691 cl->flags |= ANYOF_LOCALE;
692}
693
694/* 'And' a given class with another one. Can create false positives */
695/* We assume that cl is not inverted */
696STATIC void
5f66b61c 697S_cl_and(struct regnode_charclass_class *cl,
a28509cc 698 const struct regnode_charclass_class *and_with)
653099ff 699{
40d049e4
YO
700
701 assert(and_with->type == ANYOF);
653099ff
GS
702 if (!(and_with->flags & ANYOF_CLASS)
703 && !(cl->flags & ANYOF_CLASS)
704 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
705 && !(and_with->flags & ANYOF_FOLD)
706 && !(cl->flags & ANYOF_FOLD)) {
707 int i;
708
709 if (and_with->flags & ANYOF_INVERT)
710 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
711 cl->bitmap[i] &= ~and_with->bitmap[i];
712 else
713 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
714 cl->bitmap[i] &= and_with->bitmap[i];
715 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
716 if (!(and_with->flags & ANYOF_EOS))
717 cl->flags &= ~ANYOF_EOS;
1aa99e6b 718
14ebb1a2
JH
719 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
720 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
721 cl->flags &= ~ANYOF_UNICODE_ALL;
722 cl->flags |= ANYOF_UNICODE;
723 ARG_SET(cl, ARG(and_with));
724 }
14ebb1a2
JH
725 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
726 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 727 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
728 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
729 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 730 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
731}
732
733/* 'OR' a given class with another one. Can create false positives */
734/* We assume that cl is not inverted */
735STATIC void
097eb12c 736S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 737{
653099ff
GS
738 if (or_with->flags & ANYOF_INVERT) {
739 /* We do not use
740 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
741 * <= (B1 | !B2) | (CL1 | !CL2)
742 * which is wasteful if CL2 is small, but we ignore CL2:
743 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
744 * XXXX Can we handle case-fold? Unclear:
745 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
746 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
747 */
748 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
749 && !(or_with->flags & ANYOF_FOLD)
750 && !(cl->flags & ANYOF_FOLD) ) {
751 int i;
752
753 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
754 cl->bitmap[i] |= ~or_with->bitmap[i];
755 } /* XXXX: logic is complicated otherwise */
756 else {
830247a4 757 cl_anything(pRExC_state, cl);
653099ff
GS
758 }
759 } else {
760 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
761 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 762 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
763 || (cl->flags & ANYOF_FOLD)) ) {
764 int i;
765
766 /* OR char bitmap and class bitmap separately */
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] |= or_with->bitmap[i];
769 if (or_with->flags & ANYOF_CLASS) {
770 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
771 cl->classflags[i] |= or_with->classflags[i];
772 cl->flags |= ANYOF_CLASS;
773 }
774 }
775 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 776 cl_anything(pRExC_state, cl);
653099ff
GS
777 }
778 }
779 if (or_with->flags & ANYOF_EOS)
780 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
781
782 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
783 ARG(cl) != ARG(or_with)) {
784 cl->flags |= ANYOF_UNICODE_ALL;
785 cl->flags &= ~ANYOF_UNICODE;
786 }
787 if (or_with->flags & ANYOF_UNICODE_ALL) {
788 cl->flags |= ANYOF_UNICODE_ALL;
789 cl->flags &= ~ANYOF_UNICODE;
790 }
653099ff
GS
791}
792
a3621e74
YO
793#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
794#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
795#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
796#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
797
3dab1dad
YO
798
799#ifdef DEBUGGING
07be1b83 800/*
2b8b4781
NC
801 dump_trie(trie,widecharmap,revcharmap)
802 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
803 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
804
805 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
806 The _interim_ variants are used for debugging the interim
807 tables that are used to generate the final compressed
808 representation which is what dump_trie expects.
809
3dab1dad
YO
810 Part of the reason for their existance is to provide a form
811 of documentation as to how the different representations function.
07be1b83
YO
812
813*/
3dab1dad
YO
814
815/*
3dab1dad
YO
816 Dumps the final compressed table form of the trie to Perl_debug_log.
817 Used for debugging make_trie().
818*/
819
820STATIC void
2b8b4781
NC
821S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
822 AV *revcharmap, U32 depth)
3dab1dad
YO
823{
824 U32 state;
ab3bbdeb 825 SV *sv=sv_newmortal();
55eed653 826 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
827 GET_RE_DEBUG_FLAGS_DECL;
828
ab3bbdeb 829
3dab1dad
YO
830 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
831 (int)depth * 2 + 2,"",
832 "Match","Base","Ofs" );
833
834 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 835 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 836 if ( tmp ) {
ab3bbdeb
YO
837 PerlIO_printf( Perl_debug_log, "%*s",
838 colwidth,
ddc5bc0f 839 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
840 PL_colors[0], PL_colors[1],
841 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
842 PERL_PV_ESCAPE_FIRSTCHAR
843 )
844 );
3dab1dad
YO
845 }
846 }
847 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
848 (int)depth * 2 + 2,"");
849
850 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 851 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
852 PerlIO_printf( Perl_debug_log, "\n");
853
1e2e3d02 854 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 855 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
856
857 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
858
859 if ( trie->states[ state ].wordnum ) {
860 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
861 } else {
862 PerlIO_printf( Perl_debug_log, "%6s", "" );
863 }
864
865 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
866
867 if ( base ) {
868 U32 ofs = 0;
869
870 while( ( base + ofs < trie->uniquecharcount ) ||
871 ( base + ofs - trie->uniquecharcount < trie->lasttrans
872 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
873 ofs++;
874
875 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
876
877 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
878 if ( ( base + ofs >= trie->uniquecharcount ) &&
879 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
880 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
881 {
ab3bbdeb
YO
882 PerlIO_printf( Perl_debug_log, "%*"UVXf,
883 colwidth,
3dab1dad
YO
884 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
885 } else {
ab3bbdeb 886 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
887 }
888 }
889
890 PerlIO_printf( Perl_debug_log, "]");
891
892 }
893 PerlIO_printf( Perl_debug_log, "\n" );
894 }
895}
896/*
3dab1dad
YO
897 Dumps a fully constructed but uncompressed trie in list form.
898 List tries normally only are used for construction when the number of
899 possible chars (trie->uniquecharcount) is very high.
900 Used for debugging make_trie().
901*/
902STATIC void
55eed653 903S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
904 HV *widecharmap, AV *revcharmap, U32 next_alloc,
905 U32 depth)
3dab1dad
YO
906{
907 U32 state;
ab3bbdeb 908 SV *sv=sv_newmortal();
55eed653 909 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
910 GET_RE_DEBUG_FLAGS_DECL;
911 /* print out the table precompression. */
ab3bbdeb
YO
912 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
913 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
914 "------:-----+-----------------\n" );
3dab1dad
YO
915
916 for( state=1 ; state < next_alloc ; state ++ ) {
917 U16 charid;
918
ab3bbdeb 919 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
920 (int)depth * 2 + 2,"", (UV)state );
921 if ( ! trie->states[ state ].wordnum ) {
922 PerlIO_printf( Perl_debug_log, "%5s| ","");
923 } else {
924 PerlIO_printf( Perl_debug_log, "W%4x| ",
925 trie->states[ state ].wordnum
926 );
927 }
928 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 929 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
930 if ( tmp ) {
931 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
932 colwidth,
ddc5bc0f 933 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
934 PL_colors[0], PL_colors[1],
935 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
936 PERL_PV_ESCAPE_FIRSTCHAR
937 ) ,
1e2e3d02
YO
938 TRIE_LIST_ITEM(state,charid).forid,
939 (UV)TRIE_LIST_ITEM(state,charid).newstate
940 );
941 if (!(charid % 10))
664e119d
RGS
942 PerlIO_printf(Perl_debug_log, "\n%*s| ",
943 (int)((depth * 2) + 14), "");
1e2e3d02 944 }
ab3bbdeb
YO
945 }
946 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
947 }
948}
949
950/*
3dab1dad
YO
951 Dumps a fully constructed but uncompressed trie in table form.
952 This is the normal DFA style state transition table, with a few
953 twists to facilitate compression later.
954 Used for debugging make_trie().
955*/
956STATIC void
55eed653 957S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
958 HV *widecharmap, AV *revcharmap, U32 next_alloc,
959 U32 depth)
3dab1dad
YO
960{
961 U32 state;
962 U16 charid;
ab3bbdeb 963 SV *sv=sv_newmortal();
55eed653 964 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
965 GET_RE_DEBUG_FLAGS_DECL;
966
967 /*
968 print out the table precompression so that we can do a visual check
969 that they are identical.
970 */
971
972 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
973
974 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 975 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 976 if ( tmp ) {
ab3bbdeb
YO
977 PerlIO_printf( Perl_debug_log, "%*s",
978 colwidth,
ddc5bc0f 979 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
980 PL_colors[0], PL_colors[1],
981 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
982 PERL_PV_ESCAPE_FIRSTCHAR
983 )
984 );
3dab1dad
YO
985 }
986 }
987
988 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
989
990 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 991 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
992 }
993
994 PerlIO_printf( Perl_debug_log, "\n" );
995
996 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
997
998 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
999 (int)depth * 2 + 2,"",
1000 (UV)TRIE_NODENUM( state ) );
1001
1002 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1003 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1004 if (v)
1005 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1006 else
1007 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1008 }
1009 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1010 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1011 } else {
1012 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1013 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1014 }
1015 }
07be1b83 1016}
3dab1dad
YO
1017
1018#endif
1019
786e8c11
YO
1020/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1021 startbranch: the first branch in the whole branch sequence
1022 first : start branch of sequence of branch-exact nodes.
1023 May be the same as startbranch
1024 last : Thing following the last branch.
1025 May be the same as tail.
1026 tail : item following the branch sequence
1027 count : words in the sequence
1028 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1029 depth : indent depth
3dab1dad 1030
786e8c11 1031Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1032
786e8c11
YO
1033A trie is an N'ary tree where the branches are determined by digital
1034decomposition of the key. IE, at the root node you look up the 1st character and
1035follow that branch repeat until you find the end of the branches. Nodes can be
1036marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1037
786e8c11 1038 /he|she|his|hers/
72f13be8 1039
786e8c11
YO
1040would convert into the following structure. Numbers represent states, letters
1041following numbers represent valid transitions on the letter from that state, if
1042the number is in square brackets it represents an accepting state, otherwise it
1043will be in parenthesis.
07be1b83 1044
786e8c11
YO
1045 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1046 | |
1047 | (2)
1048 | |
1049 (1) +-i->(6)-+-s->[7]
1050 |
1051 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1052
786e8c11
YO
1053 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1054
1055This shows that when matching against the string 'hers' we will begin at state 1
1056read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1057then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1058is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1059single traverse. We store a mapping from accepting to state to which word was
1060matched, and then when we have multiple possibilities we try to complete the
1061rest of the regex in the order in which they occured in the alternation.
1062
1063The only prior NFA like behaviour that would be changed by the TRIE support is
1064the silent ignoring of duplicate alternations which are of the form:
1065
1066 / (DUPE|DUPE) X? (?{ ... }) Y /x
1067
1068Thus EVAL blocks follwing a trie may be called a different number of times with
1069and without the optimisation. With the optimisations dupes will be silently
1070ignored. This inconsistant behaviour of EVAL type nodes is well established as
1071the following demonstrates:
1072
1073 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1074
1075which prints out 'word' three times, but
1076
1077 'words'=~/(word|word|word)(?{ print $1 })S/
1078
1079which doesnt print it out at all. This is due to other optimisations kicking in.
1080
1081Example of what happens on a structural level:
1082
1083The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1084
1085 1: CURLYM[1] {1,32767}(18)
1086 5: BRANCH(8)
1087 6: EXACT <ac>(16)
1088 8: BRANCH(11)
1089 9: EXACT <ad>(16)
1090 11: BRANCH(14)
1091 12: EXACT <ab>(16)
1092 16: SUCCEED(0)
1093 17: NOTHING(18)
1094 18: END(0)
1095
1096This would be optimizable with startbranch=5, first=5, last=16, tail=16
1097and should turn into:
1098
1099 1: CURLYM[1] {1,32767}(18)
1100 5: TRIE(16)
1101 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1102 <ac>
1103 <ad>
1104 <ab>
1105 16: SUCCEED(0)
1106 17: NOTHING(18)
1107 18: END(0)
1108
1109Cases where tail != last would be like /(?foo|bar)baz/:
1110
1111 1: BRANCH(4)
1112 2: EXACT <foo>(8)
1113 4: BRANCH(7)
1114 5: EXACT <bar>(8)
1115 7: TAIL(8)
1116 8: EXACT <baz>(10)
1117 10: END(0)
1118
1119which would be optimizable with startbranch=1, first=1, last=7, tail=8
1120and would end up looking like:
1121
1122 1: TRIE(8)
1123 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1124 <foo>
1125 <bar>
1126 7: TAIL(8)
1127 8: EXACT <baz>(10)
1128 10: END(0)
1129
1130 d = uvuni_to_utf8_flags(d, uv, 0);
1131
1132is the recommended Unicode-aware way of saying
1133
1134 *(d++) = uv;
1135*/
1136
1e2e3d02 1137#define TRIE_STORE_REVCHAR \
786e8c11 1138 STMT_START { \
1e2e3d02 1139 SV *tmp = newSVpvs(""); \
786e8c11 1140 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1141 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
2b8b4781 1142 av_push( revcharmap, tmp ); \
786e8c11
YO
1143 } STMT_END
1144
1145#define TRIE_READ_CHAR STMT_START { \
1146 wordlen++; \
1147 if ( UTF ) { \
1148 if ( folder ) { \
1149 if ( foldlen > 0 ) { \
1150 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1151 foldlen -= len; \
1152 scan += len; \
1153 len = 0; \
1154 } else { \
1155 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1156 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1157 foldlen -= UNISKIP( uvc ); \
1158 scan = foldbuf + UNISKIP( uvc ); \
1159 } \
1160 } else { \
1161 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1162 } \
1163 } else { \
1164 uvc = (U32)*uc; \
1165 len = 1; \
1166 } \
1167} STMT_END
1168
1169
1170
1171#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1172 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1173 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1174 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1175 } \
1176 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1177 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1178 TRIE_LIST_CUR( state )++; \
1179} STMT_END
07be1b83 1180
786e8c11
YO
1181#define TRIE_LIST_NEW(state) STMT_START { \
1182 Newxz( trie->states[ state ].trans.list, \
1183 4, reg_trie_trans_le ); \
1184 TRIE_LIST_CUR( state ) = 1; \
1185 TRIE_LIST_LEN( state ) = 4; \
1186} STMT_END
07be1b83 1187
786e8c11
YO
1188#define TRIE_HANDLE_WORD(state) STMT_START { \
1189 U16 dupe= trie->states[ state ].wordnum; \
1190 regnode * const noper_next = regnext( noper ); \
1191 \
1192 if (trie->wordlen) \
1193 trie->wordlen[ curword ] = wordlen; \
1194 DEBUG_r({ \
1195 /* store the word for dumping */ \
1196 SV* tmp; \
1197 if (OP(noper) != NOTHING) \
1198 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1199 else \
1200 tmp = newSVpvn( "", 0 ); \
1201 if ( UTF ) SvUTF8_on( tmp ); \
2b8b4781 1202 av_push( trie_words, tmp ); \
786e8c11
YO
1203 }); \
1204 \
1205 curword++; \
1206 \
1207 if ( noper_next < tail ) { \
1208 if (!trie->jump) \
446bd890 1209 trie->jump = PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1210 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1211 if (!jumper) \
1212 jumper = noper_next; \
1213 if (!nextbranch) \
1214 nextbranch= regnext(cur); \
1215 } \
1216 \
1217 if ( dupe ) { \
1218 /* So it's a dupe. This means we need to maintain a */\
1219 /* linked-list from the first to the next. */\
1220 /* we only allocate the nextword buffer when there */\
1221 /* a dupe, so first time we have to do the allocation */\
1222 if (!trie->nextword) \
446bd890
NC
1223 trie->nextword = \
1224 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1225 while ( trie->nextword[dupe] ) \
1226 dupe= trie->nextword[dupe]; \
1227 trie->nextword[dupe]= curword; \
1228 } else { \
1229 /* we haven't inserted this word yet. */ \
1230 trie->states[ state ].wordnum = curword; \
1231 } \
1232} STMT_END
07be1b83 1233
3dab1dad 1234
786e8c11
YO
1235#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1236 ( ( base + charid >= ucharcount \
1237 && base + charid < ubound \
1238 && state == trie->trans[ base - ucharcount + charid ].check \
1239 && trie->trans[ base - ucharcount + charid ].next ) \
1240 ? trie->trans[ base - ucharcount + charid ].next \
1241 : ( state==1 ? special : 0 ) \
1242 )
3dab1dad 1243
786e8c11
YO
1244#define MADE_TRIE 1
1245#define MADE_JUMP_TRIE 2
1246#define MADE_EXACT_TRIE 4
3dab1dad 1247
a3621e74 1248STATIC I32
786e8c11 1249S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1250{
27da23d5 1251 dVAR;
a3621e74
YO
1252 /* first pass, loop through and scan words */
1253 reg_trie_data *trie;
55eed653 1254 HV *widecharmap = NULL;
2b8b4781 1255 AV *revcharmap = newAV();
a3621e74 1256 regnode *cur;
9f7f3913 1257 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1258 STRLEN len = 0;
1259 UV uvc = 0;
1260 U16 curword = 0;
1261 U32 next_alloc = 0;
786e8c11
YO
1262 regnode *jumper = NULL;
1263 regnode *nextbranch = NULL;
7f69552c 1264 regnode *convert = NULL;
a3621e74 1265 /* we just use folder as a flag in utf8 */
e1ec3a88 1266 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1267 ? PL_fold
1268 : ( flags == EXACTFL
1269 ? PL_fold_locale
1270 : NULL
1271 )
1272 );
1273
2b8b4781
NC
1274#ifdef DEBUGGING
1275 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1276 AV *trie_words = NULL;
1277 /* along with revcharmap, this only used during construction but both are
1278 * useful during debugging so we store them in the struct when debugging.
8e11feef 1279 */
2b8b4781
NC
1280#else
1281 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1282 STRLEN trie_charcount=0;
3dab1dad 1283#endif
2b8b4781 1284 SV *re_trie_maxbuff;
a3621e74 1285 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1286#ifndef DEBUGGING
1287 PERL_UNUSED_ARG(depth);
1288#endif
a3621e74 1289
446bd890 1290 trie = PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1291 trie->refcount = 1;
3dab1dad 1292 trie->startstate = 1;
786e8c11 1293 trie->wordcount = word_count;
f8fc2ecf 1294 RExC_rxi->data->data[ data_slot ] = (void*)trie;
446bd890 1295 trie->charmap = PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1296 if (!(UTF && folder))
446bd890 1297 trie->bitmap = PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1298 DEBUG_r({
2b8b4781 1299 trie_words = newAV();
a3621e74 1300 });
a3621e74 1301
0111c4fd 1302 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1303 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1304 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1305 }
3dab1dad
YO
1306 DEBUG_OPTIMISE_r({
1307 PerlIO_printf( Perl_debug_log,
786e8c11 1308 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1309 (int)depth * 2 + 2, "",
1310 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1311 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1312 (int)depth);
3dab1dad 1313 });
7f69552c
YO
1314
1315 /* Find the node we are going to overwrite */
1316 if ( first == startbranch && OP( last ) != BRANCH ) {
1317 /* whole branch chain */
1318 convert = first;
1319 } else {
1320 /* branch sub-chain */
1321 convert = NEXTOPER( first );
1322 }
1323
a3621e74
YO
1324 /* -- First loop and Setup --
1325
1326 We first traverse the branches and scan each word to determine if it
1327 contains widechars, and how many unique chars there are, this is
1328 important as we have to build a table with at least as many columns as we
1329 have unique chars.
1330
1331 We use an array of integers to represent the character codes 0..255
1332 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1333 native representation of the character value as the key and IV's for the
1334 coded index.
1335
1336 *TODO* If we keep track of how many times each character is used we can
1337 remap the columns so that the table compression later on is more
1338 efficient in terms of memory by ensuring most common value is in the
1339 middle and the least common are on the outside. IMO this would be better
1340 than a most to least common mapping as theres a decent chance the most
1341 common letter will share a node with the least common, meaning the node
1342 will not be compressable. With a middle is most common approach the worst
1343 case is when we have the least common nodes twice.
1344
1345 */
1346
a3621e74 1347 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1348 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1349 const U8 *uc = (U8*)STRING( noper );
a28509cc 1350 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1351 STRLEN foldlen = 0;
1352 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1353 const U8 *scan = (U8*)NULL;
07be1b83 1354 U32 wordlen = 0; /* required init */
3dab1dad 1355 STRLEN chars=0;
a3621e74 1356
3dab1dad
YO
1357 if (OP(noper) == NOTHING) {
1358 trie->minlen= 0;
1359 continue;
1360 }
1361 if (trie->bitmap) {
1362 TRIE_BITMAP_SET(trie,*uc);
1363 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1364 }
a3621e74 1365 for ( ; uc < e ; uc += len ) {
3dab1dad 1366 TRIE_CHARCOUNT(trie)++;
a3621e74 1367 TRIE_READ_CHAR;
3dab1dad 1368 chars++;
a3621e74
YO
1369 if ( uvc < 256 ) {
1370 if ( !trie->charmap[ uvc ] ) {
1371 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1372 if ( folder )
1373 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1374 TRIE_STORE_REVCHAR;
a3621e74
YO
1375 }
1376 } else {
1377 SV** svpp;
55eed653
NC
1378 if ( !widecharmap )
1379 widecharmap = newHV();
a3621e74 1380
55eed653 1381 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1382
1383 if ( !svpp )
e4584336 1384 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1385
1386 if ( !SvTRUE( *svpp ) ) {
1387 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1388 TRIE_STORE_REVCHAR;
a3621e74
YO
1389 }
1390 }
1391 }
3dab1dad
YO
1392 if( cur == first ) {
1393 trie->minlen=chars;
1394 trie->maxlen=chars;
1395 } else if (chars < trie->minlen) {
1396 trie->minlen=chars;
1397 } else if (chars > trie->maxlen) {
1398 trie->maxlen=chars;
1399 }
1400
a3621e74
YO
1401 } /* end first pass */
1402 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1403 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1404 (int)depth * 2 + 2,"",
55eed653 1405 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1406 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1407 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1408 );
446bd890 1409 trie->wordlen = PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1410
1411 /*
1412 We now know what we are dealing with in terms of unique chars and
1413 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1414 representation using a flat table will take. If it's over a reasonable
1415 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1416 conservative but potentially much slower representation using an array
1417 of lists.
1418
1419 At the end we convert both representations into the same compressed
1420 form that will be used in regexec.c for matching with. The latter
1421 is a form that cannot be used to construct with but has memory
1422 properties similar to the list form and access properties similar
1423 to the table form making it both suitable for fast searches and
1424 small enough that its feasable to store for the duration of a program.
1425
1426 See the comment in the code where the compressed table is produced
1427 inplace from the flat tabe representation for an explanation of how
1428 the compression works.
1429
1430 */
1431
1432
3dab1dad 1433 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1434 /*
1435 Second Pass -- Array Of Lists Representation
1436
1437 Each state will be represented by a list of charid:state records
1438 (reg_trie_trans_le) the first such element holds the CUR and LEN
1439 points of the allocated array. (See defines above).
1440
1441 We build the initial structure using the lists, and then convert
1442 it into the compressed table form which allows faster lookups
1443 (but cant be modified once converted).
a3621e74
YO
1444 */
1445
a3621e74
YO
1446 STRLEN transcount = 1;
1447
1e2e3d02
YO
1448 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1449 "%*sCompiling trie using list compiler\n",
1450 (int)depth * 2 + 2, ""));
446bd890
NC
1451
1452 trie->states = PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1453 sizeof(reg_trie_state) );
a3621e74
YO
1454 TRIE_LIST_NEW(1);
1455 next_alloc = 2;
1456
1457 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1458
c445ea15
AL
1459 regnode * const noper = NEXTOPER( cur );
1460 U8 *uc = (U8*)STRING( noper );
1461 const U8 * const e = uc + STR_LEN( noper );
1462 U32 state = 1; /* required init */
1463 U16 charid = 0; /* sanity init */
1464 U8 *scan = (U8*)NULL; /* sanity init */
1465 STRLEN foldlen = 0; /* required init */
07be1b83 1466 U32 wordlen = 0; /* required init */
c445ea15
AL
1467 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1468
3dab1dad 1469 if (OP(noper) != NOTHING) {
786e8c11 1470 for ( ; uc < e ; uc += len ) {
c445ea15 1471
786e8c11 1472 TRIE_READ_CHAR;
c445ea15 1473
786e8c11
YO
1474 if ( uvc < 256 ) {
1475 charid = trie->charmap[ uvc ];
c445ea15 1476 } else {
55eed653 1477 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1478 if ( !svpp ) {
1479 charid = 0;
1480 } else {
1481 charid=(U16)SvIV( *svpp );
1482 }
c445ea15 1483 }
786e8c11
YO
1484 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1485 if ( charid ) {
a3621e74 1486
786e8c11
YO
1487 U16 check;
1488 U32 newstate = 0;
a3621e74 1489
786e8c11
YO
1490 charid--;
1491 if ( !trie->states[ state ].trans.list ) {
1492 TRIE_LIST_NEW( state );
c445ea15 1493 }
786e8c11
YO
1494 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1495 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1496 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1497 break;
1498 }
1499 }
1500 if ( ! newstate ) {
1501 newstate = next_alloc++;
1502 TRIE_LIST_PUSH( state, charid, newstate );
1503 transcount++;
1504 }
1505 state = newstate;
1506 } else {
1507 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1508 }
a28509cc 1509 }
c445ea15 1510 }
3dab1dad 1511 TRIE_HANDLE_WORD(state);
a3621e74
YO
1512
1513 } /* end second pass */
1514
1e2e3d02
YO
1515 /* next alloc is the NEXT state to be allocated */
1516 trie->statecount = next_alloc;
446bd890
NC
1517 trie->states = PerlMemShared_realloc( trie->states, next_alloc
1518 * sizeof(reg_trie_state) );
a3621e74 1519
3dab1dad 1520 /* and now dump it out before we compress it */
2b8b4781
NC
1521 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1522 revcharmap, next_alloc,
1523 depth+1)
1e2e3d02 1524 );
a3621e74 1525
446bd890
NC
1526 trie->trans
1527 = PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1528 {
1529 U32 state;
a3621e74
YO
1530 U32 tp = 0;
1531 U32 zp = 0;
1532
1533
1534 for( state=1 ; state < next_alloc ; state ++ ) {
1535 U32 base=0;
1536
1537 /*
1538 DEBUG_TRIE_COMPILE_MORE_r(
1539 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1540 );
1541 */
1542
1543 if (trie->states[state].trans.list) {
1544 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1545 U16 maxid=minid;
a28509cc 1546 U16 idx;
a3621e74
YO
1547
1548 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1549 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1550 if ( forid < minid ) {
1551 minid=forid;
1552 } else if ( forid > maxid ) {
1553 maxid=forid;
1554 }
a3621e74
YO
1555 }
1556 if ( transcount < tp + maxid - minid + 1) {
1557 transcount *= 2;
446bd890
NC
1558 trie->trans
1559 = PerlMemShared_realloc( trie->trans,
1560 transcount
1561 * sizeof(reg_trie_trans) );
a3621e74
YO
1562 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1563 }
1564 base = trie->uniquecharcount + tp - minid;
1565 if ( maxid == minid ) {
1566 U32 set = 0;
1567 for ( ; zp < tp ; zp++ ) {
1568 if ( ! trie->trans[ zp ].next ) {
1569 base = trie->uniquecharcount + zp - minid;
1570 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1571 trie->trans[ zp ].check = state;
1572 set = 1;
1573 break;
1574 }
1575 }
1576 if ( !set ) {
1577 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1578 trie->trans[ tp ].check = state;
1579 tp++;
1580 zp = tp;
1581 }
1582 } else {
1583 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1584 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1585 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1586 trie->trans[ tid ].check = state;
1587 }
1588 tp += ( maxid - minid + 1 );
1589 }
1590 Safefree(trie->states[ state ].trans.list);
1591 }
1592 /*
1593 DEBUG_TRIE_COMPILE_MORE_r(
1594 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1595 );
1596 */
1597 trie->states[ state ].trans.base=base;
1598 }
cc601c31 1599 trie->lasttrans = tp + 1;
a3621e74
YO
1600 }
1601 } else {
1602 /*
1603 Second Pass -- Flat Table Representation.
1604
1605 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1606 We know that we will need Charcount+1 trans at most to store the data
1607 (one row per char at worst case) So we preallocate both structures
1608 assuming worst case.
1609
1610 We then construct the trie using only the .next slots of the entry
1611 structs.
1612
1613 We use the .check field of the first entry of the node temporarily to
1614 make compression both faster and easier by keeping track of how many non
1615 zero fields are in the node.
1616
1617 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1618 transition.
1619
1620 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1621 number representing the first entry of the node, and state as a
1622 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1623 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1624 are 2 entrys per node. eg:
1625
1626 A B A B
1627 1. 2 4 1. 3 7
1628 2. 0 3 3. 0 5
1629 3. 0 0 5. 0 0
1630 4. 0 0 7. 0 0
1631
1632 The table is internally in the right hand, idx form. However as we also
1633 have to deal with the states array which is indexed by nodenum we have to
1634 use TRIE_NODENUM() to convert.
1635
1636 */
1e2e3d02
YO
1637 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1638 "%*sCompiling trie using table compiler\n",
1639 (int)depth * 2 + 2, ""));
3dab1dad 1640
446bd890
NC
1641 trie->trans = PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1642 * trie->uniquecharcount + 1,
1643 sizeof(reg_trie_trans) );
1644 trie->states = PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1645 sizeof(reg_trie_state) );
a3621e74
YO
1646 next_alloc = trie->uniquecharcount + 1;
1647
3dab1dad 1648
a3621e74
YO
1649 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1650
c445ea15 1651 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1652 const U8 *uc = (U8*)STRING( noper );
1653 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1654
1655 U32 state = 1; /* required init */
1656
1657 U16 charid = 0; /* sanity init */
1658 U32 accept_state = 0; /* sanity init */
1659 U8 *scan = (U8*)NULL; /* sanity init */
1660
1661 STRLEN foldlen = 0; /* required init */
07be1b83 1662 U32 wordlen = 0; /* required init */
a3621e74
YO
1663 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1664
3dab1dad 1665 if ( OP(noper) != NOTHING ) {
786e8c11 1666 for ( ; uc < e ; uc += len ) {
a3621e74 1667
786e8c11 1668 TRIE_READ_CHAR;
a3621e74 1669
786e8c11
YO
1670 if ( uvc < 256 ) {
1671 charid = trie->charmap[ uvc ];
1672 } else {
55eed653 1673 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1674 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1675 }
786e8c11
YO
1676 if ( charid ) {
1677 charid--;
1678 if ( !trie->trans[ state + charid ].next ) {
1679 trie->trans[ state + charid ].next = next_alloc;
1680 trie->trans[ state ].check++;
1681 next_alloc += trie->uniquecharcount;
1682 }
1683 state = trie->trans[ state + charid ].next;
1684 } else {
1685 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1686 }
1687 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1688 }
a3621e74 1689 }
3dab1dad
YO
1690 accept_state = TRIE_NODENUM( state );
1691 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1692
1693 } /* end second pass */
1694
3dab1dad 1695 /* and now dump it out before we compress it */
2b8b4781
NC
1696 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1697 revcharmap,
1698 next_alloc, depth+1));
a3621e74 1699
a3621e74
YO
1700 {
1701 /*
1702 * Inplace compress the table.*
1703
1704 For sparse data sets the table constructed by the trie algorithm will
1705 be mostly 0/FAIL transitions or to put it another way mostly empty.
1706 (Note that leaf nodes will not contain any transitions.)
1707
1708 This algorithm compresses the tables by eliminating most such
1709 transitions, at the cost of a modest bit of extra work during lookup:
1710
1711 - Each states[] entry contains a .base field which indicates the
1712 index in the state[] array wheres its transition data is stored.
1713
1714 - If .base is 0 there are no valid transitions from that node.
1715
1716 - If .base is nonzero then charid is added to it to find an entry in
1717 the trans array.
1718
1719 -If trans[states[state].base+charid].check!=state then the
1720 transition is taken to be a 0/Fail transition. Thus if there are fail
1721 transitions at the front of the node then the .base offset will point
1722 somewhere inside the previous nodes data (or maybe even into a node
1723 even earlier), but the .check field determines if the transition is
1724 valid.
1725
786e8c11 1726 XXX - wrong maybe?
a3621e74
YO
1727 The following process inplace converts the table to the compressed
1728 table: We first do not compress the root node 1,and mark its all its
1729 .check pointers as 1 and set its .base pointer as 1 as well. This
1730 allows to do a DFA construction from the compressed table later, and
1731 ensures that any .base pointers we calculate later are greater than
1732 0.
1733
1734 - We set 'pos' to indicate the first entry of the second node.
1735
1736 - We then iterate over the columns of the node, finding the first and
1737 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1738 and set the .check pointers accordingly, and advance pos
1739 appropriately and repreat for the next node. Note that when we copy
1740 the next pointers we have to convert them from the original
1741 NODEIDX form to NODENUM form as the former is not valid post
1742 compression.
1743
1744 - If a node has no transitions used we mark its base as 0 and do not
1745 advance the pos pointer.
1746
1747 - If a node only has one transition we use a second pointer into the
1748 structure to fill in allocated fail transitions from other states.
1749 This pointer is independent of the main pointer and scans forward
1750 looking for null transitions that are allocated to a state. When it
1751 finds one it writes the single transition into the "hole". If the
786e8c11 1752 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1753
1754 - Once compressed we can Renew/realloc the structures to release the
1755 excess space.
1756
1757 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1758 specifically Fig 3.47 and the associated pseudocode.
1759
1760 demq
1761 */
a3b680e6 1762 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1763 U32 state, charid;
a3621e74 1764 U32 pos = 0, zp=0;
1e2e3d02 1765 trie->statecount = laststate;
a3621e74
YO
1766
1767 for ( state = 1 ; state < laststate ; state++ ) {
1768 U8 flag = 0;
a28509cc
AL
1769 const U32 stateidx = TRIE_NODEIDX( state );
1770 const U32 o_used = trie->trans[ stateidx ].check;
1771 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1772 trie->trans[ stateidx ].check = 0;
1773
1774 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1775 if ( flag || trie->trans[ stateidx + charid ].next ) {
1776 if ( trie->trans[ stateidx + charid ].next ) {
1777 if (o_used == 1) {
1778 for ( ; zp < pos ; zp++ ) {
1779 if ( ! trie->trans[ zp ].next ) {
1780 break;
1781 }
1782 }
1783 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1784 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1785 trie->trans[ zp ].check = state;
1786 if ( ++zp > pos ) pos = zp;
1787 break;
1788 }
1789 used--;
1790 }
1791 if ( !flag ) {
1792 flag = 1;
1793 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1794 }
1795 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1796 trie->trans[ pos ].check = state;
1797 pos++;
1798 }
1799 }
1800 }
cc601c31 1801 trie->lasttrans = pos + 1;
446bd890
NC
1802 trie->states = PerlMemShared_realloc( trie->states, laststate
1803 * sizeof(reg_trie_state) );
a3621e74 1804 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1805 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1806 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1807 (int)depth * 2 + 2,"",
1808 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1809 (IV)next_alloc,
1810 (IV)pos,
a3621e74
YO
1811 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1812 );
1813
1814 } /* end table compress */
1815 }
1e2e3d02
YO
1816 DEBUG_TRIE_COMPILE_MORE_r(
1817 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1818 (int)depth * 2 + 2, "",
1819 (UV)trie->statecount,
1820 (UV)trie->lasttrans)
1821 );
cc601c31 1822 /* resize the trans array to remove unused space */
446bd890
NC
1823 trie->trans = PerlMemShared_realloc( trie->trans, trie->lasttrans
1824 * sizeof(reg_trie_trans) );
a3621e74 1825
3dab1dad 1826 /* and now dump out the compressed format */
2b8b4781 1827 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1828
3dab1dad 1829 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1830 U8 nodetype =(U8)(flags & 0xFF);
1831 char *str=NULL;
786e8c11 1832
07be1b83 1833#ifdef DEBUGGING
e62cc96a 1834 regnode *optimize = NULL;
b57a0404
JH
1835 U32 mjd_offset = 0;
1836 U32 mjd_nodelen = 0;
07be1b83 1837#endif
a3621e74 1838 /*
3dab1dad
YO
1839 This means we convert either the first branch or the first Exact,
1840 depending on whether the thing following (in 'last') is a branch
1841 or not and whther first is the startbranch (ie is it a sub part of
1842 the alternation or is it the whole thing.)
1843 Assuming its a sub part we conver the EXACT otherwise we convert
1844 the whole branch sequence, including the first.
a3621e74 1845 */
3dab1dad 1846 /* Find the node we are going to overwrite */
7f69552c 1847 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1848 /* branch sub-chain */
3dab1dad 1849 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1850 DEBUG_r({
1851 mjd_offset= Node_Offset((convert));
1852 mjd_nodelen= Node_Length((convert));
1853 });
7f69552c
YO
1854 /* whole branch chain */
1855 } else {
1856 DEBUG_r({
1857 const regnode *nop = NEXTOPER( convert );
1858 mjd_offset= Node_Offset((nop));
1859 mjd_nodelen= Node_Length((nop));
1860 });
07be1b83 1861 }
7f69552c 1862
07be1b83
YO
1863 DEBUG_OPTIMISE_r(
1864 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1865 (int)depth * 2 + 2, "",
786e8c11 1866 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1867 );
a3621e74 1868
3dab1dad
YO
1869 /* But first we check to see if there is a common prefix we can
1870 split out as an EXACT and put in front of the TRIE node. */
1871 trie->startstate= 1;
55eed653 1872 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1873 U32 state;
1e2e3d02 1874 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1875 U32 ofs = 0;
8e11feef
RGS
1876 I32 idx = -1;
1877 U32 count = 0;
1878 const U32 base = trie->states[ state ].trans.base;
a3621e74 1879
3dab1dad 1880 if ( trie->states[state].wordnum )
8e11feef 1881 count = 1;
a3621e74 1882
8e11feef 1883 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1884 if ( ( base + ofs >= trie->uniquecharcount ) &&
1885 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1886 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1887 {
3dab1dad 1888 if ( ++count > 1 ) {
2b8b4781 1889 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1890 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1891 if ( state == 1 ) break;
3dab1dad
YO
1892 if ( count == 2 ) {
1893 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1894 DEBUG_OPTIMISE_r(
8e11feef
RGS
1895 PerlIO_printf(Perl_debug_log,
1896 "%*sNew Start State=%"UVuf" Class: [",
1897 (int)depth * 2 + 2, "",
786e8c11 1898 (UV)state));
be8e71aa 1899 if (idx >= 0) {
2b8b4781 1900 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 1901 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1902
3dab1dad 1903 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1904 if ( folder )
1905 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1906 DEBUG_OPTIMISE_r(
07be1b83 1907 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1908 );
8e11feef
RGS
1909 }
1910 }
1911 TRIE_BITMAP_SET(trie,*ch);
1912 if ( folder )
1913 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1914 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1915 }
1916 idx = ofs;
1917 }
3dab1dad
YO
1918 }
1919 if ( count == 1 ) {
2b8b4781 1920 SV **tmp = av_fetch( revcharmap, idx, 0);
de734bd5
A
1921 char *ch = SvPV_nolen( *tmp );
1922 DEBUG_OPTIMISE_r({
1923 SV *sv=sv_newmortal();
8e11feef
RGS
1924 PerlIO_printf( Perl_debug_log,
1925 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1926 (int)depth * 2 + 2, "",
de734bd5
A
1927 (UV)state, (UV)idx,
1928 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1929 PL_colors[0], PL_colors[1],
1930 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1931 PERL_PV_ESCAPE_FIRSTCHAR
1932 )
1933 );
1934 });
3dab1dad
YO
1935 if ( state==1 ) {
1936 OP( convert ) = nodetype;
1937 str=STRING(convert);
1938 STR_LEN(convert)=0;
1939 }
de734bd5
A
1940 while (*ch) {
1941 *str++ = *ch++;
1942 STR_LEN(convert)++;
1943 }
1944
8e11feef 1945 } else {
f9049ba1 1946#ifdef DEBUGGING
8e11feef
RGS
1947 if (state>1)
1948 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1949#endif
8e11feef
RGS
1950 break;
1951 }
1952 }
3dab1dad 1953 if (str) {
8e11feef 1954 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1955 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1956 trie->startstate = state;
07be1b83
YO
1957 trie->minlen -= (state - 1);
1958 trie->maxlen -= (state - 1);
1959 DEBUG_r({
1960 regnode *fix = convert;
de734bd5 1961 U32 word = trie->wordcount;
07be1b83
YO
1962 mjd_nodelen++;
1963 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1964 while( ++fix < n ) {
1965 Set_Node_Offset_Length(fix, 0, 0);
1966 }
de734bd5 1967 while (word--) {
2b8b4781 1968 SV ** const tmp = av_fetch( trie_words, word, 0 );
de734bd5
A
1969 if (tmp) {
1970 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1971 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1972 else
1973 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1974 }
1975 }
07be1b83 1976 });
8e11feef
RGS
1977 if (trie->maxlen) {
1978 convert = n;
1979 } else {
3dab1dad 1980 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1981 DEBUG_r(optimize= n);
3dab1dad
YO
1982 }
1983 }
1984 }
a5ca303d
YO
1985 if (!jumper)
1986 jumper = last;
3dab1dad 1987 if ( trie->maxlen ) {
8e11feef
RGS
1988 NEXT_OFF( convert ) = (U16)(tail - convert);
1989 ARG_SET( convert, data_slot );
786e8c11
YO
1990 /* Store the offset to the first unabsorbed branch in
1991 jump[0], which is otherwise unused by the jump logic.
1992 We use this when dumping a trie and during optimisation. */
1993 if (trie->jump)
7f69552c 1994 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1995
786e8c11
YO
1996 /* XXXX */
1997 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1998 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1999 {
2000 OP( convert ) = TRIEC;
2001 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2002 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2003 trie->bitmap= NULL;
2004 } else
2005 OP( convert ) = TRIE;
a3621e74 2006
3dab1dad
YO
2007 /* store the type in the flags */
2008 convert->flags = nodetype;
a5ca303d
YO
2009 DEBUG_r({
2010 optimize = convert
2011 + NODE_STEP_REGNODE
2012 + regarglen[ OP( convert ) ];
2013 });
2014 /* XXX We really should free up the resource in trie now,
2015 as we won't use them - (which resources?) dmq */
3dab1dad 2016 }
a3621e74 2017 /* needed for dumping*/
e62cc96a 2018 DEBUG_r(if (optimize) {
07be1b83 2019 regnode *opt = convert;
e62cc96a 2020 while ( ++opt < optimize) {
07be1b83
YO
2021 Set_Node_Offset_Length(opt,0,0);
2022 }
786e8c11
YO
2023 /*
2024 Try to clean up some of the debris left after the
2025 optimisation.
a3621e74 2026 */
786e8c11 2027 while( optimize < jumper ) {
07be1b83 2028 mjd_nodelen += Node_Length((optimize));
a3621e74 2029 OP( optimize ) = OPTIMIZED;
07be1b83 2030 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2031 optimize++;
2032 }
07be1b83 2033 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2034 });
2035 } /* end node insert */
55eed653 2036 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2037#ifdef DEBUGGING
2038 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2039 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2040#else
2041 SvREFCNT_dec(revcharmap);
07be1b83 2042#endif
786e8c11
YO
2043 return trie->jump
2044 ? MADE_JUMP_TRIE
2045 : trie->startstate>1
2046 ? MADE_EXACT_TRIE
2047 : MADE_TRIE;
2048}
2049
2050STATIC void
2051S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2052{
2053/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2054
2055 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2056 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2057 ISBN 0-201-10088-6
2058
2059 We find the fail state for each state in the trie, this state is the longest proper
2060 suffix of the current states 'word' that is also a proper prefix of another word in our
2061 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2062 the DFA not to have to restart after its tried and failed a word at a given point, it
2063 simply continues as though it had been matching the other word in the first place.
2064 Consider
2065 'abcdgu'=~/abcdefg|cdgu/
2066 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2067 fail, which would bring use to the state representing 'd' in the second word where we would
2068 try 'g' and succeed, prodceding to match 'cdgu'.
2069 */
2070 /* add a fail transition */
3251b653
NC
2071 const U32 trie_offset = ARG(source);
2072 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2073 U32 *q;
2074 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2075 const U32 numstates = trie->statecount;
786e8c11
YO
2076 const U32 ubound = trie->lasttrans + ucharcount;
2077 U32 q_read = 0;
2078 U32 q_write = 0;
2079 U32 charid;
2080 U32 base = trie->states[ 1 ].trans.base;
2081 U32 *fail;
2082 reg_ac_data *aho;
2083 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2084 GET_RE_DEBUG_FLAGS_DECL;
2085#ifndef DEBUGGING
2086 PERL_UNUSED_ARG(depth);
2087#endif
2088
2089
2090 ARG_SET( stclass, data_slot );
446bd890 2091 aho = PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2092 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2093 aho->trie=trie_offset;
446bd890
NC
2094 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2095 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2096 Newxz( q, numstates, U32);
446bd890 2097 aho->fail = PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2098 aho->refcount = 1;
2099 fail = aho->fail;
2100 /* initialize fail[0..1] to be 1 so that we always have
2101 a valid final fail state */
2102 fail[ 0 ] = fail[ 1 ] = 1;
2103
2104 for ( charid = 0; charid < ucharcount ; charid++ ) {
2105 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2106 if ( newstate ) {
2107 q[ q_write ] = newstate;
2108 /* set to point at the root */
2109 fail[ q[ q_write++ ] ]=1;
2110 }
2111 }
2112 while ( q_read < q_write) {
2113 const U32 cur = q[ q_read++ % numstates ];
2114 base = trie->states[ cur ].trans.base;
2115
2116 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2117 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2118 if (ch_state) {
2119 U32 fail_state = cur;
2120 U32 fail_base;
2121 do {
2122 fail_state = fail[ fail_state ];
2123 fail_base = aho->states[ fail_state ].trans.base;
2124 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2125
2126 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2127 fail[ ch_state ] = fail_state;
2128 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2129 {
2130 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2131 }
2132 q[ q_write++ % numstates] = ch_state;
2133 }
2134 }
2135 }
2136 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2137 when we fail in state 1, this allows us to use the
2138 charclass scan to find a valid start char. This is based on the principle
2139 that theres a good chance the string being searched contains lots of stuff
2140 that cant be a start char.
2141 */
2142 fail[ 0 ] = fail[ 1 ] = 0;
2143 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2144 PerlIO_printf(Perl_debug_log,
2145 "%*sStclass Failtable (%"UVuf" states): 0",
2146 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2147 );
786e8c11
YO
2148 for( q_read=1; q_read<numstates; q_read++ ) {
2149 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2150 }
2151 PerlIO_printf(Perl_debug_log, "\n");
2152 });
2153 Safefree(q);
2154 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2155}
2156
786e8c11 2157
a3621e74 2158/*
5d1c421c
JH
2159 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2160 * These need to be revisited when a newer toolchain becomes available.
2161 */
2162#if defined(__sparc64__) && defined(__GNUC__)
2163# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2164# undef SPARC64_GCC_WORKAROUND
2165# define SPARC64_GCC_WORKAROUND 1
2166# endif
2167#endif
2168
07be1b83 2169#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2170 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2171 SV * const mysv=sv_newmortal(); \
2172 regnode *Next = regnext(scan); \
2173 regprop(RExC_rx, mysv, scan); \
7f69552c 2174 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2175 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2176 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2177 }});
07be1b83 2178
1de06328
YO
2179
2180
2181
2182
07be1b83
YO
2183#define JOIN_EXACT(scan,min,flags) \
2184 if (PL_regkind[OP(scan)] == EXACT) \
2185 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2186
be8e71aa 2187STATIC U32
07be1b83
YO
2188S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2189 /* Merge several consecutive EXACTish nodes into one. */
2190 regnode *n = regnext(scan);
2191 U32 stringok = 1;
2192 regnode *next = scan + NODE_SZ_STR(scan);
2193 U32 merged = 0;
2194 U32 stopnow = 0;
2195#ifdef DEBUGGING
2196 regnode *stop = scan;
72f13be8 2197 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2198#else
d47053eb
RGS
2199 PERL_UNUSED_ARG(depth);
2200#endif
2201#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2202 PERL_UNUSED_ARG(flags);
2203 PERL_UNUSED_ARG(val);
07be1b83 2204#endif
07be1b83
YO
2205 DEBUG_PEEP("join",scan,depth);
2206
2207 /* Skip NOTHING, merge EXACT*. */
2208 while (n &&
2209 ( PL_regkind[OP(n)] == NOTHING ||
2210 (stringok && (OP(n) == OP(scan))))
2211 && NEXT_OFF(n)
2212 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2213
2214 if (OP(n) == TAIL || n > next)
2215 stringok = 0;
2216 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2217 DEBUG_PEEP("skip:",n,depth);
2218 NEXT_OFF(scan) += NEXT_OFF(n);
2219 next = n + NODE_STEP_REGNODE;
2220#ifdef DEBUGGING
2221 if (stringok)
2222 stop = n;
2223#endif
2224 n = regnext(n);
2225 }
2226 else if (stringok) {
786e8c11 2227 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2228 regnode * const nnext = regnext(n);
2229
2230 DEBUG_PEEP("merg",n,depth);
2231
2232 merged++;
2233 if (oldl + STR_LEN(n) > U8_MAX)
2234 break;
2235 NEXT_OFF(scan) += NEXT_OFF(n);
2236 STR_LEN(scan) += STR_LEN(n);
2237 next = n + NODE_SZ_STR(n);
2238 /* Now we can overwrite *n : */
2239 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2240#ifdef DEBUGGING
2241 stop = next - 1;
2242#endif
2243 n = nnext;
2244 if (stopnow) break;
2245 }
2246
d47053eb
RGS
2247#ifdef EXPERIMENTAL_INPLACESCAN
2248 if (flags && !NEXT_OFF(n)) {
2249 DEBUG_PEEP("atch", val, depth);
2250 if (reg_off_by_arg[OP(n)]) {
2251 ARG_SET(n, val - n);
2252 }
2253 else {
2254 NEXT_OFF(n) = val - n;
2255 }
2256 stopnow = 1;
2257 }
07be1b83
YO
2258#endif
2259 }
2260
2261 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2262 /*
2263 Two problematic code points in Unicode casefolding of EXACT nodes:
2264
2265 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2266 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2267
2268 which casefold to
2269
2270 Unicode UTF-8
2271
2272 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2273 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2274
2275 This means that in case-insensitive matching (or "loose matching",
2276 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2277 length of the above casefolded versions) can match a target string
2278 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2279 This would rather mess up the minimum length computation.
2280
2281 What we'll do is to look for the tail four bytes, and then peek
2282 at the preceding two bytes to see whether we need to decrease
2283 the minimum length by four (six minus two).
2284
2285 Thanks to the design of UTF-8, there cannot be false matches:
2286 A sequence of valid UTF-8 bytes cannot be a subsequence of
2287 another valid sequence of UTF-8 bytes.
2288
2289 */
2290 char * const s0 = STRING(scan), *s, *t;
2291 char * const s1 = s0 + STR_LEN(scan) - 1;
2292 char * const s2 = s1 - 4;
e294cc5d
JH
2293#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2294 const char t0[] = "\xaf\x49\xaf\x42";
2295#else
07be1b83 2296 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2297#endif
07be1b83
YO
2298 const char * const t1 = t0 + 3;
2299
2300 for (s = s0 + 2;
2301 s < s2 && (t = ninstr(s, s1, t0, t1));
2302 s = t + 4) {
e294cc5d
JH
2303#ifdef EBCDIC
2304 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2305 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2306#else
07be1b83
YO
2307 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2308 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2309#endif
07be1b83
YO
2310 *min -= 4;
2311 }
2312 }
2313
2314#ifdef DEBUGGING
2315 /* Allow dumping */
2316 n = scan + NODE_SZ_STR(scan);
2317 while (n <= stop) {
2318 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2319 OP(n) = OPTIMIZED;
2320 NEXT_OFF(n) = 0;
2321 }
2322 n++;
2323 }
2324#endif
2325 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2326 return stopnow;
2327}
2328
653099ff
GS
2329/* REx optimizer. Converts nodes into quickier variants "in place".
2330 Finds fixed substrings. */
2331
a0288114 2332/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2333 to the position after last scanned or to NULL. */
2334
40d049e4
YO
2335#define INIT_AND_WITHP \
2336 assert(!and_withp); \
2337 Newx(and_withp,1,struct regnode_charclass_class); \
2338 SAVEFREEPV(and_withp)
07be1b83 2339
b515a41d
YO
2340/* this is a chain of data about sub patterns we are processing that
2341 need to be handled seperately/specially in study_chunk. Its so
2342 we can simulate recursion without losing state. */
2343struct scan_frame;
2344typedef struct scan_frame {
2345 regnode *last; /* last node to process in this frame */
2346 regnode *next; /* next node to process when last is reached */
2347 struct scan_frame *prev; /*previous frame*/
2348 I32 stop; /* what stopparen do we use */
2349} scan_frame;
2350
304ee84b
YO
2351
2352#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2353
76e3520e 2354STATIC I32
40d049e4 2355S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2356 I32 *minlenp, I32 *deltap,
40d049e4
YO
2357 regnode *last,
2358 scan_data_t *data,
2359 I32 stopparen,
2360 U8* recursed,
2361 struct regnode_charclass_class *and_withp,
2362 U32 flags, U32 depth)
c277df42
IZ
2363 /* scanp: Start here (read-write). */
2364 /* deltap: Write maxlen-minlen here. */
2365 /* last: Stop before this one. */
40d049e4
YO
2366 /* data: string data about the pattern */
2367 /* stopparen: treat close N as END */
2368 /* recursed: which subroutines have we recursed into */
2369 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2370{
97aff369 2371 dVAR;
c277df42
IZ
2372 I32 min = 0, pars = 0, code;
2373 regnode *scan = *scanp, *next;
2374 I32 delta = 0;
2375 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2376 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2377 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2378 scan_data_t data_fake;
a3621e74 2379 SV *re_trie_maxbuff = NULL;
786e8c11 2380 regnode *first_non_open = scan;
e2e6a0f1 2381 I32 stopmin = I32_MAX;
8aa23a47
YO
2382 scan_frame *frame = NULL;
2383
a3621e74 2384 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2385
13a24bad 2386#ifdef DEBUGGING
40d049e4 2387 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2388#endif
40d049e4 2389
786e8c11 2390 if ( depth == 0 ) {
40d049e4 2391 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2392 first_non_open=regnext(first_non_open);
2393 }
2394
b81d288d 2395
8aa23a47
YO
2396 fake_study_recurse:
2397 while ( scan && OP(scan) != END && scan < last ){
2398 /* Peephole optimizer: */
304ee84b 2399 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2400 DEBUG_PEEP("Peep",scan,depth);
2401 JOIN_EXACT(scan,&min,0);
2402
2403 /* Follow the next-chain of the current node and optimize
2404 away all the NOTHINGs from it. */
2405 if (OP(scan) != CURLYX) {
2406 const int max = (reg_off_by_arg[OP(scan)]
2407 ? I32_MAX
2408 /* I32 may be smaller than U16 on CRAYs! */
2409 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2410 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2411 int noff;
2412 regnode *n = scan;
2413
2414 /* Skip NOTHING and LONGJMP. */
2415 while ((n = regnext(n))
2416 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2417 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2418 && off + noff < max)
2419 off += noff;
2420 if (reg_off_by_arg[OP(scan)])
2421 ARG(scan) = off;
2422 else
2423 NEXT_OFF(scan) = off;
2424 }
a3621e74 2425
c277df42 2426
8aa23a47
YO
2427
2428 /* The principal pseudo-switch. Cannot be a switch, since we
2429 look into several different things. */
2430 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2431 || OP(scan) == IFTHEN) {
2432 next = regnext(scan);
2433 code = OP(scan);
2434 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2435
2436 if (OP(next) == code || code == IFTHEN) {
2437 /* NOTE - There is similar code to this block below for handling
2438 TRIE nodes on a re-study. If you change stuff here check there
2439 too. */
2440 I32 max1 = 0, min1 = I32_MAX, num = 0;
2441 struct regnode_charclass_class accum;
2442 regnode * const startbranch=scan;
2443
2444 if (flags & SCF_DO_SUBSTR)
304ee84b 2445 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2446 if (flags & SCF_DO_STCLASS)
2447 cl_init_zero(pRExC_state, &accum);
2448
2449 while (OP(scan) == code) {
2450 I32 deltanext, minnext, f = 0, fake;
2451 struct regnode_charclass_class this_class;
2452
2453 num++;
2454 data_fake.flags = 0;
2455 if (data) {
2456 data_fake.whilem_c = data->whilem_c;
2457 data_fake.last_closep = data->last_closep;
2458 }
2459 else
2460 data_fake.last_closep = &fake;
58e23c8d
YO
2461
2462 data_fake.pos_delta = delta;
8aa23a47
YO
2463 next = regnext(scan);
2464 scan = NEXTOPER(scan);
2465 if (code != BRANCH)
c277df42 2466 scan = NEXTOPER(scan);
8aa23a47
YO
2467 if (flags & SCF_DO_STCLASS) {
2468 cl_init(pRExC_state, &this_class);
2469 data_fake.start_class = &this_class;
2470 f = SCF_DO_STCLASS_AND;
58e23c8d 2471 }
8aa23a47
YO
2472 if (flags & SCF_WHILEM_VISITED_POS)
2473 f |= SCF_WHILEM_VISITED_POS;
2474
2475 /* we suppose the run is continuous, last=next...*/
2476 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2477 next, &data_fake,
2478 stopparen, recursed, NULL, f,depth+1);
2479 if (min1 > minnext)
2480 min1 = minnext;
2481 if (max1 < minnext + deltanext)
2482 max1 = minnext + deltanext;
2483 if (deltanext == I32_MAX)
2484 is_inf = is_inf_internal = 1;
2485 scan = next;
2486 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2487 pars++;
2488 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2489 if ( stopmin > minnext)
2490 stopmin = min + min1;
2491 flags &= ~SCF_DO_SUBSTR;
2492 if (data)
2493 data->flags |= SCF_SEEN_ACCEPT;
2494 }
2495 if (data) {
2496 if (data_fake.flags & SF_HAS_EVAL)
2497 data->flags |= SF_HAS_EVAL;
2498 data->whilem_c = data_fake.whilem_c;
3dab1dad 2499 }
8aa23a47
YO
2500 if (flags & SCF_DO_STCLASS)
2501 cl_or(pRExC_state, &accum, &this_class);
2502 }
2503 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2504 min1 = 0;
2505 if (flags & SCF_DO_SUBSTR) {
2506 data->pos_min += min1;
2507 data->pos_delta += max1 - min1;
2508 if (max1 != min1 || is_inf)
2509 data->longest = &(data->longest_float);
2510 }
2511 min += min1;
2512 delta += max1 - min1;
2513 if (flags & SCF_DO_STCLASS_OR) {
2514 cl_or(pRExC_state, data->start_class, &accum);
2515 if (min1) {
2516 cl_and(data->start_class, and_withp);
2517 flags &= ~SCF_DO_STCLASS;
653099ff 2518 }
8aa23a47
YO
2519 }
2520 else if (flags & SCF_DO_STCLASS_AND) {
2521 if (min1) {
2522 cl_and(data->start_class, &accum);
2523 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2524 }
8aa23a47
YO
2525 else {
2526 /* Switch to OR mode: cache the old value of
2527 * data->start_class */
2528 INIT_AND_WITHP;
2529 StructCopy(data->start_class, and_withp,
2530 struct regnode_charclass_class);
2531 flags &= ~SCF_DO_STCLASS_AND;
2532 StructCopy(&accum, data->start_class,
2533 struct regnode_charclass_class);
2534 flags |= SCF_DO_STCLASS_OR;
2535 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2536 }
8aa23a47 2537 }
a3621e74 2538
8aa23a47
YO
2539 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2540 /* demq.
a3621e74 2541
8aa23a47
YO
2542 Assuming this was/is a branch we are dealing with: 'scan' now
2543 points at the item that follows the branch sequence, whatever
2544 it is. We now start at the beginning of the sequence and look
2545 for subsequences of
a3621e74 2546
8aa23a47
YO
2547 BRANCH->EXACT=>x1
2548 BRANCH->EXACT=>x2
2549 tail
a3621e74 2550
8aa23a47 2551 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2552
8aa23a47
YO
2553 If we can find such a subseqence we need to turn the first
2554 element into a trie and then add the subsequent branch exact
2555 strings to the trie.
a3621e74 2556
8aa23a47 2557 We have two cases
a3621e74 2558
8aa23a47 2559 1. patterns where the whole set of branch can be converted.
a3621e74 2560
8aa23a47 2561 2. patterns where only a subset can be converted.
a3621e74 2562
8aa23a47
YO
2563 In case 1 we can replace the whole set with a single regop
2564 for the trie. In case 2 we need to keep the start and end
2565 branchs so
a3621e74 2566
8aa23a47
YO
2567 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2568 becomes BRANCH TRIE; BRANCH X;
786e8c11 2569
8aa23a47
YO
2570 There is an additional case, that being where there is a
2571 common prefix, which gets split out into an EXACT like node
2572 preceding the TRIE node.
a3621e74 2573
8aa23a47
YO
2574 If x(1..n)==tail then we can do a simple trie, if not we make
2575 a "jump" trie, such that when we match the appropriate word
2576 we "jump" to the appopriate tail node. Essentailly we turn
2577 a nested if into a case structure of sorts.
b515a41d 2578
8aa23a47
YO
2579 */
2580
2581 int made=0;
2582 if (!re_trie_maxbuff) {
2583 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2584 if (!SvIOK(re_trie_maxbuff))
2585 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2586 }
2587 if ( SvIV(re_trie_maxbuff)>=0 ) {
2588 regnode *cur;
2589 regnode *first = (regnode *)NULL;
2590 regnode *last = (regnode *)NULL;
2591 regnode *tail = scan;
2592 U8 optype = 0;
2593 U32 count=0;
a3621e74
YO
2594
2595#ifdef DEBUGGING
8aa23a47 2596 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2597#endif
8aa23a47
YO
2598 /* var tail is used because there may be a TAIL
2599 regop in the way. Ie, the exacts will point to the
2600 thing following the TAIL, but the last branch will
2601 point at the TAIL. So we advance tail. If we
2602 have nested (?:) we may have to move through several
2603 tails.
2604 */
2605
2606 while ( OP( tail ) == TAIL ) {
2607 /* this is the TAIL generated by (?:) */
2608 tail = regnext( tail );
2609 }
a3621e74 2610
8aa23a47
YO
2611
2612 DEBUG_OPTIMISE_r({
2613 regprop(RExC_rx, mysv, tail );
2614 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2615 (int)depth * 2 + 2, "",
2616 "Looking for TRIE'able sequences. Tail node is: ",
2617 SvPV_nolen_const( mysv )
2618 );
2619 });
2620
2621 /*
2622
2623 step through the branches, cur represents each
2624 branch, noper is the first thing to be matched
2625 as part of that branch and noper_next is the
2626 regnext() of that node. if noper is an EXACT
2627 and noper_next is the same as scan (our current
2628 position in the regex) then the EXACT branch is
2629 a possible optimization target. Once we have
2630 two or more consequetive such branches we can
2631 create a trie of the EXACT's contents and stich
2632 it in place. If the sequence represents all of
2633 the branches we eliminate the whole thing and
2634 replace it with a single TRIE. If it is a
2635 subsequence then we need to stitch it in. This
2636 means the first branch has to remain, and needs
2637 to be repointed at the item on the branch chain
2638 following the last branch optimized. This could
2639 be either a BRANCH, in which case the
2640 subsequence is internal, or it could be the
2641 item following the branch sequence in which
2642 case the subsequence is at the end.
2643
2644 */
2645
2646 /* dont use tail as the end marker for this traverse */
2647 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2648 regnode * const noper = NEXTOPER( cur );
b515a41d 2649#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2650 regnode * const noper_next = regnext( noper );
b515a41d
YO
2651#endif
2652
8aa23a47
YO
2653 DEBUG_OPTIMISE_r({
2654 regprop(RExC_rx, mysv, cur);
2655 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2656 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2657
2658 regprop(RExC_rx, mysv, noper);
2659 PerlIO_printf( Perl_debug_log, " -> %s",
2660 SvPV_nolen_const(mysv));
2661
2662 if ( noper_next ) {
2663 regprop(RExC_rx, mysv, noper_next );
2664 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2665 SvPV_nolen_const(mysv));
2666 }
2667 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2668 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2669 });
2670 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2671 : PL_regkind[ OP( noper ) ] == EXACT )
2672 || OP(noper) == NOTHING )
786e8c11 2673#ifdef NOJUMPTRIE
8aa23a47 2674 && noper_next == tail
786e8c11 2675#endif
8aa23a47
YO
2676 && count < U16_MAX)
2677 {
2678 count++;
2679 if ( !first || optype == NOTHING ) {
2680 if (!first) first = cur;
2681 optype = OP( noper );
2682 } else {
2683 last = cur;
2684 }
2685 } else {
2686 if ( last ) {
2687 make_trie( pRExC_state,
2688 startbranch, first, cur, tail, count,
2689 optype, depth+1 );
2690 }
2691 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2692#ifdef NOJUMPTRIE
8aa23a47 2693 && noper_next == tail
786e8c11 2694#endif
8aa23a47
YO
2695 ){
2696 count = 1;
2697 first = cur;
2698 optype = OP( noper );
2699 } else {
2700 count = 0;
2701 first = NULL;
2702 optype = 0;
2703 }
2704 last = NULL;
2705 }
2706 }
2707 DEBUG_OPTIMISE_r({
2708 regprop(RExC_rx, mysv, cur);
2709 PerlIO_printf( Perl_debug_log,
2710 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2711 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2712
2713 });
2714 if ( last ) {
2715 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2716#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2717 if ( ((made == MADE_EXACT_TRIE &&
2718 startbranch == first)
2719 || ( first_non_open == first )) &&
2720 depth==0 ) {
2721 flags |= SCF_TRIE_RESTUDY;
2722 if ( startbranch == first
2723 && scan == tail )
2724 {
2725 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2726 }
2727 }
3dab1dad 2728#endif
8aa23a47
YO
2729 }
2730 }
2731
2732 } /* do trie */
2733
653099ff 2734 }
8aa23a47
YO
2735 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2736 scan = NEXTOPER(NEXTOPER(scan));
2737 } else /* single branch is optimized. */
2738 scan = NEXTOPER(scan);
2739 continue;
2740 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2741 scan_frame *newframe = NULL;
2742 I32 paren;
2743 regnode *start;
2744 regnode *end;
2745
2746 if (OP(scan) != SUSPEND) {
2747 /* set the pointer */
2748 if (OP(scan) == GOSUB) {
2749 paren = ARG(scan);
2750 RExC_recurse[ARG2L(scan)] = scan;
2751 start = RExC_open_parens[paren-1];
2752 end = RExC_close_parens[paren-1];
2753 } else {
2754 paren = 0;
f8fc2ecf 2755 start = RExC_rxi->program + 1;
8aa23a47
YO
2756 end = RExC_opend;
2757 }
2758 if (!recursed) {
2759 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2760 SAVEFREEPV(recursed);
2761 }
2762 if (!PAREN_TEST(recursed,paren+1)) {
2763 PAREN_SET(recursed,paren+1);
2764 Newx(newframe,1,scan_frame);
2765 } else {
2766 if (flags & SCF_DO_SUBSTR) {
304ee84b 2767 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2768 data->longest = &(data->longest_float);
2769 }
2770 is_inf = is_inf_internal = 1;
2771 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2772 cl_anything(pRExC_state, data->start_class);
2773 flags &= ~SCF_DO_STCLASS;
2774 }
2775 } else {
2776 Newx(newframe,1,scan_frame);
2777 paren = stopparen;
2778 start = scan+2;
2779 end = regnext(scan);
2780 }
2781 if (newframe) {
2782 assert(start);
2783 assert(end);
2784 SAVEFREEPV(newframe);
2785 newframe->next = regnext(scan);
2786 newframe->last = last;
2787 newframe->stop = stopparen;
2788 newframe->prev = frame;
2789
2790 frame = newframe;
2791 scan = start;
2792 stopparen = paren;
2793 last = end;
2794
2795 continue;
2796 }
2797 }
2798 else if (OP(scan) == EXACT) {
2799 I32 l = STR_LEN(scan);
2800 UV uc;
2801 if (UTF) {
2802 const U8 * const s = (U8*)STRING(scan);
2803 l = utf8_length(s, s + l);
2804 uc = utf8_to_uvchr(s, NULL);
2805 } else {
2806 uc = *((U8*)STRING(scan));
2807 }
2808 min += l;
2809 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2810 /* The code below prefers earlier match for fixed
2811 offset, later match for variable offset. */
2812 if (data->last_end == -1) { /* Update the start info. */
2813 data->last_start_min = data->pos_min;
2814 data->last_start_max = is_inf
2815 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2816 }
8aa23a47
YO
2817 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2818 if (UTF)
2819 SvUTF8_on(data->last_found);
2820 {
2821 SV * const sv = data->last_found;
2822 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2823 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2824 if (mg && mg->mg_len >= 0)
2825 mg->mg_len += utf8_length((U8*)STRING(scan),
2826 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2827 }
8aa23a47
YO
2828 data->last_end = data->pos_min + l;
2829 data->pos_min += l; /* As in the first entry. */
2830 data->flags &= ~SF_BEFORE_EOL;
2831 }
2832 if (flags & SCF_DO_STCLASS_AND) {
2833 /* Check whether it is compatible with what we know already! */
2834 int compat = 1;
2835
2836 if (uc >= 0x100 ||
2837 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2838 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2839 && (!(data->start_class->flags & ANYOF_FOLD)
2840 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2841 )
2842 compat = 0;
2843 ANYOF_CLASS_ZERO(data->start_class);
2844 ANYOF_BITMAP_ZERO(data->start_class);
2845 if (compat)
2846 ANYOF_BITMAP_SET(data->start_class, uc);
2847 data->start_class->flags &= ~ANYOF_EOS;
2848 if (uc < 0x100)
2849 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2850 }
2851 else if (flags & SCF_DO_STCLASS_OR) {
2852 /* false positive possible if the class is case-folded */
2853 if (uc < 0x100)
2854 ANYOF_BITMAP_SET(data->start_class, uc);
2855 else
2856 data->start_class->flags |= ANYOF_UNICODE_ALL;
2857 data->start_class->flags &= ~ANYOF_EOS;
2858 cl_and(data->start_class, and_withp);
2859 }
2860 flags &= ~SCF_DO_STCLASS;
2861 }
2862 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2863 I32 l = STR_LEN(scan);
2864 UV uc = *((U8*)STRING(scan));
2865
2866 /* Search for fixed substrings supports EXACT only. */
2867 if (flags & SCF_DO_SUBSTR) {
2868 assert(data);
304ee84b 2869 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
2870 }
2871 if (UTF) {
2872 const U8 * const s = (U8 *)STRING(scan);
2873 l = utf8_length(s, s + l);
2874 uc = utf8_to_uvchr(s, NULL);
2875 }
2876 min += l;
2877 if (flags & SCF_DO_SUBSTR)
2878 data->pos_min += l;
2879 if (flags & SCF_DO_STCLASS_AND) {
2880 /* Check whether it is compatible with what we know already! */
2881 int compat = 1;
2882
2883 if (uc >= 0x100 ||
2884 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2885 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2886 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2887 compat = 0;
2888 ANYOF_CLASS_ZERO(data->start_class);
2889 ANYOF_BITMAP_ZERO(data->start_class);
2890 if (compat) {
2891 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2892 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2893 data->start_class->flags |= ANYOF_FOLD;
2894 if (OP(scan) == EXACTFL)
2895 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2896 }
8aa23a47
YO
2897 }
2898 else if (flags & SCF_DO_STCLASS_OR) {
2899 if (data->start_class->flags & ANYOF_FOLD) {
2900 /* false positive possible if the class is case-folded.
2901 Assume that the locale settings are the same... */
1aa99e6b
IH
2902 if (uc < 0x100)
2903 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2904 data->start_class->flags &= ~ANYOF_EOS;
2905 }
8aa23a47 2906 cl_and(data->start_class, and_withp);
653099ff 2907 }
8aa23a47
YO
2908 flags &= ~SCF_DO_STCLASS;
2909 }
2910 else if (strchr((const char*)PL_varies,OP(scan))) {
2911 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2912 I32 f = flags, pos_before = 0;
2913 regnode * const oscan = scan;
2914 struct regnode_charclass_class this_class;
2915 struct regnode_charclass_class *oclass = NULL;
2916 I32 next_is_eval = 0;
2917
2918 switch (PL_regkind[OP(scan)]) {
2919 case WHILEM: /* End of (?:...)* . */
2920 scan = NEXTOPER(scan);
2921 goto finish;
2922 case PLUS:
2923 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2924 next = NEXTOPER(scan);
2925 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2926 mincount = 1;
2927 maxcount = REG_INFTY;
2928 next = regnext(scan);
2929 scan = NEXTOPER(scan);
2930 goto do_curly;
2931 }
2932 }
2933 if (flags & SCF_DO_SUBSTR)
2934 data->pos_min++;
2935 min++;
2936 /* Fall through. */
2937 case STAR:
2938 if (flags & SCF_DO_STCLASS) {
2939 mincount = 0;
2940 maxcount = REG_INFTY;
2941 next = regnext(scan);
2942 scan = NEXTOPER(scan);
2943 goto do_curly;
2944 }
2945 is_inf = is_inf_internal = 1;
2946 scan = regnext(scan);
c277df42 2947 if (flags & SCF_DO_SUBSTR) {
304ee84b 2948 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 2949 data->longest = &(data->longest_float);
c277df42 2950 }
8aa23a47
YO
2951 goto optimize_curly_tail;
2952 case CURLY:
2953 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2954 && (scan->flags == stopparen))
2955 {
2956 mincount = 1;
2957 maxcount = 1;
2958 } else {
2959 mincount = ARG1(scan);
2960 maxcount = ARG2(scan);
653099ff 2961 }
8aa23a47
YO
2962 next = regnext(scan);
2963 if (OP(scan) == CURLYX) {
2964 I32 lp = (data ? *(data->last_closep) : 0);
2965 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2966 }
8aa23a47
YO
2967 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2968 next_is_eval = (OP(scan) == EVAL);
2969 do_curly:
2970 if (flags & SCF_DO_SUBSTR) {
304ee84b 2971 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 2972 pos_before = data->pos_min;
b45f050a 2973 }
8aa23a47
YO
2974 if (data) {
2975 fl = data->flags;
2976 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2977 if (is_inf)
2978 data->flags |= SF_IS_INF;
2979 }
2980 if (flags & SCF_DO_STCLASS) {
2981 cl_init(pRExC_state, &this_class);
2982 oclass = data->start_class;
2983 data->start_class = &this_class;
2984 f |= SCF_DO_STCLASS_AND;
2985 f &= ~SCF_DO_STCLASS_OR;
2986 }
2987 /* These are the cases when once a subexpression
2988 fails at a particular position, it cannot succeed
2989 even after backtracking at the enclosing scope.
2990
2991 XXXX what if minimal match and we are at the
2992 initial run of {n,m}? */
2993 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2994 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 2995
8aa23a47
YO
2996 /* This will finish on WHILEM, setting scan, or on NULL: */
2997 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2998 last, data, stopparen, recursed, NULL,
2999 (mincount == 0
3000 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3001
8aa23a47
YO
3002 if (flags & SCF_DO_STCLASS)
3003 data->start_class = oclass;
3004 if (mincount == 0 || minnext == 0) {
3005 if (flags & SCF_DO_STCLASS_OR) {
3006 cl_or(pRExC_state, data->start_class, &this_class);
3007 }
3008 else if (flags & SCF_DO_STCLASS_AND) {
3009 /* Switch to OR mode: cache the old value of
3010 * data->start_class */
3011 INIT_AND_WITHP;
3012 StructCopy(data->start_class, and_withp,
3013 struct regnode_charclass_class);
3014 flags &= ~SCF_DO_STCLASS_AND;
3015 StructCopy(&this_class, data->start_class,
3016 struct regnode_charclass_class);
3017 flags |= SCF_DO_STCLASS_OR;
3018 data->start_class->flags |= ANYOF_EOS;
3019 }
3020 } else { /* Non-zero len */
3021 if (flags & SCF_DO_STCLASS_OR) {
3022 cl_or(pRExC_state, data->start_class, &this_class);
3023 cl_and(data->start_class, and_withp);
3024 }
3025 else if (flags & SCF_DO_STCLASS_AND)
3026 cl_and(data->start_class, &this_class);
3027 flags &= ~SCF_DO_STCLASS;
3028 }
3029 if (!scan) /* It was not CURLYX, but CURLY. */
3030 scan = next;
3031 if ( /* ? quantifier ok, except for (?{ ... }) */
3032 (next_is_eval || !(mincount == 0 && maxcount == 1))
3033 && (minnext == 0) && (deltanext == 0)
3034 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3035 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3036 && ckWARN(WARN_REGEXP))
3037 {
3038 vWARN(RExC_parse,
3039 "Quantifier unexpected on zero-length expression");
3040 }
3041
3042 min += minnext * mincount;
3043 is_inf_internal |= ((maxcount == REG_INFTY
3044 && (minnext + deltanext) > 0)
3045 || deltanext == I32_MAX);
3046 is_inf |= is_inf_internal;
3047 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3048
3049 /* Try powerful optimization CURLYX => CURLYN. */
3050 if ( OP(oscan) == CURLYX && data
3051 && data->flags & SF_IN_PAR
3052 && !(data->flags & SF_HAS_EVAL)
3053 && !deltanext && minnext == 1 ) {
3054 /* Try to optimize to CURLYN. */
3055 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3056 regnode * const nxt1 = nxt;
497b47a8 3057#ifdef DEBUGGING
8aa23a47 3058 regnode *nxt2;
497b47a8 3059#endif
c277df42 3060
8aa23a47
YO
3061 /* Skip open. */
3062 nxt = regnext(nxt);
3063 if (!strchr((const char*)PL_simple,OP(nxt))
3064 && !(PL_regkind[OP(nxt)] == EXACT
3065 && STR_LEN(nxt) == 1))
3066 goto nogo;
497b47a8 3067#ifdef DEBUGGING
8aa23a47 3068 nxt2 = nxt;
497b47a8 3069#endif
8aa23a47
YO
3070 nxt = regnext(nxt);
3071 if (OP(nxt) != CLOSE)
3072 goto nogo;
3073 if (RExC_open_parens) {
3074 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3075 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3076 }
3077 /* Now we know that nxt2 is the only contents: */
3078 oscan->flags = (U8)ARG(nxt);
3079 OP(oscan) = CURLYN;
3080 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3081
c277df42 3082#ifdef DEBUGGING
8aa23a47
YO
3083 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3084 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3085 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3086 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3087 OP(nxt + 1) = OPTIMIZED; /* was count. */
3088 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3089#endif
8aa23a47
YO
3090 }
3091 nogo:
3092
3093 /* Try optimization CURLYX => CURLYM. */
3094 if ( OP(oscan) == CURLYX && data
3095 && !(data->flags & SF_HAS_PAR)
3096 && !(data->flags & SF_HAS_EVAL)
3097 && !deltanext /* atom is fixed width */
3098 && minnext != 0 /* CURLYM can't handle zero width */
3099 ) {
3100 /* XXXX How to optimize if data == 0? */
3101 /* Optimize to a simpler form. */
3102 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3103 regnode *nxt2;
3104
3105 OP(oscan) = CURLYM;
3106 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3107 && (OP(nxt2) != WHILEM))
3108 nxt = nxt2;
3109 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3110 /* Need to optimize away parenths. */
3111 if (data->flags & SF_IN_PAR) {
3112 /* Set the parenth number. */
3113 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3114
3115 if (OP(nxt) != CLOSE)
3116 FAIL("Panic opt close");
3117 oscan->flags = (U8)ARG(nxt);
3118 if (RExC_open_parens) {
3119 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3120 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3121 }
8aa23a47
YO
3122 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3123 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3124
c277df42 3125#ifdef DEBUGGING
8aa23a47
YO
3126 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3127 OP(nxt + 1) = OPTIMIZED; /* was count. */
3128 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3129 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3130#endif
c277df42 3131#if 0
8aa23a47
YO
3132 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3133 regnode *nnxt = regnext(nxt1);
3134
3135 if (nnxt == nxt) {
3136 if (reg_off_by_arg[OP(nxt1)])
3137 ARG_SET(nxt1, nxt2 - nxt1);
3138 else if (nxt2 - nxt1 < U16_MAX)
3139 NEXT_OFF(nxt1) = nxt2 - nxt1;
3140 else
3141 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3142 }
8aa23a47 3143 nxt1 = nnxt;
c277df42 3144 }
5d1c421c 3145#endif
8aa23a47
YO
3146 /* Optimize again: */
3147 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3148 NULL, stopparen, recursed, NULL, 0,depth+1);
3149 }
3150 else
3151 oscan->flags = 0;
3152 }
3153 else if ((OP(oscan) == CURLYX)
3154 && (flags & SCF_WHILEM_VISITED_POS)
3155 /* See the comment on a similar expression above.
3156 However, this time it not a subexpression
3157 we care about, but the expression itself. */
3158 && (maxcount == REG_INFTY)
3159 && data && ++data->whilem_c < 16) {
3160 /* This stays as CURLYX, we can put the count/of pair. */
3161 /* Find WHILEM (as in regexec.c) */
3162 regnode *nxt = oscan + NEXT_OFF(oscan);
3163
3164 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3165 nxt += ARG(nxt);
3166 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3167 | (RExC_whilem_seen << 4)); /* On WHILEM */
3168 }
3169 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3170 pars++;
3171 if (flags & SCF_DO_SUBSTR) {
3172 SV *last_str = NULL;
3173 int counted = mincount != 0;
a0ed51b3 3174
8aa23a47
YO
3175 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3176#if defined(SPARC64_GCC_WORKAROUND)
3177 I32 b = 0;
3178 STRLEN l = 0;
3179 const char *s = NULL;
3180 I32 old = 0;
b515a41d 3181
8aa23a47
YO
3182 if (pos_before >= data->last_start_min)
3183 b = pos_before;
3184 else
3185 b = data->last_start_min;
b515a41d 3186
8aa23a47
YO
3187 l = 0;
3188 s = SvPV_const(data->last_found, l);
3189 old = b - data->last_start_min;
3190
3191#else
3192 I32 b = pos_before >= data->last_start_min
3193 ? pos_before : data->last_start_min;
3194 STRLEN l;
3195 const char * const s = SvPV_const(data->last_found, l);
3196 I32 old = b - data->last_start_min;
3197#endif
3198
3199 if (UTF)
3200 old = utf8_hop((U8*)s, old) - (U8*)s;
3201
3202 l -= old;
3203 /* Get the added string: */
3204 last_str = newSVpvn(s + old, l);
3205 if (UTF)
3206 SvUTF8_on(last_str);
3207 if (deltanext == 0 && pos_before == b) {
3208 /* What was added is a constant string */
3209 if (mincount > 1) {
3210 SvGROW(last_str, (mincount * l) + 1);
3211 repeatcpy(SvPVX(last_str) + l,
3212 SvPVX_const(last_str), l, mincount - 1);
3213 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3214 /* Add additional parts. */
3215 SvCUR_set(data->last_found,
3216 SvCUR(data->last_found) - l);
3217 sv_catsv(data->last_found, last_str);
3218 {
3219 SV * sv = data->last_found;
3220 MAGIC *mg =
3221 SvUTF8(sv) && SvMAGICAL(sv) ?
3222 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3223 if (mg && mg->mg_len >= 0)
3224 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3225 }
8aa23a47 3226 data->last_end += l * (mincount - 1);
b515a41d 3227 }
8aa23a47
YO
3228 } else {
3229 /* start offset must point into the last copy */
3230 data->last_start_min += minnext * (mincount - 1);
3231 data->last_start_max += is_inf ? I32_MAX
3232 : (maxcount - 1) * (minnext + data->pos_delta);
3233 }
c277df42 3234 }
8aa23a47
YO
3235 /* It is counted once already... */
3236 data->pos_min += minnext * (mincount - counted);
3237 data->pos_delta += - counted * deltanext +
3238 (minnext + deltanext) * maxcount - minnext * mincount;
3239 if (mincount != maxcount) {
3240 /* Cannot extend fixed substrings found inside
3241 the group. */
304ee84b 3242 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3243 if (mincount && last_str) {
3244 SV * const sv = data->last_found;
3245 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3246 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3247
3248 if (mg)
3249 mg->mg_len = -1;
3250 sv_setsv(sv, last_str);
3251 data->last_end = data->pos_min;
3252 data->last_start_min =
3253 data->pos_min - CHR_SVLEN(last_str);
3254 data->last_start_max = is_inf
3255 ? I32_MAX
3256 : data->pos_min + data->pos_delta
3257 - CHR_SVLEN(last_str);
3258 }
3259 data->longest = &(data->longest_float);
3260 }
3261 SvREFCNT_dec(last_str);
c277df42 3262 }
8aa23a47
YO
3263 if (data && (fl & SF_HAS_EVAL))
3264 data->flags |= SF_HAS_EVAL;
3265 optimize_curly_tail:
3266 if (OP(oscan) != CURLYX) {
3267 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3268 && NEXT_OFF(next))
3269 NEXT_OFF(oscan) += NEXT_OFF(next);
3270 }
3271 continue;
3272 default: /* REF and CLUMP only? */
3273 if (flags & SCF_DO_SUBSTR) {
304ee84b 3274 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3275 data->longest = &(data->longest_float);
3276 }
3277 is_inf = is_inf_internal = 1;
3278 if (flags & SCF_DO_STCLASS_OR)
3279 cl_anything(pRExC_state, data->start_class);
3280 flags &= ~SCF_DO_STCLASS;
3281 break;
c277df42 3282 }
8aa23a47
YO
3283 }
3284 else if (strchr((const char*)PL_simple,OP(scan))) {
3285 int value = 0;
653099ff 3286
8aa23a47 3287 if (flags & SCF_DO_SUBSTR) {
304ee84b 3288 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3289 data->pos_min++;
3290 }
3291 min++;
3292 if (flags & SCF_DO_STCLASS) {
3293 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3294
8aa23a47
YO
3295 /* Some of the logic below assumes that switching
3296 locale on will only add false positives. */
3297 switch (PL_regkind[OP(scan)]) {
3298 case SANY:
3299 default:
3300 do_default:
3301 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3302 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3303 cl_anything(pRExC_state, data->start_class);
3304 break;
3305 case REG_ANY:
3306 if (OP(scan) == SANY)
3307 goto do_default;
3308 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3309 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3310 || (data->start_class->flags & ANYOF_CLASS));
3311 cl_anything(pRExC_state, data->start_class);
653099ff 3312 }
8aa23a47
YO
3313 if (flags & SCF_DO_STCLASS_AND || !value)
3314 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3315 break;
3316 case ANYOF:
3317 if (flags & SCF_DO_STCLASS_AND)
3318 cl_and(data->start_class,
3319 (struct regnode_charclass_class*)scan);
653099ff 3320 else
8aa23a47
YO
3321 cl_or(pRExC_state, data->start_class,
3322 (struct regnode_charclass_class*)scan);
3323 break;
3324 case ALNUM:
3325 if (flags & SCF_DO_STCLASS_AND) {
3326 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3327 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3328 for (value = 0; value < 256; value++)
3329 if (!isALNUM(value))
3330 ANYOF_BITMAP_CLEAR(data->start_class, value);
3331 }
653099ff 3332 }
8aa23a47
YO
3333 else {
3334 if (data->start_class->flags & ANYOF_LOCALE)
3335 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3336 else {
3337 for (value = 0; value < 256; value++)
3338 if (isALNUM(value))
3339 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3340 }
8aa23a47
YO
3341 }
3342 break;
3343 case ALNUML:
3344 if (flags & SCF_DO_STCLASS_AND) {
3345 if (data->start_class->flags & ANYOF_LOCALE)
3346 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3347 }
3348 else {
3349 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3350 data->start_class->flags |= ANYOF_LOCALE;
3351 }
3352 break;
3353 case NALNUM:
3354 if (flags & SCF_DO_STCLASS_AND) {
3355 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3356 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3357 for (value = 0; value < 256; value++)
3358 if (isALNUM(value))
3359 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3360 }
3361 }
8aa23a47
YO
3362 else {
3363 if (data->start_class->flags & ANYOF_LOCALE)
3364 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3365 else {
3366 for (value = 0; value < 256; value++)
3367 if (!isALNUM(value))
3368 ANYOF_BITMAP_SET(data->start_class, value);
3369 }
653099ff 3370 }
8aa23a47
YO
3371 break;
3372 case NALNUML:
3373 if (flags & SCF_DO_STCLASS_AND) {
3374 if (data->start_class->flags & ANYOF_LOCALE)
3375 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3376 }
8aa23a47
YO
3377 else {
3378 data->start_class->flags |= ANYOF_LOCALE;
3379 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3380 }
3381 break;
3382 case SPACE:
3383 if (flags & SCF_DO_STCLASS_AND) {
3384 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3385 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3386 for (value = 0; value < 256; value++)
3387 if (!isSPACE(value))
3388 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3389 }
3390 }
8aa23a47
YO
3391 else {
3392 if (data->start_class->flags & ANYOF_LOCALE)
3393 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3394 else {
3395 for (value = 0; value < 256; value++)
3396 if (isSPACE(value))
3397 ANYOF_BITMAP_SET(data->start_class, value);
3398 }
653099ff 3399 }
8aa23a47
YO
3400 break;
3401 case SPACEL:
3402 if (flags & SCF_DO_STCLASS_AND) {
3403 if (data->start_class->flags & ANYOF_LOCALE)
3404 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3405 }
3406 else {
3407 data->start_class->flags |= ANYOF_LOCALE;
3408 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3409 }
3410 break;
3411 case NSPACE:
3412 if (flags & SCF_DO_STCLASS_AND) {
3413 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3414 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3415 for (value = 0; value < 256; value++)
3416 if (isSPACE(value))
3417 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3418 }
8aa23a47
YO
3419 }
3420 else {
3421 if (data->start_class->flags & ANYOF_LOCALE)
3422 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3423 else {
3424 for (value = 0; value < 256; value++)
3425 if (!isSPACE(value))
3426 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3427 }
3428 }
8aa23a47
YO
3429 break;
3430 case NSPACEL:
3431 if (flags & SCF_DO_STCLASS_AND) {
3432 if (data->start_class->flags & ANYOF_LOCALE) {
3433 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3434 for (value = 0; value < 256; value++)
3435 if (!isSPACE(value))
3436 ANYOF_BITMAP_CLEAR(data->start_class, value);
3437 }
653099ff 3438 }
8aa23a47
YO
3439 else {
3440 data->start_class->flags |= ANYOF_LOCALE;
3441 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3442 }
3443 break;
3444 case DIGIT:
3445 if (flags & SCF_DO_STCLASS_AND) {
3446 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3447 for (value = 0; value < 256; value++)
3448 if (!isDIGIT(value))
3449 ANYOF_BITMAP_CLEAR(data->start_class, value);
3450 }
3451 else {
3452 if (data->start_class->flags & ANYOF_LOCALE)
3453 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3454 else {
3455 for (value = 0; value < 256; value++)
3456 if (isDIGIT(value))
3457 ANYOF_BITMAP_SET(data->start_class, value);
3458 }
3459 }
3460 break;
3461 case NDIGIT:
3462 if (flags & SCF_DO_STCLASS_AND) {
3463 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3464 for (value = 0; value < 256; value++)
3465 if (isDIGIT(value))
3466 ANYOF_BITMAP_CLEAR(data->start_class, value);
3467 }
3468 else {
3469 if (data->start_class->flags & ANYOF_LOCALE)
3470 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3471 else {
3472 for (value = 0; value < 256; value++)
3473 if (!isDIGIT(value))
3474 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3475 }
3476 }
8aa23a47
YO
3477 break;
3478 }
3479 if (flags & SCF_DO_STCLASS_OR)
3480 cl_and(data->start_class, and_withp);
3481 flags &= ~SCF_DO_STCLASS;
3482 }
3483 }
3484 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3485 data->flags |= (OP(scan) == MEOL
3486 ? SF_BEFORE_MEOL
3487 : SF_BEFORE_SEOL);
3488 }
3489 else if ( PL_regkind[OP(scan)] == BRANCHJ
3490 /* Lookbehind, or need to calculate parens/evals/stclass: */
3491 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3492 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3493 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3494 || OP(scan) == UNLESSM )
3495 {
3496 /* Negative Lookahead/lookbehind
3497 In this case we can't do fixed string optimisation.
3498 */
1de06328 3499
8aa23a47
YO
3500 I32 deltanext, minnext, fake = 0;
3501 regnode *nscan;
3502 struct regnode_charclass_class intrnl;
3503 int f = 0;
1de06328 3504
8aa23a47
YO
3505 data_fake.flags = 0;
3506 if (data) {
3507 data_fake.whilem_c = data->whilem_c;
3508 data_fake.last_closep = data->last_closep;
c277df42 3509 }
8aa23a47
YO
3510 else
3511 data_fake.last_closep = &fake;
58e23c8d 3512 data_fake.pos_delta = delta;
8aa23a47
YO
3513 if ( flags & SCF_DO_STCLASS && !scan->flags
3514 && OP(scan) == IFMATCH ) { /* Lookahead */
3515 cl_init(pRExC_state, &intrnl);
3516 data_fake.start_class = &intrnl;
3517 f |= SCF_DO_STCLASS_AND;
3518 }
3519 if (flags & SCF_WHILEM_VISITED_POS)
3520 f |= SCF_WHILEM_VISITED_POS;
3521 next = regnext(scan);
3522 nscan = NEXTOPER(NEXTOPER(scan));
3523 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3524 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3525 if (scan->flags) {
3526 if (deltanext) {
58e23c8d 3527 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3528 }
3529 else if (minnext > (I32)U8_MAX) {
58e23c8d 3530 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3531 }
3532 scan->flags = (U8)minnext;
3533 }
3534 if (data) {
3535 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3536 pars++;
3537 if (data_fake.flags & SF_HAS_EVAL)
3538 data->flags |= SF_HAS_EVAL;
3539 data->whilem_c = data_fake.whilem_c;
3540 }
3541 if (f & SCF_DO_STCLASS_AND) {
3542 const int was = (data->start_class->flags & ANYOF_EOS);
3543
3544 cl_and(data->start_class, &intrnl);
3545 if (was)
3546 data->start_class->flags |= ANYOF_EOS;
3547 }
cb434fcc 3548 }
8aa23a47
YO
3549#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3550 else {
3551 /* Positive Lookahead/lookbehind
3552 In this case we can do fixed string optimisation,
3553 but we must be careful about it. Note in the case of
3554 lookbehind the positions will be offset by the minimum
3555 length of the pattern, something we won't know about
3556 until after the recurse.
3557 */
3558 I32 deltanext, fake = 0;
3559 regnode *nscan;
3560 struct regnode_charclass_class intrnl;
3561 int f = 0;
3562 /* We use SAVEFREEPV so that when the full compile
3563 is finished perl will clean up the allocated
3564 minlens when its all done. This was we don't
3565 have to worry about freeing them when we know
3566 they wont be used, which would be a pain.
3567 */
3568 I32 *minnextp;
3569 Newx( minnextp, 1, I32 );
3570 SAVEFREEPV(minnextp);
3571
3572 if (data) {
3573 StructCopy(data, &data_fake, scan_data_t);
3574 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3575 f |= SCF_DO_SUBSTR;
3576 if (scan->flags)
304ee84b 3577 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3578 data_fake.last_found=newSVsv(data->last_found);
3579 }
3580 }
3581 else
3582 data_fake.last_closep = &fake;
3583 data_fake.flags = 0;
58e23c8d 3584 data_fake.pos_delta = delta;
8aa23a47
YO
3585 if (is_inf)
3586 data_fake.flags |= SF_IS_INF;
3587 if ( flags & SCF_DO_STCLASS && !scan->flags
3588 && OP(scan) == IFMATCH ) { /* Lookahead */
3589 cl_init(pRExC_state, &intrnl);
3590 data_fake.start_class = &intrnl;
3591 f |= SCF_DO_STCLASS_AND;
3592 }
3593 if (flags & SCF_WHILEM_VISITED_POS)
3594 f |= SCF_WHILEM_VISITED_POS;
3595 next = regnext(scan);
3596 nscan = NEXTOPER(NEXTOPER(scan));
3597
3598 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3599 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3600 if (scan->flags) {
3601 if (deltanext) {
58e23c8d 3602 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3603 }
3604 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3605 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3606 }
3607 scan->flags = (U8)*minnextp;
3608 }
3609
3610 *minnextp += min;
3611
3612 if (f & SCF_DO_STCLASS_AND) {
3613 const int was = (data->start_class->flags & ANYOF_EOS);
3614
3615 cl_and(data->start_class, &intrnl);
3616 if (was)
3617 data->start_class->flags |= ANYOF_EOS;
3618 }
3619 if (data) {
3620 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3621 pars++;
3622 if (data_fake.flags & SF_HAS_EVAL)
3623 data->flags |= SF_HAS_EVAL;
3624 data->whilem_c = data_fake.whilem_c;
3625 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3626 if (RExC_rx->minlen<*minnextp)
3627 RExC_rx->minlen=*minnextp;
304ee84b 3628 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3629 SvREFCNT_dec(data_fake.last_found);
3630
3631 if ( data_fake.minlen_fixed != minlenp )
3632 {
3633 data->offset_fixed= data_fake.offset_fixed;
3634 data->minlen_fixed= data_fake.minlen_fixed;
3635 data->lookbehind_fixed+= scan->flags;
3636 }
3637 if ( data_fake.minlen_float != minlenp )
3638 {
3639 data->minlen_float= data_fake.minlen_float;
3640 data->offset_float_min=data_fake.offset_float_min;
3641 data->offset_float_max=data_fake.offset_float_max;
3642 data->lookbehind_float+= scan->flags;
3643 }
3644 }
3645 }
3646
3647
40d049e4 3648 }
8aa23a47
YO
3649#endif
3650 }
3651 else if (OP(scan) == OPEN) {
3652 if (stopparen != (I32)ARG(scan))
3653 pars++;
3654 }
3655 else if (OP(scan) == CLOSE) {
3656 if (stopparen == (I32)ARG(scan)) {
3657 break;
3658 }
3659 if ((I32)ARG(scan) == is_par) {
3660 next = regnext(scan);
b515a41d 3661
8aa23a47
YO
3662 if ( next && (OP(next) != WHILEM) && next < last)
3663 is_par = 0; /* Disable optimization */
40d049e4 3664 }
8aa23a47
YO
3665 if (data)
3666 *(data->last_closep) = ARG(scan);
3667 }
3668 else if (OP(scan) == EVAL) {
c277df42
IZ
3669 if (data)
3670 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3671 }
3672 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3673 if (flags & SCF_DO_SUBSTR) {
304ee84b 3674 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3675 flags &= ~SCF_DO_SUBSTR;
40d049e4 3676 }
8aa23a47
YO
3677 if (data && OP(scan)==ACCEPT) {
3678 data->flags |= SCF_SEEN_ACCEPT;
3679 if (stopmin > min)
3680 stopmin = min;
e2e6a0f1 3681 }
8aa23a47
YO
3682 }
3683 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3684 {
0f5d15d6 3685 if (flags & SCF_DO_SUBSTR) {
304ee84b 3686 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3687 data->longest = &(data->longest_float);
3688 }
3689 is_inf = is_inf_internal = 1;
653099ff 3690 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3691 cl_anything(pRExC_state, data->start_class);
96776eda 3692 flags &= ~SCF_DO_STCLASS;
8aa23a47 3693 }
58e23c8d 3694 else if (OP(scan) == GPOS) {
bbe252da 3695 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3696 !(delta || is_inf || (data && data->pos_delta)))
3697 {
bbe252da
YO
3698 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3699 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3700 if (RExC_rx->gofs < (U32)min)
3701 RExC_rx->gofs = min;
3702 } else {
bbe252da 3703 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3704 RExC_rx->gofs = 0;
3705 }
3706 }
786e8c11 3707#ifdef TRIE_STUDY_OPT
40d049e4 3708#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3709 else if (PL_regkind[OP(scan)] == TRIE) {
3710 /* NOTE - There is similar code to this block above for handling
3711 BRANCH nodes on the initial study. If you change stuff here
3712 check there too. */
3713 regnode *trie_node= scan;
3714 regnode *tail= regnext(scan);
f8fc2ecf 3715 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3716 I32 max1 = 0, min1 = I32_MAX;
3717 struct regnode_charclass_class accum;
3718
3719 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3720 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3721 if (flags & SCF_DO_STCLASS)
3722 cl_init_zero(pRExC_state, &accum);
3723
3724 if (!trie->jump) {
3725 min1= trie->minlen;
3726 max1= trie->maxlen;
3727 } else {
3728 const regnode *nextbranch= NULL;
3729 U32 word;
3730
3731 for ( word=1 ; word <= trie->wordcount ; word++)
3732 {
3733 I32 deltanext=0, minnext=0, f = 0, fake;
3734 struct regnode_charclass_class this_class;
3735
3736 data_fake.flags = 0;
3737 if (data) {
3738 data_fake.whilem_c = data->whilem_c;
3739 data_fake.last_closep = data->last_closep;
3740 }
3741 else
3742 data_fake.last_closep = &fake;
58e23c8d 3743 data_fake.pos_delta = delta;
8aa23a47
YO
3744 if (flags & SCF_DO_STCLASS) {
3745 cl_init(pRExC_state, &this_class);
3746 data_fake.start_class = &this_class;
3747 f = SCF_DO_STCLASS_AND;
3748 }
3749 if (flags & SCF_WHILEM_VISITED_POS)
3750 f |= SCF_WHILEM_VISITED_POS;
3751
3752 if (trie->jump[word]) {
3753 if (!nextbranch)
3754 nextbranch = trie_node + trie->jump[0];
3755 scan= trie_node + trie->jump[word];
3756 /* We go from the jump point to the branch that follows
3757 it. Note this means we need the vestigal unused branches
3758 even though they arent otherwise used.
3759 */
3760 minnext = study_chunk(pRExC_state, &scan, minlenp,
3761 &deltanext, (regnode *)nextbranch, &data_fake,
3762 stopparen, recursed, NULL, f,depth+1);
3763 }
3764 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3765 nextbranch= regnext((regnode*)nextbranch);
3766
3767 if (min1 > (I32)(minnext + trie->minlen))
3768 min1 = minnext + trie->minlen;
3769 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3770 max1 = minnext + deltanext + trie->maxlen;
3771 if (deltanext == I32_MAX)
3772 is_inf = is_inf_internal = 1;
3773
3774 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3775 pars++;
3776 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3777 if ( stopmin > min + min1)
3778 stopmin = min + min1;
3779 flags &= ~SCF_DO_SUBSTR;
3780 if (data)
3781 data->flags |= SCF_SEEN_ACCEPT;
3782 }
3783 if (data) {
3784 if (data_fake.flags & SF_HAS_EVAL)
3785 data->flags |= SF_HAS_EVAL;
3786 data->whilem_c = data_fake.whilem_c;
3787 }
3788 if (flags & SCF_DO_STCLASS)
3789 cl_or(pRExC_state, &accum, &this_class);
3790 }
3791 }
3792 if (flags & SCF_DO_SUBSTR) {
3793 data->pos_min += min1;
3794 data->pos_delta += max1 - min1;
3795 if (max1 != min1 || is_inf)
3796 data->longest = &(data->longest_float);
3797 }
3798 min += min1;
3799 delta += max1 - min1;
3800 if (flags & SCF_DO_STCLASS_OR) {
3801 cl_or(pRExC_state, data->start_class, &accum);
3802 if (min1) {
3803 cl_and(data->start_class, and_withp);
3804 flags &= ~SCF_DO_STCLASS;
3805 }
3806 }
3807 else if (flags & SCF_DO_STCLASS_AND) {
3808 if (min1) {
3809 cl_and(data->start_class, &accum);
3810 flags &= ~SCF_DO_STCLASS;
3811 }
3812 else {
3813 /* Switch to OR mode: cache the old value of
3814 * data->start_class */
3815 INIT_AND_WITHP;
3816 StructCopy(data->start_class, and_withp,
3817 struct regnode_charclass_class);
3818 flags &= ~SCF_DO_STCLASS_AND;
3819 StructCopy(&accum, data->start_class,
3820 struct regnode_charclass_class);
3821 flags |= SCF_DO_STCLASS_OR;
3822 data->start_class->flags |= ANYOF_EOS;
3823 }
3824 }
3825 scan= tail;
3826 continue;
3827 }
786e8c11 3828#else
8aa23a47 3829 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 3830 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3831 U8*bang=NULL;
3832
3833 min += trie->minlen;
3834 delta += (trie->maxlen - trie->minlen);
3835 flags &= ~SCF_DO_STCLASS; /* xxx */
3836 if (flags & SCF_DO_SUBSTR) {
304ee84b 3837 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3838 data->pos_min += trie->minlen;
3839 data->pos_delta += (trie->maxlen - trie->minlen);
3840 if (trie->maxlen != trie->minlen)
3841 data->longest = &(data->longest_float);
3842 }
3843 if (trie->jump) /* no more substrings -- for now /grr*/
3844 flags &= ~SCF_DO_SUBSTR;
b515a41d 3845 }
8aa23a47
YO
3846#endif /* old or new */
3847#endif /* TRIE_STUDY_OPT */
3848 /* Else: zero-length, ignore. */
3849 scan = regnext(scan);
3850 }
3851 if (frame) {
3852 last = frame->last;
3853 scan = frame->next;
3854 stopparen = frame->stop;
3855 frame = frame->prev;
3856 goto fake_study_recurse;
c277df42
IZ
3857 }
3858
3859 finish:
8aa23a47 3860 assert(!frame);
304ee84b 3861 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 3862
c277df42 3863 *scanp = scan;
aca2d497 3864 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3865 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3866 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3867 if (is_par > (I32)U8_MAX)
c277df42
IZ
3868 is_par = 0;
3869 if (is_par && pars==1 && data) {
3870 data->flags |= SF_IN_PAR;
3871 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3872 }
3873 else if (pars && data) {
c277df42
IZ
3874 data->flags |= SF_HAS_PAR;
3875 data->flags &= ~SF_IN_PAR;
3876 }
653099ff 3877 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3878 cl_and(data->start_class, and_withp);
786e8c11
YO
3879 if (flags & SCF_TRIE_RESTUDY)
3880 data->flags |= SCF_TRIE_RESTUDY;
1de06328 3881
304ee84b 3882 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 3883
e2e6a0f1 3884 return min < stopmin ? min : stopmin;
c277df42
IZ
3885}
3886
2eccd3b2
NC
3887STATIC U32
3888S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 3889{
4a4e7719
NC
3890 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
3891
3892 Renewc(RExC_rxi->data,
3893 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
3894 char, struct reg_data);
3895 if(count)
f8fc2ecf 3896 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 3897 else
f8fc2ecf 3898 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
3899 RExC_rxi->data->count = count + n;
3900 Copy(s, RExC_rxi->data->what + count, n, U8);
3901 return count;
c277df42
IZ
3902}
3903
f8149455 3904/*XXX: todo make this not included in a non debugging perl */
76234dfb 3905#ifndef PERL_IN_XSUB_RE
d88dccdf 3906void
864dbfa3 3907Perl_reginitcolors(pTHX)
d88dccdf 3908{
97aff369 3909 dVAR;
1df70142 3910 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3911 if (s) {
1df70142
AL
3912 char *t = savepv(s);
3913 int i = 0;
3914 PL_colors[0] = t;
d88dccdf 3915 while (++i < 6) {
1df70142
AL
3916 t = strchr(t, '\t');
3917 if (t) {
3918 *t = '\0';
3919 PL_colors[i] = ++t;
d88dccdf
IZ
3920 }
3921 else
1df70142 3922 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3923 }
3924 } else {
1df70142 3925 int i = 0;
b81d288d 3926 while (i < 6)
06b5626a 3927 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3928 }
3929 PL_colorset = 1;
3930}
76234dfb 3931#endif
8615cb43 3932
07be1b83 3933
786e8c11
YO
3934#ifdef TRIE_STUDY_OPT
3935#define CHECK_RESTUDY_GOTO \
3936 if ( \
3937 (data.flags & SCF_TRIE_RESTUDY) \
3938 && ! restudied++ \
3939 ) goto reStudy
3940#else
3941#define CHECK_RESTUDY_GOTO
3942#endif
f9f4320a 3943
a687059c 3944/*
e50aee73 3945 - pregcomp - compile a regular expression into internal code
a687059c
LW
3946 *
3947 * We can't allocate space until we know how big the compiled form will be,
3948 * but we can't compile it (and thus know how big it is) until we've got a
3949 * place to put the code. So we cheat: we compile it twice, once with code
3950 * generation turned off and size counting turned on, and once "for real".
3951 * This also means that we don't allocate space until we are sure that the
3952 * thing really will compile successfully, and we never have to move the
3953 * code and thus invalidate pointers into it. (Note that it has to be in
3954 * one piece because free() must be able to free it all.) [NB: not true in perl]
3955 *
3956 * Beware that the optimization-preparation code in here knows about some
3957 * of the structure of the compiled regexp. [I'll say.]
3958 */
b9b4dddf
YO
3959
3960
3961
f9f4320a 3962#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3963#define RE_ENGINE_PTR &PL_core_reg_engine
3964#else
f9f4320a
YO
3965extern const struct regexp_engine my_reg_engine;
3966#define RE_ENGINE_PTR &my_reg_engine
3967#endif
6d5c990f
RGS
3968
3969#ifndef PERL_IN_XSUB_RE
a687059c 3970regexp *
864dbfa3 3971Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3972{
97aff369 3973 dVAR;
6d5c990f 3974 HV * const table = GvHV(PL_hintgv);
f9f4320a
YO
3975 /* Dispatch a request to compile a regexp to correct
3976 regexp engine. */
f9f4320a
YO
3977 if (table) {
3978 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 3979 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 3980 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3981 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3982 DEBUG_COMPILE_r({
8d8756e7 3983 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3984 SvIV(*ptr));
3985 });
f2f78491 3986 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3987 }
b9b4dddf 3988 }
2a5d9b1d
RGS
3989 return Perl_re_compile(aTHX_ exp, xend, pm);
3990}
6d5c990f 3991#endif
2a5d9b1d
RGS
3992
3993regexp *
3994Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm)
3995{
3996 dVAR;
a0d0e21e 3997 register regexp *r;
f8fc2ecf 3998 register regexp_internal *ri;
c277df42 3999 regnode *scan;
c277df42 4000 regnode *first;
a0d0e21e 4001 I32 flags;
a0d0e21e
LW
4002 I32 minlen = 0;
4003 I32 sawplus = 0;
4004 I32 sawopen = 0;
2c2d71f5 4005 scan_data_t data;
830247a4 4006 RExC_state_t RExC_state;
be8e71aa 4007 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
4008#ifdef TRIE_STUDY_OPT
4009 int restudied= 0;
4010 RExC_state_t copyRExC_state;
4011#endif
2a5d9b1d 4012 GET_RE_DEBUG_FLAGS_DECL;
6d5c990f
RGS
4013 DEBUG_r(if (!PL_colorset) reginitcolors());
4014
a0d0e21e 4015 if (exp == NULL)
c277df42 4016 FAIL("NULL regexp argument");
a0d0e21e 4017
a5961de5 4018 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 4019
5cfc7842 4020 RExC_precomp = exp;
a3621e74 4021 DEBUG_COMPILE_r({
ab3bbdeb
YO
4022 SV *dsv= sv_newmortal();
4023 RE_PV_QUOTED_DECL(s, RExC_utf8,
4024 dsv, RExC_precomp, (xend - exp), 60);
4025 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4026 PL_colors[4],PL_colors[5],s);
a5961de5 4027 });
e2509266 4028 RExC_flags = pm->op_pmflags;
830247a4 4029 RExC_sawback = 0;
bbce6d69 4030
830247a4
IZ
4031 RExC_seen = 0;
4032 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4033 RExC_seen_evals = 0;
4034 RExC_extralen = 0;
c277df42 4035
bbce6d69 4036 /* First pass: determine size, legality. */
830247a4 4037 RExC_parse = exp;
fac92740 4038 RExC_start = exp;
830247a4
IZ
4039 RExC_end = xend;
4040 RExC_naughty = 0;
4041 RExC_npar = 1;
e2e6a0f1 4042 RExC_nestroot = 0;
830247a4
IZ
4043 RExC_size = 0L;
4044 RExC_emit = &PL_regdummy;
4045 RExC_whilem_seen = 0;
fc8cd66c 4046 RExC_charnames = NULL;
40d049e4
YO
4047 RExC_open_parens = NULL;
4048 RExC_close_parens = NULL;
4049 RExC_opend = NULL;
81714fb9 4050 RExC_paren_names = NULL;
40d049e4
YO
4051 RExC_recurse = NULL;
4052 RExC_recurse_count = 0;
81714fb9 4053
85ddcde9
JH
4054#if 0 /* REGC() is (currently) a NOP at the first pass.
4055 * Clever compilers notice this and complain. --jhi */
830247a4 4056 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4057#endif
3dab1dad
YO
4058 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4059 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4060 RExC_precomp = NULL;
a0d0e21e
LW
4061 return(NULL);
4062 }
07be1b83 4063 DEBUG_PARSE_r({
81714fb9
YO
4064 PerlIO_printf(Perl_debug_log,
4065 "Required size %"IVdf" nodes\n"
4066 "Starting second pass (creation)\n",
4067 (IV)RExC_size);
07be1b83
YO
4068 RExC_lastnum=0;
4069 RExC_lastparse=NULL;
4070 });
c277df42
IZ
4071 /* Small enough for pointer-storage convention?
4072 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4073 if (RExC_size >= 0x10000L && RExC_extralen)
4074 RExC_size += RExC_extralen;
c277df42 4075 else
830247a4
IZ
4076 RExC_extralen = 0;
4077 if (RExC_whilem_seen > 15)
4078 RExC_whilem_seen = 15;
a0d0e21e 4079
e2e6a0f1
YO
4080#ifdef DEBUGGING
4081 /* Make room for a sentinel value at the end of the program */
4082 RExC_size++;
4083#endif
4084
f9f4320a
YO
4085 /* Allocate space and zero-initialize. Note, the two step process
4086 of zeroing when in debug mode, thus anything assigned has to
4087 happen after that */
f8fc2ecf
YO
4088 Newxz(r, 1, regexp);
4089 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4090 char, regexp_internal);
4091 if ( r == NULL || ri == NULL )
b45f050a 4092 FAIL("Regexp out of space");
0f79a09d
GS
4093#ifdef DEBUGGING
4094 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4095 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4096#else
f8fc2ecf
YO
4097 /* bulk initialize base fields with 0. */
4098 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4099#endif
58e23c8d
YO
4100
4101 /* non-zero initialization begins here */
f8fc2ecf 4102 RXi_SET( r, ri );
f9f4320a 4103 r->engine= RE_ENGINE_PTR;
c277df42 4104 r->refcnt = 1;
bbce6d69 4105 r->prelen = xend - exp;
5cfc7842 4106 r->precomp = savepvn(RExC_precomp, r->prelen);
bbe252da
YO
4107 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
4108 r->intflags = 0;
830247a4 4109 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4110
6bda09f9 4111 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4112 Newxz(RExC_open_parens, RExC_npar,regnode *);
4113 SAVEFREEPV(RExC_open_parens);
4114 Newxz(RExC_close_parens,RExC_npar,regnode *);
4115 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4116 }
4117
4118 /* Useful during FAIL. */
f8fc2ecf
YO
4119 Newxz(ri->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4120 if (ri->offsets) {
4121 ri->offsets[0] = RExC_size;
fac92740 4122 }
a3621e74 4123 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4124 "%s %"UVuf" bytes for offset annotations.\n",
f8fc2ecf 4125 ri->offsets ? "Got" : "Couldn't get",
392fbf5d 4126 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4127
830247a4 4128 RExC_rx = r;
f8fc2ecf 4129 RExC_rxi = ri;
bbce6d69 4130
4131 /* Second pass: emit code. */
e2509266 4132 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4133 RExC_parse = exp;
4134 RExC_end = xend;
4135 RExC_naughty = 0;
4136 RExC_npar = 1;
f8fc2ecf
YO
4137 RExC_emit_start = ri->program;
4138 RExC_emit = ri->program;
e2e6a0f1
YO
4139#ifdef DEBUGGING
4140 /* put a sentinal on the end of the program so we can check for
4141 overwrites */
f8fc2ecf 4142 ri->program[RExC_size].type = 255;
e2e6a0f1 4143#endif
2cd61cdb 4144 /* Store the count of eval-groups for security checks: */
f8149455 4145 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4146 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3dab1dad 4147 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4148 return(NULL);
6bda09f9 4149
07be1b83
YO
4150 /* XXXX To minimize changes to RE engine we always allocate
4151 3-units-long substrs field. */
4152 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4153 if (RExC_recurse_count) {
4154 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4155 SAVEFREEPV(RExC_recurse);
4156 }
a0d0e21e 4157
07be1b83 4158reStudy:
1de06328 4159 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4160 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4161
07be1b83
YO
4162#ifdef TRIE_STUDY_OPT
4163 if ( restudied ) {
5d458dd8 4164 U32 seen=RExC_seen;
07be1b83 4165 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4166
4167 RExC_state = copyRExC_state;
4168 if (seen & REG_TOP_LEVEL_BRANCHES)
4169 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4170 else
4171 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4172 if (data.last_found) {
07be1b83 4173 SvREFCNT_dec(data.longest_fixed);
07be1b83 4174 SvREFCNT_dec(data.longest_float);
07be1b83 4175 SvREFCNT_dec(data.last_found);
1de06328 4176 }
40d049e4 4177 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4178 } else {
40d049e4 4179 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4180 copyRExC_state = RExC_state;
07be1b83 4181 }
40d049e4
YO
4182#else
4183 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4184#endif
fc8cd66c 4185
a0d0e21e 4186 /* Dig out information for optimizations. */
bbe252da 4187 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */
e2509266 4188 pm->op_pmflags = RExC_flags;
a0ed51b3 4189 if (UTF)
bbe252da 4190 r->extflags |= RXf_UTF8; /* Unicode in it? */
f8fc2ecf 4191 ri->regstclass = NULL;
830247a4 4192 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4193 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4194 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4195
1de06328
YO
4196 /* testing for BRANCH here tells us whether there is "must appear"
4197 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4198 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4199 I32 fake;
c5254dd6 4200 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4201 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4202 int stclass_flag;
07be1b83 4203 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4204
4205 first = scan;
c277df42 4206 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4207 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4208 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4209 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4210 /* for now we can't handle lookbehind IFMATCH*/
4211 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4212 (OP(first) == PLUS) ||
4213 (OP(first) == MINMOD) ||
653099ff 4214 /* An {n,m} with n>0 */
07be1b83
YO
4215 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4216 {
786e8c11 4217
a0d0e21e
LW
4218 if (OP(first) == PLUS)
4219 sawplus = 1;
4220 else
3dab1dad 4221 first += regarglen[OP(first)];
07be1b83
YO
4222 if (OP(first) == IFMATCH) {
4223 first = NEXTOPER(first);
4224 first += EXTRA_STEP_2ARGS;
7c167cea 4225 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4226 first = NEXTOPER(first);
a687059c
LW
4227 }
4228
a0d0e21e
LW
4229 /* Starting-point info. */
4230 again:
786e8c11 4231 DEBUG_PEEP("first:",first,0);
07be1b83 4232 /* Ignore EXACT as we deal with it later. */
3dab1dad 4233 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4234 if (OP(first) == EXACT)
6f207bd3 4235 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4236 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4237 ri->regstclass = first;
b3c9acc1 4238 }
07be1b83 4239#ifdef TRIE_STCLASS
786e8c11 4240 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4241 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4242 {
786e8c11 4243 regnode *trie_op;
07be1b83 4244 /* this can happen only on restudy */
786e8c11 4245 if ( OP(first) == TRIE ) {
446bd890
NC
4246 struct regnode_1 *trieop =
4247 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4248 StructCopy(first,trieop,struct regnode_1);
4249 trie_op=(regnode *)trieop;
4250 } else {
446bd890
NC
4251 struct regnode_charclass *trieop =
4252 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4253 StructCopy(first,trieop,struct regnode_charclass);
4254 trie_op=(regnode *)trieop;
4255 }
1de06328 4256 OP(trie_op)+=2;
786e8c11 4257 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4258 ri->regstclass = trie_op;
07be1b83
YO
4259 }
4260#endif
bfed75c6 4261 else if (strchr((const char*)PL_simple,OP(first)))
f8fc2ecf 4262 ri->regstclass = first;
3dab1dad
YO
4263 else if (PL_regkind[OP(first)] == BOUND ||
4264 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4265 ri->regstclass = first;
3dab1dad 4266 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4267 r->extflags |= (OP(first) == MBOL
4268 ? RXf_ANCH_MBOL
cad2e5aa 4269 : (OP(first) == SBOL
bbe252da
YO
4270 ? RXf_ANCH_SBOL
4271 : RXf_ANCH_BOL));
a0d0e21e 4272 first = NEXTOPER(first);
774d564b 4273 goto again;
4274 }
4275 else if (OP(first) == GPOS) {
bbe252da 4276 r->extflags |= RXf_ANCH_GPOS;
774d564b 4277 first = NEXTOPER(first);
4278 goto again;
a0d0e21e 4279 }
cf2a2b69
YO
4280 else if ((!sawopen || !RExC_sawback) &&
4281 (OP(first) == STAR &&
3dab1dad 4282 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4283 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4284 {
4285 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4286 const int type =
4287 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4288 ? RXf_ANCH_MBOL
4289 : RXf_ANCH_SBOL;
4290 r->extflags |= type;
4291 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4292 first = NEXTOPER(first);
774d564b 4293 goto again;
a0d0e21e 4294 }
b81d288d 4295 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4296 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4297 /* x+ must match at the 1st pos of run of x's */
bbe252da 4298 r->intflags |= PREGf_SKIP;
a0d0e21e 4299
c277df42 4300 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4301#ifdef TRIE_STUDY_OPT
81714fb9 4302 DEBUG_PARSE_r(
be8e71aa
YO
4303 if (!restudied)
4304 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4305 (IV)(first - scan + 1))
4306 );
4307#else
81714fb9 4308 DEBUG_PARSE_r(
be8e71aa
YO
4309 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4310 (IV)(first - scan + 1))
4311 );
4312#endif
4313
4314
a0d0e21e
LW
4315 /*
4316 * If there's something expensive in the r.e., find the
4317 * longest literal string that must appear and make it the
4318 * regmust. Resolve ties in favor of later strings, since
4319 * the regstart check works with the beginning of the r.e.
4320 * and avoiding duplication strengthens checking. Not a
4321 * strong reason, but sufficient in the absence of others.
4322 * [Now we resolve ties in favor of the earlier string if
c277df42 4323 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4324 * earlier string may buy us something the later one won't.]
4325 */
de8c5301 4326
396482e1
GA
4327 data.longest_fixed = newSVpvs("");
4328 data.longest_float = newSVpvs("");
4329 data.last_found = newSVpvs("");
c277df42
IZ
4330 data.longest = &(data.longest_fixed);
4331 first = scan;
f8fc2ecf 4332 if (!ri->regstclass) {
830247a4 4333 cl_init(pRExC_state, &ch_class);
653099ff
GS
4334 data.start_class = &ch_class;
4335 stclass_flag = SCF_DO_STCLASS_AND;
4336 } else /* XXXX Check for BOUND? */
4337 stclass_flag = 0;
cb434fcc 4338 data.last_closep = &last_close;
de8c5301 4339
1de06328 4340 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4341 &data, -1, NULL, NULL,
4342 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4343
07be1b83 4344
786e8c11
YO
4345 CHECK_RESTUDY_GOTO;
4346
4347
830247a4 4348 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4349 && data.last_start_min == 0 && data.last_end > 0
830247a4 4350 && !RExC_seen_zerolen
2bf803e2 4351 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4352 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4353 r->extflags |= RXf_CHECK_ALL;
304ee84b 4354 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4355 SvREFCNT_dec(data.last_found);
4356
1de06328
YO
4357 /* Note that code very similar to this but for anchored string
4358 follows immediately below, changes may need to be made to both.
4359 Be careful.
4360 */
a0ed51b3 4361 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4362 if (longest_float_length
c277df42
IZ
4363 || (data.flags & SF_FL_BEFORE_EOL
4364 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4365 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4366 {
1182767e 4367 I32 t,ml;
cf93c79d 4368
1de06328 4369 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4370 && data.offset_fixed == data.offset_float_min
4371 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4372 goto remove_float; /* As in (a)+. */
4373
1de06328
YO
4374 /* copy the information about the longest float from the reg_scan_data
4375 over to the program. */
33b8afdf
JH
4376 if (SvUTF8(data.longest_float)) {
4377 r->float_utf8 = data.longest_float;
c445ea15 4378 r->float_substr = NULL;
33b8afdf
JH
4379 } else {
4380 r->float_substr = data.longest_float;
c445ea15 4381 r->float_utf8 = NULL;
33b8afdf 4382 }
1de06328
YO
4383 /* float_end_shift is how many chars that must be matched that
4384 follow this item. We calculate it ahead of time as once the
4385 lookbehind offset is added in we lose the ability to correctly
4386 calculate it.*/
4387 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4388 : (I32)longest_float_length;
1de06328
YO
4389 r->float_end_shift = ml - data.offset_float_min
4390 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4391 + data.lookbehind_float;
4392 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4393 r->float_max_offset = data.offset_float_max;
1182767e 4394 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4395 r->float_max_offset -= data.lookbehind_float;
4396
cf93c79d
IZ
4397 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4398 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4399 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4400 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4401 }
4402 else {
aca2d497 4403 remove_float:
c445ea15 4404 r->float_substr = r->float_utf8 = NULL;
c277df42 4405 SvREFCNT_dec(data.longest_float);
c5254dd6 4406 longest_float_length = 0;
a0d0e21e 4407 }
c277df42 4408
1de06328
YO
4409 /* Note that code very similar to this but for floating string
4410 is immediately above, changes may need to be made to both.
4411 Be careful.
4412 */
a0ed51b3 4413 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4414 if (longest_fixed_length
c277df42
IZ
4415 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4416 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4417 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4418 {
1182767e 4419 I32 t,ml;
cf93c79d 4420
1de06328
YO
4421 /* copy the information about the longest fixed
4422 from the reg_scan_data over to the program. */
33b8afdf
JH
4423 if (SvUTF8(data.longest_fixed)) {
4424 r->anchored_utf8 = data.longest_fixed;
c445ea15 4425 r->anchored_substr = NULL;
33b8afdf
JH
4426 } else {
4427 r->anchored_substr = data.longest_fixed;
c445ea15 4428 r->anchored_utf8 = NULL;
33b8afdf 4429 }
1de06328
YO
4430 /* fixed_end_shift is how many chars that must be matched that
4431 follow this item. We calculate it ahead of time as once the
4432 lookbehind offset is added in we lose the ability to correctly
4433 calculate it.*/
4434 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4435 : (I32)longest_fixed_length;
1de06328
YO
4436 r->anchored_end_shift = ml - data.offset_fixed
4437 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4438 + data.lookbehind_fixed;
4439 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4440
cf93c79d
IZ
4441 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4442 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4443 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4444 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4445 }
4446 else {
c445ea15 4447 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4448 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4449 longest_fixed_length = 0;
a0d0e21e 4450 }
f8fc2ecf
YO
4451 if (ri->regstclass
4452 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4453 ri->regstclass = NULL;
33b8afdf
JH
4454 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4455 && stclass_flag
653099ff 4456 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4457 && !cl_is_anything(data.start_class))
4458 {
2eccd3b2 4459 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4460
f8fc2ecf 4461 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4462 struct regnode_charclass_class);
4463 StructCopy(data.start_class,
f8fc2ecf 4464 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4465 struct regnode_charclass_class);
f8fc2ecf 4466 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4467 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4468 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4469 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4470 PerlIO_printf(Perl_debug_log,
a0288114 4471 "synthetic stclass \"%s\".\n",
3f7c398e 4472 SvPVX_const(sv));});
653099ff 4473 }
c277df42
IZ
4474
4475 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4476 if (longest_fixed_length > longest_float_length) {
1de06328 4477 r->check_end_shift = r->anchored_end_shift;
c277df42 4478 r->check_substr = r->anchored_substr;
33b8afdf 4479 r->check_utf8 = r->anchored_utf8;
c277df42 4480 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4481 if (r->extflags & RXf_ANCH_SINGLE)
4482 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4483 }
4484 else {
1de06328 4485 r->check_end_shift = r->float_end_shift;
c277df42 4486 r->check_substr = r->float_substr;
33b8afdf 4487 r->check_utf8 = r->float_utf8;
1de06328
YO
4488 r->check_offset_min = r->float_min_offset;
4489 r->check_offset_max = r->float_max_offset;
a0d0e21e 4490 }
30382c73
IZ
4491 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4492 This should be changed ASAP! */
bbe252da
YO
4493 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4494 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4495 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4496 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4497 }
1de06328
YO
4498 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4499 if ( (STRLEN)minlen < longest_float_length )
4500 minlen= longest_float_length;
4501 if ( (STRLEN)minlen < longest_fixed_length )
4502 minlen= longest_fixed_length;
4503 */
a0ed51b3
LW
4504 }
4505 else {
c277df42
IZ
4506 /* Several toplevels. Best we can is to set minlen. */
4507 I32 fake;
653099ff 4508 struct regnode_charclass_class ch_class;
cb434fcc 4509 I32 last_close = 0;
c277df42 4510
5d458dd8 4511 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4512
f8fc2ecf 4513 scan = ri->program + 1;
830247a4 4514 cl_init(pRExC_state, &ch_class);
653099ff 4515 data.start_class = &ch_class;
cb434fcc 4516 data.last_closep = &last_close;
07be1b83 4517
de8c5301 4518
1de06328 4519 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4520 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4521
786e8c11 4522 CHECK_RESTUDY_GOTO;
07be1b83 4523
33b8afdf 4524 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4525 = r->float_substr = r->float_utf8 = NULL;
653099ff 4526 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4527 && !cl_is_anything(data.start_class))
4528 {
2eccd3b2 4529 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4530
f8fc2ecf 4531 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4532 struct regnode_charclass_class);
4533 StructCopy(data.start_class,
f8fc2ecf 4534 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4535 struct regnode_charclass_class);
f8fc2ecf 4536 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4537 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4538 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4539 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4540 PerlIO_printf(Perl_debug_log,
a0288114 4541 "synthetic stclass \"%s\".\n",
3f7c398e 4542 SvPVX_const(sv));});
653099ff 4543 }
a0d0e21e
LW
4544 }
4545
1de06328
YO
4546 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4547 the "real" pattern. */
cf9788e3
RGS
4548 DEBUG_OPTIMISE_r({
4549 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4550 (IV)minlen, (IV)r->minlen);
cf9788e3 4551 });
de8c5301 4552 r->minlenret = minlen;
1de06328
YO
4553 if (r->minlen < minlen)
4554 r->minlen = minlen;
4555
b81d288d 4556 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4557 r->extflags |= RXf_GPOS_SEEN;
830247a4 4558 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4559 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4560 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4561 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4562 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4563 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4564 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4565 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4566 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4567 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9
YO
4568 if (RExC_paren_names)
4569 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4570 else
4571 r->paren_names = NULL;
4572
40d049e4
YO
4573 if (RExC_recurse_count) {
4574 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4575 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4576 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4577 }
4578 }
a02a5408
JC
4579 Newxz(r->startp, RExC_npar, I32);
4580 Newxz(r->endp, RExC_npar, I32);
c74340f9
YO
4581 /* assume we don't need to swap parens around before we match */
4582
be8e71aa
YO
4583 DEBUG_DUMP_r({
4584 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4585 regdump(r);
4586 });
f8fc2ecf
YO
4587 DEBUG_OFFSETS_r(if (ri->offsets) {
4588 const U32 len = ri->offsets[0];
8e9a8a48
YO
4589 U32 i;
4590 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 4591 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->offsets[0]);
8e9a8a48 4592 for (i = 1; i <= len; i++) {
f8fc2ecf 4593 if (ri->offsets[i*2-1] || ri->offsets[i*2])
8e9a8a48 4594 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
f8fc2ecf 4595 (UV)i, (UV)ri->offsets[i*2-1], (UV)ri->offsets[i*2]);
8e9a8a48
YO
4596 }
4597 PerlIO_printf(Perl_debug_log, "\n");
4598 });
a0d0e21e 4599 return(r);
a687059c
LW
4600}
4601
f9f4320a 4602#undef CORE_ONLY_BLOCK
f9f4320a 4603#undef RE_ENGINE_PTR
3dab1dad 4604
9af228c6 4605#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4606SV*
4607Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4608{
4609 I32 parno = 0; /* no match */
4610 if (PL_curpm) {
4611 const REGEXP * const rx = PM_GETRE(PL_curpm);
4612 if (rx && rx->paren_names) {
4613 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4614 if (he_str) {
4615 IV i;
4616 SV* sv_dat=HeVAL(he_str);
4617 I32 *nums=(I32*)SvPVX(sv_dat);
4618 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4619 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4620 rx->endp[nums[i]] != -1)
4621 {
4622 parno = nums[i];
4623 break;
4624 }
4625 }
4626 }
4627 }
4628 }
4629 if ( !parno ) {
4630 return 0;
4631 } else {
4632 GV *gv_paren;
4633 SV *sv= sv_newmortal();
4634 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4635 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4636 return GvSVn(gv_paren);
4637 }
4638}
9af228c6 4639#endif
0a4db386 4640
894be9b7 4641/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4642 * If flags is REG_RSN_RETURN_NULL returns null.
4643 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4644 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4645 * to the parsed name as looked up in the RExC_paren_names hash.
4646 * If there is an error throws a vFAIL().. type exception.
894be9b7 4647 */
0a4db386
YO
4648
4649#define REG_RSN_RETURN_NULL 0
4650#define REG_RSN_RETURN_NAME 1
4651#define REG_RSN_RETURN_DATA 2
4652
894be9b7
YO
4653STATIC SV*
4654S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4655 char *name_start = RExC_parse;
0a4db386 4656 if ( UTF ) {
894be9b7 4657 STRLEN numlen;
0a4db386
YO
4658 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4659 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4660 {
4661 RExC_parse += numlen;
4662 }
4663 } else {
4664 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4665 RExC_parse++;
4666 }
0a4db386
YO
4667 if ( flags ) {
4668 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4669 (int)(RExC_parse - name_start)));
894be9b7 4670 if (UTF)
0a4db386
YO
4671 SvUTF8_on(sv_name);
4672 if ( flags == REG_RSN_RETURN_NAME)
4673 return sv_name;
4674 else if (flags==REG_RSN_RETURN_DATA) {
4675 HE *he_str = NULL;
4676 SV *sv_dat = NULL;
4677 if ( ! sv_name ) /* should not happen*/
4678 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4679 if (RExC_paren_names)
4680 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4681 if ( he_str )
4682 sv_dat = HeVAL(he_str);
4683 if ( ! sv_dat )
4684 vFAIL("Reference to nonexistent named group");
4685 return sv_dat;
4686 }
4687 else {
4688 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4689 }
4690 /* NOT REACHED */
894be9b7 4691 }
0a4db386 4692 return NULL;
894be9b7
YO
4693}
4694
3dab1dad
YO
4695#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4696 int rem=(int)(RExC_end - RExC_parse); \
4697 int cut; \
4698 int num; \
4699 int iscut=0; \
4700 if (rem>10) { \
4701 rem=10; \
4702 iscut=1; \
4703 } \
4704 cut=10-rem; \
4705 if (RExC_lastparse!=RExC_parse) \
4706 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4707 rem, RExC_parse, \
4708 cut + 4, \
4709 iscut ? "..." : "<" \
4710 ); \
4711 else \
4712 PerlIO_printf(Perl_debug_log,"%16s",""); \
4713 \
4714 if (SIZE_ONLY) \
4715 num=RExC_size; \
4716 else \
4717 num=REG_NODE_NUM(RExC_emit); \
4718 if (RExC_lastnum!=num) \
0a4db386 4719 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4720 else \
0a4db386 4721 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4722 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4723 (int)((depth*2)), "", \
3dab1dad
YO
4724 (funcname) \
4725 ); \
4726 RExC_lastnum=num; \
4727 RExC_lastparse=RExC_parse; \
4728})
4729
07be1b83
YO
4730
4731
3dab1dad
YO
4732#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4733 DEBUG_PARSE_MSG((funcname)); \
4734 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4735})
6bda09f9
YO
4736#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4737 DEBUG_PARSE_MSG((funcname)); \
4738 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4739})
a687059c
LW
4740/*
4741 - reg - regular expression, i.e. main body or parenthesized thing
4742 *
4743 * Caller must absorb opening parenthesis.
4744 *
4745 * Combining parenthesis handling with the base level of regular expression
4746 * is a trifle forced, but the need to tie the tails of the branches to what
4747 * follows makes it hard to avoid.
4748 */
07be1b83
YO
4749#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4750#ifdef DEBUGGING
4751#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4752#else
4753#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4754#endif
3dab1dad 4755
e2e6a0f1
YO
4756/* this idea is borrowed from STR_WITH_LEN in handy.h */
4757#define CHECK_WORD(s,v,l) \
4758 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4759
76e3520e 4760STATIC regnode *
3dab1dad 4761S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4762 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4763{
27da23d5 4764 dVAR;
c277df42
IZ
4765 register regnode *ret; /* Will be the head of the group. */
4766 register regnode *br;
4767 register regnode *lastbr;
cbbf8932 4768 register regnode *ender = NULL;
a0d0e21e 4769 register I32 parno = 0;
cbbf8932
AL
4770 I32 flags;
4771 const I32 oregflags = RExC_flags;
6136c704
AL
4772 bool have_branch = 0;
4773 bool is_open = 0;
9d1d55b5
JP
4774
4775 /* for (?g), (?gc), and (?o) warnings; warning
4776 about (?c) will warn about (?g) -- japhy */
4777
6136c704
AL
4778#define WASTED_O 0x01
4779#define WASTED_G 0x02
4780#define WASTED_C 0x04
4781#define WASTED_GC (0x02|0x04)
cbbf8932 4782 I32 wastedflags = 0x00;
9d1d55b5 4783
fac92740 4784 char * parse_start = RExC_parse; /* MJD */
a28509cc 4785 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4786
3dab1dad
YO
4787 GET_RE_DEBUG_FLAGS_DECL;
4788 DEBUG_PARSE("reg ");
4789
4790
821b33a5 4791 *flagp = 0; /* Tentatively. */
a0d0e21e 4792
9d1d55b5 4793
a0d0e21e
LW
4794 /* Make an OPEN node, if parenthesized. */
4795 if (paren) {
e2e6a0f1
YO
4796 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4797 char *start_verb = RExC_parse;
4798 STRLEN verb_len = 0;
4799 char *start_arg = NULL;
4800 unsigned char op = 0;
4801 int argok = 1;
4802 int internal_argval = 0; /* internal_argval is only useful if !argok */
4803 while ( *RExC_parse && *RExC_parse != ')' ) {
4804 if ( *RExC_parse == ':' ) {
4805 start_arg = RExC_parse + 1;
4806 break;
4807 }
4808 RExC_parse++;
4809 }
4810 ++start_verb;
4811 verb_len = RExC_parse - start_verb;
4812 if ( start_arg ) {
4813 RExC_parse++;
4814 while ( *RExC_parse && *RExC_parse != ')' )
4815 RExC_parse++;
4816 if ( *RExC_parse != ')' )
4817 vFAIL("Unterminated verb pattern argument");
4818 if ( RExC_parse == start_arg )
4819 start_arg = NULL;
4820 } else {
4821 if ( *RExC_parse != ')' )
4822 vFAIL("Unterminated verb pattern");
4823 }
5d458dd8 4824
e2e6a0f1
YO
4825 switch ( *start_verb ) {
4826 case 'A': /* (*ACCEPT) */
4827 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4828 op = ACCEPT;
4829 internal_argval = RExC_nestroot;
4830 }
4831 break;
4832 case 'C': /* (*COMMIT) */
4833 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4834 op = COMMIT;
e2e6a0f1
YO
4835 break;
4836 case 'F': /* (*FAIL) */
4837 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4838 op = OPFAIL;
4839 argok = 0;
4840 }
4841 break;
5d458dd8
YO
4842 case ':': /* (*:NAME) */
4843 case 'M': /* (*MARK:NAME) */
4844 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 4845 op = MARKPOINT;
5d458dd8
YO
4846 argok = -1;
4847 }
4848 break;
4849 case 'P': /* (*PRUNE) */
4850 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4851 op = PRUNE;
e2e6a0f1 4852 break;
5d458dd8
YO
4853 case 'S': /* (*SKIP) */
4854 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4855 op = SKIP;
4856 break;
4857 case 'T': /* (*THEN) */
4858 /* [19:06] <TimToady> :: is then */
4859 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4860 op = CUTGROUP;
4861 RExC_seen |= REG_SEEN_CUTGROUP;
4862 }
e2e6a0f1
YO
4863 break;
4864 }
4865 if ( ! op ) {
4866 RExC_parse++;
4867 vFAIL3("Unknown verb pattern '%.*s'",
4868 verb_len, start_verb);
4869 }
4870 if ( argok ) {
4871 if ( start_arg && internal_argval ) {
4872 vFAIL3("Verb pattern '%.*s' may not have an argument",
4873 verb_len, start_verb);
4874 } else if ( argok < 0 && !start_arg ) {
4875 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4876 verb_len, start_verb);
4877 } else {
4878 ret = reganode(pRExC_state, op, internal_argval);
4879 if ( ! internal_argval && ! SIZE_ONLY ) {
4880 if (start_arg) {
4881 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4882 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 4883 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
4884 ret->flags = 0;
4885 } else {
4886 ret->flags = 1;
4887 }
4888 }
4889 }
4890 if (!internal_argval)
4891 RExC_seen |= REG_SEEN_VERBARG;
4892 } else if ( start_arg ) {
4893 vFAIL3("Verb pattern '%.*s' may not have an argument",
4894 verb_len, start_verb);
4895 } else {
4896 ret = reg_node(pRExC_state, op);
4897 }
4898 nextchar(pRExC_state);
4899 return ret;
4900 } else
fac92740 4901 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4902 U32 posflags = 0, negflags = 0;
4903 U32 *flagsp = &posflags;
6136c704 4904 bool is_logical = 0;
a28509cc 4905 const char * const seqstart = RExC_parse;
ca9dfc88 4906
830247a4
IZ
4907 RExC_parse++;
4908 paren = *RExC_parse++;
c277df42 4909 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4910 switch (paren) {
894be9b7 4911
fac92740 4912 case '<': /* (?<...) */
b81d288d 4913 if (*RExC_parse == '!')
c277df42 4914 paren = ',';
0a4db386
YO
4915 else if (*RExC_parse != '=')
4916 { /* (?<...>) */
81714fb9 4917 char *name_start;
894be9b7 4918 SV *svname;
81714fb9
YO
4919 paren= '>';
4920 case '\'': /* (?'...') */
4921 name_start= RExC_parse;
0a4db386
YO
4922 svname = reg_scan_name(pRExC_state,
4923 SIZE_ONLY ? /* reverse test from the others */
4924 REG_RSN_RETURN_NAME :
4925 REG_RSN_RETURN_NULL);
81714fb9
YO
4926 if (RExC_parse == name_start)
4927 goto unknown;
4928 if (*RExC_parse != paren)
4929 vFAIL2("Sequence (?%c... not terminated",
4930 paren=='>' ? '<' : paren);
4931 if (SIZE_ONLY) {
e62cc96a
YO
4932 HE *he_str;
4933 SV *sv_dat = NULL;
894be9b7
YO
4934 if (!svname) /* shouldnt happen */
4935 Perl_croak(aTHX_
4936 "panic: reg_scan_name returned NULL");
81714fb9
YO
4937 if (!RExC_paren_names) {
4938 RExC_paren_names= newHV();
4939 sv_2mortal((SV*)RExC_paren_names);
4940 }
4941 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4942 if ( he_str )
81714fb9 4943 sv_dat = HeVAL(he_str);
e62cc96a 4944 if ( ! sv_dat ) {
81714fb9 4945 /* croak baby croak */
e62cc96a
YO
4946 Perl_croak(aTHX_
4947 "panic: paren_name hash element allocation failed");
4948 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4949 IV count=SvIV(sv_dat);
4950 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4951 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4952 pv[count]=RExC_npar;
4953 SvIVX(sv_dat)++;
4954 } else {
4955 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4956 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4957 SvIOK_on(sv_dat);
4958 SvIVX(sv_dat)= 1;
e62cc96a
YO
4959 }
4960
81714fb9
YO
4961 /*sv_dump(sv_dat);*/
4962 }
4963 nextchar(pRExC_state);
4964 paren = 1;
4965 goto capturing_parens;
4966 }
4967 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4968 RExC_parse++;
fac92740
MJD
4969 case '=': /* (?=...) */
4970 case '!': /* (?!...) */
830247a4 4971 RExC_seen_zerolen++;
e2e6a0f1
YO
4972 if (*RExC_parse == ')') {
4973 ret=reg_node(pRExC_state, OPFAIL);
4974 nextchar(pRExC_state);
4975 return ret;
4976 }
fac92740
MJD
4977 case ':': /* (?:...) */
4978 case '>': /* (?>...) */
a0d0e21e 4979 break;
fac92740
MJD
4980 case '$': /* (?$...) */
4981 case '@': /* (?@...) */
8615cb43 4982 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4983 break;
fac92740 4984 case '#': /* (?#...) */
830247a4
IZ
4985 while (*RExC_parse && *RExC_parse != ')')
4986 RExC_parse++;
4987 if (*RExC_parse != ')')
c277df42 4988 FAIL("Sequence (?#... not terminated");
830247a4 4989 nextchar(pRExC_state);
a0d0e21e
LW
4990 *flagp = TRYAGAIN;
4991 return NULL;
894be9b7
YO
4992 case '0' : /* (?0) */
4993 case 'R' : /* (?R) */
4994 if (*RExC_parse != ')')
6bda09f9 4995 FAIL("Sequence (?R) not terminated");
1a147d38 4996 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
4997 nextchar(pRExC_state);
4998 return ret;
4999 /*notreached*/
894be9b7
YO
5000 { /* named and numeric backreferences */
5001 I32 num;
5002 char * parse_start;
5003 case '&': /* (?&NAME) */
5004 parse_start = RExC_parse - 1;
5005 {
0a4db386
YO
5006 SV *sv_dat = reg_scan_name(pRExC_state,
5007 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5008 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5009 }
5010 goto gen_recurse_regop;
5011 /* NOT REACHED */
542fa716
YO
5012 case '+':
5013 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5014 RExC_parse++;
5015 vFAIL("Illegal pattern");
5016 }
5017 goto parse_recursion;
5018 /* NOT REACHED*/
5019 case '-': /* (?-1) */
5020 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5021 RExC_parse--; /* rewind to let it be handled later */
5022 goto parse_flags;
5023 }
5024 /*FALLTHROUGH */
6bda09f9
YO
5025 case '1': case '2': case '3': case '4': /* (?1) */
5026 case '5': case '6': case '7': case '8': case '9':
5027 RExC_parse--;
542fa716 5028 parse_recursion:
894be9b7
YO
5029 num = atoi(RExC_parse);
5030 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5031 if (*RExC_parse == '-')
5032 RExC_parse++;
6bda09f9
YO
5033 while (isDIGIT(*RExC_parse))
5034 RExC_parse++;
5035 if (*RExC_parse!=')')
5036 vFAIL("Expecting close bracket");
894be9b7
YO
5037
5038 gen_recurse_regop:
542fa716
YO
5039 if ( paren == '-' ) {
5040 /*
5041 Diagram of capture buffer numbering.
5042 Top line is the normal capture buffer numbers
5043 Botton line is the negative indexing as from
5044 the X (the (?-2))
5045
5046 + 1 2 3 4 5 X 6 7
5047 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5048 - 5 4 3 2 1 X x x
5049
5050 */
5051 num = RExC_npar + num;
5052 if (num < 1) {
5053 RExC_parse++;
5054 vFAIL("Reference to nonexistent group");
5055 }
5056 } else if ( paren == '+' ) {
5057 num = RExC_npar + num - 1;
5058 }
5059
1a147d38 5060 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5061 if (!SIZE_ONLY) {
5062 if (num > (I32)RExC_rx->nparens) {
5063 RExC_parse++;
5064 vFAIL("Reference to nonexistent group");
5065 }
40d049e4 5066 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5067 RExC_emit++;
226de585 5068 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5069 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5070 } else {
6bda09f9 5071 RExC_size++;
6bda09f9 5072 }
0a4db386 5073 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5074 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5075 Set_Node_Offset(ret, parse_start); /* MJD */
5076
6bda09f9
YO
5077 nextchar(pRExC_state);
5078 return ret;
894be9b7
YO
5079 } /* named and numeric backreferences */
5080 /* NOT REACHED */
5081
fac92740 5082 case 'p': /* (?p...) */
9014280d 5083 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 5084 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 5085 /* FALL THROUGH*/
fac92740 5086 case '?': /* (??...) */
6136c704 5087 is_logical = 1;
438a3801
YST
5088 if (*RExC_parse != '{')
5089 goto unknown;
830247a4 5090 paren = *RExC_parse++;
0f5d15d6 5091 /* FALL THROUGH */
fac92740 5092 case '{': /* (?{...}) */
c277df42 5093 {
2eccd3b2
NC
5094 I32 count = 1;
5095 U32 n = 0;
c277df42 5096 char c;
830247a4 5097 char *s = RExC_parse;
c277df42 5098
830247a4
IZ
5099 RExC_seen_zerolen++;
5100 RExC_seen |= REG_SEEN_EVAL;
5101 while (count && (c = *RExC_parse)) {
6136c704
AL
5102 if (c == '\\') {
5103 if (RExC_parse[1])
5104 RExC_parse++;
5105 }
b81d288d 5106 else if (c == '{')
c277df42 5107 count++;
b81d288d 5108 else if (c == '}')
c277df42 5109 count--;
830247a4 5110 RExC_parse++;
c277df42 5111 }
6136c704 5112 if (*RExC_parse != ')') {
b81d288d 5113 RExC_parse = s;
b45f050a
JF
5114 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5115 }
c277df42 5116 if (!SIZE_ONLY) {
f3548bdc 5117 PAD *pad;
6136c704
AL
5118 OP_4tree *sop, *rop;
5119 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5120
569233ed
SB
5121 ENTER;
5122 Perl_save_re_context(aTHX);
f3548bdc 5123 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5124 sop->op_private |= OPpREFCOUNTED;
5125 /* re_dup will OpREFCNT_inc */
5126 OpREFCNT_set(sop, 1);
569233ed 5127 LEAVE;
c277df42 5128
830247a4 5129 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5130 RExC_rxi->data->data[n] = (void*)rop;
5131 RExC_rxi->data->data[n+1] = (void*)sop;
5132 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 5133 SvREFCNT_dec(sv);
a0ed51b3 5134 }
e24b16f9 5135 else { /* First pass */
830247a4 5136 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5137 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5138 /* No compiled RE interpolated, has runtime
5139 components ===> unsafe. */
5140 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5141 if (PL_tainting && PL_tainted)
cc6b7395 5142 FAIL("Eval-group in insecure regular expression");
54df2634 5143#if PERL_VERSION > 8
923e4eb5 5144 if (IN_PERL_COMPILETIME)
b5c19bd7 5145 PL_cv_has_eval = 1;
54df2634 5146#endif
c277df42 5147 }
b5c19bd7 5148
830247a4 5149 nextchar(pRExC_state);
6136c704 5150 if (is_logical) {
830247a4 5151 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5152 if (!SIZE_ONLY)
5153 ret->flags = 2;
3dab1dad 5154 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5155 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5156 return ret;
5157 }
ccb2c380
MP
5158 ret = reganode(pRExC_state, EVAL, n);
5159 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5160 Set_Node_Offset(ret, parse_start);
5161 return ret;
c277df42 5162 }
fac92740 5163 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5164 {
0a4db386 5165 int is_define= 0;
fac92740 5166 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5167 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5168 || RExC_parse[1] == '<'
830247a4 5169 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5170 I32 flag;
5171
830247a4 5172 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5173 if (!SIZE_ONLY)
5174 ret->flags = 1;
3dab1dad 5175 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5176 goto insert_if;
b81d288d 5177 }
a0ed51b3 5178 }
0a4db386
YO
5179 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5180 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5181 {
5182 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5183 char *name_start= RExC_parse++;
2eccd3b2 5184 U32 num = 0;
0a4db386
YO
5185 SV *sv_dat=reg_scan_name(pRExC_state,
5186 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5187 if (RExC_parse == name_start || *RExC_parse != ch)
5188 vFAIL2("Sequence (?(%c... not terminated",
5189 (ch == '>' ? '<' : ch));
5190 RExC_parse++;
5191 if (!SIZE_ONLY) {
5192 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5193 RExC_rxi->data->data[num]=(void*)sv_dat;
0a4db386
YO
5194 SvREFCNT_inc(sv_dat);
5195 }
5196 ret = reganode(pRExC_state,NGROUPP,num);
5197 goto insert_if_check_paren;
5198 }
5199 else if (RExC_parse[0] == 'D' &&
5200 RExC_parse[1] == 'E' &&
5201 RExC_parse[2] == 'F' &&
5202 RExC_parse[3] == 'I' &&
5203 RExC_parse[4] == 'N' &&
5204 RExC_parse[5] == 'E')
5205 {
5206 ret = reganode(pRExC_state,DEFINEP,0);
5207 RExC_parse +=6 ;
5208 is_define = 1;
5209 goto insert_if_check_paren;
5210 }
5211 else if (RExC_parse[0] == 'R') {
5212 RExC_parse++;
5213 parno = 0;
5214 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5215 parno = atoi(RExC_parse++);
5216 while (isDIGIT(*RExC_parse))
5217 RExC_parse++;
5218 } else if (RExC_parse[0] == '&') {
5219 SV *sv_dat;
5220 RExC_parse++;
5221 sv_dat = reg_scan_name(pRExC_state,
5222 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5223 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5224 }
1a147d38 5225 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5226 goto insert_if_check_paren;
5227 }
830247a4 5228 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5229 /* (?(1)...) */
6136c704 5230 char c;
830247a4 5231 parno = atoi(RExC_parse++);
c277df42 5232
830247a4
IZ
5233 while (isDIGIT(*RExC_parse))
5234 RExC_parse++;
fac92740 5235 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5236
0a4db386 5237 insert_if_check_paren:
830247a4 5238 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5239 vFAIL("Switch condition not recognized");
c277df42 5240 insert_if:
3dab1dad
YO
5241 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5242 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5243 if (br == NULL)
830247a4 5244 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5245 else
3dab1dad 5246 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5247 c = *nextchar(pRExC_state);
d1b80229
IZ
5248 if (flags&HASWIDTH)
5249 *flagp |= HASWIDTH;
c277df42 5250 if (c == '|') {
0a4db386
YO
5251 if (is_define)
5252 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5253 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5254 regbranch(pRExC_state, &flags, 1,depth+1);
5255 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5256 if (flags&HASWIDTH)
5257 *flagp |= HASWIDTH;
830247a4 5258 c = *nextchar(pRExC_state);
a0ed51b3
LW
5259 }
5260 else
c277df42
IZ
5261 lastbr = NULL;
5262 if (c != ')')
8615cb43 5263 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5264 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5265 REGTAIL(pRExC_state, br, ender);
c277df42 5266 if (lastbr) {
3dab1dad
YO
5267 REGTAIL(pRExC_state, lastbr, ender);
5268 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5269 }
5270 else
3dab1dad 5271 REGTAIL(pRExC_state, ret, ender);
c277df42 5272 return ret;
a0ed51b3
LW
5273 }
5274 else {
830247a4 5275 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5276 }
5277 }
1b1626e4 5278 case 0:
830247a4 5279 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5280 vFAIL("Sequence (? incomplete");
1b1626e4 5281 break;
a0d0e21e 5282 default:
830247a4 5283 --RExC_parse;
fac92740 5284 parse_flags: /* (?i) */
830247a4 5285 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5286 /* (?g), (?gc) and (?o) are useless here
5287 and must be globally applied -- japhy */
5288
5289 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5290 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5291 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5292 if (! (wastedflags & wflagbit) ) {
5293 wastedflags |= wflagbit;
5294 vWARN5(
5295 RExC_parse + 1,
5296 "Useless (%s%c) - %suse /%c modifier",
5297 flagsp == &negflags ? "?-" : "?",
5298 *RExC_parse,
5299 flagsp == &negflags ? "don't " : "",
5300 *RExC_parse
5301 );
5302 }
5303 }
5304 }
5305 else if (*RExC_parse == 'c') {
5306 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5307 if (! (wastedflags & WASTED_C) ) {
5308 wastedflags |= WASTED_GC;
9d1d55b5
JP
5309 vWARN3(
5310 RExC_parse + 1,
5311 "Useless (%sc) - %suse /gc modifier",
5312 flagsp == &negflags ? "?-" : "?",
5313 flagsp == &negflags ? "don't " : ""
5314 );
5315 }
5316 }
5317 }
5318 else { pmflag(flagsp, *RExC_parse); }
5319
830247a4 5320 ++RExC_parse;
ca9dfc88 5321 }
830247a4 5322 if (*RExC_parse == '-') {
ca9dfc88 5323 flagsp = &negflags;
9d1d55b5 5324 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5325 ++RExC_parse;
ca9dfc88 5326 goto parse_flags;
48c036b1 5327 }
e2509266
JH
5328 RExC_flags |= posflags;
5329 RExC_flags &= ~negflags;
830247a4
IZ
5330 if (*RExC_parse == ':') {
5331 RExC_parse++;
ca9dfc88
IZ
5332 paren = ':';
5333 break;
5334 }
c277df42 5335 unknown:
830247a4
IZ
5336 if (*RExC_parse != ')') {
5337 RExC_parse++;
5338 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5339 }
830247a4 5340 nextchar(pRExC_state);
a0d0e21e
LW
5341 *flagp = TRYAGAIN;
5342 return NULL;
5343 }
5344 }
fac92740 5345 else { /* (...) */
81714fb9 5346 capturing_parens:
830247a4
IZ
5347 parno = RExC_npar;
5348 RExC_npar++;
e2e6a0f1 5349
830247a4 5350 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5351 if (!SIZE_ONLY ){
5352 if (!RExC_nestroot)
5353 RExC_nestroot = parno;
5354 if (RExC_seen & REG_SEEN_RECURSE) {
5355 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5356 "Setting open paren #%"IVdf" to %d\n",
5357 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5358 RExC_open_parens[parno-1]= ret;
5359 }
6bda09f9 5360 }
fac92740
MJD
5361 Set_Node_Length(ret, 1); /* MJD */
5362 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5363 is_open = 1;
a0d0e21e 5364 }
a0ed51b3 5365 }
fac92740 5366 else /* ! paren */
a0d0e21e
LW
5367 ret = NULL;
5368
5369 /* Pick up the branches, linking them together. */
fac92740 5370 parse_start = RExC_parse; /* MJD */
3dab1dad 5371 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5372 /* branch_len = (paren != 0); */
2af232bd 5373
a0d0e21e
LW
5374 if (br == NULL)
5375 return(NULL);
830247a4
IZ
5376 if (*RExC_parse == '|') {
5377 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5378 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5379 }
fac92740 5380 else { /* MJD */
6bda09f9 5381 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5382 Set_Node_Length(br, paren != 0);
5383 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5384 }
c277df42
IZ
5385 have_branch = 1;
5386 if (SIZE_ONLY)
830247a4 5387 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5388 }
5389 else if (paren == ':') {
c277df42
IZ
5390 *flagp |= flags&SIMPLE;
5391 }
6136c704 5392 if (is_open) { /* Starts with OPEN. */
3dab1dad 5393 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5394 }
5395 else if (paren != '?') /* Not Conditional */
a0d0e21e 5396 ret = br;
32a0ca98 5397 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5398 lastbr = br;
830247a4
IZ
5399 while (*RExC_parse == '|') {
5400 if (!SIZE_ONLY && RExC_extralen) {
5401 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5402 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5403 }
5404 if (SIZE_ONLY)
830247a4
IZ
5405 RExC_extralen += 2; /* Account for LONGJMP. */
5406 nextchar(pRExC_state);
3dab1dad 5407 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5408
a687059c 5409 if (br == NULL)
a0d0e21e 5410 return(NULL);
3dab1dad 5411 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5412 lastbr = br;
821b33a5
IZ
5413 if (flags&HASWIDTH)
5414 *flagp |= HASWIDTH;
a687059c 5415 *flagp |= flags&SPSTART;
a0d0e21e
LW
5416 }
5417
c277df42
IZ
5418 if (have_branch || paren != ':') {
5419 /* Make a closing node, and hook it on the end. */
5420 switch (paren) {
5421 case ':':
830247a4 5422 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5423 break;
5424 case 1:
830247a4 5425 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5426 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5427 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5428 "Setting close paren #%"IVdf" to %d\n",
5429 (IV)parno, REG_NODE_NUM(ender)));
5430 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5431 if (RExC_nestroot == parno)
5432 RExC_nestroot = 0;
40d049e4 5433 }
fac92740
MJD
5434 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5435 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5436 break;
5437 case '<':
c277df42
IZ
5438 case ',':
5439 case '=':
5440 case '!':
c277df42 5441 *flagp &= ~HASWIDTH;
821b33a5
IZ
5442 /* FALL THROUGH */
5443 case '>':
830247a4 5444 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5445 break;
5446 case 0:
830247a4 5447 ender = reg_node(pRExC_state, END);
40d049e4
YO
5448 if (!SIZE_ONLY) {
5449 assert(!RExC_opend); /* there can only be one! */
5450 RExC_opend = ender;
5451 }
c277df42
IZ
5452 break;
5453 }
eaf3ca90 5454 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5455
9674d46a 5456 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5457 if (depth==1)
5458 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5459
c277df42 5460 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5461 for (br = ret; br; br = regnext(br)) {
5462 const U8 op = PL_regkind[OP(br)];
5463 if (op == BRANCH) {
07be1b83 5464 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5465 }
5466 else if (op == BRANCHJ) {
07be1b83 5467 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5468 }
c277df42
IZ
5469 }
5470 }
a0d0e21e 5471 }
c277df42
IZ
5472
5473 {
e1ec3a88
AL
5474 const char *p;
5475 static const char parens[] = "=!<,>";
c277df42
IZ
5476
5477 if (paren && (p = strchr(parens, paren))) {
eb160463 5478 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5479 int flag = (p - parens) > 1;
5480
5481 if (paren == '>')
5482 node = SUSPEND, flag = 0;
6bda09f9 5483 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5484 Set_Node_Cur_Length(ret);
5485 Set_Node_Offset(ret, parse_start + 1);
c277df42 5486 ret->flags = flag;
07be1b83 5487 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5488 }
a0d0e21e
LW
5489 }
5490
5491 /* Check for proper termination. */
ce3e6498 5492 if (paren) {
e2509266 5493 RExC_flags = oregflags;
830247a4
IZ
5494 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5495 RExC_parse = oregcomp_parse;
380a0633 5496 vFAIL("Unmatched (");
ce3e6498 5497 }
a0ed51b3 5498 }
830247a4
IZ
5499 else if (!paren && RExC_parse < RExC_end) {
5500 if (*RExC_parse == ')') {
5501 RExC_parse++;
380a0633 5502 vFAIL("Unmatched )");
a0ed51b3
LW
5503 }
5504 else
b45f050a 5505 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5506 /* NOTREACHED */
5507 }
a687059c 5508
a0d0e21e 5509 return(ret);
a687059c
LW
5510}
5511
5512/*
5513 - regbranch - one alternative of an | operator
5514 *
5515 * Implements the concatenation operator.
5516 */
76e3520e 5517STATIC regnode *
3dab1dad 5518S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5519{
97aff369 5520 dVAR;
c277df42
IZ
5521 register regnode *ret;
5522 register regnode *chain = NULL;
5523 register regnode *latest;
5524 I32 flags = 0, c = 0;
3dab1dad
YO
5525 GET_RE_DEBUG_FLAGS_DECL;
5526 DEBUG_PARSE("brnc");
b81d288d 5527 if (first)
c277df42
IZ
5528 ret = NULL;
5529 else {
b81d288d 5530 if (!SIZE_ONLY && RExC_extralen)
830247a4 5531 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5532 else {
830247a4 5533 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5534 Set_Node_Length(ret, 1);
5535 }
c277df42
IZ
5536 }
5537
b81d288d 5538 if (!first && SIZE_ONLY)
830247a4 5539 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5540
c277df42 5541 *flagp = WORST; /* Tentatively. */
a0d0e21e 5542
830247a4
IZ
5543 RExC_parse--;
5544 nextchar(pRExC_state);
5545 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5546 flags &= ~TRYAGAIN;
3dab1dad 5547 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5548 if (latest == NULL) {
5549 if (flags & TRYAGAIN)
5550 continue;
5551 return(NULL);
a0ed51b3
LW
5552 }
5553 else if (ret == NULL)
c277df42 5554 ret = latest;
a0d0e21e 5555 *flagp |= flags&HASWIDTH;
c277df42 5556 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5557 *flagp |= flags&SPSTART;
5558 else {
830247a4 5559 RExC_naughty++;
3dab1dad 5560 REGTAIL(pRExC_state, chain, latest);
a687059c 5561 }
a0d0e21e 5562 chain = latest;
c277df42
IZ
5563 c++;
5564 }
5565 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5566 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5567 if (ret == NULL)
5568 ret = chain;
5569 }
5570 if (c == 1) {
5571 *flagp |= flags&SIMPLE;
a0d0e21e 5572 }
a687059c 5573
d4c19fe8 5574 return ret;
a687059c
LW
5575}
5576
5577/*
5578 - regpiece - something followed by possible [*+?]
5579 *
5580 * Note that the branching code sequences used for ? and the general cases
5581 * of * and + are somewhat optimized: they use the same NOTHING node as
5582 * both the endmarker for their branch list and the body of the last branch.
5583 * It might seem that this node could be dispensed with entirely, but the
5584 * endmarker role is not redundant.
5585 */
76e3520e 5586STATIC regnode *
3dab1dad 5587S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5588{
97aff369 5589 dVAR;
c277df42 5590 register regnode *ret;
a0d0e21e
LW
5591 register char op;
5592 register char *next;
5593 I32 flags;
1df70142 5594 const char * const origparse = RExC_parse;
a0d0e21e 5595 I32 min;
c277df42 5596 I32 max = REG_INFTY;
fac92740 5597 char *parse_start;
10edeb5d 5598 const char *maxpos = NULL;
3dab1dad
YO
5599 GET_RE_DEBUG_FLAGS_DECL;
5600 DEBUG_PARSE("piec");
a0d0e21e 5601
3dab1dad 5602 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5603 if (ret == NULL) {
5604 if (flags & TRYAGAIN)
5605 *flagp |= TRYAGAIN;
5606 return(NULL);
5607 }
5608
830247a4 5609 op = *RExC_parse;
a0d0e21e 5610
830247a4 5611 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5612 maxpos = NULL;
fac92740 5613 parse_start = RExC_parse; /* MJD */
830247a4 5614 next = RExC_parse + 1;
a0d0e21e
LW
5615 while (isDIGIT(*next) || *next == ',') {
5616 if (*next == ',') {
5617 if (maxpos)
5618 break;
5619 else
5620 maxpos = next;
a687059c 5621 }
a0d0e21e
LW
5622 next++;
5623 }
5624 if (*next == '}') { /* got one */
5625 if (!maxpos)
5626 maxpos = next;
830247a4
IZ
5627 RExC_parse++;
5628 min = atoi(RExC_parse);
a0d0e21e
LW
5629 if (*maxpos == ',')
5630 maxpos++;
5631 else
830247a4 5632 maxpos = RExC_parse;
a0d0e21e
LW
5633 max = atoi(maxpos);
5634 if (!max && *maxpos != '0')
c277df42
IZ
5635 max = REG_INFTY; /* meaning "infinity" */
5636 else if (max >= REG_INFTY)
8615cb43 5637 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5638 RExC_parse = next;
5639 nextchar(pRExC_state);
a0d0e21e
LW
5640
5641 do_curly:
5642 if ((flags&SIMPLE)) {
830247a4 5643 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5644 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5645 Set_Node_Offset(ret, parse_start+1); /* MJD */
5646 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5647 }
5648 else {
3dab1dad 5649 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5650
5651 w->flags = 0;
3dab1dad 5652 REGTAIL(pRExC_state, ret, w);
830247a4 5653 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5654 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5655 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5656 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5657 }
6bda09f9 5658 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5659 /* MJD hk */
5660 Set_Node_Offset(ret, parse_start+1);
2af232bd 5661 Set_Node_Length(ret,
fac92740 5662 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5663
830247a4 5664 if (!SIZE_ONLY && RExC_extralen)
c277df42 5665 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5666 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5667 if (SIZE_ONLY)
830247a4
IZ
5668 RExC_whilem_seen++, RExC_extralen += 3;
5669 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5670 }
c277df42 5671 ret->flags = 0;
a0d0e21e
LW
5672
5673 if (min > 0)
821b33a5
IZ
5674 *flagp = WORST;
5675 if (max > 0)
5676 *flagp |= HASWIDTH;
a0d0e21e 5677 if (max && max < min)
8615cb43 5678 vFAIL("Can't do {n,m} with n > m");
c277df42 5679 if (!SIZE_ONLY) {
eb160463
GS
5680 ARG1_SET(ret, (U16)min);
5681 ARG2_SET(ret, (U16)max);
a687059c 5682 }
a687059c 5683
a0d0e21e 5684 goto nest_check;
a687059c 5685 }
a0d0e21e 5686 }
a687059c 5687
a0d0e21e
LW
5688 if (!ISMULT1(op)) {
5689 *flagp = flags;
a687059c 5690 return(ret);
a0d0e21e 5691 }
bb20fd44 5692
c277df42 5693#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5694
5695 /* if this is reinstated, don't forget to put this back into perldiag:
5696
5697 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5698
5699 (F) The part of the regexp subject to either the * or + quantifier
5700 could match an empty string. The {#} shows in the regular
5701 expression about where the problem was discovered.
5702
5703 */
5704
bb20fd44 5705 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5706 vFAIL("Regexp *+ operand could be empty");
b81d288d 5707#endif
bb20fd44 5708
fac92740 5709 parse_start = RExC_parse;
830247a4 5710 nextchar(pRExC_state);
a0d0e21e 5711
821b33a5 5712 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5713
5714 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5715 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5716 ret->flags = 0;
830247a4 5717 RExC_naughty += 4;
a0d0e21e
LW
5718 }
5719 else if (op == '*') {
5720 min = 0;
5721 goto do_curly;
a0ed51b3
LW
5722 }
5723 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5724 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5725 ret->flags = 0;
830247a4 5726 RExC_naughty += 3;
a0d0e21e
LW
5727 }
5728 else if (op == '+') {
5729 min = 1;
5730 goto do_curly;
a0ed51b3
LW
5731 }
5732 else if (op == '?') {
a0d0e21e
LW
5733 min = 0; max = 1;
5734 goto do_curly;
5735 }
5736 nest_check:
041457d9 5737 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5738 vWARN3(RExC_parse,
b45f050a 5739 "%.*s matches null string many times",
afd78fd5 5740 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5741 origparse);
a0d0e21e
LW
5742 }
5743
b9b4dddf 5744 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5745 nextchar(pRExC_state);
6bda09f9 5746 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5747 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5748 }
b9b4dddf
YO
5749#ifndef REG_ALLOW_MINMOD_SUSPEND
5750 else
5751#endif
5752 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5753 regnode *ender;
5754 nextchar(pRExC_state);
5755 ender = reg_node(pRExC_state, SUCCEED);
5756 REGTAIL(pRExC_state, ret, ender);
5757 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5758 ret->flags = 0;
5759 ender = reg_node(pRExC_state, TAIL);
5760 REGTAIL(pRExC_state, ret, ender);
5761 /*ret= ender;*/
5762 }
5763
5764 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5765 RExC_parse++;
b45f050a
JF
5766 vFAIL("Nested quantifiers");
5767 }
a0d0e21e
LW
5768
5769 return(ret);
a687059c
LW
5770}
5771
fc8cd66c
YO
5772
5773/* reg_namedseq(pRExC_state,UVp)
5774
5775 This is expected to be called by a parser routine that has
5776 recognized'\N' and needs to handle the rest. RExC_parse is
5777 expected to point at the first char following the N at the time
5778 of the call.
5779
5780 If valuep is non-null then it is assumed that we are parsing inside
5781 of a charclass definition and the first codepoint in the resolved
5782 string is returned via *valuep and the routine will return NULL.
5783 In this mode if a multichar string is returned from the charnames
5784 handler a warning will be issued, and only the first char in the
5785 sequence will be examined. If the string returned is zero length
5786 then the value of *valuep is undefined and NON-NULL will
5787 be returned to indicate failure. (This will NOT be a valid pointer
5788 to a regnode.)
5789
5790 If value is null then it is assumed that we are parsing normal text
5791 and inserts a new EXACT node into the program containing the resolved
5792 string and returns a pointer to the new node. If the string is
5793 zerolength a NOTHING node is emitted.
5794
5795 On success RExC_parse is set to the char following the endbrace.
5796 Parsing failures will generate a fatal errorvia vFAIL(...)
5797
5798 NOTE: We cache all results from the charnames handler locally in
5799 the RExC_charnames hash (created on first use) to prevent a charnames
5800 handler from playing silly-buggers and returning a short string and
5801 then a long string for a given pattern. Since the regexp program
5802 size is calculated during an initial parse this would result
5803 in a buffer overrun so we cache to prevent the charname result from
5804 changing during the course of the parse.
5805
5806 */
5807STATIC regnode *
5808S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5809{
5810 char * name; /* start of the content of the name */
5811 char * endbrace; /* endbrace following the name */
5812 SV *sv_str = NULL;
5813 SV *sv_name = NULL;
5814 STRLEN len; /* this has various purposes throughout the code */
5815 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5816 regnode *ret = NULL;
5817
5818 if (*RExC_parse != '{') {
5819 vFAIL("Missing braces on \\N{}");
5820 }
5821 name = RExC_parse+1;
5822 endbrace = strchr(RExC_parse, '}');
5823 if ( ! endbrace ) {
5824 RExC_parse++;
5825 vFAIL("Missing right brace on \\N{}");
5826 }
5827 RExC_parse = endbrace + 1;
5828
5829
5830 /* RExC_parse points at the beginning brace,
5831 endbrace points at the last */
5832 if ( name[0]=='U' && name[1]=='+' ) {
5833 /* its a "unicode hex" notation {U+89AB} */
5834 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5835 | PERL_SCAN_DISALLOW_PREFIX
5836 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5837 UV cp;
196f1508 5838 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5839 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5840 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5841 cp = 0xFFFD;
5842 }
5843 if (cp > 0xff)
5844 RExC_utf8 = 1;
5845 if ( valuep ) {
5846 *valuep = cp;
5847 return NULL;
5848 }
5849 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5850 } else {
5851 /* fetch the charnames handler for this scope */
5852 HV * const table = GvHV(PL_hintgv);
5853 SV **cvp= table ?
5854 hv_fetchs(table, "charnames", FALSE) :
5855 NULL;
5856 SV *cv= cvp ? *cvp : NULL;
5857 HE *he_str;
5858 int count;
5859 /* create an SV with the name as argument */
5860 sv_name = newSVpvn(name, endbrace - name);
5861
5862 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5863 vFAIL2("Constant(\\N{%s}) unknown: "
5864 "(possibly a missing \"use charnames ...\")",
5865 SvPVX(sv_name));
5866 }
5867 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5868 vFAIL2("Constant(\\N{%s}): "
5869 "$^H{charnames} is not defined",SvPVX(sv_name));
5870 }
5871
5872
5873
5874 if (!RExC_charnames) {
5875 /* make sure our cache is allocated */
5876 RExC_charnames = newHV();
6bda09f9 5877 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5878 }
5879 /* see if we have looked this one up before */
5880 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5881 if ( he_str ) {
5882 sv_str = HeVAL(he_str);
5883 cached = 1;
5884 } else {
5885 dSP ;
5886
5887 ENTER ;
5888 SAVETMPS ;
5889 PUSHMARK(SP) ;
5890
5891 XPUSHs(sv_name);
5892
5893 PUTBACK ;
5894
5895 count= call_sv(cv, G_SCALAR);
5896
5897 if (count == 1) { /* XXXX is this right? dmq */
5898 sv_str = POPs;
5899 SvREFCNT_inc_simple_void(sv_str);
5900 }
5901
5902 SPAGAIN ;
5903 PUTBACK ;
5904 FREETMPS ;
5905 LEAVE ;
5906
5907 if ( !sv_str || !SvOK(sv_str) ) {
5908 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5909 "did not return a defined value",SvPVX(sv_name));
5910 }
5911 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5912 cached = 1;
5913 }
5914 }
5915 if (valuep) {
5916 char *p = SvPV(sv_str, len);
5917 if (len) {
5918 STRLEN numlen = 1;
5919 if ( SvUTF8(sv_str) ) {
196f1508 5920 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5921 if (*valuep > 0x7F)
5922 RExC_utf8 = 1;
5923 /* XXXX
5924 We have to turn on utf8 for high bit chars otherwise
5925 we get failures with
5926
5927 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5928 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5929
5930 This is different from what \x{} would do with the same
5931 codepoint, where the condition is > 0xFF.
5932 - dmq
5933 */
5934
5935
5936 } else {
5937 *valuep = (UV)*p;
5938 /* warn if we havent used the whole string? */
5939 }
5940 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5941 vWARN2(RExC_parse,
5942 "Ignoring excess chars from \\N{%s} in character class",
5943 SvPVX(sv_name)
5944 );
5945 }
5946 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5947 vWARN2(RExC_parse,
5948 "Ignoring zero length \\N{%s} in character class",
5949 SvPVX(sv_name)
5950 );
5951 }
5952 if (sv_name)
5953 SvREFCNT_dec(sv_name);
5954 if (!cached)
5955 SvREFCNT_dec(sv_str);
5956 return len ? NULL : (regnode *)&len;
5957 } else if(SvCUR(sv_str)) {
5958
5959 char *s;
5960 char *p, *pend;
5961 STRLEN charlen = 1;
5962 char * parse_start = name-3; /* needed for the offsets */
5963 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5964
5965 ret = reg_node(pRExC_state,
5966 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5967 s= STRING(ret);
5968
5969 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5970 sv_utf8_upgrade(sv_str);
5971 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5972 RExC_utf8= 1;
5973 }
5974
5975 p = SvPV(sv_str, len);
5976 pend = p + len;
5977 /* len is the length written, charlen is the size the char read */
5978 for ( len = 0; p < pend; p += charlen ) {
5979 if (UTF) {
196f1508 5980 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5981 if (FOLD) {
5982 STRLEN foldlen,numlen;
5983 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5984 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5985 /* Emit all the Unicode characters. */
5986
5987 for (foldbuf = tmpbuf;
5988 foldlen;
5989 foldlen -= numlen)
5990 {
5991 uvc = utf8_to_uvchr(foldbuf, &numlen);
5992 if (numlen > 0) {
5993 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5994 s += unilen;
5995 len += unilen;
5996 /* In EBCDIC the numlen
5997 * and unilen can differ. */
5998 foldbuf += numlen;
5999 if (numlen >= foldlen)
6000 break;
6001 }
6002 else
6003 break; /* "Can't happen." */
6004 }
6005 } else {
6006 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6007 if (unilen > 0) {
6008 s += unilen;
6009 len += unilen;
6010 }
6011 }
6012 } else {
6013 len++;
6014 REGC(*p, s++);
6015 }
6016 }
6017 if (SIZE_ONLY) {
6018 RExC_size += STR_SZ(len);
6019 } else {
6020 STR_LEN(ret) = len;
6021 RExC_emit += STR_SZ(len);
6022 }
6023 Set_Node_Cur_Length(ret); /* MJD */
6024 RExC_parse--;
6025 nextchar(pRExC_state);
6026 } else {
6027 ret = reg_node(pRExC_state,NOTHING);
6028 }
6029 if (!cached) {
6030 SvREFCNT_dec(sv_str);
6031 }
6032 if (sv_name) {
6033 SvREFCNT_dec(sv_name);
6034 }
6035 return ret;
6036
6037}
6038
6039
9e08bc66
TS
6040/*
6041 * reg_recode
6042 *
6043 * It returns the code point in utf8 for the value in *encp.
6044 * value: a code value in the source encoding
6045 * encp: a pointer to an Encode object
6046 *
6047 * If the result from Encode is not a single character,
6048 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6049 */
6050STATIC UV
6051S_reg_recode(pTHX_ const char value, SV **encp)
6052{
6053 STRLEN numlen = 1;
6054 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6055 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6056 : SvPVX(sv);
6057 const STRLEN newlen = SvCUR(sv);
6058 UV uv = UNICODE_REPLACEMENT;
6059
6060 if (newlen)
6061 uv = SvUTF8(sv)
6062 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6063 : *(U8*)s;
6064
6065 if (!newlen || numlen != newlen) {
6066 uv = UNICODE_REPLACEMENT;
6067 if (encp)
6068 *encp = NULL;
6069 }
6070 return uv;
6071}
6072
fc8cd66c 6073
a687059c
LW
6074/*
6075 - regatom - the lowest level
6076 *
6077 * Optimization: gobbles an entire sequence of ordinary characters so that
6078 * it can turn them into a single node, which is smaller to store and
6079 * faster to run. Backslashed characters are exceptions, each becoming a
6080 * separate node; the code is simpler that way and it's not worth fixing.
6081 *
7f6f358c
YO
6082 * [Yes, it is worth fixing, some scripts can run twice the speed.]
6083 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6084 */
76e3520e 6085STATIC regnode *
3dab1dad 6086S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6087{
97aff369 6088 dVAR;
cbbf8932 6089 register regnode *ret = NULL;
a0d0e21e 6090 I32 flags;
45948336 6091 char *parse_start = RExC_parse;
3dab1dad
YO
6092 GET_RE_DEBUG_FLAGS_DECL;
6093 DEBUG_PARSE("atom");
a0d0e21e
LW
6094 *flagp = WORST; /* Tentatively. */
6095
6096tryagain:
830247a4 6097 switch (*RExC_parse) {
a0d0e21e 6098 case '^':
830247a4
IZ
6099 RExC_seen_zerolen++;
6100 nextchar(pRExC_state);
bbe252da 6101 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6102 ret = reg_node(pRExC_state, MBOL);
bbe252da 6103 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6104 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6105 else
830247a4 6106 ret = reg_node(pRExC_state, BOL);
fac92740 6107 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6108 break;
6109 case '$':
830247a4 6110 nextchar(pRExC_state);
b81d288d 6111 if (*RExC_parse)
830247a4 6112 RExC_seen_zerolen++;
bbe252da 6113 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6114 ret = reg_node(pRExC_state, MEOL);
bbe252da 6115 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6116 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6117 else
830247a4 6118 ret = reg_node(pRExC_state, EOL);
fac92740 6119 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6120 break;
6121 case '.':
830247a4 6122 nextchar(pRExC_state);
bbe252da 6123 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
6124 ret = reg_node(pRExC_state, SANY);
6125 else
6126 ret = reg_node(pRExC_state, REG_ANY);
6127 *flagp |= HASWIDTH|SIMPLE;
830247a4 6128 RExC_naughty++;
fac92740 6129 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6130 break;
6131 case '[':
b45f050a 6132 {
3dab1dad
YO
6133 char * const oregcomp_parse = ++RExC_parse;
6134 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6135 if (*RExC_parse != ']') {
6136 RExC_parse = oregcomp_parse;
b45f050a
JF
6137 vFAIL("Unmatched [");
6138 }
830247a4 6139 nextchar(pRExC_state);
a0d0e21e 6140 *flagp |= HASWIDTH|SIMPLE;
fac92740 6141 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6142 break;
b45f050a 6143 }
a0d0e21e 6144 case '(':
830247a4 6145 nextchar(pRExC_state);
3dab1dad 6146 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6147 if (ret == NULL) {
bf93d4cc 6148 if (flags & TRYAGAIN) {
830247a4 6149 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6150 /* Make parent create an empty node if needed. */
6151 *flagp |= TRYAGAIN;
6152 return(NULL);
6153 }
a0d0e21e 6154 goto tryagain;
bf93d4cc 6155 }
a0d0e21e
LW
6156 return(NULL);
6157 }
c277df42 6158 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
6159 break;
6160 case '|':
6161 case ')':
6162 if (flags & TRYAGAIN) {
6163 *flagp |= TRYAGAIN;
6164 return NULL;
6165 }
b45f050a 6166 vFAIL("Internal urp");
a0d0e21e
LW
6167 /* Supposed to be caught earlier. */
6168 break;
85afd4ae 6169 case '{':
830247a4
IZ
6170 if (!regcurly(RExC_parse)) {
6171 RExC_parse++;
85afd4ae
CS
6172 goto defchar;
6173 }
6174 /* FALL THROUGH */
a0d0e21e
LW
6175 case '?':
6176 case '+':
6177 case '*':
830247a4 6178 RExC_parse++;
b45f050a 6179 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6180 break;
6181 case '\\':
830247a4 6182 switch (*++RExC_parse) {
a0d0e21e 6183 case 'A':
830247a4
IZ
6184 RExC_seen_zerolen++;
6185 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6186 *flagp |= SIMPLE;
830247a4 6187 nextchar(pRExC_state);
fac92740 6188 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6189 break;
6190 case 'G':
830247a4
IZ
6191 ret = reg_node(pRExC_state, GPOS);
6192 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6193 *flagp |= SIMPLE;
830247a4 6194 nextchar(pRExC_state);
fac92740 6195 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6196 break;
6197 case 'Z':
830247a4 6198 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6199 *flagp |= SIMPLE;
a1917ab9 6200 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 6201 nextchar(pRExC_state);
a0d0e21e 6202 break;
b85d18e9 6203 case 'z':
830247a4 6204 ret = reg_node(pRExC_state, EOS);
b85d18e9 6205 *flagp |= SIMPLE;
830247a4
IZ
6206 RExC_seen_zerolen++; /* Do not optimize RE away */
6207 nextchar(pRExC_state);
fac92740 6208 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 6209 break;
4a2d328f 6210 case 'C':
f33976b4
DB
6211 ret = reg_node(pRExC_state, CANY);
6212 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6213 *flagp |= HASWIDTH|SIMPLE;
830247a4 6214 nextchar(pRExC_state);
fac92740 6215 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
6216 break;
6217 case 'X':
830247a4 6218 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6219 *flagp |= HASWIDTH;
830247a4 6220 nextchar(pRExC_state);
fac92740 6221 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 6222 break;
a0d0e21e 6223 case 'w':
eb160463 6224 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6225 *flagp |= HASWIDTH|SIMPLE;
830247a4 6226 nextchar(pRExC_state);
fac92740 6227 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6228 break;
6229 case 'W':
eb160463 6230 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6231 *flagp |= HASWIDTH|SIMPLE;
830247a4 6232 nextchar(pRExC_state);
fac92740 6233 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6234 break;
6235 case 'b':
830247a4
IZ
6236 RExC_seen_zerolen++;
6237 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6238 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6239 *flagp |= SIMPLE;
830247a4 6240 nextchar(pRExC_state);
fac92740 6241 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6242 break;
6243 case 'B':
830247a4
IZ
6244 RExC_seen_zerolen++;
6245 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6246 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6247 *flagp |= SIMPLE;
830247a4 6248 nextchar(pRExC_state);
fac92740 6249 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6250 break;
6251 case 's':
eb160463 6252 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6253 *flagp |= HASWIDTH|SIMPLE;
830247a4 6254 nextchar(pRExC_state);
fac92740 6255 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6256 break;
6257 case 'S':
eb160463 6258 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6259 *flagp |= HASWIDTH|SIMPLE;
830247a4 6260 nextchar(pRExC_state);
fac92740 6261 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6262 break;
6263 case 'd':
ffc61ed2 6264 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6265 *flagp |= HASWIDTH|SIMPLE;
830247a4 6266 nextchar(pRExC_state);
fac92740 6267 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6268 break;
6269 case 'D':
ffc61ed2 6270 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6271 *flagp |= HASWIDTH|SIMPLE;
830247a4 6272 nextchar(pRExC_state);
fac92740 6273 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 6274 break;
a14b48bc
LW
6275 case 'p':
6276 case 'P':
3568d838 6277 {
3dab1dad 6278 char* const oldregxend = RExC_end;
ccb2c380 6279 char* parse_start = RExC_parse - 2;
a14b48bc 6280
830247a4 6281 if (RExC_parse[1] == '{') {
3568d838 6282 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6283 RExC_end = strchr(RExC_parse, '}');
6284 if (!RExC_end) {
3dab1dad 6285 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6286 RExC_parse += 2;
6287 RExC_end = oldregxend;
0da60cf5 6288 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6289 }
830247a4 6290 RExC_end++;
a14b48bc 6291 }
af6f566e 6292 else {
830247a4 6293 RExC_end = RExC_parse + 2;
af6f566e
HS
6294 if (RExC_end > oldregxend)
6295 RExC_end = oldregxend;
6296 }
830247a4 6297 RExC_parse--;
a14b48bc 6298
3dab1dad 6299 ret = regclass(pRExC_state,depth+1);
a14b48bc 6300
830247a4
IZ
6301 RExC_end = oldregxend;
6302 RExC_parse--;
ccb2c380
MP
6303
6304 Set_Node_Offset(ret, parse_start + 2);
6305 Set_Node_Cur_Length(ret);
830247a4 6306 nextchar(pRExC_state);
a14b48bc
LW
6307 *flagp |= HASWIDTH|SIMPLE;
6308 }
6309 break;
fc8cd66c
YO
6310 case 'N':
6311 /* Handle \N{NAME} here and not below because it can be
6312 multicharacter. join_exact() will join them up later on.
6313 Also this makes sure that things like /\N{BLAH}+/ and
6314 \N{BLAH} being multi char Just Happen. dmq*/
6315 ++RExC_parse;
6316 ret= reg_namedseq(pRExC_state, NULL);
6317 break;
0a4db386 6318 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
6319 {
6320 char ch= RExC_parse[1];
6321 if (ch != '<' && ch != '\'') {
6322 if (SIZE_ONLY)
6323 vWARN( RExC_parse + 1,
6324 "Possible broken named back reference treated as literal k");
6325 parse_start--;
6326 goto defchar;
6327 } else {
6328 char* name_start = (RExC_parse += 2);
2eccd3b2 6329 U32 num = 0;
0a4db386
YO
6330 SV *sv_dat = reg_scan_name(pRExC_state,
6331 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
6332 ch= (ch == '<') ? '>' : '\'';
6333
81714fb9
YO
6334 if (RExC_parse == name_start || *RExC_parse != ch)
6335 vFAIL2("Sequence \\k%c... not terminated",
6336 (ch == '>' ? '<' : ch));
6337
6338 RExC_sawback = 1;
6339 ret = reganode(pRExC_state,
6340 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6341 num);
6342 *flagp |= HASWIDTH;
6343
6344
6345 if (!SIZE_ONLY) {
81714fb9
YO
6346 num = add_data( pRExC_state, 1, "S" );
6347 ARG_SET(ret,num);
f8fc2ecf 6348 RExC_rxi->data->data[num]=(void*)sv_dat;
81714fb9
YO
6349 SvREFCNT_inc(sv_dat);
6350 }
6351 /* override incorrect value set in reganode MJD */
6352 Set_Node_Offset(ret, parse_start+1);
6353 Set_Node_Cur_Length(ret); /* MJD */
6354 nextchar(pRExC_state);
6355
6356 }
6357 break;
6358 }
a0d0e21e
LW
6359 case 'n':
6360 case 'r':
6361 case 't':
6362 case 'f':
6363 case 'e':
6364 case 'a':
6365 case 'x':
6366 case 'c':
6367 case '0':
6368 goto defchar;
2bf803e2 6369 case 'g':
a0d0e21e
LW
6370 case '1': case '2': case '3': case '4':
6371 case '5': case '6': case '7': case '8': case '9':
6372 {
c74340f9 6373 I32 num;
2bf803e2
YO
6374 bool isg = *RExC_parse == 'g';
6375 bool isrel = 0;
6376 bool hasbrace = 0;
6377 if (isg) {
c74340f9 6378 RExC_parse++;
2bf803e2
YO
6379 if (*RExC_parse == '{') {
6380 RExC_parse++;
6381 hasbrace = 1;
6382 }
6383 if (*RExC_parse == '-') {
6384 RExC_parse++;
6385 isrel = 1;
6386 }
6387 }
c74340f9
YO
6388 num = atoi(RExC_parse);
6389 if (isrel) {
5624f11d 6390 num = RExC_npar - num;
c74340f9
YO
6391 if (num < 1)
6392 vFAIL("Reference to nonexistent or unclosed group");
6393 }
2bf803e2 6394 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
6395 goto defchar;
6396 else {
3dab1dad 6397 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6398 while (isDIGIT(*RExC_parse))
6399 RExC_parse++;
2bf803e2
YO
6400 if (hasbrace) {
6401 if (*RExC_parse != '}')
6402 vFAIL("Unterminated \\g{...} pattern");
6403 RExC_parse++;
6404 }
c74340f9
YO
6405 if (!SIZE_ONLY) {
6406 if (num > (I32)RExC_rx->nparens)
6407 vFAIL("Reference to nonexistent group");
c74340f9 6408 }
830247a4 6409 RExC_sawback = 1;
eb160463
GS
6410 ret = reganode(pRExC_state,
6411 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6412 num);
a0d0e21e 6413 *flagp |= HASWIDTH;
2af232bd 6414
fac92740 6415 /* override incorrect value set in reganode MJD */
2af232bd 6416 Set_Node_Offset(ret, parse_start+1);
fac92740 6417 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6418 RExC_parse--;
6419 nextchar(pRExC_state);
a0d0e21e
LW
6420 }
6421 }
6422 break;
6423 case '\0':
830247a4 6424 if (RExC_parse >= RExC_end)
b45f050a 6425 FAIL("Trailing \\");
a0d0e21e
LW
6426 /* FALL THROUGH */
6427 default:
a0288114 6428 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6429 back into the quick-grab loop below */
45948336 6430 parse_start--;
a0d0e21e
LW
6431 goto defchar;
6432 }
6433 break;
4633a7c4
LW
6434
6435 case '#':
bbe252da 6436 if (RExC_flags & RXf_PMf_EXTENDED) {
3dab1dad
YO
6437 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6438 RExC_parse++;
830247a4 6439 if (RExC_parse < RExC_end)
4633a7c4
LW
6440 goto tryagain;
6441 }
6442 /* FALL THROUGH */
6443
a0d0e21e 6444 default: {
ba210ebe 6445 register STRLEN len;
58ae7d3f 6446 register UV ender;
a0d0e21e 6447 register char *p;
3dab1dad 6448 char *s;
80aecb99 6449 STRLEN foldlen;
89ebb4a3 6450 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6451
6452 parse_start = RExC_parse - 1;
a0d0e21e 6453
830247a4 6454 RExC_parse++;
a0d0e21e
LW
6455
6456 defchar:
58ae7d3f 6457 ender = 0;
eb160463
GS
6458 ret = reg_node(pRExC_state,
6459 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6460 s = STRING(ret);
830247a4
IZ
6461 for (len = 0, p = RExC_parse - 1;
6462 len < 127 && p < RExC_end;
a0d0e21e
LW
6463 len++)
6464 {
3dab1dad 6465 char * const oldp = p;
5b5a24f7 6466
bbe252da 6467 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6468 p = regwhite(p, RExC_end);
a0d0e21e
LW
6469 switch (*p) {
6470 case '^':
6471 case '$':
6472 case '.':
6473 case '[':
6474 case '(':
6475 case ')':
6476 case '|':
6477 goto loopdone;
6478 case '\\':
6479 switch (*++p) {
6480 case 'A':
1ed8eac0
JF
6481 case 'C':
6482 case 'X':
a0d0e21e 6483 case 'G':
2bf803e2 6484 case 'g':
a0d0e21e 6485 case 'Z':
b85d18e9 6486 case 'z':
a0d0e21e
LW
6487 case 'w':
6488 case 'W':
6489 case 'b':
6490 case 'B':
6491 case 's':
6492 case 'S':
6493 case 'd':
6494 case 'D':
a14b48bc
LW
6495 case 'p':
6496 case 'P':
fc8cd66c 6497 case 'N':
c74340f9 6498 case 'R':
99ed40ff 6499 case 'k':
a0d0e21e
LW
6500 --p;
6501 goto loopdone;
6502 case 'n':
6503 ender = '\n';
6504 p++;
a687059c 6505 break;
a0d0e21e
LW
6506 case 'r':
6507 ender = '\r';
6508 p++;
a687059c 6509 break;
a0d0e21e
LW
6510 case 't':
6511 ender = '\t';
6512 p++;
a687059c 6513 break;
a0d0e21e
LW
6514 case 'f':
6515 ender = '\f';
6516 p++;
a687059c 6517 break;
a0d0e21e 6518 case 'e':
c7f1f016 6519 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6520 p++;
a687059c 6521 break;
a0d0e21e 6522 case 'a':
c7f1f016 6523 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6524 p++;
a687059c 6525 break;
a0d0e21e 6526 case 'x':
a0ed51b3 6527 if (*++p == '{') {
1df70142 6528 char* const e = strchr(p, '}');
b81d288d 6529
b45f050a 6530 if (!e) {
830247a4 6531 RExC_parse = p + 1;
b45f050a
JF
6532 vFAIL("Missing right brace on \\x{}");
6533 }
de5f0749 6534 else {
a4c04bdc
NC
6535 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6536 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6537 STRLEN numlen = e - p - 1;
53305cf1 6538 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6539 if (ender > 0xff)
6540 RExC_utf8 = 1;
a0ed51b3
LW
6541 p = e + 1;
6542 }
a0ed51b3
LW
6543 }
6544 else {
a4c04bdc 6545 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6546 STRLEN numlen = 2;
53305cf1 6547 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6548 p += numlen;
6549 }
9e08bc66
TS
6550 if (PL_encoding && ender < 0x100)
6551 goto recode_encoding;
a687059c 6552 break;
a0d0e21e
LW
6553 case 'c':
6554 p++;
bbce6d69 6555 ender = UCHARAT(p++);
6556 ender = toCTRL(ender);
a687059c 6557 break;
a0d0e21e
LW
6558 case '0': case '1': case '2': case '3':case '4':
6559 case '5': case '6': case '7': case '8':case '9':
6560 if (*p == '0' ||
830247a4 6561 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6562 I32 flags = 0;
1df70142 6563 STRLEN numlen = 3;
53305cf1 6564 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6565 p += numlen;
6566 }
6567 else {
6568 --p;
6569 goto loopdone;
a687059c 6570 }
9e08bc66
TS
6571 if (PL_encoding && ender < 0x100)
6572 goto recode_encoding;
6573 break;
6574 recode_encoding:
6575 {
6576 SV* enc = PL_encoding;
6577 ender = reg_recode((const char)(U8)ender, &enc);
6578 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6579 vWARN(p, "Invalid escape in the specified encoding");
6580 RExC_utf8 = 1;
6581 }
a687059c 6582 break;
a0d0e21e 6583 case '\0':
830247a4 6584 if (p >= RExC_end)
b45f050a 6585 FAIL("Trailing \\");
a687059c 6586 /* FALL THROUGH */
a0d0e21e 6587 default:
041457d9 6588 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6589 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6590 goto normal_default;
a0d0e21e
LW
6591 }
6592 break;
a687059c 6593 default:
a0ed51b3 6594 normal_default:
fd400ab9 6595 if (UTF8_IS_START(*p) && UTF) {
1df70142 6596 STRLEN numlen;
5e12f4fb 6597 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6598 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6599 p += numlen;
6600 }
6601 else
6602 ender = *p++;
a0d0e21e 6603 break;
a687059c 6604 }
bbe252da 6605 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6606 p = regwhite(p, RExC_end);
60a8b682
JH
6607 if (UTF && FOLD) {
6608 /* Prime the casefolded buffer. */
ac7e0132 6609 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6610 }
a0d0e21e
LW
6611 if (ISMULT2(p)) { /* Back off on ?+*. */
6612 if (len)
6613 p = oldp;
16ea2a2e 6614 else if (UTF) {
80aecb99 6615 if (FOLD) {
60a8b682 6616 /* Emit all the Unicode characters. */
1df70142 6617 STRLEN numlen;
80aecb99
JH
6618 for (foldbuf = tmpbuf;
6619 foldlen;
6620 foldlen -= numlen) {
6621 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6622 if (numlen > 0) {
71207a34 6623 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6624 s += unilen;
6625 len += unilen;
6626 /* In EBCDIC the numlen
6627 * and unilen can differ. */
9dc45d57 6628 foldbuf += numlen;
47654450
JH
6629 if (numlen >= foldlen)
6630 break;
9dc45d57
JH
6631 }
6632 else
6633 break; /* "Can't happen." */
80aecb99
JH
6634 }
6635 }
6636 else {
71207a34 6637 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6638 if (unilen > 0) {
0ebc6274
JH
6639 s += unilen;
6640 len += unilen;
9dc45d57 6641 }
80aecb99 6642 }
a0ed51b3 6643 }
a0d0e21e
LW
6644 else {
6645 len++;
eb160463 6646 REGC((char)ender, s++);
a0d0e21e
LW
6647 }
6648 break;
a687059c 6649 }
16ea2a2e 6650 if (UTF) {
80aecb99 6651 if (FOLD) {
60a8b682 6652 /* Emit all the Unicode characters. */
1df70142 6653 STRLEN numlen;
80aecb99
JH
6654 for (foldbuf = tmpbuf;
6655 foldlen;
6656 foldlen -= numlen) {
6657 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6658 if (numlen > 0) {
71207a34 6659 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6660 len += unilen;
6661 s += unilen;
6662 /* In EBCDIC the numlen
6663 * and unilen can differ. */
9dc45d57 6664 foldbuf += numlen;
47654450
JH
6665 if (numlen >= foldlen)
6666 break;
9dc45d57
JH
6667 }
6668 else
6669 break;
80aecb99
JH
6670 }
6671 }
6672 else {
71207a34 6673 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6674 if (unilen > 0) {
0ebc6274
JH
6675 s += unilen;
6676 len += unilen;
9dc45d57 6677 }
80aecb99
JH
6678 }
6679 len--;
a0ed51b3
LW
6680 }
6681 else
eb160463 6682 REGC((char)ender, s++);
a0d0e21e
LW
6683 }
6684 loopdone:
830247a4 6685 RExC_parse = p - 1;
fac92740 6686 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6687 nextchar(pRExC_state);
793db0cb
JH
6688 {
6689 /* len is STRLEN which is unsigned, need to copy to signed */
6690 IV iv = len;
6691 if (iv < 0)
6692 vFAIL("Internal disaster");
6693 }
a0d0e21e
LW
6694 if (len > 0)
6695 *flagp |= HASWIDTH;
090f7165 6696 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6697 *flagp |= SIMPLE;
3dab1dad 6698
cd439c50 6699 if (SIZE_ONLY)
830247a4 6700 RExC_size += STR_SZ(len);
3dab1dad
YO
6701 else {
6702 STR_LEN(ret) = len;
830247a4 6703 RExC_emit += STR_SZ(len);
07be1b83 6704 }
3dab1dad 6705 }
a0d0e21e
LW
6706 break;
6707 }
a687059c 6708
a0d0e21e 6709 return(ret);
a687059c
LW
6710}
6711
873ef191 6712STATIC char *
5f66b61c 6713S_regwhite(char *p, const char *e)
5b5a24f7
CS
6714{
6715 while (p < e) {
6716 if (isSPACE(*p))
6717 ++p;
6718 else if (*p == '#') {
6719 do {
6720 p++;
6721 } while (p < e && *p != '\n');
6722 }
6723 else
6724 break;
6725 }
6726 return p;
6727}
6728
b8c5462f
JH
6729/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6730 Character classes ([:foo:]) can also be negated ([:^foo:]).
6731 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6732 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6733 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6734
6735#define POSIXCC_DONE(c) ((c) == ':')
6736#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6737#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6738
b8c5462f 6739STATIC I32
830247a4 6740S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6741{
97aff369 6742 dVAR;
936ed897 6743 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6744
830247a4 6745 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6746 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6747 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6748 const char c = UCHARAT(RExC_parse);
097eb12c 6749 char* const s = RExC_parse++;
b81d288d 6750
9a86a77b 6751 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6752 RExC_parse++;
6753 if (RExC_parse == RExC_end)
620e46c5 6754 /* Grandfather lone [:, [=, [. */
830247a4 6755 RExC_parse = s;
620e46c5 6756 else {
3dab1dad 6757 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6758 assert(*t == c);
6759
9a86a77b 6760 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6761 const char *posixcc = s + 1;
830247a4 6762 RExC_parse++; /* skip over the ending ] */
3dab1dad 6763
b8c5462f 6764 if (*s == ':') {
1df70142
AL
6765 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6766 const I32 skip = t - posixcc;
80916619
NC
6767
6768 /* Initially switch on the length of the name. */
6769 switch (skip) {
6770 case 4:
3dab1dad
YO
6771 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6772 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6773 break;
80916619
NC
6774 case 5:
6775 /* Names all of length 5. */
6776 /* alnum alpha ascii blank cntrl digit graph lower
6777 print punct space upper */
6778 /* Offset 4 gives the best switch position. */
6779 switch (posixcc[4]) {
6780 case 'a':
3dab1dad
YO
6781 if (memEQ(posixcc, "alph", 4)) /* alpha */
6782 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6783 break;
6784 case 'e':
3dab1dad
YO
6785 if (memEQ(posixcc, "spac", 4)) /* space */
6786 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6787 break;
6788 case 'h':
3dab1dad
YO
6789 if (memEQ(posixcc, "grap", 4)) /* graph */
6790 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6791 break;
6792 case 'i':
3dab1dad
YO
6793 if (memEQ(posixcc, "asci", 4)) /* ascii */
6794 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6795 break;
6796 case 'k':
3dab1dad
YO
6797 if (memEQ(posixcc, "blan", 4)) /* blank */
6798 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6799 break;
6800 case 'l':
3dab1dad
YO
6801 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6802 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6803 break;
6804 case 'm':
3dab1dad
YO
6805 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6806 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6807 break;
6808 case 'r':
3dab1dad
YO
6809 if (memEQ(posixcc, "lowe", 4)) /* lower */
6810 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6811 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6812 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6813 break;
6814 case 't':
3dab1dad
YO
6815 if (memEQ(posixcc, "digi", 4)) /* digit */
6816 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6817 else if (memEQ(posixcc, "prin", 4)) /* print */
6818 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6819 else if (memEQ(posixcc, "punc", 4)) /* punct */
6820 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6821 break;
b8c5462f
JH
6822 }
6823 break;
80916619 6824 case 6:
3dab1dad
YO
6825 if (memEQ(posixcc, "xdigit", 6))
6826 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6827 break;
6828 }
80916619
NC
6829
6830 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6831 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6832 t - s - 1, s + 1);
80916619
NC
6833 assert (posixcc[skip] == ':');
6834 assert (posixcc[skip+1] == ']');
b45f050a 6835 } else if (!SIZE_ONLY) {
b8c5462f 6836 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6837
830247a4 6838 /* adjust RExC_parse so the warning shows after
b45f050a 6839 the class closes */
9a86a77b 6840 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6841 RExC_parse++;
b45f050a
JF
6842 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6843 }
b8c5462f
JH
6844 } else {
6845 /* Maternal grandfather:
6846 * "[:" ending in ":" but not in ":]" */
830247a4 6847 RExC_parse = s;
767d463e 6848 }
620e46c5
JH
6849 }
6850 }
6851
b8c5462f
JH
6852 return namedclass;
6853}
6854
6855STATIC void
830247a4 6856S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6857{
97aff369 6858 dVAR;
3dab1dad 6859 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6860 const char *s = RExC_parse;
6861 const char c = *s++;
b8c5462f 6862
3dab1dad 6863 while (isALNUM(*s))
b8c5462f
JH
6864 s++;
6865 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6866 if (ckWARN(WARN_REGEXP))
6867 vWARN3(s+2,
6868 "POSIX syntax [%c %c] belongs inside character classes",
6869 c, c);
b45f050a
JF
6870
6871 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6872 if (POSIXCC_NOTYET(c)) {
830247a4 6873 /* adjust RExC_parse so the error shows after
b45f050a 6874 the class closes */
9a86a77b 6875 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6876 NOOP;
b45f050a
JF
6877 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6878 }
b8c5462f
JH
6879 }
6880 }
620e46c5
JH
6881}
6882
7f6f358c
YO
6883
6884/*
6885 parse a class specification and produce either an ANYOF node that
6886 matches the pattern. If the pattern matches a single char only and
6887 that char is < 256 then we produce an EXACT node instead.
6888*/
76e3520e 6889STATIC regnode *
3dab1dad 6890S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6891{
97aff369 6892 dVAR;
9ef43ace 6893 register UV value = 0;
9a86a77b 6894 register UV nextvalue;
3568d838 6895 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6896 register IV range = 0;
c277df42 6897 register regnode *ret;
ba210ebe 6898 STRLEN numlen;
ffc61ed2 6899 IV namedclass;
cbbf8932 6900 char *rangebegin = NULL;
936ed897 6901 bool need_class = 0;
c445ea15 6902 SV *listsv = NULL;
ffc61ed2 6903 UV n;
9e55ce06 6904 bool optimize_invert = TRUE;
cbbf8932 6905 AV* unicode_alternate = NULL;
1b2d223b
JH
6906#ifdef EBCDIC
6907 UV literal_endpoint = 0;
6908#endif
7f6f358c 6909 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6910
3dab1dad 6911 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6912 case we need to change the emitted regop to an EXACT. */
07be1b83 6913 const char * orig_parse = RExC_parse;
72f13be8 6914 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6915#ifndef DEBUGGING
6916 PERL_UNUSED_ARG(depth);
6917#endif
72f13be8 6918
3dab1dad 6919 DEBUG_PARSE("clas");
7f6f358c
YO
6920
6921 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6922 ret = reganode(pRExC_state, ANYOF, 0);
6923
6924 if (!SIZE_ONLY)
6925 ANYOF_FLAGS(ret) = 0;
6926
9a86a77b 6927 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6928 RExC_naughty++;
6929 RExC_parse++;
6930 if (!SIZE_ONLY)
6931 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6932 }
a0d0e21e 6933
73060fc4 6934 if (SIZE_ONLY) {
830247a4 6935 RExC_size += ANYOF_SKIP;
73060fc4
JH
6936 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6937 }
936ed897 6938 else {
830247a4 6939 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6940 if (FOLD)
6941 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6942 if (LOC)
6943 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6944 ANYOF_BITMAP_ZERO(ret);
396482e1 6945 listsv = newSVpvs("# comment\n");
a0d0e21e 6946 }
b8c5462f 6947
9a86a77b
JH
6948 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6949
b938889d 6950 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6951 checkposixcc(pRExC_state);
b8c5462f 6952
f064b6ad
HS
6953 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6954 if (UCHARAT(RExC_parse) == ']')
6955 goto charclassloop;
ffc61ed2 6956
fc8cd66c 6957parseit:
9a86a77b 6958 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6959
6960 charclassloop:
6961
6962 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6963
73b437c8 6964 if (!range)
830247a4 6965 rangebegin = RExC_parse;
ffc61ed2 6966 if (UTF) {
5e12f4fb 6967 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6968 RExC_end - RExC_parse,
9f7f3913 6969 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6970 RExC_parse += numlen;
6971 }
6972 else
6973 value = UCHARAT(RExC_parse++);
7f6f358c 6974
9a86a77b
JH
6975 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6976 if (value == '[' && POSIXCC(nextvalue))
830247a4 6977 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6978 else if (value == '\\') {
ffc61ed2 6979 if (UTF) {
5e12f4fb 6980 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6981 RExC_end - RExC_parse,
9f7f3913 6982 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6983 RExC_parse += numlen;
6984 }
6985 else
6986 value = UCHARAT(RExC_parse++);
470c3474 6987 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6988 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6989 * be a problem later if we want switch on Unicode.
6990 * A similar issue a little bit later when switching on
6991 * namedclass. --jhi */
ffc61ed2 6992 switch ((I32)value) {
b8c5462f
JH
6993 case 'w': namedclass = ANYOF_ALNUM; break;
6994 case 'W': namedclass = ANYOF_NALNUM; break;
6995 case 's': namedclass = ANYOF_SPACE; break;
6996 case 'S': namedclass = ANYOF_NSPACE; break;
6997 case 'd': namedclass = ANYOF_DIGIT; break;
6998 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6999 case 'N': /* Handle \N{NAME} in class */
7000 {
7001 /* We only pay attention to the first char of
7002 multichar strings being returned. I kinda wonder
7003 if this makes sense as it does change the behaviour
7004 from earlier versions, OTOH that behaviour was broken
7005 as well. */
7006 UV v; /* value is register so we cant & it /grrr */
7007 if (reg_namedseq(pRExC_state, &v)) {
7008 goto parseit;
7009 }
7010 value= v;
7011 }
7012 break;
ffc61ed2
JH
7013 case 'p':
7014 case 'P':
3dab1dad
YO
7015 {
7016 char *e;
af6f566e 7017 if (RExC_parse >= RExC_end)
2a4859cd 7018 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 7019 if (*RExC_parse == '{') {
1df70142 7020 const U8 c = (U8)value;
ffc61ed2
JH
7021 e = strchr(RExC_parse++, '}');
7022 if (!e)
0da60cf5 7023 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
7024 while (isSPACE(UCHARAT(RExC_parse)))
7025 RExC_parse++;
7026 if (e == RExC_parse)
0da60cf5 7027 vFAIL2("Empty \\%c{}", c);
ffc61ed2 7028 n = e - RExC_parse;
ab13f0c7
JH
7029 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7030 n--;
ffc61ed2
JH
7031 }
7032 else {
7033 e = RExC_parse;
7034 n = 1;
7035 }
7036 if (!SIZE_ONLY) {
ab13f0c7
JH
7037 if (UCHARAT(RExC_parse) == '^') {
7038 RExC_parse++;
7039 n--;
7040 value = value == 'p' ? 'P' : 'p'; /* toggle */
7041 while (isSPACE(UCHARAT(RExC_parse))) {
7042 RExC_parse++;
7043 n--;
7044 }
7045 }
097eb12c
AL
7046 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7047 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
7048 }
7049 RExC_parse = e + 1;
7050 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 7051 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 7052 }
f81125e2 7053 break;
b8c5462f
JH
7054 case 'n': value = '\n'; break;
7055 case 'r': value = '\r'; break;
7056 case 't': value = '\t'; break;
7057 case 'f': value = '\f'; break;
7058 case 'b': value = '\b'; break;
c7f1f016
NIS
7059 case 'e': value = ASCII_TO_NATIVE('\033');break;
7060 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 7061 case 'x':
ffc61ed2 7062 if (*RExC_parse == '{') {
a4c04bdc
NC
7063 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7064 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 7065 char * const e = strchr(RExC_parse++, '}');
b81d288d 7066 if (!e)
ffc61ed2 7067 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
7068
7069 numlen = e - RExC_parse;
7070 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7071 RExC_parse = e + 1;
7072 }
7073 else {
a4c04bdc 7074 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
7075 numlen = 2;
7076 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7077 RExC_parse += numlen;
7078 }
9e08bc66
TS
7079 if (PL_encoding && value < 0x100)
7080 goto recode_encoding;
b8c5462f
JH
7081 break;
7082 case 'c':
830247a4 7083 value = UCHARAT(RExC_parse++);
b8c5462f
JH
7084 value = toCTRL(value);
7085 break;
7086 case '0': case '1': case '2': case '3': case '4':
7087 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
7088 {
7089 I32 flags = 0;
7090 numlen = 3;
7091 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7092 RExC_parse += numlen;
7093 if (PL_encoding && value < 0x100)
7094 goto recode_encoding;
7095 break;
7096 }
7097 recode_encoding:
7098 {
7099 SV* enc = PL_encoding;
7100 value = reg_recode((const char)(U8)value, &enc);
7101 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7102 vWARN(RExC_parse,
7103 "Invalid escape in the specified encoding");
7104 break;
7105 }
1028017a 7106 default:
041457d9 7107 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7108 vWARN2(RExC_parse,
7109 "Unrecognized escape \\%c in character class passed through",
7110 (int)value);
1028017a 7111 break;
b8c5462f 7112 }
ffc61ed2 7113 } /* end of \blah */
1b2d223b
JH
7114#ifdef EBCDIC
7115 else
7116 literal_endpoint++;
7117#endif
ffc61ed2
JH
7118
7119 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7120
7121 if (!SIZE_ONLY && !need_class)
936ed897 7122 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7123
936ed897 7124 need_class = 1;
ffc61ed2
JH
7125
7126 /* a bad range like a-\d, a-[:digit:] ? */
7127 if (range) {
73b437c8 7128 if (!SIZE_ONLY) {
afd78fd5 7129 if (ckWARN(WARN_REGEXP)) {
097eb12c 7130 const int w =
afd78fd5
JH
7131 RExC_parse >= rangebegin ?
7132 RExC_parse - rangebegin : 0;
830247a4 7133 vWARN4(RExC_parse,
b45f050a 7134 "False [] range \"%*.*s\"",
097eb12c 7135 w, w, rangebegin);
afd78fd5 7136 }
3568d838
JH
7137 if (prevvalue < 256) {
7138 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7139 ANYOF_BITMAP_SET(ret, '-');
7140 }
7141 else {
7142 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7143 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7144 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7145 }
b8c5462f 7146 }
ffc61ed2
JH
7147
7148 range = 0; /* this was not a true range */
73b437c8 7149 }
ffc61ed2 7150
73b437c8 7151 if (!SIZE_ONLY) {
c49a72a9
NC
7152 const char *what = NULL;
7153 char yesno = 0;
7154
3568d838
JH
7155 if (namedclass > OOB_NAMEDCLASS)
7156 optimize_invert = FALSE;
e2962f66
JH
7157 /* Possible truncation here but in some 64-bit environments
7158 * the compiler gets heartburn about switch on 64-bit values.
7159 * A similar issue a little earlier when switching on value.
98f323fa 7160 * --jhi */
e2962f66 7161 switch ((I32)namedclass) {
73b437c8
JH
7162 case ANYOF_ALNUM:
7163 if (LOC)
936ed897 7164 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
7165 else {
7166 for (value = 0; value < 256; value++)
7167 if (isALNUM(value))
936ed897 7168 ANYOF_BITMAP_SET(ret, value);
73b437c8 7169 }
c49a72a9
NC
7170 yesno = '+';
7171 what = "Word";
73b437c8
JH
7172 break;
7173 case ANYOF_NALNUM:
7174 if (LOC)
936ed897 7175 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
7176 else {
7177 for (value = 0; value < 256; value++)
7178 if (!isALNUM(value))
936ed897 7179 ANYOF_BITMAP_SET(ret, value);
73b437c8 7180 }
c49a72a9
NC
7181 yesno = '!';
7182 what = "Word";
73b437c8 7183 break;
ffc61ed2 7184 case ANYOF_ALNUMC:
73b437c8 7185 if (LOC)
ffc61ed2 7186 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
7187 else {
7188 for (value = 0; value < 256; value++)
ffc61ed2 7189 if (isALNUMC(value))
936ed897 7190 ANYOF_BITMAP_SET(ret, value);
73b437c8 7191 }
c49a72a9
NC
7192 yesno = '+';
7193 what = "Alnum";
73b437c8
JH
7194 break;
7195 case ANYOF_NALNUMC:
7196 if (LOC)
936ed897 7197 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
7198 else {
7199 for (value = 0; value < 256; value++)
7200 if (!isALNUMC(value))
936ed897 7201 ANYOF_BITMAP_SET(ret, value);
73b437c8 7202 }
c49a72a9
NC
7203 yesno = '!';
7204 what = "Alnum";
73b437c8
JH
7205 break;
7206 case ANYOF_ALPHA:
7207 if (LOC)
936ed897 7208 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
7209 else {
7210 for (value = 0; value < 256; value++)
7211 if (isALPHA(value))
936ed897 7212 ANYOF_BITMAP_SET(ret, value);
73b437c8 7213 }
c49a72a9
NC
7214 yesno = '+';
7215 what = "Alpha";
73b437c8
JH
7216 break;
7217 case ANYOF_NALPHA:
7218 if (LOC)
936ed897 7219 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
7220 else {
7221 for (value = 0; value < 256; value++)
7222 if (!isALPHA(value))
936ed897 7223 ANYOF_BITMAP_SET(ret, value);
73b437c8 7224 }
c49a72a9
NC
7225 yesno = '!';
7226 what = "Alpha";
73b437c8
JH
7227 break;
7228 case ANYOF_ASCII:
7229 if (LOC)
936ed897 7230 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7231 else {
c7f1f016 7232#ifndef EBCDIC
1ba5c669
JH
7233 for (value = 0; value < 128; value++)
7234 ANYOF_BITMAP_SET(ret, value);
7235#else /* EBCDIC */
ffbc6a93 7236 for (value = 0; value < 256; value++) {
3a3c4447
JH
7237 if (isASCII(value))
7238 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7239 }
1ba5c669 7240#endif /* EBCDIC */
73b437c8 7241 }
c49a72a9
NC
7242 yesno = '+';
7243 what = "ASCII";
73b437c8
JH
7244 break;
7245 case ANYOF_NASCII:
7246 if (LOC)
936ed897 7247 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7248 else {
c7f1f016 7249#ifndef EBCDIC
1ba5c669
JH
7250 for (value = 128; value < 256; value++)
7251 ANYOF_BITMAP_SET(ret, value);
7252#else /* EBCDIC */
ffbc6a93 7253 for (value = 0; value < 256; value++) {
3a3c4447
JH
7254 if (!isASCII(value))
7255 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7256 }
1ba5c669 7257#endif /* EBCDIC */
73b437c8 7258 }
c49a72a9
NC
7259 yesno = '!';
7260 what = "ASCII";
73b437c8 7261 break;
aaa51d5e
JF
7262 case ANYOF_BLANK:
7263 if (LOC)
7264 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7265 else {
7266 for (value = 0; value < 256; value++)
7267 if (isBLANK(value))
7268 ANYOF_BITMAP_SET(ret, value);
7269 }
c49a72a9
NC
7270 yesno = '+';
7271 what = "Blank";
aaa51d5e
JF
7272 break;
7273 case ANYOF_NBLANK:
7274 if (LOC)
7275 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7276 else {
7277 for (value = 0; value < 256; value++)
7278 if (!isBLANK(value))
7279 ANYOF_BITMAP_SET(ret, value);
7280 }
c49a72a9
NC
7281 yesno = '!';
7282 what = "Blank";
aaa51d5e 7283 break;
73b437c8
JH
7284 case ANYOF_CNTRL:
7285 if (LOC)
936ed897 7286 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
7287 else {
7288 for (value = 0; value < 256; value++)
7289 if (isCNTRL(value))
936ed897 7290 ANYOF_BITMAP_SET(ret, value);
73b437c8 7291 }
c49a72a9
NC
7292 yesno = '+';
7293 what = "Cntrl";
73b437c8
JH
7294 break;
7295 case ANYOF_NCNTRL:
7296 if (LOC)
936ed897 7297 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
7298 else {
7299 for (value = 0; value < 256; value++)
7300 if (!isCNTRL(value))
936ed897 7301 ANYOF_BITMAP_SET(ret, value);
73b437c8 7302 }
c49a72a9
NC
7303 yesno = '!';
7304 what = "Cntrl";
ffc61ed2
JH
7305 break;
7306 case ANYOF_DIGIT:
7307 if (LOC)
7308 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7309 else {
7310 /* consecutive digits assumed */
7311 for (value = '0'; value <= '9'; value++)
7312 ANYOF_BITMAP_SET(ret, value);
7313 }
c49a72a9
NC
7314 yesno = '+';
7315 what = "Digit";
ffc61ed2
JH
7316 break;
7317 case ANYOF_NDIGIT:
7318 if (LOC)
7319 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7320 else {
7321 /* consecutive digits assumed */
7322 for (value = 0; value < '0'; value++)
7323 ANYOF_BITMAP_SET(ret, value);
7324 for (value = '9' + 1; value < 256; value++)
7325 ANYOF_BITMAP_SET(ret, value);
7326 }
c49a72a9
NC
7327 yesno = '!';
7328 what = "Digit";
73b437c8
JH
7329 break;
7330 case ANYOF_GRAPH:
7331 if (LOC)
936ed897 7332 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
7333 else {
7334 for (value = 0; value < 256; value++)
7335 if (isGRAPH(value))
936ed897 7336 ANYOF_BITMAP_SET(ret, value);
73b437c8 7337 }
c49a72a9
NC
7338 yesno = '+';
7339 what = "Graph";
73b437c8
JH
7340 break;
7341 case ANYOF_NGRAPH:
7342 if (LOC)
936ed897 7343 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
7344 else {
7345 for (value = 0; value < 256; value++)
7346 if (!isGRAPH(value))
936ed897 7347 ANYOF_BITMAP_SET(ret, value);
73b437c8 7348 }
c49a72a9
NC
7349 yesno = '!';
7350 what = "Graph";
73b437c8
JH
7351 break;
7352 case ANYOF_LOWER:
7353 if (LOC)
936ed897 7354 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
7355 else {
7356 for (value = 0; value < 256; value++)
7357 if (isLOWER(value))
936ed897 7358 ANYOF_BITMAP_SET(ret, value);
73b437c8 7359 }
c49a72a9
NC
7360 yesno = '+';
7361 what = "Lower";
73b437c8
JH
7362 break;
7363 case ANYOF_NLOWER:
7364 if (LOC)
936ed897 7365 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
7366 else {
7367 for (value = 0; value < 256; value++)
7368 if (!isLOWER(value))
936ed897 7369 ANYOF_BITMAP_SET(ret, value);
73b437c8 7370 }
c49a72a9
NC
7371 yesno = '!';
7372 what = "Lower";
73b437c8
JH
7373 break;
7374 case ANYOF_PRINT:
7375 if (LOC)
936ed897 7376 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
7377 else {
7378 for (value = 0; value < 256; value++)
7379 if (isPRINT(value))
936ed897 7380 ANYOF_BITMAP_SET(ret, value);
73b437c8 7381 }
c49a72a9
NC
7382 yesno = '+';
7383 what = "Print";
73b437c8
JH
7384 break;
7385 case ANYOF_NPRINT:
7386 if (LOC)
936ed897 7387 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
7388 else {
7389 for (value = 0; value < 256; value++)
7390 if (!isPRINT(value))
936ed897 7391 ANYOF_BITMAP_SET(ret, value);
73b437c8 7392 }
c49a72a9
NC
7393 yesno = '!';
7394 what = "Print";
73b437c8 7395 break;
aaa51d5e
JF
7396 case ANYOF_PSXSPC:
7397 if (LOC)
7398 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7399 else {
7400 for (value = 0; value < 256; value++)
7401 if (isPSXSPC(value))
7402 ANYOF_BITMAP_SET(ret, value);
7403 }
c49a72a9
NC
7404 yesno = '+';
7405 what = "Space";
aaa51d5e
JF
7406 break;
7407 case ANYOF_NPSXSPC:
7408 if (LOC)
7409 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7410 else {
7411 for (value = 0; value < 256; value++)
7412 if (!isPSXSPC(value))
7413 ANYOF_BITMAP_SET(ret, value);
7414 }
c49a72a9
NC
7415 yesno = '!';
7416 what = "Space";
aaa51d5e 7417 break;
73b437c8
JH
7418 case ANYOF_PUNCT:
7419 if (LOC)
936ed897 7420 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
7421 else {
7422 for (value = 0; value < 256; value++)
7423 if (isPUNCT(value))
936ed897 7424 ANYOF_BITMAP_SET(ret, value);
73b437c8 7425 }
c49a72a9
NC
7426 yesno = '+';
7427 what = "Punct";
73b437c8
JH
7428 break;
7429 case ANYOF_NPUNCT:
7430 if (LOC)
936ed897 7431 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
7432 else {
7433 for (value = 0; value < 256; value++)
7434 if (!isPUNCT(value))
936ed897 7435 ANYOF_BITMAP_SET(ret, value);
73b437c8 7436 }
c49a72a9
NC
7437 yesno = '!';
7438 what = "Punct";
ffc61ed2
JH
7439 break;
7440 case ANYOF_SPACE:
7441 if (LOC)
7442 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7443 else {
7444 for (value = 0; value < 256; value++)
7445 if (isSPACE(value))
7446 ANYOF_BITMAP_SET(ret, value);
7447 }
c49a72a9
NC
7448 yesno = '+';
7449 what = "SpacePerl";
ffc61ed2
JH
7450 break;
7451 case ANYOF_NSPACE:
7452 if (LOC)
7453 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7454 else {
7455 for (value = 0; value < 256; value++)
7456 if (!isSPACE(value))
7457 ANYOF_BITMAP_SET(ret, value);
7458 }
c49a72a9
NC
7459 yesno = '!';
7460 what = "SpacePerl";
73b437c8
JH
7461 break;
7462 case ANYOF_UPPER:
7463 if (LOC)
936ed897 7464 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
7465 else {
7466 for (value = 0; value < 256; value++)
7467 if (isUPPER(value))
936ed897 7468 ANYOF_BITMAP_SET(ret, value);
73b437c8 7469 }
c49a72a9
NC
7470 yesno = '+';
7471 what = "Upper";
73b437c8
JH
7472 break;
7473 case ANYOF_NUPPER:
7474 if (LOC)
936ed897 7475 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
7476 else {
7477 for (value = 0; value < 256; value++)
7478 if (!isUPPER(value))
936ed897 7479 ANYOF_BITMAP_SET(ret, value);
73b437c8 7480 }
c49a72a9
NC
7481 yesno = '!';
7482 what = "Upper";
73b437c8
JH
7483 break;
7484 case ANYOF_XDIGIT:
7485 if (LOC)
936ed897 7486 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
7487 else {
7488 for (value = 0; value < 256; value++)
7489 if (isXDIGIT(value))
936ed897 7490 ANYOF_BITMAP_SET(ret, value);
73b437c8 7491 }
c49a72a9
NC
7492 yesno = '+';
7493 what = "XDigit";
73b437c8
JH
7494 break;
7495 case ANYOF_NXDIGIT:
7496 if (LOC)
936ed897 7497 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
7498 else {
7499 for (value = 0; value < 256; value++)
7500 if (!isXDIGIT(value))
936ed897 7501 ANYOF_BITMAP_SET(ret, value);
73b437c8 7502 }
c49a72a9
NC
7503 yesno = '!';
7504 what = "XDigit";
73b437c8 7505 break;
f81125e2
JP
7506 case ANYOF_MAX:
7507 /* this is to handle \p and \P */
7508 break;
73b437c8 7509 default:
b45f050a 7510 vFAIL("Invalid [::] class");
73b437c8 7511 break;
b8c5462f 7512 }
c49a72a9
NC
7513 if (what) {
7514 /* Strings such as "+utf8::isWord\n" */
7515 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7516 }
b8c5462f 7517 if (LOC)
936ed897 7518 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7519 continue;
a0d0e21e 7520 }
ffc61ed2
JH
7521 } /* end of namedclass \blah */
7522
a0d0e21e 7523 if (range) {
eb160463 7524 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7525 const int w = RExC_parse - rangebegin;
7526 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7527 range = 0; /* not a valid range */
73b437c8 7528 }
a0d0e21e
LW
7529 }
7530 else {
3568d838 7531 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7532 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7533 RExC_parse[1] != ']') {
7534 RExC_parse++;
ffc61ed2
JH
7535
7536 /* a bad range like \w-, [:word:]- ? */
7537 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7538 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7539 const int w =
afd78fd5
JH
7540 RExC_parse >= rangebegin ?
7541 RExC_parse - rangebegin : 0;
830247a4 7542 vWARN4(RExC_parse,
b45f050a 7543 "False [] range \"%*.*s\"",
097eb12c 7544 w, w, rangebegin);
afd78fd5 7545 }
73b437c8 7546 if (!SIZE_ONLY)
936ed897 7547 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7548 } else
ffc61ed2
JH
7549 range = 1; /* yeah, it's a range! */
7550 continue; /* but do it the next time */
a0d0e21e 7551 }
a687059c 7552 }
ffc61ed2 7553
93733859 7554 /* now is the next time */
07be1b83 7555 /*stored += (value - prevvalue + 1);*/
ae5c130c 7556 if (!SIZE_ONLY) {
3568d838 7557 if (prevvalue < 256) {
1df70142 7558 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7559 IV i;
3568d838 7560#ifdef EBCDIC
1b2d223b
JH
7561 /* In EBCDIC [\x89-\x91] should include
7562 * the \x8e but [i-j] should not. */
7563 if (literal_endpoint == 2 &&
7564 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7565 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7566 {
3568d838
JH
7567 if (isLOWER(prevvalue)) {
7568 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7569 if (isLOWER(i))
7570 ANYOF_BITMAP_SET(ret, i);
7571 } else {
3568d838 7572 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7573 if (isUPPER(i))
7574 ANYOF_BITMAP_SET(ret, i);
7575 }
8ada0baa 7576 }
ffc61ed2 7577 else
8ada0baa 7578#endif
07be1b83
YO
7579 for (i = prevvalue; i <= ceilvalue; i++) {
7580 if (!ANYOF_BITMAP_TEST(ret,i)) {
7581 stored++;
7582 ANYOF_BITMAP_SET(ret, i);
7583 }
7584 }
3568d838 7585 }
a5961de5 7586 if (value > 255 || UTF) {
1df70142
AL
7587 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7588 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7589 stored+=2; /* can't optimize this class */
ffc61ed2 7590 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7591 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7592 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7593 prevnatvalue, natvalue);
7594 }
7595 else if (prevnatvalue == natvalue) {
7596 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7597 if (FOLD) {
89ebb4a3 7598 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7599 STRLEN foldlen;
1df70142 7600 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7601
e294cc5d
JH
7602#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7603 if (RExC_precomp[0] == ':' &&
7604 RExC_precomp[1] == '[' &&
7605 (f == 0xDF || f == 0x92)) {
7606 f = NATIVE_TO_UNI(f);
7607 }
7608#endif
c840d2a2
JH
7609 /* If folding and foldable and a single
7610 * character, insert also the folded version
7611 * to the charclass. */
9e55ce06 7612 if (f != value) {
e294cc5d
JH
7613#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7614 if ((RExC_precomp[0] == ':' &&
7615 RExC_precomp[1] == '[' &&
7616 (f == 0xA2 &&
7617 (value == 0xFB05 || value == 0xFB06))) ?
7618 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7619 foldlen == (STRLEN)UNISKIP(f) )
7620#else
eb160463 7621 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7622#endif
9e55ce06
JH
7623 Perl_sv_catpvf(aTHX_ listsv,
7624 "%04"UVxf"\n", f);
7625 else {
7626 /* Any multicharacter foldings
7627 * require the following transform:
7628 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7629 * where E folds into "pq" and F folds
7630 * into "rst", all other characters
7631 * fold to single characters. We save
7632 * away these multicharacter foldings,
7633 * to be later saved as part of the
7634 * additional "s" data. */
7635 SV *sv;
7636
7637 if (!unicode_alternate)
7638 unicode_alternate = newAV();
7639 sv = newSVpvn((char*)foldbuf, foldlen);
7640 SvUTF8_on(sv);
7641 av_push(unicode_alternate, sv);
7642 }
7643 }
254ba52a 7644
60a8b682
JH
7645 /* If folding and the value is one of the Greek
7646 * sigmas insert a few more sigmas to make the
7647 * folding rules of the sigmas to work right.
7648 * Note that not all the possible combinations
7649 * are handled here: some of them are handled
9e55ce06
JH
7650 * by the standard folding rules, and some of
7651 * them (literal or EXACTF cases) are handled
7652 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7653 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7654 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7655 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7656 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7657 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7658 }
7659 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7660 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7661 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7662 }
7663 }
ffc61ed2 7664 }
1b2d223b
JH
7665#ifdef EBCDIC
7666 literal_endpoint = 0;
7667#endif
8ada0baa 7668 }
ffc61ed2
JH
7669
7670 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7671 }
ffc61ed2 7672
936ed897 7673 if (need_class) {
4f66b38d 7674 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7675 if (SIZE_ONLY)
830247a4 7676 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7677 else
830247a4 7678 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7679 }
ffc61ed2 7680
7f6f358c
YO
7681
7682 if (SIZE_ONLY)
7683 return ret;
7684 /****** !SIZE_ONLY AFTER HERE *********/
7685
7686 if( stored == 1 && value < 256
7687 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7688 ) {
7689 /* optimize single char class to an EXACT node
7690 but *only* when its not a UTF/high char */
07be1b83
YO
7691 const char * cur_parse= RExC_parse;
7692 RExC_emit = (regnode *)orig_emit;
7693 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7694 ret = reg_node(pRExC_state,
7695 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7696 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7697 *STRING(ret)= (char)value;
7698 STR_LEN(ret)= 1;
7699 RExC_emit += STR_SZ(1);
7700 return ret;
7701 }
ae5c130c 7702 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7703 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7704 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7705 ) {
a0ed51b3 7706 for (value = 0; value < 256; ++value) {
936ed897 7707 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7708 UV fold = PL_fold[value];
ffc61ed2
JH
7709
7710 if (fold != value)
7711 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7712 }
7713 }
936ed897 7714 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7715 }
ffc61ed2 7716
ae5c130c 7717 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7718 if (optimize_invert &&
ffc61ed2
JH
7719 /* If the only flag is inversion. */
7720 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7721 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7722 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7723 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7724 }
7f6f358c 7725 {
097eb12c 7726 AV * const av = newAV();
ffc61ed2 7727 SV *rv;
9e55ce06 7728 /* The 0th element stores the character class description
6a0407ee 7729 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7730 * to initialize the appropriate swash (which gets stored in
7731 * the 1st element), and also useful for dumping the regnode.
7732 * The 2nd element stores the multicharacter foldings,
6a0407ee 7733 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7734 av_store(av, 0, listsv);
7735 av_store(av, 1, NULL);
9e55ce06 7736 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7737 rv = newRV_noinc((SV*)av);
19860706 7738 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 7739 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 7740 ARG_SET(ret, n);
a0ed51b3 7741 }
a0ed51b3
LW
7742 return ret;
7743}
7744
76e3520e 7745STATIC char*
830247a4 7746S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7747{
097eb12c 7748 char* const retval = RExC_parse++;
a0d0e21e 7749
4633a7c4 7750 for (;;) {
830247a4
IZ
7751 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7752 RExC_parse[2] == '#') {
e994fd66
AE
7753 while (*RExC_parse != ')') {
7754 if (RExC_parse == RExC_end)
7755 FAIL("Sequence (?#... not terminated");
830247a4 7756 RExC_parse++;
e994fd66 7757 }
830247a4 7758 RExC_parse++;
4633a7c4
LW
7759 continue;
7760 }
bbe252da 7761 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
7762 if (isSPACE(*RExC_parse)) {
7763 RExC_parse++;
748a9306
LW
7764 continue;
7765 }
830247a4 7766 else if (*RExC_parse == '#') {
e994fd66
AE
7767 while (RExC_parse < RExC_end)
7768 if (*RExC_parse++ == '\n') break;
748a9306
LW
7769 continue;
7770 }
748a9306 7771 }
4633a7c4 7772 return retval;
a0d0e21e 7773 }
a687059c
LW
7774}
7775
7776/*
c277df42 7777- reg_node - emit a node
a0d0e21e 7778*/
76e3520e 7779STATIC regnode * /* Location. */
830247a4 7780S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7781{
97aff369 7782 dVAR;
c277df42 7783 register regnode *ptr;
504618e9 7784 regnode * const ret = RExC_emit;
07be1b83 7785 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7786
c277df42 7787 if (SIZE_ONLY) {
830247a4
IZ
7788 SIZE_ALIGN(RExC_size);
7789 RExC_size += 1;
a0d0e21e
LW
7790 return(ret);
7791 }
e2e6a0f1
YO
7792#ifdef DEBUGGING
7793 if (OP(RExC_emit) == 255)
7794 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7795 reg_name[op], OP(RExC_emit));
7796#endif
c277df42 7797 NODE_ALIGN_FILL(ret);
a0d0e21e 7798 ptr = ret;
c277df42 7799 FILL_ADVANCE_NODE(ptr, op);
fac92740 7800 if (RExC_offsets) { /* MJD */
07be1b83 7801 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7802 "reg_node", __LINE__,
7803 reg_name[op],
07be1b83
YO
7804 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7805 ? "Overwriting end of array!\n" : "OK",
7806 (UV)(RExC_emit - RExC_emit_start),
7807 (UV)(RExC_parse - RExC_start),
7808 (UV)RExC_offsets[0]));
ccb2c380 7809 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7810 }
07be1b83 7811
830247a4 7812 RExC_emit = ptr;
a0d0e21e 7813 return(ret);
a687059c
LW
7814}
7815
7816/*
a0d0e21e
LW
7817- reganode - emit a node with an argument
7818*/
76e3520e 7819STATIC regnode * /* Location. */
830247a4 7820S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7821{
97aff369 7822 dVAR;
c277df42 7823 register regnode *ptr;
504618e9 7824 regnode * const ret = RExC_emit;
07be1b83 7825 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7826
c277df42 7827 if (SIZE_ONLY) {
830247a4
IZ
7828 SIZE_ALIGN(RExC_size);
7829 RExC_size += 2;
6bda09f9
YO
7830 /*
7831 We can't do this:
7832
7833 assert(2==regarglen[op]+1);
7834
7835 Anything larger than this has to allocate the extra amount.
7836 If we changed this to be:
7837
7838 RExC_size += (1 + regarglen[op]);
7839
7840 then it wouldn't matter. Its not clear what side effect
7841 might come from that so its not done so far.
7842 -- dmq
7843 */
a0d0e21e
LW
7844 return(ret);
7845 }
e2e6a0f1
YO
7846#ifdef DEBUGGING
7847 if (OP(RExC_emit) == 255)
7848 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7849#endif
c277df42 7850 NODE_ALIGN_FILL(ret);
a0d0e21e 7851 ptr = ret;
c277df42 7852 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7853 if (RExC_offsets) { /* MJD */
07be1b83 7854 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7855 "reganode",
ccb2c380
MP
7856 __LINE__,
7857 reg_name[op],
07be1b83 7858 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7859 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7860 (UV)(RExC_emit - RExC_emit_start),
7861 (UV)(RExC_parse - RExC_start),
7862 (UV)RExC_offsets[0]));
ccb2c380 7863 Set_Cur_Node_Offset;
fac92740
MJD
7864 }
7865
830247a4 7866 RExC_emit = ptr;
a0d0e21e 7867 return(ret);
fe14fcc3
LW
7868}
7869
7870/*
cd439c50 7871- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7872*/
71207a34
AL
7873STATIC STRLEN
7874S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7875{
97aff369 7876 dVAR;
71207a34 7877 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7878}
7879
7880/*
a0d0e21e
LW
7881- reginsert - insert an operator in front of already-emitted operand
7882*
7883* Means relocating the operand.
7884*/
76e3520e 7885STATIC void
6bda09f9 7886S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7887{
97aff369 7888 dVAR;
c277df42
IZ
7889 register regnode *src;
7890 register regnode *dst;
7891 register regnode *place;
504618e9 7892 const int offset = regarglen[(U8)op];
6bda09f9 7893 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7894 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7895/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7896 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7897 if (SIZE_ONLY) {
6bda09f9 7898 RExC_size += size;
a0d0e21e
LW
7899 return;
7900 }
a687059c 7901
830247a4 7902 src = RExC_emit;
6bda09f9 7903 RExC_emit += size;
830247a4 7904 dst = RExC_emit;
40d049e4 7905 if (RExC_open_parens) {
6bda09f9 7906 int paren;
6d99fb9b 7907 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 7908 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7909 if ( RExC_open_parens[paren] >= opnd ) {
7910 DEBUG_PARSE_FMT("open"," - %d",size);
7911 RExC_open_parens[paren] += size;
7912 } else {
7913 DEBUG_PARSE_FMT("open"," - %s","ok");
7914 }
7915 if ( RExC_close_parens[paren] >= opnd ) {
7916 DEBUG_PARSE_FMT("close"," - %d",size);
7917 RExC_close_parens[paren] += size;
7918 } else {
7919 DEBUG_PARSE_FMT("close"," - %s","ok");
7920 }
7921 }
6bda09f9 7922 }
40d049e4 7923
fac92740 7924 while (src > opnd) {
c277df42 7925 StructCopy(--src, --dst, regnode);
fac92740 7926 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7927 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7928 "reg_insert",
ccb2c380
MP
7929 __LINE__,
7930 reg_name[op],
07be1b83
YO
7931 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7932 ? "Overwriting end of array!\n" : "OK",
7933 (UV)(src - RExC_emit_start),
7934 (UV)(dst - RExC_emit_start),
7935 (UV)RExC_offsets[0]));
ccb2c380
MP
7936 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7937 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7938 }
7939 }
7940
a0d0e21e
LW
7941
7942 place = opnd; /* Op node, where operand used to be. */
fac92740 7943 if (RExC_offsets) { /* MJD */
07be1b83 7944 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7945 "reginsert",
ccb2c380
MP
7946 __LINE__,
7947 reg_name[op],
07be1b83 7948 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7949 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7950 (UV)(place - RExC_emit_start),
7951 (UV)(RExC_parse - RExC_start),
786e8c11 7952 (UV)RExC_offsets[0]));
ccb2c380 7953 Set_Node_Offset(place, RExC_parse);
45948336 7954 Set_Node_Length(place, 1);
fac92740 7955 }
c277df42
IZ
7956 src = NEXTOPER(place);
7957 FILL_ADVANCE_NODE(place, op);
7958 Zero(src, offset, regnode);
a687059c
LW
7959}
7960
7961/*
c277df42 7962- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7963- SEE ALSO: regtail_study
a0d0e21e 7964*/
097eb12c 7965/* TODO: All three parms should be const */
76e3520e 7966STATIC void
3dab1dad 7967S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7968{
97aff369 7969 dVAR;
c277df42 7970 register regnode *scan;
72f13be8 7971 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7972#ifndef DEBUGGING
7973 PERL_UNUSED_ARG(depth);
7974#endif
a0d0e21e 7975
c277df42 7976 if (SIZE_ONLY)
a0d0e21e
LW
7977 return;
7978
7979 /* Find last node. */
7980 scan = p;
7981 for (;;) {
504618e9 7982 regnode * const temp = regnext(scan);
3dab1dad
YO
7983 DEBUG_PARSE_r({
7984 SV * const mysv=sv_newmortal();
7985 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7986 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7987 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7988 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7989 (temp == NULL ? "->" : ""),
7990 (temp == NULL ? reg_name[OP(val)] : "")
7991 );
3dab1dad
YO
7992 });
7993 if (temp == NULL)
7994 break;
7995 scan = temp;
7996 }
7997
7998 if (reg_off_by_arg[OP(scan)]) {
7999 ARG_SET(scan, val - scan);
8000 }
8001 else {
8002 NEXT_OFF(scan) = val - scan;
8003 }
8004}
8005
07be1b83 8006#ifdef DEBUGGING
3dab1dad
YO
8007/*
8008- regtail_study - set the next-pointer at the end of a node chain of p to val.
8009- Look for optimizable sequences at the same time.
8010- currently only looks for EXACT chains.
07be1b83
YO
8011
8012This is expermental code. The idea is to use this routine to perform
8013in place optimizations on branches and groups as they are constructed,
8014with the long term intention of removing optimization from study_chunk so
8015that it is purely analytical.
8016
8017Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8018to control which is which.
8019
3dab1dad
YO
8020*/
8021/* TODO: All four parms should be const */
07be1b83 8022
3dab1dad
YO
8023STATIC U8
8024S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8025{
8026 dVAR;
8027 register regnode *scan;
07be1b83
YO
8028 U8 exact = PSEUDO;
8029#ifdef EXPERIMENTAL_INPLACESCAN
8030 I32 min = 0;
8031#endif
8032
3dab1dad
YO
8033 GET_RE_DEBUG_FLAGS_DECL;
8034
07be1b83 8035
3dab1dad
YO
8036 if (SIZE_ONLY)
8037 return exact;
8038
8039 /* Find last node. */
8040
8041 scan = p;
8042 for (;;) {
8043 regnode * const temp = regnext(scan);
07be1b83
YO
8044#ifdef EXPERIMENTAL_INPLACESCAN
8045 if (PL_regkind[OP(scan)] == EXACT)
8046 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8047 return EXACT;
8048#endif
3dab1dad
YO
8049 if ( exact ) {
8050 switch (OP(scan)) {
8051 case EXACT:
8052 case EXACTF:
8053 case EXACTFL:
8054 if( exact == PSEUDO )
8055 exact= OP(scan);
07be1b83
YO
8056 else if ( exact != OP(scan) )
8057 exact= 0;
3dab1dad
YO
8058 case NOTHING:
8059 break;
8060 default:
8061 exact= 0;
8062 }
8063 }
8064 DEBUG_PARSE_r({
8065 SV * const mysv=sv_newmortal();
8066 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8067 regprop(RExC_rx, mysv, scan);
eaf3ca90 8068 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8069 SvPV_nolen_const(mysv),
eaf3ca90
YO
8070 REG_NODE_NUM(scan),
8071 reg_name[exact]);
3dab1dad 8072 });
a0d0e21e
LW
8073 if (temp == NULL)
8074 break;
8075 scan = temp;
8076 }
07be1b83
YO
8077 DEBUG_PARSE_r({
8078 SV * const mysv_val=sv_newmortal();
8079 DEBUG_PARSE_MSG("");
8080 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
8081 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8082 SvPV_nolen_const(mysv_val),
8083 (IV)REG_NODE_NUM(val),
8084 (IV)(val - scan)
07be1b83
YO
8085 );
8086 });
c277df42
IZ
8087 if (reg_off_by_arg[OP(scan)]) {
8088 ARG_SET(scan, val - scan);
a0ed51b3
LW
8089 }
8090 else {
c277df42
IZ
8091 NEXT_OFF(scan) = val - scan;
8092 }
3dab1dad
YO
8093
8094 return exact;
a687059c 8095}
07be1b83 8096#endif
a687059c
LW
8097
8098/*
a687059c
LW
8099 - regcurly - a little FSA that accepts {\d+,?\d*}
8100 */
79072805 8101STATIC I32
5f66b61c 8102S_regcurly(register const char *s)
a687059c
LW
8103{
8104 if (*s++ != '{')
8105 return FALSE;
f0fcb552 8106 if (!isDIGIT(*s))
a687059c 8107 return FALSE;
f0fcb552 8108 while (isDIGIT(*s))
a687059c
LW
8109 s++;
8110 if (*s == ',')
8111 s++;
f0fcb552 8112 while (isDIGIT(*s))
a687059c
LW
8113 s++;
8114 if (*s != '}')
8115 return FALSE;
8116 return TRUE;
8117}
8118
a687059c
LW
8119
8120/*
fd181c75 8121 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
8122 */
8123void
097eb12c 8124Perl_regdump(pTHX_ const regexp *r)
a687059c 8125{
35ff7856 8126#ifdef DEBUGGING
97aff369 8127 dVAR;
c445ea15 8128 SV * const sv = sv_newmortal();
ab3bbdeb 8129 SV *dsv= sv_newmortal();
f8fc2ecf 8130 RXi_GET_DECL(r,ri);
a687059c 8131
f8fc2ecf 8132 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8133
8134 /* Header fields of interest. */
ab3bbdeb
YO
8135 if (r->anchored_substr) {
8136 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8137 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8138 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8139 "anchored %s%s at %"IVdf" ",
8140 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8141 (IV)r->anchored_offset);
ab3bbdeb
YO
8142 } else if (r->anchored_utf8) {
8143 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8144 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8145 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8146 "anchored utf8 %s%s at %"IVdf" ",
8147 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8148 (IV)r->anchored_offset);
ab3bbdeb
YO
8149 }
8150 if (r->float_substr) {
8151 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8152 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8153 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8154 "floating %s%s at %"IVdf"..%"UVuf" ",
8155 s, RE_SV_TAIL(r->float_substr),
7b0972df 8156 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8157 } else if (r->float_utf8) {
8158 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8159 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8160 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8161 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8162 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8163 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8164 }
33b8afdf 8165 if (r->check_substr || r->check_utf8)
b81d288d 8166 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8167 (const char *)
8168 (r->check_substr == r->float_substr
8169 && r->check_utf8 == r->float_utf8
8170 ? "(checking floating" : "(checking anchored"));
bbe252da 8171 if (r->extflags & RXf_NOSCAN)
c277df42 8172 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 8173 if (r->extflags & RXf_CHECK_ALL)
c277df42 8174 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8175 if (r->check_substr || r->check_utf8)
c277df42
IZ
8176 PerlIO_printf(Perl_debug_log, ") ");
8177
f8fc2ecf
YO
8178 if (ri->regstclass) {
8179 regprop(r, sv, ri->regstclass);
1de06328 8180 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8181 }
bbe252da 8182 if (r->extflags & RXf_ANCH) {
774d564b 8183 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 8184 if (r->extflags & RXf_ANCH_BOL)
774d564b 8185 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 8186 if (r->extflags & RXf_ANCH_MBOL)
c277df42 8187 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 8188 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 8189 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 8190 if (r->extflags & RXf_ANCH_GPOS)
774d564b 8191 PerlIO_printf(Perl_debug_log, "(GPOS)");
8192 PerlIO_putc(Perl_debug_log, ' ');
8193 }
bbe252da 8194 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 8195 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 8196 if (r->intflags & PREGf_SKIP)
760ac839 8197 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 8198 if (r->intflags & PREGf_IMPLICIT)
760ac839 8199 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 8200 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 8201 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 8202 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8203 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8204#else
96a5add6 8205 PERL_UNUSED_CONTEXT;
65e66c80 8206 PERL_UNUSED_ARG(r);
17c3b450 8207#endif /* DEBUGGING */
a687059c
LW
8208}
8209
8210/*
a0d0e21e
LW
8211- regprop - printable representation of opcode
8212*/
46fc3d4c 8213void
32fc9b6a 8214Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8215{
35ff7856 8216#ifdef DEBUGGING
97aff369 8217 dVAR;
9b155405 8218 register int k;
f8fc2ecf 8219 RXi_GET_DECL(prog,progi);
1de06328 8220 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 8221
a0d0e21e 8222
54dc92de 8223 sv_setpvn(sv, "", 0);
8aa23a47 8224
03363afd 8225 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8226 /* It would be nice to FAIL() here, but this may be called from
8227 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8228 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8229 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8230
3dab1dad 8231 k = PL_regkind[OP(o)];
9b155405 8232
2a782b5b 8233 if (k == EXACT) {
396482e1 8234 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8235 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8236 * is a crude hack but it may be the best for now since
8237 * we have no flag "this EXACTish node was UTF-8"
8238 * --jhi */
8239 const char * const s =
ddc5bc0f 8240 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8241 PL_colors[0], PL_colors[1],
8242 PERL_PV_ESCAPE_UNI_DETECT |
8243 PERL_PV_PRETTY_ELIPSES |
8244 PERL_PV_PRETTY_LTGT
8245 );
8246 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8247 } else if (k == TRIE) {
3dab1dad 8248 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 8249 * progi->data isn't available here */
1de06328
YO
8250 const char op = OP(o);
8251 const I32 n = ARG(o);
8252 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 8253 (reg_ac_data *)progi->data->data[n] :
1de06328 8254 NULL;
3251b653
NC
8255 const reg_trie_data * const trie
8256 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328
YO
8257
8258 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8259 DEBUG_TRIE_COMPILE_r(
8260 Perl_sv_catpvf(aTHX_ sv,
8261 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8262 (UV)trie->startstate,
1e2e3d02 8263 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8264 (UV)trie->wordcount,
8265 (UV)trie->minlen,
8266 (UV)trie->maxlen,
8267 (UV)TRIE_CHARCOUNT(trie),
8268 (UV)trie->uniquecharcount
8269 )
8270 );
8271 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8272 int i;
8273 int rangestart = -1;
f46cb337 8274 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8275 Perl_sv_catpvf(aTHX_ sv, "[");
8276 for (i = 0; i <= 256; i++) {
8277 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8278 if (rangestart == -1)
8279 rangestart = i;
8280 } else if (rangestart != -1) {
8281 if (i <= rangestart + 3)
8282 for (; rangestart < i; rangestart++)
8283 put_byte(sv, rangestart);
8284 else {
8285 put_byte(sv, rangestart);
8286 sv_catpvs(sv, "-");
8287 put_byte(sv, i - 1);
8288 }
8289 rangestart = -1;
8290 }
8291 }
8292 Perl_sv_catpvf(aTHX_ sv, "]");
8293 }
8294
a3621e74 8295 } else if (k == CURLY) {
cb434fcc 8296 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8297 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8298 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8299 }
2c2d71f5
JH
8300 else if (k == WHILEM && o->flags) /* Ordinal/of */
8301 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
e2e6a0f1 8302 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
894356b3 8303 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1a147d38 8304 else if (k == GOSUB)
6bda09f9 8305 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8306 else if (k == VERB) {
8307 if (!o->flags)
8308 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
f8fc2ecf 8309 (SV*)progi->data->data[ ARG( o ) ]);
e2e6a0f1 8310 } else if (k == LOGICAL)
04ebc1ab 8311 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8312 else if (k == ANYOF) {
8313 int i, rangestart = -1;
2d03de9c 8314 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8315
8316 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8317 static const char * const anyofs[] = {
653099ff
GS
8318 "\\w",
8319 "\\W",
8320 "\\s",
8321 "\\S",
8322 "\\d",
8323 "\\D",
8324 "[:alnum:]",
8325 "[:^alnum:]",
8326 "[:alpha:]",
8327 "[:^alpha:]",
8328 "[:ascii:]",
8329 "[:^ascii:]",
8330 "[:ctrl:]",
8331 "[:^ctrl:]",
8332 "[:graph:]",
8333 "[:^graph:]",
8334 "[:lower:]",
8335 "[:^lower:]",
8336 "[:print:]",
8337 "[:^print:]",
8338 "[:punct:]",
8339 "[:^punct:]",
8340 "[:upper:]",
aaa51d5e 8341 "[:^upper:]",
653099ff 8342 "[:xdigit:]",
aaa51d5e
JF
8343 "[:^xdigit:]",
8344 "[:space:]",
8345 "[:^space:]",
8346 "[:blank:]",
8347 "[:^blank:]"
653099ff
GS
8348 };
8349
19860706 8350 if (flags & ANYOF_LOCALE)
396482e1 8351 sv_catpvs(sv, "{loc}");
19860706 8352 if (flags & ANYOF_FOLD)
396482e1 8353 sv_catpvs(sv, "{i}");
653099ff 8354 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8355 if (flags & ANYOF_INVERT)
396482e1 8356 sv_catpvs(sv, "^");
ffc61ed2
JH
8357 for (i = 0; i <= 256; i++) {
8358 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8359 if (rangestart == -1)
8360 rangestart = i;
8361 } else if (rangestart != -1) {
8362 if (i <= rangestart + 3)
8363 for (; rangestart < i; rangestart++)
653099ff 8364 put_byte(sv, rangestart);
ffc61ed2
JH
8365 else {
8366 put_byte(sv, rangestart);
396482e1 8367 sv_catpvs(sv, "-");
ffc61ed2 8368 put_byte(sv, i - 1);
653099ff 8369 }
ffc61ed2 8370 rangestart = -1;
653099ff 8371 }
847a199f 8372 }
ffc61ed2
JH
8373
8374 if (o->flags & ANYOF_CLASS)
bb7a0f54 8375 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8376 if (ANYOF_CLASS_TEST(o,i))
8377 sv_catpv(sv, anyofs[i]);
8378
8379 if (flags & ANYOF_UNICODE)
396482e1 8380 sv_catpvs(sv, "{unicode}");
1aa99e6b 8381 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8382 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8383
8384 {
8385 SV *lv;
32fc9b6a 8386 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8387
ffc61ed2
JH
8388 if (lv) {
8389 if (sw) {
89ebb4a3 8390 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8391
ffc61ed2 8392 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8393 uvchr_to_utf8(s, i);
ffc61ed2 8394
3568d838 8395 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8396 if (rangestart == -1)
8397 rangestart = i;
8398 } else if (rangestart != -1) {
ffc61ed2
JH
8399 if (i <= rangestart + 3)
8400 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8401 const U8 * const e = uvchr_to_utf8(s,rangestart);
8402 U8 *p;
8403 for(p = s; p < e; p++)
ffc61ed2
JH
8404 put_byte(sv, *p);
8405 }
8406 else {
2d03de9c
AL
8407 const U8 *e = uvchr_to_utf8(s,rangestart);
8408 U8 *p;
8409 for (p = s; p < e; p++)
ffc61ed2 8410 put_byte(sv, *p);
396482e1 8411 sv_catpvs(sv, "-");
2d03de9c
AL
8412 e = uvchr_to_utf8(s, i-1);
8413 for (p = s; p < e; p++)
1df70142 8414 put_byte(sv, *p);
ffc61ed2
JH
8415 }
8416 rangestart = -1;
8417 }
19860706 8418 }
ffc61ed2 8419
396482e1 8420 sv_catpvs(sv, "..."); /* et cetera */
19860706 8421 }
fde631ed 8422
ffc61ed2 8423 {
2e0de35c 8424 char *s = savesvpv(lv);
c445ea15 8425 char * const origs = s;
b81d288d 8426
3dab1dad
YO
8427 while (*s && *s != '\n')
8428 s++;
b81d288d 8429
ffc61ed2 8430 if (*s == '\n') {
2d03de9c 8431 const char * const t = ++s;
ffc61ed2
JH
8432
8433 while (*s) {
8434 if (*s == '\n')
8435 *s = ' ';
8436 s++;
8437 }
8438 if (s[-1] == ' ')
8439 s[-1] = 0;
8440
8441 sv_catpv(sv, t);
fde631ed 8442 }
b81d288d 8443
ffc61ed2 8444 Safefree(origs);
fde631ed
JH
8445 }
8446 }
653099ff 8447 }
ffc61ed2 8448
653099ff
GS
8449 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8450 }
9b155405 8451 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8452 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8453#else
96a5add6 8454 PERL_UNUSED_CONTEXT;
65e66c80
SP
8455 PERL_UNUSED_ARG(sv);
8456 PERL_UNUSED_ARG(o);
f9049ba1 8457 PERL_UNUSED_ARG(prog);
17c3b450 8458#endif /* DEBUGGING */
35ff7856 8459}
a687059c 8460
cad2e5aa
JH
8461SV *
8462Perl_re_intuit_string(pTHX_ regexp *prog)
8463{ /* Assume that RE_INTUIT is set */
97aff369 8464 dVAR;
a3621e74 8465 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8466 PERL_UNUSED_CONTEXT;
8467
a3621e74 8468 DEBUG_COMPILE_r(
cfd0369c 8469 {
2d03de9c 8470 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8471 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8472
8473 if (!PL_colorset) reginitcolors();
8474 PerlIO_printf(Perl_debug_log,
a0288114 8475 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8476 PL_colors[4],
8477 prog->check_substr ? "" : "utf8 ",
8478 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8479 s,
8480 PL_colors[1],
8481 (strlen(s) > 60 ? "..." : ""));
8482 } );
8483
33b8afdf 8484 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8485}
8486
84da74a7 8487/*
f8149455 8488 pregfree()
84da74a7 8489
f8149455
YO
8490 handles refcounting and freeing the perl core regexp structure. When
8491 it is necessary to actually free the structure the first thing it
8492 does is call the 'free' method of the regexp_engine associated to to
8493 the regexp, allowing the handling of the void *pprivate; member
8494 first. (This routine is not overridable by extensions, which is why
8495 the extensions free is called first.)
8496
8497 See regdupe and regdupe_internal if you change anything here.
84da74a7 8498*/
f8149455 8499#ifndef PERL_IN_XSUB_RE
2b69d0c2 8500void
864dbfa3 8501Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8502{
27da23d5 8503 dVAR;
fc32ee4a 8504 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8505
7821416a
IZ
8506 if (!r || (--r->refcnt > 0))
8507 return;
f8149455
YO
8508
8509 CALLREGFREE_PVT(r); /* free the private data */
8510
43c5f42d
NC
8511 /* gcov results gave these as non-null 100% of the time, so there's no
8512 optimisation in checking them before calling Safefree */
8513 Safefree(r->precomp);
ed252734 8514 RX_MATCH_COPY_FREE(r);
f8c7b90f 8515#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8516 if (r->saved_copy)
8517 SvREFCNT_dec(r->saved_copy);
8518#endif
a193d654
GS
8519 if (r->substrs) {
8520 if (r->anchored_substr)
8521 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8522 if (r->anchored_utf8)
8523 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8524 if (r->float_substr)
8525 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8526 if (r->float_utf8)
8527 SvREFCNT_dec(r->float_utf8);
2779dcf1 8528 Safefree(r->substrs);
a193d654 8529 }
81714fb9
YO
8530 if (r->paren_names)
8531 SvREFCNT_dec(r->paren_names);
f8149455
YO
8532
8533 Safefree(r->startp);
8534 Safefree(r->endp);
8535 Safefree(r);
8536}
8537#endif
8538
8539/* regfree_internal()
8540
8541 Free the private data in a regexp. This is overloadable by
8542 extensions. Perl takes care of the regexp structure in pregfree(),
8543 this covers the *pprivate pointer which technically perldoesnt
8544 know about, however of course we have to handle the
8545 regexp_internal structure when no extension is in use.
8546
8547 Note this is called before freeing anything in the regexp
8548 structure.
8549 */
8550
8551void
8552Perl_regfree_internal(pTHX_ struct regexp *r)
8553{
8554 dVAR;
8555 RXi_GET_DECL(r,ri);
8556 GET_RE_DEBUG_FLAGS_DECL;
8557
8558 DEBUG_COMPILE_r({
8559 if (!PL_colorset)
8560 reginitcolors();
8561 {
8562 SV *dsv= sv_newmortal();
8563 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
8564 dsv, r->precomp, r->prelen, 60);
8565 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8566 PL_colors[4],PL_colors[5],s);
8567 }
8568 });
8569
8570 Safefree(ri->offsets); /* 20010421 MJD */
f8fc2ecf
YO
8571 if (ri->data) {
8572 int n = ri->data->count;
f3548bdc
DM
8573 PAD* new_comppad = NULL;
8574 PAD* old_comppad;
4026c95a 8575 PADOFFSET refcnt;
dfad63ad 8576
c277df42 8577 while (--n >= 0) {
261faec3 8578 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 8579 switch (ri->data->what[n]) {
c277df42 8580 case 's':
81714fb9 8581 case 'S':
55eed653 8582 case 'u':
f8fc2ecf 8583 SvREFCNT_dec((SV*)ri->data->data[n]);
c277df42 8584 break;
653099ff 8585 case 'f':
f8fc2ecf 8586 Safefree(ri->data->data[n]);
653099ff 8587 break;
dfad63ad 8588 case 'p':
f8fc2ecf 8589 new_comppad = (AV*)ri->data->data[n];
dfad63ad 8590 break;
c277df42 8591 case 'o':
dfad63ad 8592 if (new_comppad == NULL)
cea2e8a9 8593 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8594 PAD_SAVE_LOCAL(old_comppad,
8595 /* Watch out for global destruction's random ordering. */
c445ea15 8596 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8597 );
b34c0dd4 8598 OP_REFCNT_LOCK;
f8fc2ecf 8599 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
8600 OP_REFCNT_UNLOCK;
8601 if (!refcnt)
f8fc2ecf 8602 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 8603
f3548bdc 8604 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8605 SvREFCNT_dec((SV*)new_comppad);
8606 new_comppad = NULL;
c277df42
IZ
8607 break;
8608 case 'n':
9e55ce06 8609 break;
07be1b83 8610 case 'T':
be8e71aa
YO
8611 { /* Aho Corasick add-on structure for a trie node.
8612 Used in stclass optimization only */
07be1b83 8613 U32 refcount;
f8fc2ecf 8614 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
8615 OP_REFCNT_LOCK;
8616 refcount = --aho->refcount;
8617 OP_REFCNT_UNLOCK;
8618 if ( !refcount ) {
446bd890
NC
8619 PerlMemShared_free(aho->states);
8620 PerlMemShared_free(aho->fail);
446bd890
NC
8621 /* do this last!!!! */
8622 PerlMemShared_free(ri->data->data[n]);
8623 PerlMemShared_free(ri->regstclass);
07be1b83
YO
8624 }
8625 }
8626 break;
a3621e74 8627 case 't':
07be1b83 8628 {
be8e71aa 8629 /* trie structure. */
07be1b83 8630 U32 refcount;
f8fc2ecf 8631 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
8632 OP_REFCNT_LOCK;
8633 refcount = --trie->refcount;
8634 OP_REFCNT_UNLOCK;
8635 if ( !refcount ) {
446bd890 8636 PerlMemShared_free(trie->charmap);
446bd890
NC
8637 PerlMemShared_free(trie->states);
8638 PerlMemShared_free(trie->trans);
07be1b83 8639 if (trie->bitmap)
446bd890 8640 PerlMemShared_free(trie->bitmap);
07be1b83 8641 if (trie->wordlen)
446bd890 8642 PerlMemShared_free(trie->wordlen);
786e8c11 8643 if (trie->jump)
446bd890 8644 PerlMemShared_free(trie->jump);
786e8c11 8645 if (trie->nextword)
446bd890 8646 PerlMemShared_free(trie->nextword);
446bd890
NC
8647 /* do this last!!!! */
8648 PerlMemShared_free(ri->data->data[n]);
a3621e74 8649 }
07be1b83
YO
8650 }
8651 break;
c277df42 8652 default:
f8fc2ecf 8653 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
8654 }
8655 }
f8fc2ecf
YO
8656 Safefree(ri->data->what);
8657 Safefree(ri->data);
a0d0e21e 8658 }
f8fc2ecf
YO
8659 if (ri->swap) {
8660 Safefree(ri->swap->startp);
8661 Safefree(ri->swap->endp);
8662 Safefree(ri->swap);
c74340f9 8663 }
f8fc2ecf 8664 Safefree(ri);
a687059c 8665}
c277df42 8666
84da74a7
YO
8667#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8668#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8669#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8670#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8671
8672/*
8673 regdupe - duplicate a regexp.
8674
8675 This routine is called by sv.c's re_dup and is expected to clone a
8676 given regexp structure. It is a no-op when not under USE_ITHREADS.
8677 (Originally this *was* re_dup() for change history see sv.c)
8678
f8149455
YO
8679 After all of the core data stored in struct regexp is duplicated
8680 the regexp_engine.dupe method is used to copy any private data
8681 stored in the *pprivate pointer. This allows extensions to handle
8682 any duplication it needs to do.
8683
8684 See pregfree() and regfree_internal() if you change anything here.
84da74a7 8685*/
a3c0e9ca 8686#if defined(USE_ITHREADS)
f8149455 8687#ifndef PERL_IN_XSUB_RE
84da74a7 8688regexp *
f8149455 8689Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
84da74a7 8690{
84da74a7 8691 dVAR;
f8fc2ecf 8692 regexp *ret;
f8149455 8693 int i, npar;
84da74a7 8694 struct reg_substr_datum *s;
644c02aa 8695
84da74a7
YO
8696 if (!r)
8697 return (REGEXP *)NULL;
8698
8699 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8700 return ret;
8701
f8149455 8702
84da74a7 8703 npar = r->nparens+1;
f8fc2ecf 8704 Newxz(ret, 1, regexp);
84da74a7
YO
8705 Newx(ret->startp, npar, I32);
8706 Copy(r->startp, ret->startp, npar, I32);
8707 Newx(ret->endp, npar, I32);
f8149455 8708 Copy(r->endp, ret->endp, npar, I32);
84da74a7 8709
785a26d5
YO
8710 if (ret->substrs) {
8711 Newx(ret->substrs, 1, struct reg_substr_data);
8712 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8713 s->min_offset = r->substrs->data[i].min_offset;
8714 s->max_offset = r->substrs->data[i].max_offset;
8715 s->end_shift = r->substrs->data[i].end_shift;
8716 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8717 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8718 }
8719 } else
8720 ret->substrs = NULL;
f8149455
YO
8721
8722 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8723 ret->refcnt = r->refcnt;
8724 ret->minlen = r->minlen;
8725 ret->minlenret = r->minlenret;
8726 ret->prelen = r->prelen;
8727 ret->nparens = r->nparens;
8728 ret->lastparen = r->lastparen;
8729 ret->lastcloseparen = r->lastcloseparen;
8730 ret->intflags = r->intflags;
8731 ret->extflags = r->extflags;
8732
8733 ret->sublen = r->sublen;
8734
8735 ret->engine = r->engine;
8736
8737 ret->paren_names = hv_dup_inc(r->paren_names, param);
8738
8739 if (RX_MATCH_COPIED(ret))
8740 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8741 else
8742 ret->subbeg = NULL;
8743#ifdef PERL_OLD_COPY_ON_WRITE
8744 ret->saved_copy = NULL;
8745#endif
8746
8747 ret->pprivate = r->pprivate;
785a26d5
YO
8748 if (ret->pprivate)
8749 RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
f8149455
YO
8750
8751 ptr_table_store(PL_ptr_table, r, ret);
8752 return ret;
8753}
8754#endif /* PERL_IN_XSUB_RE */
8755
8756/*
8757 regdupe_internal()
8758
8759 This is the internal complement to regdupe() which is used to copy
8760 the structure pointed to by the *pprivate pointer in the regexp.
8761 This is the core version of the extension overridable cloning hook.
8762 The regexp structure being duplicated will be copied by perl prior
8763 to this and will be provided as the regexp *r argument, however
8764 with the /old/ structures pprivate pointer value. Thus this routine
8765 may override any copying normally done by perl.
8766
8767 It returns a pointer to the new regexp_internal structure.
8768*/
8769
8770void *
8771Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param)
8772{
8773 dVAR;
8774 regexp_internal *reti;
8775 int len, npar;
8776 RXi_GET_DECL(r,ri);
8777
8778 npar = r->nparens+1;
8779 len = ri->offsets[0];
8780
8781 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
8782 Copy(ri->program, reti->program, len+1, regnode);
8783
8784 if(ri->swap) {
8785 Newx(reti->swap, 1, regexp_paren_ofs);
8786 /* no need to copy these */
8787 Newx(reti->swap->startp, npar, I32);
8788 Newx(reti->swap->endp, npar, I32);
8789 } else {
8790 reti->swap = NULL;
8791 }
8792
84da74a7 8793
f8fc2ecf
YO
8794 reti->regstclass = NULL;
8795 if (ri->data) {
84da74a7 8796 struct reg_data *d;
f8fc2ecf 8797 const int count = ri->data->count;
84da74a7
YO
8798 int i;
8799
8800 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8801 char, struct reg_data);
8802 Newx(d->what, count, U8);
8803
8804 d->count = count;
8805 for (i = 0; i < count; i++) {
f8fc2ecf 8806 d->what[i] = ri->data->what[i];
84da74a7 8807 switch (d->what[i]) {
55eed653 8808 /* legal options are one of: sSfpontTu
84da74a7
YO
8809 see also regcomp.h and pregfree() */
8810 case 's':
81714fb9 8811 case 'S':
0536c0a7 8812 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 8813 case 'u': /* actually an HV, but the dup function is identical. */
f8fc2ecf 8814 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
84da74a7 8815 break;
84da74a7
YO
8816 case 'f':
8817 /* This is cheating. */
8818 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 8819 StructCopy(ri->data->data[i], d->data[i],
84da74a7 8820 struct regnode_charclass_class);
f8fc2ecf 8821 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
8822 break;
8823 case 'o':
bbe252da
YO
8824 /* Compiled op trees are readonly and in shared memory,
8825 and can thus be shared without duplication. */
84da74a7 8826 OP_REFCNT_LOCK;
f8fc2ecf 8827 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
8828 OP_REFCNT_UNLOCK;
8829 break;
23eab42c
NC
8830 case 'T':
8831 /* Trie stclasses are readonly and can thus be shared
8832 * without duplication. We free the stclass in pregfree
8833 * when the corresponding reg_ac_data struct is freed.
8834 */
8835 reti->regstclass= ri->regstclass;
8836 /* Fall through */
84da74a7 8837 case 't':
84da74a7 8838 OP_REFCNT_LOCK;
0536c0a7 8839 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 8840 OP_REFCNT_UNLOCK;
0536c0a7
NC
8841 /* Fall through */
8842 case 'n':
8843 d->data[i] = ri->data->data[i];
84da74a7 8844 break;
84da74a7 8845 default:
f8fc2ecf 8846 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
8847 }
8848 }
8849
f8fc2ecf 8850 reti->data = d;
84da74a7
YO
8851 }
8852 else
f8fc2ecf 8853 reti->data = NULL;
84da74a7 8854
f8fc2ecf
YO
8855 Newx(reti->offsets, 2*len+1, U32);
8856 Copy(ri->offsets, reti->offsets, 2*len+1, U32);
81714fb9 8857
f8149455 8858 return (void*)reti;
84da74a7 8859}
f8149455
YO
8860
8861#endif /* USE_ITHREADS */
84da74a7 8862
de8c5301
YO
8863/*
8864 reg_stringify()
8865
8866 converts a regexp embedded in a MAGIC struct to its stringified form,
8867 caching the converted form in the struct and returns the cached
8868 string.
8869
8870 If lp is nonnull then it is used to return the length of the
8871 resulting string
8872
8873 If flags is nonnull and the returned string contains UTF8 then
f8149455 8874 (*flags & 1) will be true.
de8c5301
YO
8875
8876 If haseval is nonnull then it is used to return whether the pattern
8877 contains evals.
8878
8879 Normally called via macro:
8880
f8149455 8881 CALLREG_STRINGIFY(mg,&len,&utf8);
de8c5301
YO
8882
8883 And internally with
8884
f8149455 8885 CALLREG_AS_STR(mg,&lp,&flags,&haseval)
de8c5301
YO
8886
8887 See sv_2pv_flags() in sv.c for an example of internal usage.
8888
8889 */
f8149455 8890#ifndef PERL_IN_XSUB_RE
de8c5301
YO
8891char *
8892Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8893 dVAR;
8894 const regexp * const re = (regexp *)mg->mg_obj;
f8149455 8895
de8c5301
YO
8896 if (!mg->mg_ptr) {
8897 const char *fptr = "msix";
8898 char reflags[6];
8899 char ch;
8900 int left = 0;
8901 int right = 4;
8902 bool need_newline = 0;
bbe252da 8903 U16 reganch = (U16)((re->extflags & RXf_PMf_COMPILETIME) >> 12);
de8c5301
YO
8904
8905 while((ch = *fptr++)) {
8906 if(reganch & 1) {
8907 reflags[left++] = ch;
8908 }
8909 else {
8910 reflags[right--] = ch;
8911 }
8912 reganch >>= 1;
8913 }
8914 if(left != 4) {
8915 reflags[left] = '-';
8916 left = 5;
8917 }
8918
8919 mg->mg_len = re->prelen + 4 + left;
8920 /*
8921 * If /x was used, we have to worry about a regex ending with a
8922 * comment later being embedded within another regex. If so, we don't
8923 * want this regex's "commentization" to leak out to the right part of
8924 * the enclosing regex, we must cap it with a newline.
8925 *
8926 * So, if /x was used, we scan backwards from the end of the regex. If
8927 * we find a '#' before we find a newline, we need to add a newline
8928 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8929 * we don't need to add anything. -jfriedl
8930 */
bbe252da 8931 if (PMf_EXTENDED & re->extflags) {
de8c5301
YO
8932 const char *endptr = re->precomp + re->prelen;
8933 while (endptr >= re->precomp) {
8934 const char c = *(endptr--);
8935 if (c == '\n')
8936 break; /* don't need another */
8937 if (c == '#') {
8938 /* we end while in a comment, so we need a newline */
8939 mg->mg_len++; /* save space for it */
8940 need_newline = 1; /* note to add it */
8941 break;
8942 }
8943 }
8944 }
8945
8946 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8947 mg->mg_ptr[0] = '(';
8948 mg->mg_ptr[1] = '?';
8949 Copy(reflags, mg->mg_ptr+2, left, char);
8950 *(mg->mg_ptr+left+2) = ':';
8951 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8952 if (need_newline)
8953 mg->mg_ptr[mg->mg_len - 2] = '\n';
8954 mg->mg_ptr[mg->mg_len - 1] = ')';
8955 mg->mg_ptr[mg->mg_len] = 0;
8956 }
8957 if (haseval)
f8149455 8958 *haseval = re->seen_evals;
de8c5301 8959 if (flags)
bbe252da 8960 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
de8c5301
YO
8961
8962 if (lp)
8963 *lp = mg->mg_len;
8964 return mg->mg_ptr;
8965}
8966
c277df42
IZ
8967/*
8968 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8969 */
8970regnode *
864dbfa3 8971Perl_regnext(pTHX_ register regnode *p)
c277df42 8972{
97aff369 8973 dVAR;
c277df42
IZ
8974 register I32 offset;
8975
f8fc2ecf 8976 if (!p)
c277df42
IZ
8977 return(NULL);
8978
8979 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8980 if (offset == 0)
8981 return(NULL);
8982
c277df42 8983 return(p+offset);
c277df42 8984}
76234dfb 8985#endif
c277df42 8986
01f988be 8987STATIC void
cea2e8a9 8988S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8989{
8990 va_list args;
8991 STRLEN l1 = strlen(pat1);
8992 STRLEN l2 = strlen(pat2);
8993 char buf[512];
06bf62c7 8994 SV *msv;
73d840c0 8995 const char *message;
c277df42
IZ
8996
8997 if (l1 > 510)
8998 l1 = 510;
8999 if (l1 + l2 > 510)
9000 l2 = 510 - l1;
9001 Copy(pat1, buf, l1 , char);
9002 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
9003 buf[l1 + l2] = '\n';
9004 buf[l1 + l2 + 1] = '\0';
8736538c
AS
9005#ifdef I_STDARG
9006 /* ANSI variant takes additional second argument */
c277df42 9007 va_start(args, pat2);
8736538c
AS
9008#else
9009 va_start(args);
9010#endif
5a844595 9011 msv = vmess(buf, &args);
c277df42 9012 va_end(args);
cfd0369c 9013 message = SvPV_const(msv,l1);
c277df42
IZ
9014 if (l1 > 512)
9015 l1 = 512;
9016 Copy(message, buf, l1 , char);
197cf9b9 9017 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 9018 Perl_croak(aTHX_ "%s", buf);
c277df42 9019}
a0ed51b3
LW
9020
9021/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9022
76234dfb 9023#ifndef PERL_IN_XSUB_RE
a0ed51b3 9024void
864dbfa3 9025Perl_save_re_context(pTHX)
b81d288d 9026{
97aff369 9027 dVAR;
1ade1aa1
NC
9028
9029 struct re_save_state *state;
9030
9031 SAVEVPTR(PL_curcop);
9032 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9033
9034 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9035 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9036 SSPUSHINT(SAVEt_RE_STATE);
9037
46ab3289 9038 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 9039
a0ed51b3 9040 PL_reg_start_tmp = 0;
a0ed51b3 9041 PL_reg_start_tmpl = 0;
c445ea15 9042 PL_reg_oldsaved = NULL;
a5db57d6 9043 PL_reg_oldsavedlen = 0;
a5db57d6 9044 PL_reg_maxiter = 0;
a5db57d6 9045 PL_reg_leftiter = 0;
c445ea15 9046 PL_reg_poscache = NULL;
a5db57d6 9047 PL_reg_poscache_size = 0;
1ade1aa1
NC
9048#ifdef PERL_OLD_COPY_ON_WRITE
9049 PL_nrs = NULL;
9050#endif
ada6e8a9 9051
c445ea15
AL
9052 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9053 if (PL_curpm) {
9054 const REGEXP * const rx = PM_GETRE(PL_curpm);
9055 if (rx) {
1df70142 9056 U32 i;
ada6e8a9 9057 for (i = 1; i <= rx->nparens; i++) {
1df70142 9058 char digits[TYPE_CHARS(long)];
d9fad198 9059 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
9060 GV *const *const gvp
9061 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9062
b37c2d43
AL
9063 if (gvp) {
9064 GV * const gv = *gvp;
9065 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9066 save_scalar(gv);
49f27e4b 9067 }
ada6e8a9
AMS
9068 }
9069 }
9070 }
a0ed51b3 9071}
76234dfb 9072#endif
51371543 9073
51371543 9074static void
acfe0abc 9075clear_re(pTHX_ void *r)
51371543 9076{
97aff369 9077 dVAR;
51371543
GS
9078 ReREFCNT_dec((regexp *)r);
9079}
ffbc6a93 9080
a28509cc
AL
9081#ifdef DEBUGGING
9082
9083STATIC void
9084S_put_byte(pTHX_ SV *sv, int c)
9085{
9086 if (isCNTRL(c) || c == 255 || !isPRINT(c))
9087 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9088 else if (c == '-' || c == ']' || c == '\\' || c == '^')
9089 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
9090 else
9091 Perl_sv_catpvf(aTHX_ sv, "%c", c);
9092}
9093
786e8c11 9094
3dab1dad
YO
9095#define CLEAR_OPTSTART \
9096 if (optstart) STMT_START { \
70685ca0 9097 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
9098 optstart=NULL; \
9099 } STMT_END
9100
786e8c11 9101#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 9102
b5a2f8d8
NC
9103STATIC const regnode *
9104S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9105 const regnode *last, const regnode *plast,
9106 SV* sv, I32 indent, U32 depth)
a28509cc 9107{
97aff369 9108 dVAR;
786e8c11 9109 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9110 register const regnode *next;
3dab1dad 9111 const regnode *optstart= NULL;
f8fc2ecf 9112 RXi_GET_DECL(r,ri);
3dab1dad 9113 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 9114
786e8c11
YO
9115#ifdef DEBUG_DUMPUNTIL
9116 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9117 last ? last-start : 0,plast ? plast-start : 0);
9118#endif
9119
9120 if (plast && plast < last)
9121 last= plast;
9122
9123 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
9124 /* While that wasn't END last time... */
9125
9126 NODE_ALIGN(node);
9127 op = OP(node);
de734bd5 9128 if (op == CLOSE || op == WHILEM)
786e8c11 9129 indent--;
b5a2f8d8 9130 next = regnext((regnode *)node);
07be1b83 9131
a28509cc 9132 /* Where, what. */
8e11feef 9133 if (OP(node) == OPTIMIZED) {
e68ec53f 9134 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 9135 optstart = node;
3dab1dad 9136 else
8e11feef 9137 goto after_print;
3dab1dad
YO
9138 } else
9139 CLEAR_OPTSTART;
07be1b83 9140
32fc9b6a 9141 regprop(r, sv, node);
a28509cc 9142 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 9143 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
9144
9145 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
9146 if (next == NULL) /* Next ptr. */
9147 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
9148 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9149 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
9150 else
9151 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 9152
1de06328 9153 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 9154 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
9155 }
9156
a28509cc
AL
9157 after_print:
9158 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
9159 assert(next);
9160 {
9161 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
9162 ? regnext((regnode *)next)
9163 : next);
be8e71aa
YO
9164 if (last && nnode > last)
9165 nnode = last;
786e8c11 9166 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 9167 }
a28509cc
AL
9168 }
9169 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9170 assert(next);
786e8c11 9171 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9172 }
9173 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9174 const regnode *this_trie = node;
1de06328 9175 const char op = OP(node);
a28509cc 9176 const I32 n = ARG(node);
1de06328 9177 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 9178 (reg_ac_data *)ri->data->data[n] :
1de06328 9179 NULL;
3251b653
NC
9180 const reg_trie_data * const trie =
9181 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781
NC
9182#ifdef DEBUGGING
9183 AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9184#endif
786e8c11 9185 const regnode *nextbranch= NULL;
a28509cc 9186 I32 word_idx;
1de06328 9187 sv_setpvn(sv, "", 0);
786e8c11 9188 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 9189 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
9190
9191 PerlIO_printf(Perl_debug_log, "%*s%s ",
9192 (int)(2*(indent+3)), "",
9193 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9194 PL_colors[0], PL_colors[1],
9195 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9196 PERL_PV_PRETTY_ELIPSES |
7f69552c 9197 PERL_PV_PRETTY_LTGT
786e8c11
YO
9198 )
9199 : "???"
9200 );
9201 if (trie->jump) {
40d049e4 9202 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
9203 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9204 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
9205 if (dist) {
9206 if (!nextbranch)
24b23f37 9207 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9208 DUMPUNTIL(this_trie + dist, nextbranch);
9209 }
786e8c11
YO
9210 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9211 nextbranch= regnext((regnode *)nextbranch);
9212 } else {
9213 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9214 }
786e8c11
YO
9215 }
9216 if (last && next > last)
9217 node= last;
9218 else
9219 node= next;
a28509cc 9220 }
786e8c11
YO
9221 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9222 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9223 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9224 }
9225 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9226 assert(next);
786e8c11 9227 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9228 }
9229 else if ( op == PLUS || op == STAR) {
786e8c11 9230 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9231 }
9232 else if (op == ANYOF) {
9233 /* arglen 1 + class block */
9234 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9235 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9236 node = NEXTOPER(node);
9237 }
9238 else if (PL_regkind[(U8)op] == EXACT) {
9239 /* Literal string, where present. */
9240 node += NODE_SZ_STR(node) - 1;
9241 node = NEXTOPER(node);
9242 }
9243 else {
9244 node = NEXTOPER(node);
9245 node += regarglen[(U8)op];
9246 }
9247 if (op == CURLYX || op == OPEN)
786e8c11 9248 indent++;
a28509cc 9249 }
3dab1dad 9250 CLEAR_OPTSTART;
786e8c11 9251#ifdef DEBUG_DUMPUNTIL
70685ca0 9252 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 9253#endif
1de06328 9254 return node;
a28509cc
AL
9255}
9256
9257#endif /* DEBUGGING */
9258
241d1a3b
NC
9259/*
9260 * Local variables:
9261 * c-indentation-style: bsd
9262 * c-basic-offset: 4
9263 * indent-tabs-mode: t
9264 * End:
9265 *
37442d52
RGS
9266 * ex: set ts=8 sts=4 sw=4 noet:
9267 */